249 lines
7.7 KiB
C#
249 lines
7.7 KiB
C#
using bzit.bomg.Models;
|
|
using HtmlAgilityPack;
|
|
using RyzStudio.Net;
|
|
using System;
|
|
using System.Net;
|
|
|
|
namespace BookmarkManager
|
|
{
|
|
public class WebParser
|
|
{
|
|
protected HttpWeb webClient = null;
|
|
|
|
|
|
public BookmarkItem RetrieveDetails(string url)
|
|
{
|
|
string sourceCode = retrieveSourceCode(url);
|
|
if (string.IsNullOrWhiteSpace(sourceCode))
|
|
{
|
|
return null;
|
|
}
|
|
|
|
BookmarkItem rs = new BookmarkItem();
|
|
|
|
HtmlDocument document = new HtmlDocument();
|
|
document.LoadHtml(sourceCode);
|
|
|
|
rs.SiteName = parseSiteTitle(document);
|
|
rs.SiteAddress = url;
|
|
rs.SiteDescription = parseSiteDescription(document);
|
|
rs.FaviconAddress = parseSiteIcon(document);
|
|
|
|
// resolve relative URL
|
|
if (!string.IsNullOrWhiteSpace(rs.FaviconAddress))
|
|
{
|
|
Uri iconAddressURI;
|
|
bool rv = Uri.TryCreate(new Uri(url), rs.FaviconAddress, out iconAddressURI);
|
|
if (rv)
|
|
{
|
|
rs.FaviconAddress = iconAddressURI.ToString();
|
|
}
|
|
}
|
|
|
|
return rs;
|
|
}
|
|
|
|
|
|
protected string retrieveSourceCode(string url)
|
|
{
|
|
if (webClient == null) webClient = new HttpWeb();
|
|
|
|
string sourceCode;
|
|
|
|
try
|
|
{
|
|
int statusCode = webClient.GetResponse(out sourceCode, url);
|
|
if ((statusCode == 200) || (statusCode == 301) || (statusCode == 302))
|
|
{
|
|
return sourceCode;
|
|
}
|
|
}
|
|
catch (Exception)
|
|
{
|
|
return null;
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
protected string parseSiteDescription(HtmlDocument doc)
|
|
{
|
|
string rs = null;
|
|
|
|
if (string.IsNullOrWhiteSpace(rs))
|
|
{
|
|
rs = parseTagValue_Attr(doc, "//meta[@name='description']", "content", string.Empty);
|
|
}
|
|
|
|
if (string.IsNullOrWhiteSpace(rs))
|
|
{
|
|
rs = parseTagValue_Attr(doc, "//meta[@property='og:description']", "content", string.Empty);
|
|
}
|
|
|
|
if (string.IsNullOrWhiteSpace(rs))
|
|
{
|
|
rs = parseTagValue_Attr(doc, "//meta[@name='twitter:description']", "content", string.Empty);
|
|
}
|
|
|
|
if (string.IsNullOrWhiteSpace(rs))
|
|
{
|
|
rs = parseTagValue_Attr(doc, "//meta[@property='og:description']", "content", string.Empty);
|
|
}
|
|
|
|
if (string.IsNullOrWhiteSpace(rs))
|
|
{
|
|
rs = parseTagValue_Attr(doc, "//meta[@itemprop='description']", "content", string.Empty);
|
|
}
|
|
|
|
return rs;
|
|
}
|
|
|
|
protected string parseSiteIcon(HtmlDocument doc)
|
|
{
|
|
string rs = null;
|
|
|
|
if (string.IsNullOrWhiteSpace(rs))
|
|
{
|
|
rs = parseTagValue_Attr(doc, "//link[translate(@rel, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz') = 'shortcut icon']", "href", string.Empty);
|
|
}
|
|
|
|
if (string.IsNullOrWhiteSpace(rs))
|
|
{
|
|
rs = parseTagValue_Attr(doc, "//link[translate(@rel, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz') = 'icon']", "href", string.Empty);
|
|
}
|
|
|
|
if (string.IsNullOrWhiteSpace(rs))
|
|
{
|
|
rs = parseTagValue_Attr(doc, "//link[translate(@rel, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz') = 'apple-touch-icon']", "href", string.Empty);
|
|
}
|
|
|
|
if (string.IsNullOrWhiteSpace(rs))
|
|
{
|
|
rs = parseTagValue_Attr(doc, "//link[translate(@rel, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz') = 'apple-touch-icon-precomposed']", "href", string.Empty);
|
|
}
|
|
|
|
if (string.IsNullOrWhiteSpace(rs))
|
|
{
|
|
rs = parseTagValue_Attr(doc, "//meta[translate(@property, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz') = 'og:image']", "content", string.Empty);
|
|
}
|
|
|
|
if (string.IsNullOrWhiteSpace(rs))
|
|
{
|
|
rs = parseTagValue_Attr(doc, "//meta[translate(@name, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz') = 'twitter:image']", "content", string.Empty);
|
|
}
|
|
|
|
if (string.IsNullOrWhiteSpace(rs))
|
|
{
|
|
rs = parseTagValue_Attr(doc, "//meta[translate(@property, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz') = 'og:image']", "content", string.Empty);
|
|
}
|
|
|
|
if (string.IsNullOrWhiteSpace(rs))
|
|
{
|
|
rs = parseTagValue_Attr(doc, "//meta[translate(@itemprop, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz') = 'image']", "content", string.Empty);
|
|
}
|
|
|
|
//if (string.IsNullOrWhiteSpace(rs))
|
|
//{
|
|
// rs = "/favicon.ico";
|
|
//}
|
|
|
|
return rs;
|
|
}
|
|
|
|
protected string parseSiteTitle(HtmlDocument doc)
|
|
{
|
|
string rs = null;
|
|
|
|
if (string.IsNullOrWhiteSpace(rs))
|
|
{
|
|
rs = parseTagValue(doc, "//title", string.Empty);
|
|
}
|
|
|
|
if (string.IsNullOrWhiteSpace(rs))
|
|
{
|
|
rs = parseTagValue_Attr(doc, "//meta[@property='og:title']", "content", string.Empty);
|
|
}
|
|
|
|
if (string.IsNullOrWhiteSpace(rs))
|
|
{
|
|
rs = parseTagValue_Attr(doc, "//meta[@name='twitter:title']", "content", string.Empty);
|
|
}
|
|
|
|
if (string.IsNullOrWhiteSpace(rs))
|
|
{
|
|
rs = parseTagValue_Attr(doc, "//meta[@property='og:site_name']", "content", string.Empty);
|
|
}
|
|
|
|
if (string.IsNullOrWhiteSpace(rs))
|
|
{
|
|
rs = parseTagValue_Attr(doc, "//meta[@itemprop='name']", "content", string.Empty);
|
|
}
|
|
|
|
return rs?.Trim() ?? string.Empty;
|
|
}
|
|
|
|
protected string parseTagValue(HtmlDocument doc, string xpath, string defaultValue = "")
|
|
{
|
|
HtmlNodeCollection hnc = doc.DocumentNode.SelectNodes(xpath);
|
|
if (hnc == null)
|
|
{
|
|
return defaultValue;
|
|
}
|
|
|
|
if (hnc.Count <= 0)
|
|
{
|
|
return defaultValue;
|
|
}
|
|
|
|
foreach (HtmlNode hn in hnc)
|
|
{
|
|
if (string.IsNullOrWhiteSpace(hn.InnerHtml))
|
|
{
|
|
continue;
|
|
}
|
|
|
|
string rs = WebUtility.HtmlDecode(hn.InnerHtml)?.Replace("\r", "")?.Replace("\n", " ")?.Trim();
|
|
if (string.IsNullOrWhiteSpace(rs))
|
|
{
|
|
continue;
|
|
}
|
|
|
|
return rs;
|
|
}
|
|
|
|
return defaultValue;
|
|
}
|
|
|
|
protected string parseTagValue_Attr(HtmlDocument doc, string xpath, string attr, string defaultValue = "")
|
|
{
|
|
HtmlNodeCollection hnc = doc.DocumentNode.SelectNodes(xpath);
|
|
if (hnc == null)
|
|
{
|
|
return defaultValue;
|
|
}
|
|
|
|
if (hnc.Count <= 0)
|
|
{
|
|
return defaultValue;
|
|
}
|
|
|
|
foreach (HtmlNode hn in hnc)
|
|
{
|
|
if (hn.Attributes[attr] == null)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
if (string.IsNullOrWhiteSpace(hn.Attributes[attr].Value))
|
|
{
|
|
continue;
|
|
}
|
|
|
|
return System.Web.HttpUtility.HtmlDecode(hn.Attributes[attr].Value?.Trim());
|
|
}
|
|
|
|
return defaultValue;
|
|
}
|
|
|
|
}
|
|
} |