bookmark-manager-r4/Net/WebParser.cs

283 lines
8.7 KiB
C#
Raw Normal View History

2021-09-17 15:37:01 +00:00
using bzit.bomg.Models;
using HtmlAgilityPack;
using RyzStudio.Net;
using System;
2021-09-30 21:26:17 +00:00
using System.Drawing;
using System.IO;
2021-09-17 15:37:01 +00:00
using System.Net;
namespace BookmarkManager
{
public class WebParser
{
protected HttpWeb webClient = null;
2021-09-30 21:26:17 +00:00
protected WebClient webClient2 = null;
2021-09-17 15:37:01 +00:00
2021-10-12 18:55:00 +00:00
public BookmarkResult RetrieveDetails(string url, bool ignoreSSL)
2021-09-17 15:37:01 +00:00
{
2021-10-12 18:55:00 +00:00
string sourceCode = retrieveSourceCode(url, ignoreSSL);
2021-09-17 15:37:01 +00:00
if (string.IsNullOrWhiteSpace(sourceCode))
{
return null;
}
2021-09-21 12:28:52 +00:00
BookmarkResult rs = new BookmarkResult();
rs.Item = new BookmarkItem();
2021-09-17 15:37:01 +00:00
HtmlDocument document = new HtmlDocument();
document.LoadHtml(sourceCode);
2021-09-21 12:28:52 +00:00
rs.Item.SiteName = parseSiteTitle(document);
rs.Item.SiteAddress = url;
rs.Item.SiteDescription = parseSiteDescription(document);
rs.IconURL = parseSiteIcon(document);
2021-09-17 15:37:01 +00:00
// resolve relative URL
2021-09-21 12:28:52 +00:00
if (!string.IsNullOrWhiteSpace(rs.IconURL))
2021-09-17 15:37:01 +00:00
{
Uri iconAddressURI;
2021-09-21 12:28:52 +00:00
bool rv = Uri.TryCreate(new Uri(url), rs.IconURL, out iconAddressURI);
2021-09-17 15:37:01 +00:00
if (rv)
{
2021-09-21 12:28:52 +00:00
rs.IconURL = iconAddressURI.ToString();
2021-09-17 15:37:01 +00:00
}
}
return rs;
}
2021-09-30 21:26:17 +00:00
public Bitmap RetrieveImage(string url)
{
if (string.IsNullOrWhiteSpace(url))
{
return null;
}
if (webClient2 == null) webClient2 = new WebClient();
webClient2.CachePolicy = new System.Net.Cache.RequestCachePolicy(System.Net.Cache.RequestCacheLevel.NoCacheNoStore);
try
{
byte[] byteData = webClient2.DownloadData(url);
if (!RyzStudio.IO.FileType.IsImage(byteData))
{
throw new Exception("Not a supported image");
}
Image img = Image.FromStream(new MemoryStream(byteData));
return new Bitmap(img, 16, 16);
}
catch (Exception)
{
return null;
}
}
2021-09-17 15:37:01 +00:00
2021-10-12 18:55:00 +00:00
protected string retrieveSourceCode(string url, bool ignoreSSL)
2021-09-17 15:37:01 +00:00
{
if (webClient == null) webClient = new HttpWeb();
2021-10-12 18:55:00 +00:00
webClient.IgnoreSSL = ignoreSSL;
2021-09-17 15:37:01 +00:00
string sourceCode;
try
{
int statusCode = webClient.GetResponse(out sourceCode, url);
if ((statusCode == 200) || (statusCode == 301) || (statusCode == 302))
{
return sourceCode;
}
}
catch (Exception)
{
return null;
}
return null;
}
protected string parseSiteDescription(HtmlDocument doc)
{
string rs = null;
if (string.IsNullOrWhiteSpace(rs))
{
rs = parseTagValue_Attr(doc, "//meta[@name='description']", "content", string.Empty);
}
if (string.IsNullOrWhiteSpace(rs))
{
rs = parseTagValue_Attr(doc, "//meta[@property='og:description']", "content", string.Empty);
}
if (string.IsNullOrWhiteSpace(rs))
{
rs = parseTagValue_Attr(doc, "//meta[@name='twitter:description']", "content", string.Empty);
}
if (string.IsNullOrWhiteSpace(rs))
{
rs = parseTagValue_Attr(doc, "//meta[@property='og:description']", "content", string.Empty);
}
if (string.IsNullOrWhiteSpace(rs))
{
rs = parseTagValue_Attr(doc, "//meta[@itemprop='description']", "content", string.Empty);
}
return rs;
}
protected string parseSiteIcon(HtmlDocument doc)
{
string rs = null;
if (string.IsNullOrWhiteSpace(rs))
{
rs = parseTagValue_Attr(doc, "//link[translate(@rel, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz') = 'shortcut icon']", "href", string.Empty);
}
if (string.IsNullOrWhiteSpace(rs))
{
rs = parseTagValue_Attr(doc, "//link[translate(@rel, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz') = 'icon']", "href", string.Empty);
}
if (string.IsNullOrWhiteSpace(rs))
{
rs = parseTagValue_Attr(doc, "//link[translate(@rel, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz') = 'apple-touch-icon']", "href", string.Empty);
}
if (string.IsNullOrWhiteSpace(rs))
{
rs = parseTagValue_Attr(doc, "//link[translate(@rel, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz') = 'apple-touch-icon-precomposed']", "href", string.Empty);
}
if (string.IsNullOrWhiteSpace(rs))
{
rs = parseTagValue_Attr(doc, "//meta[translate(@property, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz') = 'og:image']", "content", string.Empty);
}
if (string.IsNullOrWhiteSpace(rs))
{
rs = parseTagValue_Attr(doc, "//meta[translate(@name, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz') = 'twitter:image']", "content", string.Empty);
}
if (string.IsNullOrWhiteSpace(rs))
{
rs = parseTagValue_Attr(doc, "//meta[translate(@property, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz') = 'og:image']", "content", string.Empty);
}
if (string.IsNullOrWhiteSpace(rs))
{
rs = parseTagValue_Attr(doc, "//meta[translate(@itemprop, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz') = 'image']", "content", string.Empty);
}
2021-09-21 14:27:00 +00:00
if (string.IsNullOrWhiteSpace(rs))
{
rs = "/favicon.ico";
}
2021-09-17 15:37:01 +00:00
return rs;
}
protected string parseSiteTitle(HtmlDocument doc)
{
string rs = null;
if (string.IsNullOrWhiteSpace(rs))
{
rs = parseTagValue(doc, "//title", string.Empty);
}
if (string.IsNullOrWhiteSpace(rs))
{
rs = parseTagValue_Attr(doc, "//meta[@property='og:title']", "content", string.Empty);
}
if (string.IsNullOrWhiteSpace(rs))
{
rs = parseTagValue_Attr(doc, "//meta[@name='twitter:title']", "content", string.Empty);
}
if (string.IsNullOrWhiteSpace(rs))
{
rs = parseTagValue_Attr(doc, "//meta[@property='og:site_name']", "content", string.Empty);
}
if (string.IsNullOrWhiteSpace(rs))
{
rs = parseTagValue_Attr(doc, "//meta[@itemprop='name']", "content", string.Empty);
}
return rs?.Trim() ?? string.Empty;
}
protected string parseTagValue(HtmlDocument doc, string xpath, string defaultValue = "")
{
HtmlNodeCollection hnc = doc.DocumentNode.SelectNodes(xpath);
if (hnc == null)
{
return defaultValue;
}
if (hnc.Count <= 0)
{
return defaultValue;
}
foreach (HtmlNode hn in hnc)
{
if (string.IsNullOrWhiteSpace(hn.InnerHtml))
{
continue;
}
string rs = WebUtility.HtmlDecode(hn.InnerHtml)?.Replace("\r", "")?.Replace("\n", " ")?.Trim();
if (string.IsNullOrWhiteSpace(rs))
{
continue;
}
return rs;
}
return defaultValue;
}
protected string parseTagValue_Attr(HtmlDocument doc, string xpath, string attr, string defaultValue = "")
{
HtmlNodeCollection hnc = doc.DocumentNode.SelectNodes(xpath);
if (hnc == null)
{
return defaultValue;
}
if (hnc.Count <= 0)
{
return defaultValue;
}
foreach (HtmlNode hn in hnc)
{
if (hn.Attributes[attr] == null)
{
continue;
}
if (string.IsNullOrWhiteSpace(hn.Attributes[attr].Value))
{
continue;
}
return System.Web.HttpUtility.HtmlDecode(hn.Attributes[attr].Value?.Trim());
}
return defaultValue;
}
}
}