using bzit.bomg.Models; using HtmlAgilityPack; using RyzStudio.Net; using System; using System.Net; namespace BookmarkManager { public class WebParser { protected HttpWeb webClient = null; public BookmarkResult RetrieveDetails(string url) { string sourceCode = retrieveSourceCode(url); if (string.IsNullOrWhiteSpace(sourceCode)) { return null; } BookmarkResult rs = new BookmarkResult(); rs.Item = new BookmarkItem(); HtmlDocument document = new HtmlDocument(); document.LoadHtml(sourceCode); rs.Item.SiteName = parseSiteTitle(document); rs.Item.SiteAddress = url; rs.Item.SiteDescription = parseSiteDescription(document); rs.IconURL = parseSiteIcon(document); // resolve relative URL if (!string.IsNullOrWhiteSpace(rs.IconURL)) { Uri iconAddressURI; bool rv = Uri.TryCreate(new Uri(url), rs.IconURL, out iconAddressURI); if (rv) { rs.IconURL = iconAddressURI.ToString(); } } return rs; } protected string retrieveSourceCode(string url) { if (webClient == null) webClient = new HttpWeb(); string sourceCode; try { int statusCode = webClient.GetResponse(out sourceCode, url); if ((statusCode == 200) || (statusCode == 301) || (statusCode == 302)) { return sourceCode; } } catch (Exception) { return null; } return null; } protected string parseSiteDescription(HtmlDocument doc) { string rs = null; if (string.IsNullOrWhiteSpace(rs)) { rs = parseTagValue_Attr(doc, "//meta[@name='description']", "content", string.Empty); } if (string.IsNullOrWhiteSpace(rs)) { rs = parseTagValue_Attr(doc, "//meta[@property='og:description']", "content", string.Empty); } if (string.IsNullOrWhiteSpace(rs)) { rs = parseTagValue_Attr(doc, "//meta[@name='twitter:description']", "content", string.Empty); } if (string.IsNullOrWhiteSpace(rs)) { rs = parseTagValue_Attr(doc, "//meta[@property='og:description']", "content", string.Empty); } if (string.IsNullOrWhiteSpace(rs)) { rs = parseTagValue_Attr(doc, "//meta[@itemprop='description']", "content", string.Empty); } return rs; } protected string parseSiteIcon(HtmlDocument doc) { string rs = null; if (string.IsNullOrWhiteSpace(rs)) { rs = parseTagValue_Attr(doc, "//link[translate(@rel, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz') = 'shortcut icon']", "href", string.Empty); } if (string.IsNullOrWhiteSpace(rs)) { rs = parseTagValue_Attr(doc, "//link[translate(@rel, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz') = 'icon']", "href", string.Empty); } if (string.IsNullOrWhiteSpace(rs)) { rs = parseTagValue_Attr(doc, "//link[translate(@rel, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz') = 'apple-touch-icon']", "href", string.Empty); } if (string.IsNullOrWhiteSpace(rs)) { rs = parseTagValue_Attr(doc, "//link[translate(@rel, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz') = 'apple-touch-icon-precomposed']", "href", string.Empty); } if (string.IsNullOrWhiteSpace(rs)) { rs = parseTagValue_Attr(doc, "//meta[translate(@property, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz') = 'og:image']", "content", string.Empty); } if (string.IsNullOrWhiteSpace(rs)) { rs = parseTagValue_Attr(doc, "//meta[translate(@name, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz') = 'twitter:image']", "content", string.Empty); } if (string.IsNullOrWhiteSpace(rs)) { rs = parseTagValue_Attr(doc, "//meta[translate(@property, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz') = 'og:image']", "content", string.Empty); } if (string.IsNullOrWhiteSpace(rs)) { rs = parseTagValue_Attr(doc, "//meta[translate(@itemprop, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz') = 'image']", "content", string.Empty); } //if (string.IsNullOrWhiteSpace(rs)) //{ // rs = "/favicon.ico"; //} return rs; } protected string parseSiteTitle(HtmlDocument doc) { string rs = null; if (string.IsNullOrWhiteSpace(rs)) { rs = parseTagValue(doc, "//title", string.Empty); } if (string.IsNullOrWhiteSpace(rs)) { rs = parseTagValue_Attr(doc, "//meta[@property='og:title']", "content", string.Empty); } if (string.IsNullOrWhiteSpace(rs)) { rs = parseTagValue_Attr(doc, "//meta[@name='twitter:title']", "content", string.Empty); } if (string.IsNullOrWhiteSpace(rs)) { rs = parseTagValue_Attr(doc, "//meta[@property='og:site_name']", "content", string.Empty); } if (string.IsNullOrWhiteSpace(rs)) { rs = parseTagValue_Attr(doc, "//meta[@itemprop='name']", "content", string.Empty); } return rs?.Trim() ?? string.Empty; } protected string parseTagValue(HtmlDocument doc, string xpath, string defaultValue = "") { HtmlNodeCollection hnc = doc.DocumentNode.SelectNodes(xpath); if (hnc == null) { return defaultValue; } if (hnc.Count <= 0) { return defaultValue; } foreach (HtmlNode hn in hnc) { if (string.IsNullOrWhiteSpace(hn.InnerHtml)) { continue; } string rs = WebUtility.HtmlDecode(hn.InnerHtml)?.Replace("\r", "")?.Replace("\n", " ")?.Trim(); if (string.IsNullOrWhiteSpace(rs)) { continue; } return rs; } return defaultValue; } protected string parseTagValue_Attr(HtmlDocument doc, string xpath, string attr, string defaultValue = "") { HtmlNodeCollection hnc = doc.DocumentNode.SelectNodes(xpath); if (hnc == null) { return defaultValue; } if (hnc.Count <= 0) { return defaultValue; } foreach (HtmlNode hn in hnc) { if (hn.Attributes[attr] == null) { continue; } if (string.IsNullOrWhiteSpace(hn.Attributes[attr].Value)) { continue; } return System.Web.HttpUtility.HtmlDecode(hn.Attributes[attr].Value?.Trim()); } return defaultValue; } } }