using System; using System.Collections.Generic; using System.Drawing; using System.Linq; using System.Net; using System.Security.Policy; using System.Threading.Tasks; using HtmlAgilityPack; using RyzStudio.Net; namespace BookmarkManager.Services { public class WebProvider { private readonly WebClientProvider _webClientProvider; public WebProvider() { _webClientProvider = new WebClientProvider(); _webClientProvider.Timeout = 4; } public bool IgnoreSSL { get => _webClientProvider.IgnoreSSL; set { _webClientProvider.IgnoreSSL = value; } } public async Task RetrieveHtmlDocument(string url) { var sourceCode = await this.RetrieveSourceCode(url); if (string.IsNullOrWhiteSpace(sourceCode)) { return null; } var document = new HtmlAgilityPack.HtmlDocument(); try { document.LoadHtml(sourceCode); } catch (Exception) { return null; } return document; } public async Task RetrieveSourceCode(string url) { if (string.IsNullOrWhiteSpace(url)) { return null; } if (!Uri.TryCreate(url, UriKind.Absolute, out Uri uri)) { return null; } System.Net.Http.HttpResponseMessage response; try { response = await _webClientProvider.Get(url?.Trim()); } catch (Exception) { return null; } if (response == null) { return null; } if (response.StatusCode != HttpStatusCode.OK) { return null; } var sourceCode = ""; try { sourceCode = await response?.Content?.ReadAsStringAsync(); } catch (Exception) { // do nothing } if (string.IsNullOrWhiteSpace(sourceCode)) { return null; } return sourceCode; } public async Task RetrieveImage(string url) { if (string.IsNullOrWhiteSpace(url)) { return null; } if (!Uri.TryCreate(url, UriKind.Absolute, out Uri uri)) { return null; } System.Net.Http.HttpResponseMessage response; try { response = await _webClientProvider.Get(url?.Trim()); } catch (Exception) { return null; } if (response.StatusCode != HttpStatusCode.OK) { return null; } var stream = await response?.Content?.ReadAsStreamAsync(); Image result = null; try { result = Image.FromStream(stream); } catch (Exception) { return null; } return result; } public async Task RetrieveImage(string url, HtmlAgilityPack.HtmlDocument document) { var iconUrl = this.ParseFavicon(document); if (string.IsNullOrWhiteSpace(iconUrl)) { return null; } try { var baseUri = new Uri(url); var absoluteUri = new Uri(baseUri, iconUrl); iconUrl = absoluteUri.AbsoluteUri; } catch { return null; } return await this.RetrieveImage(iconUrl); } public string ParseTitle(HtmlAgilityPack.HtmlDocument document) { string result = null; result = ParseTagValue(document, "//title", string.Empty)?.Trim(); if (!string.IsNullOrWhiteSpace(result)) { return result; } result = FindNodeAttrValue(document, "//meta[@property='og:title']", "content", string.Empty)?.Trim(); if (!string.IsNullOrWhiteSpace(result)) { return result; } result = FindNodeAttrValue(document, "//meta[@name='twitter:title']", "content", string.Empty)?.Trim(); if (!string.IsNullOrWhiteSpace(result)) { return result; } result = FindNodeAttrValue(document, "//meta[@property='og:site_name']", "content", string.Empty)?.Trim(); if (!string.IsNullOrWhiteSpace(result)) { return result; } result = FindNodeAttrValue(document, "//meta[@itemprop='name']", "content", string.Empty)?.Trim(); if (!string.IsNullOrWhiteSpace(result)) { return result; } return string.Empty; } public string ParseMetaDescription(HtmlAgilityPack.HtmlDocument document) { string result = null; result = FindNodeAttrValue(document, "//meta[@name='description']", "content", string.Empty)?.Trim(); if (!string.IsNullOrWhiteSpace(result)) { return result; } result = FindNodeAttrValue(document, "//meta[@property='og:description']", "content", string.Empty)?.Trim(); if (!string.IsNullOrWhiteSpace(result)) { return result; } result = FindNodeAttrValue(document, "//meta[@name='twitter:description']", "content", string.Empty)?.Trim(); if (!string.IsNullOrWhiteSpace(result)) { return result; } result = FindNodeAttrValue(document, "//meta[@property='og:description']", "content", string.Empty)?.Trim(); if (!string.IsNullOrWhiteSpace(result)) { return result; } result = FindNodeAttrValue(document, "//meta[@itemprop='description']", "content", string.Empty)?.Trim(); if (!string.IsNullOrWhiteSpace(result)) { return result; } return result; } public string ParseFavicon(HtmlAgilityPack.HtmlDocument document) { string result = null; //var tt1 = FindNode_AtrributeContains(document, "//link[contains(@rel, 'icon')]", "href", string.Empty); //var tt1 = FindNode_AtrributeContains(document, "link", "rel", "icon"); // Find link-rel contains "icon" var linkNodes = FindNode(document, "link", "rel"); foreach (var item in linkNodes) { var relValue = item.Attributes["rel"].Value?.Trim() ?? string.Empty; if (!ContainsWord(relValue, "icon")) { continue; } var hrefValue = item.Attributes["href"].Value?.Trim() ?? string.Empty; if (string.IsNullOrWhiteSpace(hrefValue)) { continue; } return System.Web.HttpUtility.HtmlDecode(hrefValue); } // Find link-rel contains apple-icon var appleIconPatterns = new List() { "apple-touch-icon", "apple-touch-icon-precomposed" }; foreach (var item in linkNodes) { var relValue = item.Attributes["rel"].Value?.Trim() ?? string.Empty; if (!appleIconPatterns.Contains(relValue?.ToLower() ?? string.Empty)) { continue; } var hrefValue = item.Attributes["href"].Value?.Trim() ?? string.Empty; if (string.IsNullOrWhiteSpace(hrefValue)) { continue; } return System.Web.HttpUtility.HtmlDecode(hrefValue); } //result = ParseTagValue_Attr(document, "//link[translate(@rel, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz') = 'shortcut icon']", "href", string.Empty)?.Trim(); //if (!string.IsNullOrWhiteSpace(result)) //{ // return result; //} //result = ParseTagValue_Attr(document, "//link[translate(@rel, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz') = 'icon']", "href", string.Empty)?.Trim(); //if (!string.IsNullOrWhiteSpace(result)) //{ // return result; //} //result = ParseTagValue_Attr(document, "//link[translate(@rel, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz') = 'apple-touch-icon']", "href", string.Empty)?.Trim(); //if (!string.IsNullOrWhiteSpace(result)) //{ // return result; //} //result = ParseTagValue_Attr(document, "//link[translate(@rel, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz') = 'apple-touch-icon-precomposed']", "href", string.Empty)?.Trim(); //if (!string.IsNullOrWhiteSpace(result)) //{ // return result; //} result = FindNodeAttrValue(document, "//meta[@property='og:image']", "content", string.Empty)?.Trim(); if (!string.IsNullOrWhiteSpace(result)) { return result; } result = FindNodeAttrValue(document, "//meta[@name='twitter:image']", "content", string.Empty)?.Trim(); if (!string.IsNullOrWhiteSpace(result)) { return result; } result = FindNodeAttrValue(document, "//meta[@itemprop='image']", "content", string.Empty)?.Trim(); if (!string.IsNullOrWhiteSpace(result)) { return result; } return "/favicon.ico"; } private string ParseTagValue(HtmlAgilityPack.HtmlDocument document, string xPath, string defaultValue = "") { var hnc = document.DocumentNode.SelectNodes(xPath); if (hnc == null) { return defaultValue; } if (hnc.Count <= 0) { return defaultValue; } foreach (HtmlNode hn in hnc) { if (string.IsNullOrWhiteSpace(hn.InnerHtml)) { continue; } var result = WebUtility.HtmlDecode(hn.InnerHtml)?.Replace("\r", "")?.Replace("\n", " ")?.Trim(); if (string.IsNullOrWhiteSpace(result)) { continue; } return result; } return defaultValue; } private string FindNodeAttrValue(HtmlAgilityPack.HtmlDocument document, string xPath, string attr, string defaultValue = "") { var hnc = document.DocumentNode.SelectNodes(xPath); if (hnc == null) { return defaultValue; } if (hnc.Count <= 0) { return defaultValue; } foreach (HtmlNode hn in hnc) { if (hn.Attributes[attr] == null) { continue; } if (string.IsNullOrWhiteSpace(hn.Attributes[attr].Value)) { continue; } return System.Web.HttpUtility.HtmlDecode(hn.Attributes[attr].Value?.Trim()); } return defaultValue; } //private List FindNode_AtrributeContains(HtmlAgilityPack.HtmlDocument document, string nodeName, string attrName, string findValue) //{ // var response = new List(); // var xPath = $"//{nodeName}[@{attrName}]"; // var hnc = document.DocumentNode.SelectNodes(xPath); // if (hnc == null) // { // return response; // } // if (hnc.Count <= 0) // { // return response; // } // foreach (HtmlNode item in hnc) // { // if (!item.Attributes.Contains(attrName)) // { // continue; // } // if (!ContainsWord(item.Attributes[attrName].Value ?? string.Empty, findValue)) // { // continue; // } // response.Add(item); // } // return response; //} private List FindNode(HtmlAgilityPack.HtmlDocument document, string nodeName, string attrName) { var xPath = (string.IsNullOrWhiteSpace(attrName) ? $"//{nodeName}" : $"//{nodeName}[@{attrName}]"); var hnc = document.DocumentNode.SelectNodes(xPath); if (hnc == null) { return new List(); } if (hnc.Count <= 0) { return new List(); } return hnc.ToList(); } private bool ContainsWord(string haystack, string needle) { haystack = haystack?.Trim() ?? string.Empty; if (!haystack.Contains(" ")) { return haystack.Equals(needle, StringComparison.CurrentCultureIgnoreCase); } foreach (var item in haystack.Split(" ")) { if (string.IsNullOrWhiteSpace(item)) { continue; } if (item.Equals(needle, StringComparison.CurrentCultureIgnoreCase)) { return true; } } return false; } } }