2024-07-17 00:56:17 +00:00
|
|
|
|
using System;
|
2024-09-14 22:03:06 +00:00
|
|
|
|
using System.Collections.Generic;
|
2024-07-17 00:56:17 +00:00
|
|
|
|
using System.Drawing;
|
2024-09-14 22:03:06 +00:00
|
|
|
|
using System.Linq;
|
2024-07-17 00:56:17 +00:00
|
|
|
|
using System.Net;
|
|
|
|
|
using System.Threading.Tasks;
|
|
|
|
|
using HtmlAgilityPack;
|
|
|
|
|
using RyzStudio.Net;
|
|
|
|
|
|
|
|
|
|
namespace BookmarkManager.Services
|
|
|
|
|
{
|
|
|
|
|
public class WebProvider
|
|
|
|
|
{
|
|
|
|
|
private readonly WebClientProvider _webClientProvider;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
public WebProvider()
|
|
|
|
|
{
|
|
|
|
|
_webClientProvider = new WebClientProvider();
|
|
|
|
|
_webClientProvider.Timeout = 4;
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-18 22:45:51 +00:00
|
|
|
|
|
|
|
|
|
public bool IgnoreSSL
|
|
|
|
|
{
|
|
|
|
|
get => _webClientProvider.IgnoreSSL;
|
|
|
|
|
set
|
|
|
|
|
{
|
|
|
|
|
_webClientProvider.IgnoreSSL = value;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
2024-07-17 00:56:17 +00:00
|
|
|
|
public async Task<HtmlAgilityPack.HtmlDocument> RetrieveHtmlDocument(string url)
|
|
|
|
|
{
|
|
|
|
|
var sourceCode = await this.RetrieveSourceCode(url);
|
|
|
|
|
if (string.IsNullOrWhiteSpace(sourceCode))
|
|
|
|
|
{
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
var document = new HtmlAgilityPack.HtmlDocument();
|
|
|
|
|
|
|
|
|
|
try
|
|
|
|
|
{
|
|
|
|
|
document.LoadHtml(sourceCode);
|
|
|
|
|
}
|
|
|
|
|
catch (Exception)
|
|
|
|
|
{
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return document;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public async Task<string> RetrieveSourceCode(string url)
|
|
|
|
|
{
|
|
|
|
|
if (string.IsNullOrWhiteSpace(url))
|
|
|
|
|
{
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!Uri.TryCreate(url, UriKind.Absolute, out Uri uri))
|
|
|
|
|
{
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
System.Net.Http.HttpResponseMessage response;
|
|
|
|
|
|
|
|
|
|
try
|
|
|
|
|
{
|
|
|
|
|
response = await _webClientProvider.Get(url?.Trim());
|
|
|
|
|
}
|
|
|
|
|
catch (Exception)
|
|
|
|
|
{
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-19 20:22:24 +00:00
|
|
|
|
if (response == null)
|
|
|
|
|
{
|
|
|
|
|
return null;
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-17 00:56:17 +00:00
|
|
|
|
if (response.StatusCode != HttpStatusCode.OK)
|
|
|
|
|
{
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-19 20:22:24 +00:00
|
|
|
|
var sourceCode = "";
|
|
|
|
|
|
|
|
|
|
try
|
|
|
|
|
{
|
|
|
|
|
sourceCode = await response?.Content?.ReadAsStringAsync();
|
|
|
|
|
}
|
|
|
|
|
catch (Exception)
|
|
|
|
|
{
|
|
|
|
|
// do nothing
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-17 00:56:17 +00:00
|
|
|
|
if (string.IsNullOrWhiteSpace(sourceCode))
|
|
|
|
|
{
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return sourceCode;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public async Task<Image> RetrieveImage(string url)
|
|
|
|
|
{
|
|
|
|
|
if (string.IsNullOrWhiteSpace(url))
|
|
|
|
|
{
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!Uri.TryCreate(url, UriKind.Absolute, out Uri uri))
|
|
|
|
|
{
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
System.Net.Http.HttpResponseMessage response;
|
|
|
|
|
|
|
|
|
|
try
|
|
|
|
|
{
|
|
|
|
|
response = await _webClientProvider.Get(url?.Trim());
|
|
|
|
|
}
|
|
|
|
|
catch (Exception)
|
|
|
|
|
{
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (response.StatusCode != HttpStatusCode.OK)
|
|
|
|
|
{
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
var stream = await response?.Content?.ReadAsStreamAsync();
|
|
|
|
|
|
|
|
|
|
Image result = null;
|
|
|
|
|
|
|
|
|
|
try
|
|
|
|
|
{
|
|
|
|
|
result = Image.FromStream(stream);
|
|
|
|
|
}
|
|
|
|
|
catch (Exception)
|
|
|
|
|
{
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
2024-09-14 22:03:06 +00:00
|
|
|
|
public async Task<Image> RetrieveImage(string url, HtmlAgilityPack.HtmlDocument document)
|
2024-07-17 00:56:17 +00:00
|
|
|
|
{
|
|
|
|
|
var iconUrl = this.ParseFavicon(document);
|
|
|
|
|
if (string.IsNullOrWhiteSpace(iconUrl))
|
|
|
|
|
{
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
2024-09-14 22:03:06 +00:00
|
|
|
|
try
|
|
|
|
|
{
|
|
|
|
|
var baseUri = new Uri(url);
|
|
|
|
|
var absoluteUri = new Uri(baseUri, iconUrl);
|
|
|
|
|
|
|
|
|
|
iconUrl = absoluteUri.AbsoluteUri;
|
|
|
|
|
}
|
|
|
|
|
catch
|
|
|
|
|
{
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-17 00:56:17 +00:00
|
|
|
|
return await this.RetrieveImage(iconUrl);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public string ParseTitle(HtmlAgilityPack.HtmlDocument document)
|
|
|
|
|
{
|
|
|
|
|
string result = null;
|
|
|
|
|
|
2024-09-15 11:48:31 +00:00
|
|
|
|
// Find basic title
|
|
|
|
|
result = FindNodeValue(document, "//title", string.Empty)?.Trim();
|
2024-07-17 00:56:17 +00:00
|
|
|
|
if (!string.IsNullOrWhiteSpace(result))
|
|
|
|
|
{
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
2024-09-15 11:48:31 +00:00
|
|
|
|
// Find title from extended meta
|
|
|
|
|
var patternList = new List<string>()
|
2024-07-17 00:56:17 +00:00
|
|
|
|
{
|
2024-09-15 11:48:31 +00:00
|
|
|
|
"//meta[@property='og:title']",
|
|
|
|
|
"//meta[@property='og:site_name']",
|
|
|
|
|
"//meta[@name='twitter:title']",
|
|
|
|
|
"//meta[@itemprop='name']"
|
|
|
|
|
};
|
2024-07-17 00:56:17 +00:00
|
|
|
|
|
2024-09-15 11:48:31 +00:00
|
|
|
|
foreach (var item in patternList)
|
2024-07-17 00:56:17 +00:00
|
|
|
|
{
|
2024-09-15 11:48:31 +00:00
|
|
|
|
result = FindNodeAttrValue(document, item, "content", string.Empty)?.Trim();
|
|
|
|
|
if (string.IsNullOrWhiteSpace(result))
|
|
|
|
|
{
|
|
|
|
|
continue;
|
|
|
|
|
}
|
2024-07-17 00:56:17 +00:00
|
|
|
|
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return string.Empty;
|
|
|
|
|
}
|
|
|
|
|
|
2024-09-15 11:48:31 +00:00
|
|
|
|
public string ParseDescription(HtmlAgilityPack.HtmlDocument document)
|
2024-07-17 00:56:17 +00:00
|
|
|
|
{
|
2024-09-15 11:48:31 +00:00
|
|
|
|
var patternList = new List<string>()
|
2024-07-17 00:56:17 +00:00
|
|
|
|
{
|
2024-09-15 11:48:31 +00:00
|
|
|
|
"//meta[@name='description']",
|
|
|
|
|
"//meta[@property='og:description']",
|
|
|
|
|
"//meta[@name='twitter:description']",
|
|
|
|
|
"//meta[@itemprop='description']",
|
|
|
|
|
};
|
2024-07-17 00:56:17 +00:00
|
|
|
|
|
2024-09-15 11:48:31 +00:00
|
|
|
|
foreach (var item in patternList)
|
2024-07-17 00:56:17 +00:00
|
|
|
|
{
|
2024-09-15 11:48:31 +00:00
|
|
|
|
var result = FindNodeAttrValue(document, item, "content", string.Empty)?.Trim();
|
|
|
|
|
if (string.IsNullOrWhiteSpace(result))
|
|
|
|
|
{
|
|
|
|
|
continue;
|
|
|
|
|
}
|
2024-07-17 00:56:17 +00:00
|
|
|
|
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
2024-09-15 11:48:31 +00:00
|
|
|
|
return string.Empty;
|
2024-07-17 00:56:17 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public string ParseFavicon(HtmlAgilityPack.HtmlDocument document)
|
|
|
|
|
{
|
|
|
|
|
string result = null;
|
|
|
|
|
|
2024-09-15 11:48:31 +00:00
|
|
|
|
// Find link-rel that contains word
|
|
|
|
|
result = FindNodeAttrValue_ContainsWord(document, "link", "rel", "href", "icon");
|
|
|
|
|
if (!string.IsNullOrWhiteSpace(result))
|
2024-07-17 00:56:17 +00:00
|
|
|
|
{
|
2024-09-15 11:48:31 +00:00
|
|
|
|
return result;
|
2024-07-17 00:56:17 +00:00
|
|
|
|
}
|
|
|
|
|
|
2024-09-14 22:03:06 +00:00
|
|
|
|
// Find link-rel contains apple-icon
|
2024-09-15 11:48:31 +00:00
|
|
|
|
var matchPatterns = new List<string>() { "apple-touch-icon", "apple-touch-icon-precomposed" };
|
|
|
|
|
result = FindNodeAttrValue_Equals(document, "link", "rel", "href", matchPatterns);
|
2024-07-17 00:56:17 +00:00
|
|
|
|
if (!string.IsNullOrWhiteSpace(result))
|
|
|
|
|
{
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
2024-09-15 11:48:31 +00:00
|
|
|
|
// Find favicon from extended meta
|
|
|
|
|
var patternList = new List<string>()
|
2024-07-17 00:56:17 +00:00
|
|
|
|
{
|
2024-09-15 11:48:31 +00:00
|
|
|
|
"//meta[@property='og:image']",
|
|
|
|
|
"//meta[@name='twitter:image']",
|
|
|
|
|
"//meta[@itemprop='image']"
|
|
|
|
|
};
|
2024-07-17 00:56:17 +00:00
|
|
|
|
|
2024-09-15 11:48:31 +00:00
|
|
|
|
foreach (var item in patternList)
|
2024-07-17 00:56:17 +00:00
|
|
|
|
{
|
2024-09-15 11:48:31 +00:00
|
|
|
|
result = FindNodeAttrValue(document, item, "content", string.Empty)?.Trim();
|
|
|
|
|
if (string.IsNullOrWhiteSpace(result))
|
|
|
|
|
{
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-17 00:56:17 +00:00
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
2024-09-15 11:48:31 +00:00
|
|
|
|
return string.Empty;
|
2024-07-17 00:56:17 +00:00
|
|
|
|
}
|
|
|
|
|
|
2024-09-15 11:48:31 +00:00
|
|
|
|
private string FindNodeValue(HtmlAgilityPack.HtmlDocument document, string xPath, string defaultValue = "")
|
2024-07-17 00:56:17 +00:00
|
|
|
|
{
|
|
|
|
|
var hnc = document.DocumentNode.SelectNodes(xPath);
|
|
|
|
|
if (hnc == null)
|
|
|
|
|
{
|
|
|
|
|
return defaultValue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (hnc.Count <= 0)
|
|
|
|
|
{
|
|
|
|
|
return defaultValue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
foreach (HtmlNode hn in hnc)
|
|
|
|
|
{
|
|
|
|
|
if (string.IsNullOrWhiteSpace(hn.InnerHtml))
|
|
|
|
|
{
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
var result = WebUtility.HtmlDecode(hn.InnerHtml)?.Replace("\r", "")?.Replace("\n", " ")?.Trim();
|
|
|
|
|
if (string.IsNullOrWhiteSpace(result))
|
|
|
|
|
{
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return defaultValue;
|
|
|
|
|
}
|
|
|
|
|
|
2024-09-14 22:03:06 +00:00
|
|
|
|
private string FindNodeAttrValue(HtmlAgilityPack.HtmlDocument document, string xPath, string attr, string defaultValue = "")
|
2024-07-17 00:56:17 +00:00
|
|
|
|
{
|
|
|
|
|
var hnc = document.DocumentNode.SelectNodes(xPath);
|
|
|
|
|
if (hnc == null)
|
|
|
|
|
{
|
|
|
|
|
return defaultValue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (hnc.Count <= 0)
|
|
|
|
|
{
|
|
|
|
|
return defaultValue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
foreach (HtmlNode hn in hnc)
|
|
|
|
|
{
|
|
|
|
|
if (hn.Attributes[attr] == null)
|
|
|
|
|
{
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (string.IsNullOrWhiteSpace(hn.Attributes[attr].Value))
|
|
|
|
|
{
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return System.Web.HttpUtility.HtmlDecode(hn.Attributes[attr].Value?.Trim());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return defaultValue;
|
|
|
|
|
}
|
|
|
|
|
|
2024-09-14 22:03:06 +00:00
|
|
|
|
private List<HtmlNode> FindNode(HtmlAgilityPack.HtmlDocument document, string nodeName, string attrName)
|
|
|
|
|
{
|
|
|
|
|
var xPath = (string.IsNullOrWhiteSpace(attrName) ? $"//{nodeName}" : $"//{nodeName}[@{attrName}]");
|
|
|
|
|
var hnc = document.DocumentNode.SelectNodes(xPath);
|
|
|
|
|
if (hnc == null)
|
|
|
|
|
{
|
|
|
|
|
return new List<HtmlNode>();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (hnc.Count <= 0)
|
|
|
|
|
{
|
|
|
|
|
return new List<HtmlNode>();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return hnc.ToList();
|
|
|
|
|
}
|
|
|
|
|
|
2024-09-15 11:48:31 +00:00
|
|
|
|
private string FindNodeAttrValue_ContainsWord(HtmlAgilityPack.HtmlDocument document, string nodeName, string attrName, string returnAttrName, string matchEqualList, string defaultValue = "")
|
|
|
|
|
{
|
|
|
|
|
var linkNodes = FindNode(document, nodeName, attrName);
|
|
|
|
|
foreach (var item in linkNodes)
|
|
|
|
|
{
|
|
|
|
|
var relValue = item.Attributes[attrName].Value?.Trim() ?? string.Empty;
|
|
|
|
|
if (!ContainsWord(relValue, matchEqualList))
|
|
|
|
|
{
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
var hrefValue = item.Attributes[returnAttrName].Value?.Trim() ?? string.Empty;
|
|
|
|
|
if (string.IsNullOrWhiteSpace(hrefValue))
|
|
|
|
|
{
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return System.Web.HttpUtility.HtmlDecode(hrefValue);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return defaultValue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private string FindNodeAttrValue_Equals(HtmlAgilityPack.HtmlDocument document, string nodeName, string attrName, string returnAttrName, List<string> matchValueList, string defaultValue = "")
|
|
|
|
|
{
|
|
|
|
|
var linkNodes = FindNode(document, nodeName, attrName);
|
|
|
|
|
foreach (var item in linkNodes)
|
|
|
|
|
{
|
|
|
|
|
var relValue = item.Attributes[attrName].Value?.Trim() ?? string.Empty;
|
|
|
|
|
if (!matchValueList.Contains(relValue?.ToLower() ?? string.Empty))
|
|
|
|
|
{
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
var hrefValue = item.Attributes[returnAttrName].Value?.Trim() ?? string.Empty;
|
|
|
|
|
if (string.IsNullOrWhiteSpace(hrefValue))
|
|
|
|
|
{
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return System.Web.HttpUtility.HtmlDecode(hrefValue);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return defaultValue;
|
|
|
|
|
}
|
|
|
|
|
|
2024-09-14 22:03:06 +00:00
|
|
|
|
private bool ContainsWord(string haystack, string needle)
|
|
|
|
|
{
|
|
|
|
|
haystack = haystack?.Trim() ?? string.Empty;
|
|
|
|
|
|
|
|
|
|
if (!haystack.Contains(" "))
|
|
|
|
|
{
|
|
|
|
|
return haystack.Equals(needle, StringComparison.CurrentCultureIgnoreCase);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
foreach (var item in haystack.Split(" "))
|
|
|
|
|
{
|
|
|
|
|
if (string.IsNullOrWhiteSpace(item))
|
|
|
|
|
{
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (item.Equals(needle, StringComparison.CurrentCultureIgnoreCase))
|
|
|
|
|
{
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-17 00:56:17 +00:00
|
|
|
|
}
|
|
|
|
|
}
|