bookmark-manager-r4/Services/WebProvider.cs

483 lines
14 KiB
C#
Raw Normal View History

2024-07-17 00:56:17 +00:00
using System;
2024-09-14 22:03:06 +00:00
using System.Collections.Generic;
2024-07-17 00:56:17 +00:00
using System.Drawing;
2024-09-14 22:03:06 +00:00
using System.Linq;
2024-07-17 00:56:17 +00:00
using System.Net;
2024-09-14 22:03:06 +00:00
using System.Security.Policy;
2024-07-17 00:56:17 +00:00
using System.Threading.Tasks;
using HtmlAgilityPack;
using RyzStudio.Net;
namespace BookmarkManager.Services
{
public class WebProvider
{
private readonly WebClientProvider _webClientProvider;
public WebProvider()
{
_webClientProvider = new WebClientProvider();
_webClientProvider.Timeout = 4;
}
2024-07-18 22:45:51 +00:00
public bool IgnoreSSL
{
get => _webClientProvider.IgnoreSSL;
set
{
_webClientProvider.IgnoreSSL = value;
}
}
2024-07-17 00:56:17 +00:00
public async Task<HtmlAgilityPack.HtmlDocument> RetrieveHtmlDocument(string url)
{
var sourceCode = await this.RetrieveSourceCode(url);
if (string.IsNullOrWhiteSpace(sourceCode))
{
return null;
}
var document = new HtmlAgilityPack.HtmlDocument();
try
{
document.LoadHtml(sourceCode);
}
catch (Exception)
{
return null;
}
return document;
}
public async Task<string> RetrieveSourceCode(string url)
{
if (string.IsNullOrWhiteSpace(url))
{
return null;
}
if (!Uri.TryCreate(url, UriKind.Absolute, out Uri uri))
{
return null;
}
System.Net.Http.HttpResponseMessage response;
try
{
response = await _webClientProvider.Get(url?.Trim());
}
catch (Exception)
{
return null;
}
2024-07-19 20:22:24 +00:00
if (response == null)
{
return null;
}
2024-07-17 00:56:17 +00:00
if (response.StatusCode != HttpStatusCode.OK)
{
return null;
}
2024-07-19 20:22:24 +00:00
var sourceCode = "";
try
{
sourceCode = await response?.Content?.ReadAsStringAsync();
}
catch (Exception)
{
// do nothing
}
2024-07-17 00:56:17 +00:00
if (string.IsNullOrWhiteSpace(sourceCode))
{
return null;
}
return sourceCode;
}
public async Task<Image> RetrieveImage(string url)
{
if (string.IsNullOrWhiteSpace(url))
{
return null;
}
if (!Uri.TryCreate(url, UriKind.Absolute, out Uri uri))
{
return null;
}
System.Net.Http.HttpResponseMessage response;
try
{
response = await _webClientProvider.Get(url?.Trim());
}
catch (Exception)
{
return null;
}
if (response.StatusCode != HttpStatusCode.OK)
{
return null;
}
var stream = await response?.Content?.ReadAsStreamAsync();
Image result = null;
try
{
result = Image.FromStream(stream);
}
catch (Exception)
{
return null;
}
return result;
}
2024-09-14 22:03:06 +00:00
public async Task<Image> RetrieveImage(string url, HtmlAgilityPack.HtmlDocument document)
2024-07-17 00:56:17 +00:00
{
var iconUrl = this.ParseFavicon(document);
if (string.IsNullOrWhiteSpace(iconUrl))
{
return null;
}
2024-09-14 22:03:06 +00:00
try
{
var baseUri = new Uri(url);
var absoluteUri = new Uri(baseUri, iconUrl);
iconUrl = absoluteUri.AbsoluteUri;
}
catch
{
return null;
}
2024-07-17 00:56:17 +00:00
return await this.RetrieveImage(iconUrl);
}
public string ParseTitle(HtmlAgilityPack.HtmlDocument document)
{
string result = null;
result = ParseTagValue(document, "//title", string.Empty)?.Trim();
if (!string.IsNullOrWhiteSpace(result))
{
return result;
}
2024-09-14 22:03:06 +00:00
result = FindNodeAttrValue(document, "//meta[@property='og:title']", "content", string.Empty)?.Trim();
2024-07-17 00:56:17 +00:00
if (!string.IsNullOrWhiteSpace(result))
{
return result;
}
2024-09-14 22:03:06 +00:00
result = FindNodeAttrValue(document, "//meta[@name='twitter:title']", "content", string.Empty)?.Trim();
2024-07-17 00:56:17 +00:00
if (!string.IsNullOrWhiteSpace(result))
{
return result;
}
2024-09-14 22:03:06 +00:00
result = FindNodeAttrValue(document, "//meta[@property='og:site_name']", "content", string.Empty)?.Trim();
2024-07-17 00:56:17 +00:00
if (!string.IsNullOrWhiteSpace(result))
{
return result;
}
2024-09-14 22:03:06 +00:00
result = FindNodeAttrValue(document, "//meta[@itemprop='name']", "content", string.Empty)?.Trim();
2024-07-17 00:56:17 +00:00
if (!string.IsNullOrWhiteSpace(result))
{
return result;
}
return string.Empty;
}
public string ParseMetaDescription(HtmlAgilityPack.HtmlDocument document)
{
string result = null;
2024-09-14 22:03:06 +00:00
result = FindNodeAttrValue(document, "//meta[@name='description']", "content", string.Empty)?.Trim();
2024-07-17 00:56:17 +00:00
if (!string.IsNullOrWhiteSpace(result))
{
return result;
}
2024-09-14 22:03:06 +00:00
result = FindNodeAttrValue(document, "//meta[@property='og:description']", "content", string.Empty)?.Trim();
2024-07-17 00:56:17 +00:00
if (!string.IsNullOrWhiteSpace(result))
{
return result;
}
2024-09-14 22:03:06 +00:00
result = FindNodeAttrValue(document, "//meta[@name='twitter:description']", "content", string.Empty)?.Trim();
2024-07-17 00:56:17 +00:00
if (!string.IsNullOrWhiteSpace(result))
{
return result;
}
2024-09-14 22:03:06 +00:00
result = FindNodeAttrValue(document, "//meta[@property='og:description']", "content", string.Empty)?.Trim();
2024-07-17 00:56:17 +00:00
if (!string.IsNullOrWhiteSpace(result))
{
return result;
}
2024-09-14 22:03:06 +00:00
result = FindNodeAttrValue(document, "//meta[@itemprop='description']", "content", string.Empty)?.Trim();
2024-07-17 00:56:17 +00:00
if (!string.IsNullOrWhiteSpace(result))
{
return result;
}
return result;
}
public string ParseFavicon(HtmlAgilityPack.HtmlDocument document)
{
string result = null;
2024-09-14 22:03:06 +00:00
//var tt1 = FindNode_AtrributeContains(document, "//link[contains(@rel, 'icon')]", "href", string.Empty);
//var tt1 = FindNode_AtrributeContains(document, "link", "rel", "icon");
2024-07-17 00:56:17 +00:00
2024-09-14 22:03:06 +00:00
// Find link-rel contains "icon"
var linkNodes = FindNode(document, "link", "rel");
foreach (var item in linkNodes)
2024-07-17 00:56:17 +00:00
{
2024-09-14 22:03:06 +00:00
var relValue = item.Attributes["rel"].Value?.Trim() ?? string.Empty;
if (!ContainsWord(relValue, "icon"))
{
continue;
}
2024-07-17 00:56:17 +00:00
2024-09-14 22:03:06 +00:00
var hrefValue = item.Attributes["href"].Value?.Trim() ?? string.Empty;
if (string.IsNullOrWhiteSpace(hrefValue))
{
continue;
}
2024-07-17 00:56:17 +00:00
2024-09-14 22:03:06 +00:00
return System.Web.HttpUtility.HtmlDecode(hrefValue);
2024-07-17 00:56:17 +00:00
}
2024-09-14 22:03:06 +00:00
// Find link-rel contains apple-icon
var appleIconPatterns = new List<string>() { "apple-touch-icon", "apple-touch-icon-precomposed" };
foreach (var item in linkNodes)
2024-07-17 00:56:17 +00:00
{
2024-09-14 22:03:06 +00:00
var relValue = item.Attributes["rel"].Value?.Trim() ?? string.Empty;
if (!appleIconPatterns.Contains(relValue?.ToLower() ?? string.Empty))
{
continue;
}
var hrefValue = item.Attributes["href"].Value?.Trim() ?? string.Empty;
if (string.IsNullOrWhiteSpace(hrefValue))
{
continue;
}
return System.Web.HttpUtility.HtmlDecode(hrefValue);
2024-07-17 00:56:17 +00:00
}
2024-09-14 22:03:06 +00:00
//result = ParseTagValue_Attr(document, "//link[translate(@rel, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz') = 'shortcut icon']", "href", string.Empty)?.Trim();
//if (!string.IsNullOrWhiteSpace(result))
//{
// return result;
//}
//result = ParseTagValue_Attr(document, "//link[translate(@rel, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz') = 'icon']", "href", string.Empty)?.Trim();
//if (!string.IsNullOrWhiteSpace(result))
//{
// return result;
//}
//result = ParseTagValue_Attr(document, "//link[translate(@rel, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz') = 'apple-touch-icon']", "href", string.Empty)?.Trim();
//if (!string.IsNullOrWhiteSpace(result))
//{
// return result;
//}
//result = ParseTagValue_Attr(document, "//link[translate(@rel, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz') = 'apple-touch-icon-precomposed']", "href", string.Empty)?.Trim();
//if (!string.IsNullOrWhiteSpace(result))
//{
// return result;
//}
result = FindNodeAttrValue(document, "//meta[@property='og:image']", "content", string.Empty)?.Trim();
2024-07-17 00:56:17 +00:00
if (!string.IsNullOrWhiteSpace(result))
{
return result;
}
2024-09-14 22:03:06 +00:00
result = FindNodeAttrValue(document, "//meta[@name='twitter:image']", "content", string.Empty)?.Trim();
2024-07-17 00:56:17 +00:00
if (!string.IsNullOrWhiteSpace(result))
{
return result;
}
2024-09-14 22:03:06 +00:00
result = FindNodeAttrValue(document, "//meta[@itemprop='image']", "content", string.Empty)?.Trim();
2024-07-17 00:56:17 +00:00
if (!string.IsNullOrWhiteSpace(result))
{
return result;
}
return "/favicon.ico";
}
private string ParseTagValue(HtmlAgilityPack.HtmlDocument document, string xPath, string defaultValue = "")
{
var hnc = document.DocumentNode.SelectNodes(xPath);
if (hnc == null)
{
return defaultValue;
}
if (hnc.Count <= 0)
{
return defaultValue;
}
foreach (HtmlNode hn in hnc)
{
if (string.IsNullOrWhiteSpace(hn.InnerHtml))
{
continue;
}
var result = WebUtility.HtmlDecode(hn.InnerHtml)?.Replace("\r", "")?.Replace("\n", " ")?.Trim();
if (string.IsNullOrWhiteSpace(result))
{
continue;
}
return result;
}
return defaultValue;
}
2024-09-14 22:03:06 +00:00
private string FindNodeAttrValue(HtmlAgilityPack.HtmlDocument document, string xPath, string attr, string defaultValue = "")
2024-07-17 00:56:17 +00:00
{
var hnc = document.DocumentNode.SelectNodes(xPath);
if (hnc == null)
{
return defaultValue;
}
if (hnc.Count <= 0)
{
return defaultValue;
}
foreach (HtmlNode hn in hnc)
{
if (hn.Attributes[attr] == null)
{
continue;
}
if (string.IsNullOrWhiteSpace(hn.Attributes[attr].Value))
{
continue;
}
return System.Web.HttpUtility.HtmlDecode(hn.Attributes[attr].Value?.Trim());
}
return defaultValue;
}
2024-09-14 22:03:06 +00:00
//private List<HtmlNode> FindNode_AtrributeContains(HtmlAgilityPack.HtmlDocument document, string nodeName, string attrName, string findValue)
//{
// var response = new List<HtmlNode>();
// var xPath = $"//{nodeName}[@{attrName}]";
// var hnc = document.DocumentNode.SelectNodes(xPath);
// if (hnc == null)
// {
// return response;
// }
// if (hnc.Count <= 0)
// {
// return response;
// }
// foreach (HtmlNode item in hnc)
// {
// if (!item.Attributes.Contains(attrName))
// {
// continue;
// }
// if (!ContainsWord(item.Attributes[attrName].Value ?? string.Empty, findValue))
// {
// continue;
// }
// response.Add(item);
// }
// return response;
//}
private List<HtmlNode> FindNode(HtmlAgilityPack.HtmlDocument document, string nodeName, string attrName)
{
var xPath = (string.IsNullOrWhiteSpace(attrName) ? $"//{nodeName}" : $"//{nodeName}[@{attrName}]");
var hnc = document.DocumentNode.SelectNodes(xPath);
if (hnc == null)
{
return new List<HtmlNode>();
}
if (hnc.Count <= 0)
{
return new List<HtmlNode>();
}
return hnc.ToList();
}
private bool ContainsWord(string haystack, string needle)
{
haystack = haystack?.Trim() ?? string.Empty;
if (!haystack.Contains(" "))
{
return haystack.Equals(needle, StringComparison.CurrentCultureIgnoreCase);
}
foreach (var item in haystack.Split(" "))
{
if (string.IsNullOrWhiteSpace(item))
{
continue;
}
if (item.Equals(needle, StringComparison.CurrentCultureIgnoreCase))
{
return true;
}
}
return false;
}
2024-07-17 00:56:17 +00:00
}
}