bookmark-manager-r4/Services/WebProvider.cs
2024-07-17 01:56:17 +01:00

328 lines
9.9 KiB
C#

using System;
using System.Drawing;
using System.Net;
using System.Threading.Tasks;
using HtmlAgilityPack;
using RyzStudio.Net;
namespace BookmarkManager.Services
{
public class WebProvider
{
private readonly WebClientProvider _webClientProvider;
public WebProvider()
{
_webClientProvider = new WebClientProvider();
_webClientProvider.Timeout = 4;
}
public async Task<HtmlAgilityPack.HtmlDocument> RetrieveHtmlDocument(string url)
{
var sourceCode = await this.RetrieveSourceCode(url);
if (string.IsNullOrWhiteSpace(sourceCode))
{
return null;
}
var document = new HtmlAgilityPack.HtmlDocument();
try
{
document.LoadHtml(sourceCode);
}
catch (Exception)
{
return null;
}
return document;
}
public async Task<string> RetrieveSourceCode(string url)
{
if (string.IsNullOrWhiteSpace(url))
{
return null;
}
if (!Uri.TryCreate(url, UriKind.Absolute, out Uri uri))
{
return null;
}
System.Net.Http.HttpResponseMessage response;
try
{
response = await _webClientProvider.Get(url?.Trim());
}
catch (Exception)
{
return null;
}
if (response.StatusCode != HttpStatusCode.OK)
{
return null;
}
var sourceCode = await response?.Content?.ReadAsStringAsync();
if (string.IsNullOrWhiteSpace(sourceCode))
{
return null;
}
return sourceCode;
}
public async Task<Image> RetrieveImage(string url)
{
if (string.IsNullOrWhiteSpace(url))
{
return null;
}
if (!Uri.TryCreate(url, UriKind.Absolute, out Uri uri))
{
return null;
}
System.Net.Http.HttpResponseMessage response;
try
{
response = await _webClientProvider.Get(url?.Trim());
}
catch (Exception)
{
return null;
}
if (response.StatusCode != HttpStatusCode.OK)
{
return null;
}
var stream = await response?.Content?.ReadAsStreamAsync();
Image result = null;
try
{
result = Image.FromStream(stream);
}
catch (Exception)
{
return null;
}
return result;
}
public async Task<Image> RetrieveImage(HtmlAgilityPack.HtmlDocument document)
{
var iconUrl = this.ParseFavicon(document);
if (string.IsNullOrWhiteSpace(iconUrl))
{
return null;
}
return await this.RetrieveImage(iconUrl);
}
public string ParseTitle(HtmlAgilityPack.HtmlDocument document)
{
string result = null;
result = ParseTagValue(document, "//title", string.Empty)?.Trim();
if (!string.IsNullOrWhiteSpace(result))
{
return result;
}
result = ParseTagValue_Attr(document, "//meta[@property='og:title']", "content", string.Empty)?.Trim();
if (!string.IsNullOrWhiteSpace(result))
{
return result;
}
result = ParseTagValue_Attr(document, "//meta[@name='twitter:title']", "content", string.Empty)?.Trim();
if (!string.IsNullOrWhiteSpace(result))
{
return result;
}
result = ParseTagValue_Attr(document, "//meta[@property='og:site_name']", "content", string.Empty)?.Trim();
if (!string.IsNullOrWhiteSpace(result))
{
return result;
}
result = ParseTagValue_Attr(document, "//meta[@itemprop='name']", "content", string.Empty)?.Trim();
if (!string.IsNullOrWhiteSpace(result))
{
return result;
}
return string.Empty;
}
public string ParseMetaDescription(HtmlAgilityPack.HtmlDocument document)
{
string result = null;
result = ParseTagValue_Attr(document, "//meta[@name='description']", "content", string.Empty)?.Trim();
if (!string.IsNullOrWhiteSpace(result))
{
return result;
}
result = ParseTagValue_Attr(document, "//meta[@property='og:description']", "content", string.Empty)?.Trim();
if (!string.IsNullOrWhiteSpace(result))
{
return result;
}
result = ParseTagValue_Attr(document, "//meta[@name='twitter:description']", "content", string.Empty)?.Trim();
if (!string.IsNullOrWhiteSpace(result))
{
return result;
}
result = ParseTagValue_Attr(document, "//meta[@property='og:description']", "content", string.Empty)?.Trim();
if (!string.IsNullOrWhiteSpace(result))
{
return result;
}
result = ParseTagValue_Attr(document, "//meta[@itemprop='description']", "content", string.Empty)?.Trim();
if (!string.IsNullOrWhiteSpace(result))
{
return result;
}
return result;
}
public string ParseFavicon(HtmlAgilityPack.HtmlDocument document)
{
string result = null;
result = ParseTagValue_Attr(document, "//link[translate(@rel, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz') = 'shortcut icon']", "href", string.Empty)?.Trim();
if (!string.IsNullOrWhiteSpace(result))
{
return result;
}
result = ParseTagValue_Attr(document, "//link[translate(@rel, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz') = 'icon']", "href", string.Empty)?.Trim();
if (!string.IsNullOrWhiteSpace(result))
{
return result;
}
result = ParseTagValue_Attr(document, "//link[translate(@rel, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz') = 'apple-touch-icon']", "href", string.Empty)?.Trim();
if (!string.IsNullOrWhiteSpace(result))
{
return result;
}
result = ParseTagValue_Attr(document, "//link[translate(@rel, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz') = 'apple-touch-icon-precomposed']", "href", string.Empty)?.Trim();
if (!string.IsNullOrWhiteSpace(result))
{
return result;
}
result = ParseTagValue_Attr(document, "//meta[translate(@property, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz') = 'og:image']", "content", string.Empty)?.Trim();
if (!string.IsNullOrWhiteSpace(result))
{
return result;
}
result = ParseTagValue_Attr(document, "//meta[translate(@name, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz') = 'twitter:image']", "content", string.Empty)?.Trim();
if (!string.IsNullOrWhiteSpace(result))
{
return result;
}
result = ParseTagValue_Attr(document, "//meta[translate(@property, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz') = 'og:image']", "content", string.Empty)?.Trim();
if (!string.IsNullOrWhiteSpace(result))
{
return result;
}
result = ParseTagValue_Attr(document, "//meta[translate(@itemprop, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz') = 'image']", "content", string.Empty)?.Trim();
if (!string.IsNullOrWhiteSpace(result))
{
return result;
}
return "/favicon.ico";
}
private string ParseTagValue(HtmlAgilityPack.HtmlDocument document, string xPath, string defaultValue = "")
{
var hnc = document.DocumentNode.SelectNodes(xPath);
if (hnc == null)
{
return defaultValue;
}
if (hnc.Count <= 0)
{
return defaultValue;
}
foreach (HtmlNode hn in hnc)
{
if (string.IsNullOrWhiteSpace(hn.InnerHtml))
{
continue;
}
var result = WebUtility.HtmlDecode(hn.InnerHtml)?.Replace("\r", "")?.Replace("\n", " ")?.Trim();
if (string.IsNullOrWhiteSpace(result))
{
continue;
}
return result;
}
return defaultValue;
}
private string ParseTagValue_Attr(HtmlAgilityPack.HtmlDocument document, string xPath, string attr, string defaultValue = "")
{
var hnc = document.DocumentNode.SelectNodes(xPath);
if (hnc == null)
{
return defaultValue;
}
if (hnc.Count <= 0)
{
return defaultValue;
}
foreach (HtmlNode hn in hnc)
{
if (hn.Attributes[attr] == null)
{
continue;
}
if (string.IsNullOrWhiteSpace(hn.Attributes[attr].Value))
{
continue;
}
return System.Web.HttpUtility.HtmlDecode(hn.Attributes[attr].Value?.Trim());
}
return defaultValue;
}
}
}