bukkubuddy-bookmark-manager/Services/WebPageService.cs
2026-05-22 01:16:08 +01:00

428 lines
13 KiB
C#

using System;
using System.Collections.Generic;
using System.Drawing;
using System.IO;
using System.Linq;
using System.Net;
using System.Net.Http;
using System.Runtime.InteropServices;
using System.Threading;
using System.Threading.Tasks;
using HtmlAgilityPack;
namespace BukkuBuddy.Services
{
public class WebPageService
{
private readonly HttpClient _httpClient;
public WebPageService(bool allowUnsafeSSL, int timeout, bool allowCookies, bool allowRedirect)
{
var handler = new HttpClientHandler();
handler.AllowAutoRedirect = allowRedirect;
if (allowUnsafeSSL)
{
handler.ServerCertificateCustomValidationCallback = HttpClientHandler.DangerousAcceptAnyServerCertificateValidator;
}
if (allowCookies)
{
handler.UseCookies = allowCookies;
handler.CookieContainer = new CookieContainer();
}
_httpClient = new HttpClient(handler);
_httpClient.Timeout = TimeSpan.FromSeconds(timeout);
}
public async Task<HtmlAgilityPack.HtmlDocument> GetDocument(string url, CancellationToken cancellationToken = default)
{
var sourceCode = await this.GetSource(url, cancellationToken);
if (string.IsNullOrWhiteSpace(sourceCode))
{
return null;
}
var document = new HtmlAgilityPack.HtmlDocument();
document.LoadHtml(sourceCode);
return document;
}
public async Task<string> GetSource(string url, CancellationToken cancellationToken = default)
{
if (string.IsNullOrWhiteSpace(url))
{
return null;
}
var userAgent = GenerateUserAgent();
using var request = new HttpRequestMessage(HttpMethod.Get, url);
request.Headers.UserAgent.ParseAdd(userAgent);
using var response = await _httpClient.SendAsync(request, cancellationToken);
if (!response.IsSuccessStatusCode)
{
return null;
}
response.EnsureSuccessStatusCode();
return await response.Content.ReadAsStringAsync(cancellationToken);
}
public async Task<Image> GetImage(string url, CancellationToken cancellationToken = default)
{
try
{
var userAgent = GenerateUserAgent();
using var request = new HttpRequestMessage(HttpMethod.Get, url);
request.Headers.UserAgent.ParseAdd(userAgent);
using var response = await _httpClient.SendAsync(request, HttpCompletionOption.ResponseHeadersRead, cancellationToken);
if (!response.IsSuccessStatusCode)
{
return null;
}
response.EnsureSuccessStatusCode();
var contentType = response.Content.Headers.ContentType?.MediaType;
if (contentType == null || !contentType.StartsWith("image/", StringComparison.OrdinalIgnoreCase))
{
return null;
}
await using var stream = await response.Content.ReadAsStreamAsync(cancellationToken);
using var memoryStream = new MemoryStream();
await stream.CopyToAsync(memoryStream, cancellationToken);
memoryStream.Position = 0;
var image = Image.FromStream(memoryStream);
return new Bitmap(image);
}
catch (Exception)
{
// Do nothing
}
return null;
}
public string ParseTitle(HtmlAgilityPack.HtmlDocument document)
{
// Find basic title
var result = FindNodeValue(document, "//title", string.Empty)?.Trim();
if (!string.IsNullOrWhiteSpace(result))
{
return result;
}
// Find title from extended meta
var patternList = new List<string>()
{
"//meta[@property='og:title']",
"//meta[@property='og:site_name']",
"//meta[@name='twitter:title']",
"//meta[@itemprop='name']"
};
foreach (var item in patternList)
{
result = FindNodeAttrValue(document, item, "content", string.Empty)?.Trim();
if (string.IsNullOrWhiteSpace(result))
{
continue;
}
return result;
}
return null;
}
public string ParseDescription(HtmlAgilityPack.HtmlDocument document)
{
var patternList = new List<string>()
{
"//meta[@name='description']",
"//meta[@property='og:description']",
"//meta[@name='twitter:description']",
"//meta[@itemprop='description']",
};
foreach (var item in patternList)
{
var result = FindNodeAttrValue(document, item, "content", string.Empty)?.Trim();
if (string.IsNullOrWhiteSpace(result))
{
continue;
}
return result;
}
return string.Empty;
}
public string ParseFavicon(HtmlAgilityPack.HtmlDocument document)
{
string result = null;
// Find link-rel that contains word
result = FindNodeAttrValue_ContainsWord(document, "link", "rel", "href", "icon");
if (!string.IsNullOrWhiteSpace(result))
{
return result;
}
// Find link-rel contains apple-icon
var matchPatterns = new List<string>() { "apple-touch-icon", "apple-touch-icon-precomposed" };
result = FindNodeAttrValue_Equals(document, "link", "rel", "href", matchPatterns);
if (!string.IsNullOrWhiteSpace(result))
{
return result;
}
// Find favicon from extended meta
var patternList = new List<string>()
{
"//meta[@property='og:image']",
"//meta[@name='twitter:image']",
"//meta[@itemprop='image']"
};
foreach (var item in patternList)
{
result = FindNodeAttrValue(document, item, "content", string.Empty)?.Trim();
if (string.IsNullOrWhiteSpace(result))
{
continue;
}
return result;
}
return string.Empty;
}
public bool IsValidUrl(string url)
{
if (string.IsNullOrWhiteSpace(url))
{
return false;
}
if (!url.StartsWith("http://", StringComparison.CurrentCultureIgnoreCase) && !url.StartsWith("https://", StringComparison.CurrentCultureIgnoreCase))
{
return false;
}
return true;
}
private string FindNodeValue(HtmlAgilityPack.HtmlDocument document, string xPath, string defaultValue = "")
{
var hnc = document.DocumentNode.SelectNodes(xPath);
if (hnc == null)
{
return defaultValue;
}
if (hnc.Count <= 0)
{
return defaultValue;
}
foreach (HtmlNode hn in hnc)
{
if (string.IsNullOrWhiteSpace(hn.InnerHtml))
{
continue;
}
var result = WebUtility.HtmlDecode(hn.InnerHtml)?.Replace("\r", "")?.Replace("\n", " ")?.Trim();
if (string.IsNullOrWhiteSpace(result))
{
continue;
}
return result;
}
return defaultValue;
}
private string FindNodeAttrValue(HtmlAgilityPack.HtmlDocument document, string xPath, string attr, string defaultValue = "")
{
var hnc = document.DocumentNode.SelectNodes(xPath);
if (hnc == null)
{
return defaultValue;
}
if (hnc.Count <= 0)
{
return defaultValue;
}
foreach (HtmlNode hn in hnc)
{
if (hn.Attributes[attr] == null)
{
continue;
}
if (string.IsNullOrWhiteSpace(hn.Attributes[attr].Value))
{
continue;
}
return System.Web.HttpUtility.HtmlDecode(hn.Attributes[attr].Value?.Trim());
}
return defaultValue;
}
private List<HtmlNode> FindNode(HtmlAgilityPack.HtmlDocument document, string nodeName, string attrName)
{
var xPath = (string.IsNullOrWhiteSpace(attrName) ? $"//{nodeName}" : $"//{nodeName}[@{attrName}]");
var hnc = document.DocumentNode.SelectNodes(xPath);
if (hnc == null)
{
return new List<HtmlNode>();
}
if (hnc.Count <= 0)
{
return new List<HtmlNode>();
}
return hnc.ToList();
}
private string FindNodeAttrValue_ContainsWord(HtmlAgilityPack.HtmlDocument document, string nodeName, string attrName, string returnAttrName, string matchEqualList, string defaultValue = "")
{
var linkNodes = FindNode(document, nodeName, attrName);
foreach (var item in linkNodes)
{
var relValue = item.Attributes[attrName].Value?.Trim() ?? string.Empty;
if (!ContainsWord(relValue, matchEqualList))
{
continue;
}
var hrefValue = item.Attributes[returnAttrName].Value?.Trim() ?? string.Empty;
if (string.IsNullOrWhiteSpace(hrefValue))
{
continue;
}
return System.Web.HttpUtility.HtmlDecode(hrefValue);
}
return defaultValue;
}
private string FindNodeAttrValue_Equals(HtmlAgilityPack.HtmlDocument document, string nodeName, string attrName, string returnAttrName, List<string> matchValueList, string defaultValue = "")
{
var linkNodes = FindNode(document, nodeName, attrName);
foreach (var item in linkNodes)
{
var relValue = item.Attributes[attrName].Value?.Trim() ?? string.Empty;
if (!matchValueList.Contains(relValue?.ToLower() ?? string.Empty))
{
continue;
}
var hrefValue = item.Attributes[returnAttrName].Value?.Trim() ?? string.Empty;
if (string.IsNullOrWhiteSpace(hrefValue))
{
continue;
}
return System.Web.HttpUtility.HtmlDecode(hrefValue);
}
return defaultValue;
}
private bool ContainsWord(string haystack, string needle)
{
haystack = haystack?.Trim() ?? string.Empty;
if (!haystack.Contains(" "))
{
return haystack.Equals(needle, StringComparison.CurrentCultureIgnoreCase);
}
foreach (var item in haystack.Split(" "))
{
if (string.IsNullOrWhiteSpace(item))
{
continue;
}
if (item.Equals(needle, StringComparison.CurrentCultureIgnoreCase))
{
return true;
}
}
return false;
}
private string GenerateUserAgent()
{
var os = GetWindowsVersion();
var arch = RuntimeInformation.OSArchitecture switch
{
Architecture.X64 => "Win64; x64",
Architecture.X86 => "Win32",
Architecture.Arm64 => "ARM64",
_ => "Win64; x64"
};
var chromeVersion = GetChromeLikeVersion();
return $"Mozilla/5.0 (Windows NT {os}; {arch}) Momozilla/5.0 () AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{chromeVersion} Safari/537.36";
}
private string GetWindowsVersion()
{
var v = Environment.OSVersion.Version;
// Map to common Windows NT versions
return v.Major switch
{
10 => "10.0", // Windows 10/11 both report 10.0
6 when v.Minor == 3 => "6.3", // Windows 8.1
6 when v.Minor == 2 => "6.2", // Windows 8
6 when v.Minor == 1 => "6.1", // Windows 7
_ => $"{v.Major}.{v.Minor}"
};
}
private string GetChromeLikeVersion()
{
// You can hardcode or randomize within a realistic range
var rnd = new Random();
int major = rnd.Next(120, 126); // recent Chrome versions
int build = rnd.Next(0, 7000);
int patch = rnd.Next(0, 200);
return $"{major}.0.{build}.{patch}";
}
}
}