diff --git a/API/Schema/MangaConnectors/Webtoons.cs b/API/Schema/MangaConnectors/Webtoons.cs new file mode 100644 index 0000000..238bf51 --- /dev/null +++ b/API/Schema/MangaConnectors/Webtoons.cs @@ -0,0 +1,272 @@ +using System.Net; +using System.Text.RegularExpressions; +using HtmlAgilityPack; +using Tranga.Jobs; + +namespace Tranga.MangaConnectors; + +public class Webtoons : MangaConnector +{ + + public Webtoons(GlobalBase clone) : base(clone, "Webtoons", ["en"]) + { + this.downloadClient = new HttpDownloadClient(clone); + } + + // Done + public override Manga[] GetManga(string publicationTitle = "") + { + string sanitizedTitle = string.Join(' ', Regex.Matches(publicationTitle, "[A-z]*").Where(m => m.Value.Length > 0)).ToLower(); + Log($"Searching Publications. Term=\"{publicationTitle}\""); + string requestUrl = $"https://www.webtoons.com/en/search?keyword={sanitizedTitle}&searchType=WEBTOON"; + RequestResult requestResult = + downloadClient.MakeRequest(requestUrl, RequestType.Default); + if ((int)requestResult.statusCode < 200 || (int)requestResult.statusCode >= 300) { + Log($"Failed to retrieve site"); + return Array.Empty(); + } + + if (requestResult.htmlDocument is null) + { + Log($"Failed to retrieve site"); + return Array.Empty(); + } + + Manga[] publications = ParsePublicationsFromHtml(requestResult.htmlDocument); + Log($"Retrieved {publications.Length} publications. Term=\"{publicationTitle}\""); + return publications; + } + + // Done + public override Manga? GetMangaFromId(string publicationId) + { + PublicationManager pb = new PublicationManager(publicationId); + return GetMangaFromUrl($"https://www.webtoons.com/en/{pb.Category}/{pb.Title}/list?title_no={pb.Id}"); + } + + // Done + public override Manga? GetMangaFromUrl(string url) + { + RequestResult requestResult = downloadClient.MakeRequest(url, RequestType.MangaInfo); + if ((int)requestResult.statusCode < 200 || (int)requestResult.statusCode >= 300) { + return null; + } + if (requestResult.htmlDocument is null) + { + Log($"Failed to retrieve site"); + return null; + } + Regex regex = new Regex(@".*webtoons\.com/en/(?[^/]+)/(?[^/]+)/list\?title_no=(?<id>\d+).*"); + Match match = regex.Match(url); + + if(match.Success) { + PublicationManager pm = new PublicationManager(match.Groups["title"].Value, match.Groups["category"].Value, match.Groups["id"].Value); + return ParseSinglePublicationFromHtml(requestResult.htmlDocument, pm.getPublicationId(), url); + } + Log($"Failed match Regex ID"); + return null; + } + + // Done + private Manga[] ParsePublicationsFromHtml(HtmlDocument document) + { + HtmlNode mangaList = document.DocumentNode.SelectSingleNode("//ul[contains(@class, 'card_lst')]"); + if (!mangaList.ChildNodes.Any(node => node.Name == "li")) { + Log($"Failed to parse publication"); + return Array.Empty<Manga>(); + } + + List<string> urls = document.DocumentNode + .SelectNodes("//ul[contains(@class, 'card_lst')]/li/a") + .Select(node => node.GetAttributeValue("href", "https://www.webtoons.com")) + .ToList(); + + HashSet<Manga> ret = new(); + foreach (string url in urls) + { + Manga? manga = GetMangaFromUrl(url); + if (manga is not null) + ret.Add((Manga)manga); + } + + return ret.ToArray(); + } + + private string capitalizeString(string str = "") { + if(str.Length == 0) return ""; + if(str.Length == 1) return str.ToUpper(); + return char.ToUpper(str[0]) + str.Substring(1).ToLower(); + } + + // Done + private Manga ParseSinglePublicationFromHtml(HtmlDocument document, string publicationId, string websiteUrl) + { + HtmlNode infoNode1 = document.DocumentNode.SelectSingleNode("//*[@id='content']/div[2]/div[1]/div[1]"); + HtmlNode infoNode2 = document.DocumentNode.SelectSingleNode("//*[@id='content']/div[2]/div[2]/div[2]"); + + string sortName = infoNode1.SelectSingleNode(".//h1[contains(@class, 'subj')]").InnerText; + string description = infoNode2.SelectSingleNode(".//p[contains(@class, 'summary')]") + .InnerText.Trim(); + + HtmlNode posterNode = document.DocumentNode.SelectSingleNode("//div[contains(@class, 'detail_body') and contains(@class, 'banner')]"); + + Regex regex = new Regex(@"url\((?<url>.*?)\)"); + Match match = regex.Match(posterNode.GetAttributeValue("style", "")); + + string posterUrl = match.Groups["url"].Value; + string coverFileNameInCache = SaveCoverImageToCache(posterUrl, publicationId, RequestType.MangaCover, websiteUrl); + + string genre = infoNode1.SelectSingleNode(".//h2[contains(@class, 'genre')]") + .InnerText.Trim(); + string[] tags = [ genre ]; + + List<HtmlNode> authorsNodes = infoNode1.SelectSingleNode(".//div[contains(@class, 'author_area')]").Descendants("a").ToList(); + List<string> authors = authorsNodes.Select(node => node.InnerText.Trim()).ToList(); + + string originalLanguage = ""; + + int year = DateTime.Now.Year; + + string status1 = infoNode2.SelectSingleNode(".//p").InnerText; + string status2 = infoNode2.SelectSingleNode(".//p/span").InnerText; + Manga.ReleaseStatusByte releaseStatus = Manga.ReleaseStatusByte.Unreleased; + if(status2.Length == 0 || status1.ToLower() == "completed") { + releaseStatus = Manga.ReleaseStatusByte.Completed; + } else if(status2.ToLower() == "up") { + releaseStatus = Manga.ReleaseStatusByte.Continuing; + } + + Manga manga = new(sortName, authors, description, new Dictionary<string, string>(), tags, posterUrl, coverFileNameInCache, new Dictionary<string, string>(), + year, originalLanguage, publicationId, releaseStatus, websiteUrl: websiteUrl); + AddMangaToCache(manga); + return manga; + } + + // Done + public override Chapter[] GetChapters(Manga manga, string language = "en") + { + PublicationManager pm = new PublicationManager(manga.publicationId); + string requestUrl = $"https://www.webtoons.com/en/{pm.Category}/{pm.Title}/list?title_no={pm.Id}"; + // Leaving this in for verification if the page exists + RequestResult requestResult = + downloadClient.MakeRequest(requestUrl, RequestType.Default); + if ((int)requestResult.statusCode < 200 || (int)requestResult.statusCode >= 300) + return Array.Empty<Chapter>(); + + // Get number of pages + int pages = requestResult.htmlDocument.DocumentNode.SelectSingleNode("//div[contains(@class, 'paginate')]").ChildNodes.ToArray().Length; + List<Chapter> chapters = new List<Chapter>(); + + for(int page = 1; page <= pages; page++) { + string pageRequestUrl = $"{requestUrl}&page={page}"; + + chapters.AddRange(ParseChaptersFromHtml(manga, pageRequestUrl)); + } + + Log($"Got {chapters.Count} chapters. {manga}"); + return chapters.Order().ToArray(); + } + + // Done + private List<Chapter> ParseChaptersFromHtml(Manga manga, string mangaUrl) + { + RequestResult result = downloadClient.MakeRequest(mangaUrl, RequestType.Default); + if ((int)result.statusCode < 200 || (int)result.statusCode >= 300 || result.htmlDocument is null) + { + Log("Failed to load site"); + return new List<Chapter>(); + } + + List<Chapter> ret = new(); + + foreach (HtmlNode chapterInfo in result.htmlDocument.DocumentNode.SelectNodes("//ul/li[contains(@class, '_episodeItem')]")) + { + HtmlNode infoNode = chapterInfo.SelectSingleNode(".//a"); + string url = infoNode.GetAttributeValue("href", ""); + + string id = chapterInfo.GetAttributeValue("id", ""); + if(id == "") continue; + string? volumeNumber = null; + string chapterNumber = chapterInfo.GetAttributeValue("data-episode-no", ""); + if(chapterNumber == "") continue; + string chapterName = infoNode.SelectSingleNode(".//span[contains(@class, 'subj')]/span").InnerText.Trim(); + ret.Add(new Chapter(manga, chapterName, volumeNumber, chapterNumber, url)); + } + + return ret; + } + + public override HttpStatusCode DownloadChapter(Chapter chapter, ProgressToken? progressToken = null) + { + if (progressToken?.cancellationRequested ?? false) + { + progressToken.Cancel(); + return HttpStatusCode.RequestTimeout; + } + + Manga chapterParentManga = chapter.parentManga; + Log($"Retrieving chapter-info {chapter} {chapterParentManga}"); + string requestUrl = chapter.url; + // Leaving this in to check if the page exists + RequestResult requestResult = + downloadClient.MakeRequest(requestUrl, RequestType.Default); + if ((int)requestResult.statusCode < 200 || (int)requestResult.statusCode >= 300) + { + progressToken?.Cancel(); + return requestResult.statusCode; + } + + string[] imageUrls = ParseImageUrlsFromHtml(requestUrl); + return DownloadChapterImages(imageUrls, chapter, RequestType.MangaImage, progressToken:progressToken, referrer: requestUrl); + } + + private string[] ParseImageUrlsFromHtml(string mangaUrl) + { + RequestResult requestResult = + downloadClient.MakeRequest(mangaUrl, RequestType.Default); + if ((int)requestResult.statusCode < 200 || (int)requestResult.statusCode >= 300) + { + return Array.Empty<string>(); + } + if (requestResult.htmlDocument is null) + { + Log($"Failed to retrieve site"); + return Array.Empty<string>(); + } + + return requestResult.htmlDocument.DocumentNode + .SelectNodes("//*[@id='_imageList']/img") + .Select(node => + node.GetAttributeValue("data-url", "")) + .ToArray(); + } +} + +internal class PublicationManager { + public PublicationManager(string title = "", string category = "", string id = "") { + this.Title = title; + this.Category = category; + this.Id = id; + } + + public PublicationManager(string publicationId) { + string[] parts = publicationId.Split("|"); + if(parts.Length == 3) { + this.Title = parts[0]; + this.Category = parts[1]; + this.Id = parts[2]; + } else { + this.Title = ""; + this.Category = ""; + this.Id = ""; + } + } + + public string getPublicationId() { + return $"{this.Title}|{this.Category}|{this.Id}"; + } + + public string Title { get; set; } + public string Category { get; set; } + public string Id { get; set; } +} \ No newline at end of file diff --git a/README.md b/README.md index 3a557cd..5a5ea41 100644 --- a/README.md +++ b/README.md @@ -45,14 +45,13 @@ Tranga can download Chapters and Metadata from "Scanlation" sites such as - [MangaDex.org](https://mangadex.org/) (Multilingual) - [Manganato.com](https://manganato.com/) (en) -- [Mangasee.com](https://mangasee123.com/) (en) - [MangaKatana.com](https://mangakatana.com) (en) - [Mangaworld.bz](https://www.mangaworld.bz/) (it) - [Bato.to](https://bato.to/v3x) (en) -- [Manga4Life](https://manga4life.com) (en) - [ManhuaPlus](https://manhuaplus.org/) (en) - [MangaHere](https://www.mangahere.cc/) (en) (Their covers aren't scrapeable.) - [Weebcentral](https://weebcentral.com) (en) +- [Webtoons](https://www.webtoons.com/en/) - ❓ Open an [issue](https://github.com/C9Glax/tranga/issues/new?assignees=&labels=New+Connector&projects=&template=new_connector.yml&title=%5BNew+Connector%5D%3A+) and trigger a library-scan with [Komga](https://komga.org/) and [Kavita](https://www.kavitareader.com/). diff --git a/Tranga/MangaConnectors/WeebCentral.cs b/Tranga/MangaConnectors/WeebCentral.cs new file mode 100644 index 0000000..50d63c4 --- /dev/null +++ b/Tranga/MangaConnectors/WeebCentral.cs @@ -0,0 +1,243 @@ +using System.Net; +using System.Text.RegularExpressions; +using HtmlAgilityPack; +using Soenneker.Utils.String.NeedlemanWunsch; +using Tranga.Jobs; + +namespace Tranga.MangaConnectors; + +public class Weebcentral : MangaConnector +{ + private readonly string _baseUrl = "https://weebcentral.com"; + + private readonly string[] _filterWords = + { "a", "the", "of", "as", "to", "no", "for", "on", "with", "be", "and", "in", "wa", "at", "be", "ni" }; + + public Weebcentral(GlobalBase clone) : base(clone, "Weebcentral", ["en"]) + { + downloadClient = new ChromiumDownloadClient(clone); + } + + public override Manga[] GetManga(string publicationTitle = "") + { + Log($"Searching Publications. Term=\"{publicationTitle}\""); + const int limit = 32; //How many values we want returned at once + var offset = 0; //"Page" + var requestUrl = + $"{_baseUrl}/search/data?limit={limit}&offset={offset}&text={publicationTitle}&sort=Best+Match&order=Ascending&official=Any&display_mode=Minimal%20Display"; + var requestResult = + downloadClient.MakeRequest(requestUrl, RequestType.Default); + if ((int)requestResult.statusCode < 200 || (int)requestResult.statusCode >= 300 || + requestResult.htmlDocument == null) + { + Log($"Failed to retrieve search: {requestResult.statusCode}"); + return []; + } + + var publications = ParsePublicationsFromHtml(requestResult.htmlDocument); + Log($"Retrieved {publications.Length} publications. Term=\"{publicationTitle}\""); + + return publications; + } + + private Manga[] ParsePublicationsFromHtml(HtmlDocument document) + { + if (document.DocumentNode.SelectNodes("//article") == null) + return Array.Empty<Manga>(); + + var urls = document.DocumentNode.SelectNodes("/html/body/article/a[@class='link link-hover']") + .Select(elem => elem.GetAttributeValue("href", "")).ToList(); + + HashSet<Manga> ret = new(); + foreach (var url in urls) + { + var manga = GetMangaFromUrl(url); + if (manga is not null) + ret.Add((Manga)manga); + } + + return ret.ToArray(); + } + + public override Manga? GetMangaFromUrl(string url) + { + Regex publicationIdRex = new(@"https:\/\/weebcentral\.com\/series\/(\w*)\/(.*)"); + var publicationId = publicationIdRex.Match(url).Groups[1].Value; + + var requestResult = downloadClient.MakeRequest(url, RequestType.MangaInfo); + if ((int)requestResult.statusCode < 300 && (int)requestResult.statusCode >= 200 && + requestResult.htmlDocument is not null) + return ParseSinglePublicationFromHtml(requestResult.htmlDocument, publicationId, url); + return null; + } + + private Manga ParseSinglePublicationFromHtml(HtmlDocument document, string publicationId, string websiteUrl) + { + var posterNode = + document.DocumentNode.SelectSingleNode("//section[@class='flex items-center justify-center']/picture/img"); + var posterUrl = posterNode?.GetAttributeValue("src", "") ?? ""; + var coverFileNameInCache = SaveCoverImageToCache(posterUrl, publicationId, RequestType.MangaCover); + + var titleNode = document.DocumentNode.SelectSingleNode("//section/h1"); + var sortName = titleNode?.InnerText ?? "Undefined"; + + HtmlNode[] authorsNodes = + document.DocumentNode.SelectNodes("//ul/li[strong/text() = 'Author(s): ']/span")?.ToArray() ?? []; + var authors = authorsNodes.Select(n => n.InnerText).ToList(); + + HtmlNode[] genreNodes = + document.DocumentNode.SelectNodes("//ul/li[strong/text() = 'Tags(s): ']/span")?.ToArray() ?? []; + HashSet<string> tags = genreNodes.Select(n => n.InnerText).ToHashSet(); + + var statusNode = document.DocumentNode.SelectSingleNode("//ul/li[strong/text() = 'Status: ']/a"); + var status = statusNode?.InnerText ?? ""; + Log("unable to parse status"); + var releaseStatus = Manga.ReleaseStatusByte.Unreleased; + switch (status.ToLower()) + { + case "cancelled": releaseStatus = Manga.ReleaseStatusByte.Cancelled; break; + case "hiatus": releaseStatus = Manga.ReleaseStatusByte.OnHiatus; break; + case "complete": releaseStatus = Manga.ReleaseStatusByte.Completed; break; + case "ongoing": releaseStatus = Manga.ReleaseStatusByte.Continuing; break; + } + + var yearNode = document.DocumentNode.SelectSingleNode("//ul/li[strong/text() = 'Released: ']/span"); + var year = Convert.ToInt32(yearNode?.InnerText ?? "0"); + + var descriptionNode = document.DocumentNode.SelectSingleNode("//ul/li[strong/text() = 'Description']/p"); + var description = descriptionNode?.InnerText ?? "Undefined"; + + HtmlNode[] altTitleNodes = document.DocumentNode + .SelectNodes("//ul/li[strong/text() = 'Associated Name(s)']/ul/li")?.ToArray() ?? []; + Dictionary<string, string> altTitles = new(), links = new(); + for (var i = 0; i < altTitleNodes.Length; i++) + altTitles.Add(i.ToString(), altTitleNodes[i].InnerText); + + var originalLanguage = ""; + + Manga manga = new(sortName, authors.ToList(), description, altTitles, tags.ToArray(), posterUrl, + coverFileNameInCache, links, + year, originalLanguage, publicationId, releaseStatus, websiteUrl); + AddMangaToCache(manga); + return manga; + } + + public override Manga? GetMangaFromId(string publicationId) + { + return GetMangaFromUrl($"https://weebcentral.com/series/{publicationId}"); + } + + private string ToFilteredString(string input) + { + return string.Join(' ', input.ToLower().Split(' ').Where(word => _filterWords.Contains(word) == false)); + } + + private SearchResult[] FilteredResults(string publicationTitle, SearchResult[] unfilteredSearchResults) + { + Dictionary<SearchResult, int> similarity = new(); + foreach (var sr in unfilteredSearchResults) + { + List<int> scores = new(); + var filteredPublicationString = ToFilteredString(publicationTitle); + var filteredSString = ToFilteredString(sr.s); + scores.Add(NeedlemanWunschStringUtil.CalculateSimilarity(filteredSString, filteredPublicationString)); + foreach (var srA in sr.a) + { + var filteredAString = ToFilteredString(srA); + scores.Add(NeedlemanWunschStringUtil.CalculateSimilarity(filteredAString, filteredPublicationString)); + } + + similarity.Add(sr, scores.Sum() / scores.Count); + } + + var ret = similarity.OrderBy(s => s.Value).Take(10).Select(s => s.Key).ToList(); + return ret.ToArray(); + } + + public override Chapter[] GetChapters(Manga manga, string language = "en") + { + Log($"Getting chapters {manga}"); + var requestUrl = $"{_baseUrl}/series/{manga.publicationId}/full-chapter-list"; + var requestResult = + downloadClient.MakeRequest(requestUrl, RequestType.Default); + if ((int)requestResult.statusCode < 200 || (int)requestResult.statusCode >= 300) + return Array.Empty<Chapter>(); + + //Return Chapters ordered by Chapter-Number + if (requestResult.htmlDocument is null) + return Array.Empty<Chapter>(); + var chapters = ParseChaptersFromHtml(manga, requestResult.htmlDocument); + Log($"Got {chapters.Count} chapters. {manga}"); + return chapters.Order().ToArray(); + } + + private List<Chapter> ParseChaptersFromHtml(Manga manga, HtmlDocument document) + { + var chaptersWrapper = document.DocumentNode.SelectSingleNode("/html/body"); + + Regex chapterRex = new(@"(\d+(?:\.\d+)*)"); + Regex idRex = new(@"https:\/\/weebcentral\.com\/chapters\/(\w*)"); + + var ret = chaptersWrapper.Descendants("a").Select(elem => + { + var url = elem.GetAttributeValue("href", "") ?? "Undefined"; + + if (!url.StartsWith("https://") && !url.StartsWith("http://")) + return new Chapter(manga, null, null, "-1", "undefined"); + + var idMatch = idRex.Match(url); + var id = idMatch.Success ? idMatch.Groups[1].Value : null; + + var chapterNode = elem.SelectSingleNode("span[@class='grow flex items-center gap-2']/span")?.InnerText ?? + "Undefined"; + + var chapterNumberMatch = chapterRex.Match(chapterNode); + var chapterNumber = chapterNumberMatch.Success ? chapterNumberMatch.Groups[1].Value : "-1"; + + return new Chapter(manga, null, null, chapterNumber, url, id); + }).Where(elem => elem.chapterNumber != -1 && elem.url != "undefined").ToList(); + + ret.Reverse(); + return ret; + } + + public override HttpStatusCode DownloadChapter(Chapter chapter, ProgressToken? progressToken = null) + { + if (progressToken?.cancellationRequested ?? false) + { + progressToken.Cancel(); + return HttpStatusCode.RequestTimeout; + } + + var chapterParentManga = chapter.parentManga; + if (progressToken?.cancellationRequested ?? false) + { + progressToken.Cancel(); + return HttpStatusCode.RequestTimeout; + } + + Log($"Retrieving chapter-info {chapter} {chapterParentManga}"); + + var requestResult = downloadClient.MakeRequest(chapter.url, RequestType.Default); + if (requestResult.htmlDocument is null) + { + progressToken?.Cancel(); + return HttpStatusCode.RequestTimeout; + } + + var document = requestResult.htmlDocument; + + var imageNodes = + document.DocumentNode.SelectNodes($"//section[@hx-get='{chapter.url}/images']/img")?.ToArray() ?? []; + var urls = imageNodes.Select(imgNode => imgNode.GetAttributeValue("src", "")).ToArray(); + + return DownloadChapterImages(urls, chapter, RequestType.MangaImage, progressToken: progressToken); + } + + private struct SearchResult + { + public string i { get; set; } + public string s { get; set; } + public string[] a { get; set; } + } +} \ No newline at end of file