From fb0755eb89b4471c7a5b9897546d14aef1e5df9c Mon Sep 17 00:00:00 2001 From: Glax Date: Sat, 27 Apr 2024 19:09:12 +0200 Subject: [PATCH] Use NeedlemanWunsch for string comparison on Mangasee.cs Resolves #132 #167 --- README.md | 1 + Tranga/MangaConnectors/Mangasee.cs | 54 +++++++----------------------- Tranga/Tranga.csproj | 1 + 3 files changed, 14 insertions(+), 42 deletions(-) diff --git a/README.md b/README.md index 05d6aee..6137396 100644 --- a/README.md +++ b/README.md @@ -85,6 +85,7 @@ That is why I wanted to create my own project, in a language I understand, and t - Newtonsoft.JSON - [PuppeteerSharp](https://www.puppeteersharp.com/) - [Html Agility Pack (HAP)](https://html-agility-pack.net/) +- [Soenneker.Utils.String.NeedlemanWunsch](https://github.com/soenneker/soenneker.utils.string.needlemanwunsch) - 💙 Blåhaj 🦈

(back to top)

diff --git a/Tranga/MangaConnectors/Mangasee.cs b/Tranga/MangaConnectors/Mangasee.cs index e1ce0c2..067f840 100644 --- a/Tranga/MangaConnectors/Mangasee.cs +++ b/Tranga/MangaConnectors/Mangasee.cs @@ -4,6 +4,7 @@ using System.Text.RegularExpressions; using System.Xml.Linq; using HtmlAgilityPack; using Newtonsoft.Json; +using Soenneker.Utils.String.NeedlemanWunsch; using Tranga.Jobs; namespace Tranga.MangaConnectors; @@ -41,14 +42,6 @@ public class Mangasee : MangaConnector SearchResult[] filteredResults = FilteredResults(publicationTitle, searchResults); Log($"Total available manga: {searchResults.Length} Filtered down to: {filteredResults.Length}"); - /* - Dictionary levenshteinRelation = filteredResults.ToDictionary(result => result, - result => - { - Log($"Levenshtein {result.s}"); - return LevenshteinDistance(publicationTitle.Replace(" ", "").ToLower(), result.s.Replace(" ", "").ToLower()); - }); - Log($"After levenshtein: {levenshteinRelation.Count}");*/ string[] urls = filteredResults.Select(result => $"https://mangasee123.com/manga/{result.i}").ToArray(); List searchResultManga = new(); @@ -70,42 +63,19 @@ public class Mangasee : MangaConnector private SearchResult[] FilteredResults(string publicationTitle, SearchResult[] unfilteredSearchResults) { - string[] bannedStrings = {"a", "the", "of", "as", "to", "no", "for", "on", "with", "be", "and", "in", "wa", "at"}; - string[] cleanSplitPublicationTitle = publicationTitle.Split(' ') - .Where(part => part.Length > 0 && !bannedStrings.Contains(part.ToLower())).ToArray(); - - return unfilteredSearchResults.Where(usr => + Dictionary similarity = new(); + foreach (SearchResult sr in unfilteredSearchResults) { - string cleanSearchResultString = string.Join(' ', usr.s.Split(' ').Where(part => part.Length > 0 && !bannedStrings.Contains(part.ToLower()))); - foreach(string splitPublicationTitlePart in cleanSplitPublicationTitle) - if (cleanSearchResultString.Contains(splitPublicationTitlePart, StringComparison.InvariantCultureIgnoreCase) || - cleanSearchResultString.Contains(splitPublicationTitlePart, StringComparison.InvariantCultureIgnoreCase)) - return true; - return false; - }).ToArray(); - } - - private int LevenshteinDistance(string a, string b) - { - if (b.Length == 0) - return a.Length; - if (a.Length == 0) - return b.Length; - if (a[0] == b[0]) - return LevenshteinDistance(a[1..], b[1..]); - - int case1 = LevenshteinDistance(a, b[1..]); - int case2 = LevenshteinDistance(a[1..], b[1..]); - int case3 = LevenshteinDistance(a[1..], b); - - if (case1 < case2) - { - return 1 + (case1 < case3 ? case1 : case3); - } - else - { - return 1 + (case2 < case3 ? case2 : case3); + List scores = new(); + foreach (string se in sr.a) + scores.Add(NeedlemanWunschStringUtil.CalculateSimilarity(se.ToLower(), publicationTitle.ToLower())); + scores.Add(NeedlemanWunschStringUtil.CalculateSimilarity(sr.s.ToLower(), publicationTitle.ToLower())); + similarity.Add(sr, scores.Sum() / scores.Count); } + + SearchResult[] similarity90 = similarity.Where(s => s.Value < 10).Select(s => s.Key).ToArray(); + + return similarity90; } public override Manga? GetMangaFromId(string publicationId) diff --git a/Tranga/Tranga.csproj b/Tranga/Tranga.csproj index 2e4242b..47b765f 100644 --- a/Tranga/Tranga.csproj +++ b/Tranga/Tranga.csproj @@ -11,6 +11,7 @@ +