From 28c868b46ca8154f00e54edc559d3d15980c61fb Mon Sep 17 00:00:00 2001 From: tjarls Date: Sun, 18 Sep 2022 19:26:17 +0100 Subject: [PATCH] Parser optimization part1 (#1531) * Optimize CleanTitle * Optimize MangaEditionRegex * Optimize special regexes * Refactor manga|comic special parsing into simple tests * Word bind the special regexps. Support additional "special" use cases. * Updates to address PR comments * CleanTitle benchmarking * Use a smaller Comics Data set for benchmarking --- API.Benchmark/API.Benchmark.csproj | 4 +- API.Benchmark/CleanTitleBenchmark.cs | 26 ++ API.Benchmark/Data/Comics.txt | 112 ++++++++ API.Benchmark/Program.cs | 9 +- API.Tests/Parser/ComicParserTests.cs | 8 +- API.Tests/Parser/MangaParserTests.cs | 15 +- API.Tests/Parser/ParserTest.cs | 52 ++++ .../Tasks/Scanner/Parser/DefaultParser.cs | 6 +- API/Services/Tasks/Scanner/Parser/Parser.cs | 240 +++++------------- 9 files changed, 269 insertions(+), 203 deletions(-) create mode 100644 API.Benchmark/CleanTitleBenchmark.cs create mode 100644 API.Benchmark/Data/Comics.txt diff --git a/API.Benchmark/API.Benchmark.csproj b/API.Benchmark/API.Benchmark.csproj index b6f60b873..5461138d5 100644 --- a/API.Benchmark/API.Benchmark.csproj +++ b/API.Benchmark/API.Benchmark.csproj @@ -16,9 +16,9 @@ - + Always - + diff --git a/API.Benchmark/CleanTitleBenchmark.cs b/API.Benchmark/CleanTitleBenchmark.cs new file mode 100644 index 000000000..a32b4beb2 --- /dev/null +++ b/API.Benchmark/CleanTitleBenchmark.cs @@ -0,0 +1,26 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Text.RegularExpressions; +using BenchmarkDotNet.Attributes; +using BenchmarkDotNet.Order; + +namespace API.Benchmark; + +[MemoryDiagnoser] +public class CleanTitleBenchmarks +{ + private static IList _names; + + [GlobalSetup] + public void LoadData() => _names = File.ReadAllLines("Data/Comics.txt"); + + [Benchmark] + public void TestCleanTitle() + { + foreach (var name in _names) + { + Services.Tasks.Scanner.Parser.Parser.CleanTitle(name, true); + } + } +} diff --git a/API.Benchmark/Data/Comics.txt b/API.Benchmark/Data/Comics.txt new file mode 100644 index 000000000..05eb2d52f --- /dev/null +++ b/API.Benchmark/Data/Comics.txt @@ -0,0 +1,112 @@ +One-Star Squadron 02 (of 06) (2022) (digital) (Son of Ultron-Empire).cbz +Batman & the Monster Men 06 (2006) (Kryptonia-DCP).cbr +Hauteville House -07- Expedition Vanikoro.cbr +Fantastic Four v3 #020.cbz +Thunderbolts 053.cbr +Moon Knight 010 2007 Red Lion-DCP .cbr +New X-Men 037.cbr +X-Men - Deadly Genesis 02 (2006) (BigBlue-DCP).cbr +Incredible Hercules 128.cbr +JLA - Year One 03 of 12.cbr +Daredevil v2 082 (2006) (Reiu-DCP).cbr +069 - Iron Man v4 035 (2009) (Minutemen-ZonesDiva).cbr +2000AD prog 2285 (2022) (digital) (Minutemen-juvecube).cbz +Tanguy et Laverdure - Intégrale - T07.cbz +Excalibur 026 (2022) (Digital) (Zone-Empire).cbz +DC vs. Vampires - Killers 001 (2022) (Webrip) (The Last Kryptonian-DCP).cbz +By the Horns 003 (2021) (Digital) (Mephisto-Empire).cbz +Incredible Hulks 630 (2011) (Minutemen-Fiji).cbz +Red Robin 010 (2010) (Minutemen-OTT).cbr +Les Droits de lHomme - OneShot - Collectif.cbz +Tout Gaston - Intégrale.cbr +Good Night, Hem (2021) (Digital) (Dipole-Empire).cbz +Bunny Mask - The Hollow Inside 001 (2022) (Digital) (Mephisto-Empire).cbz +Les MYTHICS - T14 - Avarice.cbr +Fantastic Four Special 01 (2006) (Nascent-DCP).cbr +Sonjaversal 006 (2021) (5 covers) (digital) (The Seeker-Empire).cbz +The Flash 779 (2022) (Digital) (Zone-Empire).cbz +Supergirl and the Legion of Super-Heroes 020 (2006) (CamelotScans-DCP).cbr +Time Before Time 015 (2022) (Digital) (Zone-Empire).cbz +Union Jack 02 (2006) (Red Lion-DCP).cbr +Le Corps est un Vêtement que l'on quitte.pdf +Helmet of Fate - Black Alice 01 (2007) (Racerx-DCP).cbz +Villains United 003 [2005] (Team-DCP).cbr +Punisher 002.cbr +Grendel - Devil's Odyssey 008 (2021) (digital) (NeverAngel-Empire).cbz +Uncanny X-Force 05.1 (2011) (Minutemen-Megatonic).cbz +Orcs & Gobelins - T14 - Shaaka.cbr +Les grands personnages de l'histoire en bandes dessinées - T67 - Suffren - La Bataille de Gondelou.cbz +Batman Adventures 013 (Jorl - Dcp).cbr +Norse Mythology II 003 (2021) (digital) (Son of Ultron-Empire).cbz +Ghost Rider 012 (2007) (Team-DCP).cbr +Once & Future 021 (2021) (digital) (Son of Ultron-Empire).cbz +The Seven Deadly Sins #1_ Seven Deadly Her - Nakaba Suzuki.epub +Kimagure Orange Road Omnibus #5_ Vol. 5 - Izumi Matsumoto.cbz +Booster Gold 36 2010 Minutemen-Oracle Saxon .cbr +New X-Men 023 (2006) (Reiu-DCP).cbr +World of Betty and Veronica Comics Digest 016 (2022) (Forsythe-DCP).cbz +Deadpool Team-Up 889 (2010) (noads) (LegionNever-CPS).cbr +Les bêtes de black city - T03 - le feu de la vengeance.cbr +The Brother of All Men 002 (2022) (digital) (Son of Ultron-Empire).cbz +DC Fifty-Two (52) Week One (2006) (Kryptonia-DCP).cbr +Heroes For Hire v2 09 (2007) (DarthScanner-DCP).cbr +Doom Patrol v4 012 [2005] (Bchry-DCP).cbr +Black Panther's Prey #1(Aieiebrazoff-DCP)-Repack.cbz +Hello Neighbor 02 - The Raven Brooks Disaster (2021) (Digital Rip) (Hourman-DCP).cbz +Grimm Spotlight - Cinderella vs. Zombies (2021) (digital) (The Seeker-Empire).cbz +Black's Myth 001 (2021) (digital) (Son of Ultron-Empire).cbz +Donjon Antipodes T02 +10001 Le Coffre aux Âmes.pdf +Ghost Rider 016 (2007) (Noads) (Team-DCP).cbr +JLA Classified 38 (2007) (Wolfrider-DCP).cbr +Olive 003 - On the Trail of the Nerpa (2021) (digital) (Mr Norrell-Empire).cbz +Avengers v3 #054.cbz +Doctor Strange - The Oath 01 (2006) (Kryptonia-DCP).cbr +Red Robin 006 2010 Minutemen-DTermined.cbr +056 - She-Hulk v2 032 (2008) (2 covers) (Minutemen-ReZone).cbr +DC Fifty-Two (52) Week 030 (2007) (Kryptonia-DCP).cbr +Detective Comics 1055 (2022) (Webrip) (The Last Kryptonian-DCP).cbz +Spider-Man vs. Vampires 01 2010 Minutemen-DTs .cbz +Grim 003 (2022) (digital) (Son of Ultron-Empire).cbz +Wastelanders - Star-Lord 001 (2022) (Digital) (Zone-Empire).cbz +Superman [2003-38] Adventures of Superman 621.cbr +Elektra - Black, White & Blood 001 (2022) (Digital) (Zone-Empire).cbz +Félix #15 - Heroic Album -1950- Le Tueur Fantome.cbz +Ms. Marvel v2 09 (2006) (Team-DCP).cbr +Stray Dogs - Dog Days 002 (2022) (digital) (Son of Ultron-Empire).cbz +My Date With Monsters 002 (2021) (Digital) (Mephisto-Empire).cbz +Friendly Neighborhood Spider-Man 02 (2006) (Variant Cvr) (Wildcarde1-DCP).cbr +Acriboréa -T03- Des millions de soleils.cbr +X-Men: Phoenix - Endsong 05 (of 5) [2005] (Team-DCP).cbr +Usagi Yojimbo - Lone Goat and Kid 006 (2022) (digital) (Son of Ultron-Empire).cbz +Robyn Hood Annual - The Swarm (2021) (digital) (The Seeker-Empire).cbz +Azrael #025.cbr +Nita Hawes' Nightmare Blog 002 (2021) (Digital) (Zone-Empire).cbz +Dark Avengers-Uncanny X-Men - Utopia 001.cbr +Naughty List 004 (2022) (digital) (Son of Ultron-Empire).cbz +Atalante - La Légende-04-L'Envol Des Boréades.cbz +Warlord of Mars 02 (6 covers).cbr +Action Comics 857 (2007) (CamelotScans-DCP).cbr +War For Earth - 3 002 (2022) (Webrip) (The Last Kryptonian-DCP).cbz +Oracle - T04 - Le Malformé.cbz +Battle Angel Alita #9_ Vol. 9 - Yukito Kishiro.epub +Les aventuriers de l'intermonde - T01 - Mission Athènes.cbz +Captain_America_and_The_Secret_Avengers_(2011)_(Minutemen-DTermined).cbr +She-Hulk 002 (2022) (Digital) (Zone-Empire).cbz +infinity inc 01 (2007) (racerx-dcp).cbz +Wonder Girl 004 (2021) (digital) (Son of Ultron-Empire).cbz +SEULS - T07 - Les Terres Basses.cbr +Out of Body 003 (2021) (digital) (Son of Ultron-Empire).cbz +Power Girl 09.cbr +Thor 614 (2 covers) (2010) (noads) (Archangel & FP-CPS).cbr +Iron Man 011 (2021) (Digital) (Zone-Empire).cbz +Ms. Marvel - Beyond the Limit 002 (2022) (Digital) (Zone-Empire).cbz +Ultimate X-Men #038.cbr +Excalibur 022 (2021) (Digital) (Zone-Empire).cbz +New Avengers 025 (2006) (Fixed) (Team-DCP).cbr +T06.2 - Topkapi.pdf +Thor Corps 2 of 4.cbr +Shang-Chi - Brothers & Sisters Infinity Comic 003 (2021) (Digital-Mobile) (Infinity-Empire) (WebP).cbz +X-Men To Serve And Protect 01 of 04 2010 .cbr +08A - Blue Beetle 020.cbz +The Joker Presents - A Puzzlebox Director's Cut 013 (2021) (digital) (Son of Ultron-Empire).cbz +Alice Matheson - T01 - Jour Z.cbz diff --git a/API.Benchmark/Program.cs b/API.Benchmark/Program.cs index d43b84240..76ed97c70 100644 --- a/API.Benchmark/Program.cs +++ b/API.Benchmark/Program.cs @@ -10,12 +10,5 @@ namespace API.Benchmark; /// public static class Program { - private static void Main(string[] args) - { - //BenchmarkRunner.Run(); - //BenchmarkRunner.Run(); - //BenchmarkRunner.Run(); - BenchmarkRunner.Run(); - - } + private static void Main(string[] args) => BenchmarkSwitcher.FromAssembly(typeof(Program).Assembly).Run(args); } diff --git a/API.Tests/Parser/ComicParserTests.cs b/API.Tests/Parser/ComicParserTests.cs index 9a2c576c9..2787f0ed0 100644 --- a/API.Tests/Parser/ComicParserTests.cs +++ b/API.Tests/Parser/ComicParserTests.cs @@ -197,8 +197,12 @@ public class ComicParserTests [InlineData("Adventure Time 2013 Annual #001 (2013)", true)] [InlineData("Adventure Time 2013_Annual_#001 (2013)", true)] [InlineData("Adventure Time 2013_-_Annual #001 (2013)", true)] - public void ParseComicSpecialTest(string input, bool expected) + [InlineData("G.I. Joe - A Real American Hero Yearbook 004 Reprint (2021)", false)] + [InlineData("Mazebook 001", false)] + [InlineData("X-23 One Shot (2010)", true)] + [InlineData("Casus Belli v1 Hors-Série 21 - Mousquetaires et Sorcellerie", true)] + public void IsComicSpecialTest(string input, bool expected) { - Assert.Equal(expected, !string.IsNullOrEmpty(API.Services.Tasks.Scanner.Parser.Parser.ParseComicSpecial(input))); + Assert.Equal(expected, API.Services.Tasks.Scanner.Parser.Parser.IsComicSpecial(input)); } } diff --git a/API.Tests/Parser/MangaParserTests.cs b/API.Tests/Parser/MangaParserTests.cs index c482bcabd..010e5ea3f 100644 --- a/API.Tests/Parser/MangaParserTests.cs +++ b/API.Tests/Parser/MangaParserTests.cs @@ -284,6 +284,7 @@ public class MangaParserTests [InlineData("Wotakoi - Love is Hard for Otaku Omnibus v01 (2018) (Digital) (danke-Empire)", "Omnibus")] [InlineData("To Love Ru v01 Uncensored (Ch.001-007)", "Uncensored")] [InlineData("Chobits Omnibus Edition v01 [Dark Horse]", "Omnibus Edition")] + [InlineData("Chobits_Omnibus_Edition_v01_[Dark_Horse]", "Omnibus Edition")] [InlineData("[dmntsf.net] One Piece - Digital Colored Comics Vol. 20 Ch. 177 - 30 Million vs 81 Million.cbz", "")] [InlineData("AKIRA - c003 (v01) [Full Color] [Darkhorse].cbz", "")] [InlineData("Love Hina Omnibus v05 (2015) (Digital-HD) (Asgard-Empire).cbz", "Omnibus")] @@ -306,9 +307,11 @@ public class MangaParserTests [InlineData("Beastars SP01", false)] [InlineData("The League of Extraordinary Gentlemen", false)] [InlineData("The League of Extra-ordinary Gentlemen", false)] - public void ParseMangaSpecialTest(string input, bool expected) + [InlineData("Gifting The Wonderful World With Blessings! - 3 Side Stories [yuNS][Unknown].epub", true)] + [InlineData("Dr. Ramune - Mysterious Disease Specialist v01 (2020) (Digital) (danke-Empire).cbz", false)] + public void IsMangaSpecialTest(string input, bool expected) { - Assert.Equal(expected, !string.IsNullOrEmpty(API.Services.Tasks.Scanner.Parser.Parser.ParseMangaSpecial(input))); + Assert.Equal(expected, API.Services.Tasks.Scanner.Parser.Parser.IsMangaSpecial(input)); } [Theory] @@ -320,13 +323,5 @@ public class MangaParserTests Assert.Equal(expected, API.Services.Tasks.Scanner.Parser.Parser.ParseFormat(inputFile)); } - [Theory] - [InlineData("Gifting The Wonderful World With Blessings! - 3 Side Stories [yuNS][Unknown].epub", "Side Stories")] - public void ParseSpecialTest(string inputFile, string expected) - { - Assert.Equal(expected, API.Services.Tasks.Scanner.Parser.Parser.ParseMangaSpecial(inputFile)); - } - - } diff --git a/API.Tests/Parser/ParserTest.cs b/API.Tests/Parser/ParserTest.cs index fb88f2d3e..cbb25f57b 100644 --- a/API.Tests/Parser/ParserTest.cs +++ b/API.Tests/Parser/ParserTest.cs @@ -64,6 +64,10 @@ public class ParserTests [InlineData("[Suihei Kiki]_Kasumi_Otoko_no_Ko_[Taruby]_v1.1", false, "Kasumi Otoko no Ko v1.1")] [InlineData("Batman - Detective Comics - Rebirth Deluxe Edition Book 04 (2019) (digital) (Son of Ultron-Empire)", true, "Batman - Detective Comics - Rebirth Deluxe Edition")] [InlineData("Something - Full Color Edition", false, "Something - Full Color Edition")] + [InlineData("Witchblade 089 (2005) (Bittertek-DCP) (Top Cow (Image Comics))", true, "Witchblade 089")] + [InlineData("(C99) Kami-sama Hiroimashita. (SSSS.GRIDMAN)", false, "Kami-sama Hiroimashita.")] + [InlineData("Dr. Ramune - Mysterious Disease Specialist v01 (2020) (Digital) (danke-Empire)", false, "Dr. Ramune - Mysterious Disease Specialist v01")] + [InlineData("Magic Knight Rayearth {Omnibus Edition}", false, "Magic Knight Rayearth {}")] public void CleanTitleTest(string input, bool isComic, string expected) { Assert.Equal(expected, CleanTitle(input, isComic)); @@ -236,4 +240,52 @@ public class ParserTests { Assert.Equal(expected, NormalizePath(inputPath)); } + + [Theory] + [InlineData("The quick brown fox jumps over the lazy dog")] + [InlineData("(The quick brown fox jumps over the lazy dog)")] + [InlineData("()The quick brown fox jumps over the lazy dog")] + [InlineData("The ()quick brown fox jumps over the lazy dog")] + [InlineData("The (quick (brown)) fox jumps over the lazy dog")] + [InlineData("The (quick (brown) fox jumps over the lazy dog)")] + public void BalancedParenTestMatches(string input) + { + Assert.Matches($@"^{BalancedParen}$", input); + } + + [Theory] + [InlineData("(The quick brown fox jumps over the lazy dog")] + [InlineData("The quick brown fox jumps over the lazy dog)")] + [InlineData("The )(quick brown fox jumps over the lazy dog")] + [InlineData("The quick (brown)) fox jumps over the lazy dog")] + [InlineData("The quick (brown) fox jumps over the lazy dog)")] + [InlineData("(The ))(quick (brown) fox jumps over the lazy dog")] + public void BalancedParenTestDoesNotMatch(string input) + { + Assert.DoesNotMatch($@"^{BalancedParen}$", input); + } + + [Theory] + [InlineData("The quick brown fox jumps over the lazy dog")] + [InlineData("[The quick brown fox jumps over the lazy dog]")] + [InlineData("[]The quick brown fox jumps over the lazy dog")] + [InlineData("The []quick brown fox jumps over the lazy dog")] + [InlineData("The [quick [brown]] fox jumps over the lazy dog")] + [InlineData("The [quick [brown] fox jumps over the lazy dog]")] + public void BalancedBrackTestMatches(string input) + { + Assert.Matches($@"^{BalancedBrack}$", input); + } + + [Theory] + [InlineData("[The quick brown fox jumps over the lazy dog")] + [InlineData("The quick brown fox jumps over the lazy dog]")] + [InlineData("The ][quick brown fox jumps over the lazy dog")] + [InlineData("The quick [brown]] fox jumps over the lazy dog")] + [InlineData("The quick [brown] fox jumps over the lazy dog]")] + [InlineData("[The ]][quick [brown] fox jumps over the lazy dog")] + public void BalancedBrackTestDoesNotMatch(string input) + { + Assert.DoesNotMatch($@"^{BalancedBrack}$", input); + } } diff --git a/API/Services/Tasks/Scanner/Parser/DefaultParser.cs b/API/Services/Tasks/Scanner/Parser/DefaultParser.cs index 60317e97d..a92256941 100644 --- a/API/Services/Tasks/Scanner/Parser/DefaultParser.cs +++ b/API/Services/Tasks/Scanner/Parser/DefaultParser.cs @@ -85,10 +85,10 @@ public class DefaultParser : IDefaultParser ret.Edition = edition; } - var isSpecial = type == LibraryType.Comic ? Services.Tasks.Scanner.Parser.Parser.ParseComicSpecial(fileName) : Services.Tasks.Scanner.Parser.Parser.ParseMangaSpecial(fileName); + var isSpecial = type == LibraryType.Comic ? Services.Tasks.Scanner.Parser.Parser.IsComicSpecial(fileName) : Services.Tasks.Scanner.Parser.Parser.IsMangaSpecial(fileName); // We must ensure that we can only parse a special out. As some files will have v20 c171-180+Omake and that // could cause a problem as Omake is a special term, but there is valid volume/chapter information. - if (ret.Chapters == Services.Tasks.Scanner.Parser.Parser.DefaultChapter && ret.Volumes == Services.Tasks.Scanner.Parser.Parser.DefaultVolume && !string.IsNullOrEmpty(isSpecial)) + if (ret.Chapters == Services.Tasks.Scanner.Parser.Parser.DefaultChapter && ret.Volumes == Services.Tasks.Scanner.Parser.Parser.DefaultVolume && isSpecial) { ret.IsSpecial = true; ParseFromFallbackFolders(filePath, rootPath, type, ref ret); // NOTE: This can cause some complications, we should try to be a bit less aggressive to fallback to folder @@ -131,7 +131,7 @@ public class DefaultParser : IDefaultParser for (var i = 0; i < fallbackFolders.Count; i++) { var folder = fallbackFolders[i]; - if (!string.IsNullOrEmpty(Services.Tasks.Scanner.Parser.Parser.ParseMangaSpecial(folder))) continue; + if (Services.Tasks.Scanner.Parser.Parser.IsMangaSpecial(folder)) continue; var parsedVolume = type is LibraryType.Manga ? Services.Tasks.Scanner.Parser.Parser.ParseVolume(folder) : Services.Tasks.Scanner.Parser.Parser.ParseComicVolume(folder); var parsedChapter = type is LibraryType.Manga ? Services.Tasks.Scanner.Parser.Parser.ParseChapter(folder) : Services.Tasks.Scanner.Parser.Parser.ParseComicChapter(folder); diff --git a/API/Services/Tasks/Scanner/Parser/Parser.cs b/API/Services/Tasks/Scanner/Parser/Parser.cs index 4b4f62130..cf292d66c 100644 --- a/API/Services/Tasks/Scanner/Parser/Parser.cs +++ b/API/Services/Tasks/Scanner/Parser/Parser.cs @@ -70,6 +70,12 @@ public static class Parser private const string Number = @"\d+(\.\d)?"; private const string NumberRange = Number + @"(-" + Number + @")?"; + // Some generic reusage regex patterns: + // - non greedy matching of a string where parenthesis are balanced + public const string BalancedParen = @"(?:[^()]|(?\()|(?<-open>\)))*?(?(open)(?!))"; + // - non greedy matching of a string where square brackets are balanced + public const string BalancedBrack = @"(?:[^\[\]]|(?\[)|(?<-open>\]))*?(?(open)(?!))"; + private static readonly Regex[] MangaVolumeRegex = new[] { // Dance in the Vampire Bund v16-17 @@ -499,16 +505,6 @@ public static class Parser MatchOptions, RegexTimeout), }; - private static readonly Regex[] ReleaseGroupRegex = new[] - { - // [TrinityBAKumA Finella&anon], [BAA]_, [SlowManga&OverloadScans], [batoto] - new Regex(@"(?:\[(?(?!\s).+?(?(?!\s).+?(?Omnibus(( |_)?Edition)?)(\b|_)?", - MatchOptions, RegexTimeout), // To Love Ru v01 Uncensored (Ch.001-007) - new Regex( - @"(\b|_)(?Uncensored)(\b|_)", - MatchOptions, RegexTimeout), - }; + @"\b(?:Omnibus(?:\s?Edition)?|Uncensored)\b", + MatchOptions, RegexTimeout + ); - private static readonly Regex[] CleanupRegex = - { - // (), {}, [] - new Regex( - @"(?(\{\}|\[\]|\(\)))", - MatchOptions, RegexTimeout), - // (Complete) - new Regex( - @"(?(\{Complete\}|\[Complete\]|\(Complete\)))", - MatchOptions, RegexTimeout), - // Anything in parenthesis - new Regex( - @"\(.*\)", - MatchOptions, RegexTimeout), - }; + // Matches [Complete], release tags like [kmts] but not [ Complete ] or [kmts ] + private const string TagsInBrackets = $@"\[(?!\s){BalancedBrack}(?Specials?|OneShot|One\-Shot|Omake|Extra(?:(\sChapter)?[^\S])|Art Collection|Side( |_)Stories|Bonus)", - MatchOptions, RegexTimeout), - }; - - private static readonly Regex[] ComicSpecialRegex = - { - // All Keywords, does not account for checking if contains volume/chapter identification. Parser.Parse() will handle. - new Regex( - @"(?Specials?|OneShot|One\-Shot|\d.+?(\W|_|-)Annual|Annual(\W|_|-)\d.+?|Extra(?:(\sChapter)?[^\S])|Book \d.+?|Compendium \d.+?|Omnibus \d.+?|[_\s\-]TPB[_\s\-]|FCBD \d.+?|Absolute \d.+?|Preview \d.+?|Art Collection|Side(\s|_)Stories|Bonus|Hors Série|(\W|_|-)HS(\W|_|-)|(\W|_|-)THS(\W|_|-))", - MatchOptions, RegexTimeout), - }; - - private static readonly Regex[] EuropeanComicRegex = - { - // All Keywords, does not account for checking if contains volume/chapter identification. Parser.Parse() will handle. - new Regex( - @"(?Bd(\s|_|-)Fr)", - MatchOptions, RegexTimeout), - }; + @"\b(?:Bd[-\s]Fr)\b", + MatchOptions, RegexTimeout + ); // If SP\d+ is in the filename, we force treat it as a special regardless if volume or chapter might have been found. private static readonly Regex SpecialMarkerRegex = new Regex( - @"(?SP\d+)", + @"SP\d+", MatchOptions, RegexTimeout ); private static readonly Regex EmptySpaceRegex = new Regex( - @"(?!=.+)(\s{2,})(?!=.+)", + @"\s{2,}", MatchOptions, RegexTimeout ); @@ -642,6 +624,8 @@ public static class Parser private static readonly char[] LeadingZeroesTrimChars = new[] { '0' }; + private static readonly char[] SpacesAndSeparators = { '\0', '\t', '\r', ' ', '-', ','}; + public static MangaFormat ParseFormat(string filePath) { if (IsArchive(filePath)) return MangaFormat.Archive; @@ -653,20 +637,9 @@ public static class Parser public static string ParseEdition(string filePath) { - foreach (var regex in MangaEditionRegex) - { - var matches = regex.Matches(filePath); - foreach (var group in matches.Select(match => match.Groups["Edition"]) - .Where(group => group.Success && group != Match.Empty)) - { - return group.Value - .Replace("{", "").Replace("}", "") - .Replace("[", "").Replace("]", "") - .Replace("(", "").Replace(")", ""); - } - } - - return string.Empty; + filePath = ReplaceUnderscores(filePath); + var match = MangaEditionRegex.Match(filePath); + return match.Success ? match.Value : string.Empty; } /// @@ -676,39 +649,19 @@ public static class Parser /// public static bool HasSpecialMarker(string filePath) { - var matches = SpecialMarkerRegex.Matches(filePath); - return matches.Select(match => match.Groups["Special"]) - .Any(group => group.Success && group != Match.Empty); + return SpecialMarkerRegex.IsMatch(filePath); } - public static string ParseMangaSpecial(string filePath) + public static bool IsMangaSpecial(string filePath) { - foreach (var regex in MangaSpecialRegex) - { - var matches = regex.Matches(filePath); - foreach (var group in matches.Select(match => match.Groups["Special"]) - .Where(group => group.Success && group != Match.Empty)) - { - return group.Value; - } - } - - return string.Empty; + filePath = ReplaceUnderscores(filePath); + return MangaSpecialRegex.IsMatch(filePath); } - public static string ParseComicSpecial(string filePath) + public static bool IsComicSpecial(string filePath) { - foreach (var regex in ComicSpecialRegex) - { - var matches = regex.Matches(filePath); - foreach (var group in matches.Select(match => match.Groups["Special"]) - .Where(group => group.Success && group != Match.Empty)) - { - return group.Value; - } - } - - return string.Empty; + filePath = ReplaceUnderscores(filePath); + return ComicSpecialRegex.IsMatch(filePath); } public static string ParseSeries(string filename) @@ -840,73 +793,26 @@ public static class Parser private static string RemoveEditionTagHolders(string title) { - foreach (var regex in CleanupRegex) - { - var matches = regex.Matches(title); - foreach (Match match in matches) - { - if (match.Success) - { - title = title.Replace(match.Value, string.Empty).Trim(); - } - } - } + title = CleanupRegex.Replace(title, string.Empty); - foreach (var regex in MangaEditionRegex) - { - var matches = regex.Matches(title); - foreach (Match match in matches) - { - if (match.Success) - { - title = title.Replace(match.Value, string.Empty).Trim(); - } - } - } + title = MangaEditionRegex.Replace(title, string.Empty); return title; } private static string RemoveMangaSpecialTags(string title) { - foreach (var regex in MangaSpecialRegex) - { - var matches = regex.Matches(title); - foreach (var match in matches.Where(m => m.Success)) - { - title = title.Replace(match.Value, string.Empty).Trim(); - } - } - - return title; + return MangaSpecialRegex.Replace(title, string.Empty); } private static string RemoveEuropeanTags(string title) { - foreach (var regex in EuropeanComicRegex) - { - var matches = regex.Matches(title); - foreach (var match in matches.Where(m => m.Success)) - { - title = title.Replace(match.Value, string.Empty).Trim(); - } - } - - return title; + return EuropeanComicRegex.Replace(title, string.Empty); } private static string RemoveComicSpecialTags(string title) { - foreach (var regex in ComicSpecialRegex) - { - var matches = regex.Matches(title); - foreach (var match in matches.Where(m => m.Success)) - { - title = title.Replace(match.Value, string.Empty).Trim(); - } - } - - return title; + return ComicSpecialRegex.Replace(title, string.Empty); } @@ -920,14 +826,14 @@ public static class Parser /// /// /// + public static string CleanTitle(string title, bool isComic = false) { - title = RemoveReleaseGroup(title); + + title = ReplaceUnderscores(title); title = RemoveEditionTagHolders(title); - title = isComic ? RemoveComicSpecialTags(title) : RemoveMangaSpecialTags(title); - if (isComic) { title = RemoveComicSpecialTags(title); @@ -938,34 +844,10 @@ public static class Parser title = RemoveMangaSpecialTags(title); } - - title = title.Replace("_", " ").Trim(); - if (title.EndsWith("-") || title.EndsWith(",")) - { - title = title.Substring(0, title.Length - 1); - } - - if (title.StartsWith("-") || title.StartsWith(",")) - { - title = title.Substring(1); - } + title = title.Trim(SpacesAndSeparators); title = EmptySpaceRegex.Replace(title, " "); - return title.Trim(); - } - - private static string RemoveReleaseGroup(string title) - { - foreach (var regex in ReleaseGroupRegex) - { - var matches = regex.Matches(title); - foreach (var match in matches.Where(m => m.Success)) - { - title = title.Replace(match.Value, string.Empty); - } - } - return title; } @@ -1150,4 +1032,6 @@ public static class Parser { return FormatTagSpecialKeywords.Contains(comicInfoFormat); } + + private static string ReplaceUnderscores(string name) => name?.Replace("_", " "); }