mirror of
https://github.com/Kareadita/Kavita.git
synced 2025-07-09 03:04:19 -04:00
Parser optimization part1 (#1531)
* Optimize CleanTitle * Optimize MangaEditionRegex * Optimize special regexes * Refactor manga|comic special parsing into simple tests * Word bind the special regexps. Support additional "special" use cases. * Updates to address PR comments * CleanTitle benchmarking * Use a smaller Comics Data set for benchmarking
This commit is contained in:
parent
0403f938b0
commit
28c868b46c
@ -16,9 +16,9 @@
|
|||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
|
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<None Update="Data\SeriesNamesForNormalization.txt">
|
<Content Include="Data/*.txt">
|
||||||
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
|
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
|
||||||
</None>
|
</Content>
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
|
|
||||||
</Project>
|
</Project>
|
||||||
|
26
API.Benchmark/CleanTitleBenchmark.cs
Normal file
26
API.Benchmark/CleanTitleBenchmark.cs
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
using System;
|
||||||
|
using System.Collections.Generic;
|
||||||
|
using System.IO;
|
||||||
|
using System.Text.RegularExpressions;
|
||||||
|
using BenchmarkDotNet.Attributes;
|
||||||
|
using BenchmarkDotNet.Order;
|
||||||
|
|
||||||
|
namespace API.Benchmark;
|
||||||
|
|
||||||
|
[MemoryDiagnoser]
|
||||||
|
public class CleanTitleBenchmarks
|
||||||
|
{
|
||||||
|
private static IList<string> _names;
|
||||||
|
|
||||||
|
[GlobalSetup]
|
||||||
|
public void LoadData() => _names = File.ReadAllLines("Data/Comics.txt");
|
||||||
|
|
||||||
|
[Benchmark]
|
||||||
|
public void TestCleanTitle()
|
||||||
|
{
|
||||||
|
foreach (var name in _names)
|
||||||
|
{
|
||||||
|
Services.Tasks.Scanner.Parser.Parser.CleanTitle(name, true);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
112
API.Benchmark/Data/Comics.txt
Normal file
112
API.Benchmark/Data/Comics.txt
Normal file
@ -0,0 +1,112 @@
|
|||||||
|
One-Star Squadron 02 (of 06) (2022) (digital) (Son of Ultron-Empire).cbz
|
||||||
|
Batman & the Monster Men 06 (2006) (Kryptonia-DCP).cbr
|
||||||
|
Hauteville House -07- Expedition Vanikoro.cbr
|
||||||
|
Fantastic Four v3 #020.cbz
|
||||||
|
Thunderbolts 053.cbr
|
||||||
|
Moon Knight 010 2007 Red Lion-DCP .cbr
|
||||||
|
New X-Men 037.cbr
|
||||||
|
X-Men - Deadly Genesis 02 (2006) (BigBlue-DCP).cbr
|
||||||
|
Incredible Hercules 128.cbr
|
||||||
|
JLA - Year One 03 of 12.cbr
|
||||||
|
Daredevil v2 082 (2006) (Reiu-DCP).cbr
|
||||||
|
069 - Iron Man v4 035 (2009) (Minutemen-ZonesDiva).cbr
|
||||||
|
2000AD prog 2285 (2022) (digital) (Minutemen-juvecube).cbz
|
||||||
|
Tanguy et Laverdure - Intégrale - T07.cbz
|
||||||
|
Excalibur 026 (2022) (Digital) (Zone-Empire).cbz
|
||||||
|
DC vs. Vampires - Killers 001 (2022) (Webrip) (The Last Kryptonian-DCP).cbz
|
||||||
|
By the Horns 003 (2021) (Digital) (Mephisto-Empire).cbz
|
||||||
|
Incredible Hulks 630 (2011) (Minutemen-Fiji).cbz
|
||||||
|
Red Robin 010 (2010) (Minutemen-OTT).cbr
|
||||||
|
Les Droits de lHomme - OneShot - Collectif.cbz
|
||||||
|
Tout Gaston - Intégrale.cbr
|
||||||
|
Good Night, Hem (2021) (Digital) (Dipole-Empire).cbz
|
||||||
|
Bunny Mask - The Hollow Inside 001 (2022) (Digital) (Mephisto-Empire).cbz
|
||||||
|
Les MYTHICS - T14 - Avarice.cbr
|
||||||
|
Fantastic Four Special 01 (2006) (Nascent-DCP).cbr
|
||||||
|
Sonjaversal 006 (2021) (5 covers) (digital) (The Seeker-Empire).cbz
|
||||||
|
The Flash 779 (2022) (Digital) (Zone-Empire).cbz
|
||||||
|
Supergirl and the Legion of Super-Heroes 020 (2006) (CamelotScans-DCP).cbr
|
||||||
|
Time Before Time 015 (2022) (Digital) (Zone-Empire).cbz
|
||||||
|
Union Jack 02 (2006) (Red Lion-DCP).cbr
|
||||||
|
Le Corps est un Vêtement que l'on quitte.pdf
|
||||||
|
Helmet of Fate - Black Alice 01 (2007) (Racerx-DCP).cbz
|
||||||
|
Villains United 003 [2005] (Team-DCP).cbr
|
||||||
|
Punisher 002.cbr
|
||||||
|
Grendel - Devil's Odyssey 008 (2021) (digital) (NeverAngel-Empire).cbz
|
||||||
|
Uncanny X-Force 05.1 (2011) (Minutemen-Megatonic).cbz
|
||||||
|
Orcs & Gobelins - T14 - Shaaka.cbr
|
||||||
|
Les grands personnages de l'histoire en bandes dessinées - T67 - Suffren - La Bataille de Gondelou.cbz
|
||||||
|
Batman Adventures 013 (Jorl - Dcp).cbr
|
||||||
|
Norse Mythology II 003 (2021) (digital) (Son of Ultron-Empire).cbz
|
||||||
|
Ghost Rider 012 (2007) (Team-DCP).cbr
|
||||||
|
Once & Future 021 (2021) (digital) (Son of Ultron-Empire).cbz
|
||||||
|
The Seven Deadly Sins #1_ Seven Deadly Her - Nakaba Suzuki.epub
|
||||||
|
Kimagure Orange Road Omnibus #5_ Vol. 5 - Izumi Matsumoto.cbz
|
||||||
|
Booster Gold 36 2010 Minutemen-Oracle Saxon .cbr
|
||||||
|
New X-Men 023 (2006) (Reiu-DCP).cbr
|
||||||
|
World of Betty and Veronica Comics Digest 016 (2022) (Forsythe-DCP).cbz
|
||||||
|
Deadpool Team-Up 889 (2010) (noads) (LegionNever-CPS).cbr
|
||||||
|
Les bêtes de black city - T03 - le feu de la vengeance.cbr
|
||||||
|
The Brother of All Men 002 (2022) (digital) (Son of Ultron-Empire).cbz
|
||||||
|
DC Fifty-Two (52) Week One (2006) (Kryptonia-DCP).cbr
|
||||||
|
Heroes For Hire v2 09 (2007) (DarthScanner-DCP).cbr
|
||||||
|
Doom Patrol v4 012 [2005] (Bchry-DCP).cbr
|
||||||
|
Black Panther's Prey #1(Aieiebrazoff-DCP)-Repack.cbz
|
||||||
|
Hello Neighbor 02 - The Raven Brooks Disaster (2021) (Digital Rip) (Hourman-DCP).cbz
|
||||||
|
Grimm Spotlight - Cinderella vs. Zombies (2021) (digital) (The Seeker-Empire).cbz
|
||||||
|
Black's Myth 001 (2021) (digital) (Son of Ultron-Empire).cbz
|
||||||
|
Donjon Antipodes T02 +10001 Le Coffre aux Âmes.pdf
|
||||||
|
Ghost Rider 016 (2007) (Noads) (Team-DCP).cbr
|
||||||
|
JLA Classified 38 (2007) (Wolfrider-DCP).cbr
|
||||||
|
Olive 003 - On the Trail of the Nerpa (2021) (digital) (Mr Norrell-Empire).cbz
|
||||||
|
Avengers v3 #054.cbz
|
||||||
|
Doctor Strange - The Oath 01 (2006) (Kryptonia-DCP).cbr
|
||||||
|
Red Robin 006 2010 Minutemen-DTermined.cbr
|
||||||
|
056 - She-Hulk v2 032 (2008) (2 covers) (Minutemen-ReZone).cbr
|
||||||
|
DC Fifty-Two (52) Week 030 (2007) (Kryptonia-DCP).cbr
|
||||||
|
Detective Comics 1055 (2022) (Webrip) (The Last Kryptonian-DCP).cbz
|
||||||
|
Spider-Man vs. Vampires 01 2010 Minutemen-DTs .cbz
|
||||||
|
Grim 003 (2022) (digital) (Son of Ultron-Empire).cbz
|
||||||
|
Wastelanders - Star-Lord 001 (2022) (Digital) (Zone-Empire).cbz
|
||||||
|
Superman [2003-38] Adventures of Superman 621.cbr
|
||||||
|
Elektra - Black, White & Blood 001 (2022) (Digital) (Zone-Empire).cbz
|
||||||
|
Félix #15 - Heroic Album -1950- Le Tueur Fantome.cbz
|
||||||
|
Ms. Marvel v2 09 (2006) (Team-DCP).cbr
|
||||||
|
Stray Dogs - Dog Days 002 (2022) (digital) (Son of Ultron-Empire).cbz
|
||||||
|
My Date With Monsters 002 (2021) (Digital) (Mephisto-Empire).cbz
|
||||||
|
Friendly Neighborhood Spider-Man 02 (2006) (Variant Cvr) (Wildcarde1-DCP).cbr
|
||||||
|
Acriboréa -T03- Des millions de soleils.cbr
|
||||||
|
X-Men: Phoenix - Endsong 05 (of 5) [2005] (Team-DCP).cbr
|
||||||
|
Usagi Yojimbo - Lone Goat and Kid 006 (2022) (digital) (Son of Ultron-Empire).cbz
|
||||||
|
Robyn Hood Annual - The Swarm (2021) (digital) (The Seeker-Empire).cbz
|
||||||
|
Azrael #025.cbr
|
||||||
|
Nita Hawes' Nightmare Blog 002 (2021) (Digital) (Zone-Empire).cbz
|
||||||
|
Dark Avengers-Uncanny X-Men - Utopia 001.cbr
|
||||||
|
Naughty List 004 (2022) (digital) (Son of Ultron-Empire).cbz
|
||||||
|
Atalante - La Légende-04-L'Envol Des Boréades.cbz
|
||||||
|
Warlord of Mars 02 (6 covers).cbr
|
||||||
|
Action Comics 857 (2007) (CamelotScans-DCP).cbr
|
||||||
|
War For Earth - 3 002 (2022) (Webrip) (The Last Kryptonian-DCP).cbz
|
||||||
|
Oracle - T04 - Le Malformé.cbz
|
||||||
|
Battle Angel Alita #9_ Vol. 9 - Yukito Kishiro.epub
|
||||||
|
Les aventuriers de l'intermonde - T01 - Mission Athènes.cbz
|
||||||
|
Captain_America_and_The_Secret_Avengers_(2011)_(Minutemen-DTermined).cbr
|
||||||
|
She-Hulk 002 (2022) (Digital) (Zone-Empire).cbz
|
||||||
|
infinity inc 01 (2007) (racerx-dcp).cbz
|
||||||
|
Wonder Girl 004 (2021) (digital) (Son of Ultron-Empire).cbz
|
||||||
|
SEULS - T07 - Les Terres Basses.cbr
|
||||||
|
Out of Body 003 (2021) (digital) (Son of Ultron-Empire).cbz
|
||||||
|
Power Girl 09.cbr
|
||||||
|
Thor 614 (2 covers) (2010) (noads) (Archangel & FP-CPS).cbr
|
||||||
|
Iron Man 011 (2021) (Digital) (Zone-Empire).cbz
|
||||||
|
Ms. Marvel - Beyond the Limit 002 (2022) (Digital) (Zone-Empire).cbz
|
||||||
|
Ultimate X-Men #038.cbr
|
||||||
|
Excalibur 022 (2021) (Digital) (Zone-Empire).cbz
|
||||||
|
New Avengers 025 (2006) (Fixed) (Team-DCP).cbr
|
||||||
|
T06.2 - Topkapi.pdf
|
||||||
|
Thor Corps 2 of 4.cbr
|
||||||
|
Shang-Chi - Brothers & Sisters Infinity Comic 003 (2021) (Digital-Mobile) (Infinity-Empire) (WebP).cbz
|
||||||
|
X-Men To Serve And Protect 01 of 04 2010 .cbr
|
||||||
|
08A - Blue Beetle 020.cbz
|
||||||
|
The Joker Presents - A Puzzlebox Director's Cut 013 (2021) (digital) (Son of Ultron-Empire).cbz
|
||||||
|
Alice Matheson - T01 - Jour Z.cbz
|
@ -10,12 +10,5 @@ namespace API.Benchmark;
|
|||||||
/// </summary>
|
/// </summary>
|
||||||
public static class Program
|
public static class Program
|
||||||
{
|
{
|
||||||
private static void Main(string[] args)
|
private static void Main(string[] args) => BenchmarkSwitcher.FromAssembly(typeof(Program).Assembly).Run(args);
|
||||||
{
|
|
||||||
//BenchmarkRunner.Run<ParseScannedFilesBenchmarks>();
|
|
||||||
//BenchmarkRunner.Run<TestBenchmark>();
|
|
||||||
//BenchmarkRunner.Run<ParserBenchmarks>();
|
|
||||||
BenchmarkRunner.Run<EpubBenchmark>();
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -197,8 +197,12 @@ public class ComicParserTests
|
|||||||
[InlineData("Adventure Time 2013 Annual #001 (2013)", true)]
|
[InlineData("Adventure Time 2013 Annual #001 (2013)", true)]
|
||||||
[InlineData("Adventure Time 2013_Annual_#001 (2013)", true)]
|
[InlineData("Adventure Time 2013_Annual_#001 (2013)", true)]
|
||||||
[InlineData("Adventure Time 2013_-_Annual #001 (2013)", true)]
|
[InlineData("Adventure Time 2013_-_Annual #001 (2013)", true)]
|
||||||
public void ParseComicSpecialTest(string input, bool expected)
|
[InlineData("G.I. Joe - A Real American Hero Yearbook 004 Reprint (2021)", false)]
|
||||||
|
[InlineData("Mazebook 001", false)]
|
||||||
|
[InlineData("X-23 One Shot (2010)", true)]
|
||||||
|
[InlineData("Casus Belli v1 Hors-Série 21 - Mousquetaires et Sorcellerie", true)]
|
||||||
|
public void IsComicSpecialTest(string input, bool expected)
|
||||||
{
|
{
|
||||||
Assert.Equal(expected, !string.IsNullOrEmpty(API.Services.Tasks.Scanner.Parser.Parser.ParseComicSpecial(input)));
|
Assert.Equal(expected, API.Services.Tasks.Scanner.Parser.Parser.IsComicSpecial(input));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -284,6 +284,7 @@ public class MangaParserTests
|
|||||||
[InlineData("Wotakoi - Love is Hard for Otaku Omnibus v01 (2018) (Digital) (danke-Empire)", "Omnibus")]
|
[InlineData("Wotakoi - Love is Hard for Otaku Omnibus v01 (2018) (Digital) (danke-Empire)", "Omnibus")]
|
||||||
[InlineData("To Love Ru v01 Uncensored (Ch.001-007)", "Uncensored")]
|
[InlineData("To Love Ru v01 Uncensored (Ch.001-007)", "Uncensored")]
|
||||||
[InlineData("Chobits Omnibus Edition v01 [Dark Horse]", "Omnibus Edition")]
|
[InlineData("Chobits Omnibus Edition v01 [Dark Horse]", "Omnibus Edition")]
|
||||||
|
[InlineData("Chobits_Omnibus_Edition_v01_[Dark_Horse]", "Omnibus Edition")]
|
||||||
[InlineData("[dmntsf.net] One Piece - Digital Colored Comics Vol. 20 Ch. 177 - 30 Million vs 81 Million.cbz", "")]
|
[InlineData("[dmntsf.net] One Piece - Digital Colored Comics Vol. 20 Ch. 177 - 30 Million vs 81 Million.cbz", "")]
|
||||||
[InlineData("AKIRA - c003 (v01) [Full Color] [Darkhorse].cbz", "")]
|
[InlineData("AKIRA - c003 (v01) [Full Color] [Darkhorse].cbz", "")]
|
||||||
[InlineData("Love Hina Omnibus v05 (2015) (Digital-HD) (Asgard-Empire).cbz", "Omnibus")]
|
[InlineData("Love Hina Omnibus v05 (2015) (Digital-HD) (Asgard-Empire).cbz", "Omnibus")]
|
||||||
@ -306,9 +307,11 @@ public class MangaParserTests
|
|||||||
[InlineData("Beastars SP01", false)]
|
[InlineData("Beastars SP01", false)]
|
||||||
[InlineData("The League of Extraordinary Gentlemen", false)]
|
[InlineData("The League of Extraordinary Gentlemen", false)]
|
||||||
[InlineData("The League of Extra-ordinary Gentlemen", false)]
|
[InlineData("The League of Extra-ordinary Gentlemen", false)]
|
||||||
public void ParseMangaSpecialTest(string input, bool expected)
|
[InlineData("Gifting The Wonderful World With Blessings! - 3 Side Stories [yuNS][Unknown].epub", true)]
|
||||||
|
[InlineData("Dr. Ramune - Mysterious Disease Specialist v01 (2020) (Digital) (danke-Empire).cbz", false)]
|
||||||
|
public void IsMangaSpecialTest(string input, bool expected)
|
||||||
{
|
{
|
||||||
Assert.Equal(expected, !string.IsNullOrEmpty(API.Services.Tasks.Scanner.Parser.Parser.ParseMangaSpecial(input)));
|
Assert.Equal(expected, API.Services.Tasks.Scanner.Parser.Parser.IsMangaSpecial(input));
|
||||||
}
|
}
|
||||||
|
|
||||||
[Theory]
|
[Theory]
|
||||||
@ -320,13 +323,5 @@ public class MangaParserTests
|
|||||||
Assert.Equal(expected, API.Services.Tasks.Scanner.Parser.Parser.ParseFormat(inputFile));
|
Assert.Equal(expected, API.Services.Tasks.Scanner.Parser.Parser.ParseFormat(inputFile));
|
||||||
}
|
}
|
||||||
|
|
||||||
[Theory]
|
|
||||||
[InlineData("Gifting The Wonderful World With Blessings! - 3 Side Stories [yuNS][Unknown].epub", "Side Stories")]
|
|
||||||
public void ParseSpecialTest(string inputFile, string expected)
|
|
||||||
{
|
|
||||||
Assert.Equal(expected, API.Services.Tasks.Scanner.Parser.Parser.ParseMangaSpecial(inputFile));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -64,6 +64,10 @@ public class ParserTests
|
|||||||
[InlineData("[Suihei Kiki]_Kasumi_Otoko_no_Ko_[Taruby]_v1.1", false, "Kasumi Otoko no Ko v1.1")]
|
[InlineData("[Suihei Kiki]_Kasumi_Otoko_no_Ko_[Taruby]_v1.1", false, "Kasumi Otoko no Ko v1.1")]
|
||||||
[InlineData("Batman - Detective Comics - Rebirth Deluxe Edition Book 04 (2019) (digital) (Son of Ultron-Empire)", true, "Batman - Detective Comics - Rebirth Deluxe Edition")]
|
[InlineData("Batman - Detective Comics - Rebirth Deluxe Edition Book 04 (2019) (digital) (Son of Ultron-Empire)", true, "Batman - Detective Comics - Rebirth Deluxe Edition")]
|
||||||
[InlineData("Something - Full Color Edition", false, "Something - Full Color Edition")]
|
[InlineData("Something - Full Color Edition", false, "Something - Full Color Edition")]
|
||||||
|
[InlineData("Witchblade 089 (2005) (Bittertek-DCP) (Top Cow (Image Comics))", true, "Witchblade 089")]
|
||||||
|
[InlineData("(C99) Kami-sama Hiroimashita. (SSSS.GRIDMAN)", false, "Kami-sama Hiroimashita.")]
|
||||||
|
[InlineData("Dr. Ramune - Mysterious Disease Specialist v01 (2020) (Digital) (danke-Empire)", false, "Dr. Ramune - Mysterious Disease Specialist v01")]
|
||||||
|
[InlineData("Magic Knight Rayearth {Omnibus Edition}", false, "Magic Knight Rayearth {}")]
|
||||||
public void CleanTitleTest(string input, bool isComic, string expected)
|
public void CleanTitleTest(string input, bool isComic, string expected)
|
||||||
{
|
{
|
||||||
Assert.Equal(expected, CleanTitle(input, isComic));
|
Assert.Equal(expected, CleanTitle(input, isComic));
|
||||||
@ -236,4 +240,52 @@ public class ParserTests
|
|||||||
{
|
{
|
||||||
Assert.Equal(expected, NormalizePath(inputPath));
|
Assert.Equal(expected, NormalizePath(inputPath));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[Theory]
|
||||||
|
[InlineData("The quick brown fox jumps over the lazy dog")]
|
||||||
|
[InlineData("(The quick brown fox jumps over the lazy dog)")]
|
||||||
|
[InlineData("()The quick brown fox jumps over the lazy dog")]
|
||||||
|
[InlineData("The ()quick brown fox jumps over the lazy dog")]
|
||||||
|
[InlineData("The (quick (brown)) fox jumps over the lazy dog")]
|
||||||
|
[InlineData("The (quick (brown) fox jumps over the lazy dog)")]
|
||||||
|
public void BalancedParenTestMatches(string input)
|
||||||
|
{
|
||||||
|
Assert.Matches($@"^{BalancedParen}$", input);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Theory]
|
||||||
|
[InlineData("(The quick brown fox jumps over the lazy dog")]
|
||||||
|
[InlineData("The quick brown fox jumps over the lazy dog)")]
|
||||||
|
[InlineData("The )(quick brown fox jumps over the lazy dog")]
|
||||||
|
[InlineData("The quick (brown)) fox jumps over the lazy dog")]
|
||||||
|
[InlineData("The quick (brown) fox jumps over the lazy dog)")]
|
||||||
|
[InlineData("(The ))(quick (brown) fox jumps over the lazy dog")]
|
||||||
|
public void BalancedParenTestDoesNotMatch(string input)
|
||||||
|
{
|
||||||
|
Assert.DoesNotMatch($@"^{BalancedParen}$", input);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Theory]
|
||||||
|
[InlineData("The quick brown fox jumps over the lazy dog")]
|
||||||
|
[InlineData("[The quick brown fox jumps over the lazy dog]")]
|
||||||
|
[InlineData("[]The quick brown fox jumps over the lazy dog")]
|
||||||
|
[InlineData("The []quick brown fox jumps over the lazy dog")]
|
||||||
|
[InlineData("The [quick [brown]] fox jumps over the lazy dog")]
|
||||||
|
[InlineData("The [quick [brown] fox jumps over the lazy dog]")]
|
||||||
|
public void BalancedBrackTestMatches(string input)
|
||||||
|
{
|
||||||
|
Assert.Matches($@"^{BalancedBrack}$", input);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Theory]
|
||||||
|
[InlineData("[The quick brown fox jumps over the lazy dog")]
|
||||||
|
[InlineData("The quick brown fox jumps over the lazy dog]")]
|
||||||
|
[InlineData("The ][quick brown fox jumps over the lazy dog")]
|
||||||
|
[InlineData("The quick [brown]] fox jumps over the lazy dog")]
|
||||||
|
[InlineData("The quick [brown] fox jumps over the lazy dog]")]
|
||||||
|
[InlineData("[The ]][quick [brown] fox jumps over the lazy dog")]
|
||||||
|
public void BalancedBrackTestDoesNotMatch(string input)
|
||||||
|
{
|
||||||
|
Assert.DoesNotMatch($@"^{BalancedBrack}$", input);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -85,10 +85,10 @@ public class DefaultParser : IDefaultParser
|
|||||||
ret.Edition = edition;
|
ret.Edition = edition;
|
||||||
}
|
}
|
||||||
|
|
||||||
var isSpecial = type == LibraryType.Comic ? Services.Tasks.Scanner.Parser.Parser.ParseComicSpecial(fileName) : Services.Tasks.Scanner.Parser.Parser.ParseMangaSpecial(fileName);
|
var isSpecial = type == LibraryType.Comic ? Services.Tasks.Scanner.Parser.Parser.IsComicSpecial(fileName) : Services.Tasks.Scanner.Parser.Parser.IsMangaSpecial(fileName);
|
||||||
// We must ensure that we can only parse a special out. As some files will have v20 c171-180+Omake and that
|
// We must ensure that we can only parse a special out. As some files will have v20 c171-180+Omake and that
|
||||||
// could cause a problem as Omake is a special term, but there is valid volume/chapter information.
|
// could cause a problem as Omake is a special term, but there is valid volume/chapter information.
|
||||||
if (ret.Chapters == Services.Tasks.Scanner.Parser.Parser.DefaultChapter && ret.Volumes == Services.Tasks.Scanner.Parser.Parser.DefaultVolume && !string.IsNullOrEmpty(isSpecial))
|
if (ret.Chapters == Services.Tasks.Scanner.Parser.Parser.DefaultChapter && ret.Volumes == Services.Tasks.Scanner.Parser.Parser.DefaultVolume && isSpecial)
|
||||||
{
|
{
|
||||||
ret.IsSpecial = true;
|
ret.IsSpecial = true;
|
||||||
ParseFromFallbackFolders(filePath, rootPath, type, ref ret); // NOTE: This can cause some complications, we should try to be a bit less aggressive to fallback to folder
|
ParseFromFallbackFolders(filePath, rootPath, type, ref ret); // NOTE: This can cause some complications, we should try to be a bit less aggressive to fallback to folder
|
||||||
@ -131,7 +131,7 @@ public class DefaultParser : IDefaultParser
|
|||||||
for (var i = 0; i < fallbackFolders.Count; i++)
|
for (var i = 0; i < fallbackFolders.Count; i++)
|
||||||
{
|
{
|
||||||
var folder = fallbackFolders[i];
|
var folder = fallbackFolders[i];
|
||||||
if (!string.IsNullOrEmpty(Services.Tasks.Scanner.Parser.Parser.ParseMangaSpecial(folder))) continue;
|
if (Services.Tasks.Scanner.Parser.Parser.IsMangaSpecial(folder)) continue;
|
||||||
|
|
||||||
var parsedVolume = type is LibraryType.Manga ? Services.Tasks.Scanner.Parser.Parser.ParseVolume(folder) : Services.Tasks.Scanner.Parser.Parser.ParseComicVolume(folder);
|
var parsedVolume = type is LibraryType.Manga ? Services.Tasks.Scanner.Parser.Parser.ParseVolume(folder) : Services.Tasks.Scanner.Parser.Parser.ParseComicVolume(folder);
|
||||||
var parsedChapter = type is LibraryType.Manga ? Services.Tasks.Scanner.Parser.Parser.ParseChapter(folder) : Services.Tasks.Scanner.Parser.Parser.ParseComicChapter(folder);
|
var parsedChapter = type is LibraryType.Manga ? Services.Tasks.Scanner.Parser.Parser.ParseChapter(folder) : Services.Tasks.Scanner.Parser.Parser.ParseComicChapter(folder);
|
||||||
|
@ -70,6 +70,12 @@ public static class Parser
|
|||||||
private const string Number = @"\d+(\.\d)?";
|
private const string Number = @"\d+(\.\d)?";
|
||||||
private const string NumberRange = Number + @"(-" + Number + @")?";
|
private const string NumberRange = Number + @"(-" + Number + @")?";
|
||||||
|
|
||||||
|
// Some generic reusage regex patterns:
|
||||||
|
// - non greedy matching of a string where parenthesis are balanced
|
||||||
|
public const string BalancedParen = @"(?:[^()]|(?<open>\()|(?<-open>\)))*?(?(open)(?!))";
|
||||||
|
// - non greedy matching of a string where square brackets are balanced
|
||||||
|
public const string BalancedBrack = @"(?:[^\[\]]|(?<open>\[)|(?<-open>\]))*?(?(open)(?!))";
|
||||||
|
|
||||||
private static readonly Regex[] MangaVolumeRegex = new[]
|
private static readonly Regex[] MangaVolumeRegex = new[]
|
||||||
{
|
{
|
||||||
// Dance in the Vampire Bund v16-17
|
// Dance in the Vampire Bund v16-17
|
||||||
@ -499,16 +505,6 @@ public static class Parser
|
|||||||
MatchOptions, RegexTimeout),
|
MatchOptions, RegexTimeout),
|
||||||
};
|
};
|
||||||
|
|
||||||
private static readonly Regex[] ReleaseGroupRegex = new[]
|
|
||||||
{
|
|
||||||
// [TrinityBAKumA Finella&anon], [BAA]_, [SlowManga&OverloadScans], [batoto]
|
|
||||||
new Regex(@"(?:\[(?<subgroup>(?!\s).+?(?<!\s))\](?:_|-|\s|\.)?)",
|
|
||||||
MatchOptions, RegexTimeout),
|
|
||||||
// (Shadowcat-Empire),
|
|
||||||
// new Regex(@"(?:\[(?<subgroup>(?!\s).+?(?<!\s))\](?:_|-|\s|\.)?)",
|
|
||||||
// MatchOptions),
|
|
||||||
};
|
|
||||||
|
|
||||||
private static readonly Regex[] MangaChapterRegex = new[]
|
private static readonly Regex[] MangaChapterRegex = new[]
|
||||||
{
|
{
|
||||||
// Historys Strongest Disciple Kenichi_v11_c90-98.zip, ...c90.5-100.5
|
// Historys Strongest Disciple Kenichi_v11_c90-98.zip, ...c90.5-100.5
|
||||||
@ -573,65 +569,51 @@ public static class Parser
|
|||||||
MatchOptions, RegexTimeout),
|
MatchOptions, RegexTimeout),
|
||||||
};
|
};
|
||||||
|
|
||||||
private static readonly Regex[] MangaEditionRegex = {
|
private static readonly Regex MangaEditionRegex = new Regex(
|
||||||
// Tenjo Tenge {Full Contact Edition} v01 (2011) (Digital) (ASTC).cbz
|
// Tenjo Tenge {Full Contact Edition} v01 (2011) (Digital) (ASTC).cbz
|
||||||
new Regex(
|
|
||||||
@"(\b|_)(?<Edition>Omnibus(( |_)?Edition)?)(\b|_)?",
|
|
||||||
MatchOptions, RegexTimeout),
|
|
||||||
// To Love Ru v01 Uncensored (Ch.001-007)
|
// To Love Ru v01 Uncensored (Ch.001-007)
|
||||||
new Regex(
|
@"\b(?:Omnibus(?:\s?Edition)?|Uncensored)\b",
|
||||||
@"(\b|_)(?<Edition>Uncensored)(\b|_)",
|
MatchOptions, RegexTimeout
|
||||||
MatchOptions, RegexTimeout),
|
);
|
||||||
};
|
|
||||||
|
|
||||||
private static readonly Regex[] CleanupRegex =
|
// Matches [Complete], release tags like [kmts] but not [ Complete ] or [kmts ]
|
||||||
{
|
private const string TagsInBrackets = $@"\[(?!\s){BalancedBrack}(?<!\s)\]";
|
||||||
// (), {}, []
|
|
||||||
new Regex(
|
|
||||||
@"(?<Cleanup>(\{\}|\[\]|\(\)))",
|
|
||||||
MatchOptions, RegexTimeout),
|
|
||||||
// (Complete)
|
|
||||||
new Regex(
|
|
||||||
@"(?<Cleanup>(\{Complete\}|\[Complete\]|\(Complete\)))",
|
|
||||||
MatchOptions, RegexTimeout),
|
|
||||||
// Anything in parenthesis
|
|
||||||
new Regex(
|
|
||||||
@"\(.*\)",
|
|
||||||
MatchOptions, RegexTimeout),
|
|
||||||
};
|
|
||||||
|
|
||||||
private static readonly Regex[] MangaSpecialRegex =
|
// Matches anything between balanced parenthesis, tags between brackets, {} and {Complete}
|
||||||
{
|
private static readonly Regex CleanupRegex = new Regex(
|
||||||
|
$@"(?:\({BalancedParen}\)|{TagsInBrackets}|\{{\}}|\{{Complete\}})",
|
||||||
|
MatchOptions, RegexTimeout
|
||||||
|
);
|
||||||
|
|
||||||
|
// Common regex patterns present in both Comics and Mangas
|
||||||
|
private const string CommonSpecial = @"Specials?|One[- ]?Shot|Extra(?:\sChapter)?(?=\s)|Art Collection|Side Stories|Bonus";
|
||||||
|
|
||||||
|
private static readonly Regex MangaSpecialRegex = new Regex(
|
||||||
|
// All Keywords, does not account for checking if contains volume/chapter identification. Parser.Parse() will handle.
|
||||||
|
$@"\b(?:{CommonSpecial}|Omake)\b",
|
||||||
|
MatchOptions, RegexTimeout
|
||||||
|
);
|
||||||
|
|
||||||
|
private static readonly Regex ComicSpecialRegex = new Regex(
|
||||||
|
// All Keywords, does not account for checking if contains volume/chapter identification. Parser.Parse() will handle.
|
||||||
|
$@"\b(?:{CommonSpecial}|\d.+?\WAnnual|Annual\W\d.+?|Book \d.+?|Compendium \d.+?|Omnibus \d.+?|FCBD \d.+?|Absolute \d.+?|Preview \d.+?|Hors[ -]S[ée]rie|TPB|HS|THS)\b",
|
||||||
|
MatchOptions, RegexTimeout
|
||||||
|
);
|
||||||
|
|
||||||
|
private static readonly Regex EuropeanComicRegex = new Regex(
|
||||||
// All Keywords, does not account for checking if contains volume/chapter identification. Parser.Parse() will handle.
|
// All Keywords, does not account for checking if contains volume/chapter identification. Parser.Parse() will handle.
|
||||||
new Regex(
|
@"\b(?:Bd[-\s]Fr)\b",
|
||||||
@"(?<Special>Specials?|OneShot|One\-Shot|Omake|Extra(?:(\sChapter)?[^\S])|Art Collection|Side( |_)Stories|Bonus)",
|
MatchOptions, RegexTimeout
|
||||||
MatchOptions, RegexTimeout),
|
);
|
||||||
};
|
|
||||||
|
|
||||||
private static readonly Regex[] ComicSpecialRegex =
|
|
||||||
{
|
|
||||||
// All Keywords, does not account for checking if contains volume/chapter identification. Parser.Parse() will handle.
|
|
||||||
new Regex(
|
|
||||||
@"(?<Special>Specials?|OneShot|One\-Shot|\d.+?(\W|_|-)Annual|Annual(\W|_|-)\d.+?|Extra(?:(\sChapter)?[^\S])|Book \d.+?|Compendium \d.+?|Omnibus \d.+?|[_\s\-]TPB[_\s\-]|FCBD \d.+?|Absolute \d.+?|Preview \d.+?|Art Collection|Side(\s|_)Stories|Bonus|Hors Série|(\W|_|-)HS(\W|_|-)|(\W|_|-)THS(\W|_|-))",
|
|
||||||
MatchOptions, RegexTimeout),
|
|
||||||
};
|
|
||||||
|
|
||||||
private static readonly Regex[] EuropeanComicRegex =
|
|
||||||
{
|
|
||||||
// All Keywords, does not account for checking if contains volume/chapter identification. Parser.Parse() will handle.
|
|
||||||
new Regex(
|
|
||||||
@"(?<Special>Bd(\s|_|-)Fr)",
|
|
||||||
MatchOptions, RegexTimeout),
|
|
||||||
};
|
|
||||||
|
|
||||||
// If SP\d+ is in the filename, we force treat it as a special regardless if volume or chapter might have been found.
|
// If SP\d+ is in the filename, we force treat it as a special regardless if volume or chapter might have been found.
|
||||||
private static readonly Regex SpecialMarkerRegex = new Regex(
|
private static readonly Regex SpecialMarkerRegex = new Regex(
|
||||||
@"(?<Special>SP\d+)",
|
@"SP\d+",
|
||||||
MatchOptions, RegexTimeout
|
MatchOptions, RegexTimeout
|
||||||
);
|
);
|
||||||
|
|
||||||
private static readonly Regex EmptySpaceRegex = new Regex(
|
private static readonly Regex EmptySpaceRegex = new Regex(
|
||||||
@"(?!=.+)(\s{2,})(?!=.+)",
|
@"\s{2,}",
|
||||||
MatchOptions, RegexTimeout
|
MatchOptions, RegexTimeout
|
||||||
);
|
);
|
||||||
|
|
||||||
@ -642,6 +624,8 @@ public static class Parser
|
|||||||
|
|
||||||
private static readonly char[] LeadingZeroesTrimChars = new[] { '0' };
|
private static readonly char[] LeadingZeroesTrimChars = new[] { '0' };
|
||||||
|
|
||||||
|
private static readonly char[] SpacesAndSeparators = { '\0', '\t', '\r', ' ', '-', ','};
|
||||||
|
|
||||||
public static MangaFormat ParseFormat(string filePath)
|
public static MangaFormat ParseFormat(string filePath)
|
||||||
{
|
{
|
||||||
if (IsArchive(filePath)) return MangaFormat.Archive;
|
if (IsArchive(filePath)) return MangaFormat.Archive;
|
||||||
@ -653,20 +637,9 @@ public static class Parser
|
|||||||
|
|
||||||
public static string ParseEdition(string filePath)
|
public static string ParseEdition(string filePath)
|
||||||
{
|
{
|
||||||
foreach (var regex in MangaEditionRegex)
|
filePath = ReplaceUnderscores(filePath);
|
||||||
{
|
var match = MangaEditionRegex.Match(filePath);
|
||||||
var matches = regex.Matches(filePath);
|
return match.Success ? match.Value : string.Empty;
|
||||||
foreach (var group in matches.Select(match => match.Groups["Edition"])
|
|
||||||
.Where(group => group.Success && group != Match.Empty))
|
|
||||||
{
|
|
||||||
return group.Value
|
|
||||||
.Replace("{", "").Replace("}", "")
|
|
||||||
.Replace("[", "").Replace("]", "")
|
|
||||||
.Replace("(", "").Replace(")", "");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return string.Empty;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
@ -676,39 +649,19 @@ public static class Parser
|
|||||||
/// <returns></returns>
|
/// <returns></returns>
|
||||||
public static bool HasSpecialMarker(string filePath)
|
public static bool HasSpecialMarker(string filePath)
|
||||||
{
|
{
|
||||||
var matches = SpecialMarkerRegex.Matches(filePath);
|
return SpecialMarkerRegex.IsMatch(filePath);
|
||||||
return matches.Select(match => match.Groups["Special"])
|
|
||||||
.Any(group => group.Success && group != Match.Empty);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public static string ParseMangaSpecial(string filePath)
|
public static bool IsMangaSpecial(string filePath)
|
||||||
{
|
{
|
||||||
foreach (var regex in MangaSpecialRegex)
|
filePath = ReplaceUnderscores(filePath);
|
||||||
{
|
return MangaSpecialRegex.IsMatch(filePath);
|
||||||
var matches = regex.Matches(filePath);
|
|
||||||
foreach (var group in matches.Select(match => match.Groups["Special"])
|
|
||||||
.Where(group => group.Success && group != Match.Empty))
|
|
||||||
{
|
|
||||||
return group.Value;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return string.Empty;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public static string ParseComicSpecial(string filePath)
|
public static bool IsComicSpecial(string filePath)
|
||||||
{
|
{
|
||||||
foreach (var regex in ComicSpecialRegex)
|
filePath = ReplaceUnderscores(filePath);
|
||||||
{
|
return ComicSpecialRegex.IsMatch(filePath);
|
||||||
var matches = regex.Matches(filePath);
|
|
||||||
foreach (var group in matches.Select(match => match.Groups["Special"])
|
|
||||||
.Where(group => group.Success && group != Match.Empty))
|
|
||||||
{
|
|
||||||
return group.Value;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return string.Empty;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public static string ParseSeries(string filename)
|
public static string ParseSeries(string filename)
|
||||||
@ -840,73 +793,26 @@ public static class Parser
|
|||||||
|
|
||||||
private static string RemoveEditionTagHolders(string title)
|
private static string RemoveEditionTagHolders(string title)
|
||||||
{
|
{
|
||||||
foreach (var regex in CleanupRegex)
|
title = CleanupRegex.Replace(title, string.Empty);
|
||||||
{
|
|
||||||
var matches = regex.Matches(title);
|
|
||||||
foreach (Match match in matches)
|
|
||||||
{
|
|
||||||
if (match.Success)
|
|
||||||
{
|
|
||||||
title = title.Replace(match.Value, string.Empty).Trim();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
foreach (var regex in MangaEditionRegex)
|
title = MangaEditionRegex.Replace(title, string.Empty);
|
||||||
{
|
|
||||||
var matches = regex.Matches(title);
|
|
||||||
foreach (Match match in matches)
|
|
||||||
{
|
|
||||||
if (match.Success)
|
|
||||||
{
|
|
||||||
title = title.Replace(match.Value, string.Empty).Trim();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return title;
|
return title;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static string RemoveMangaSpecialTags(string title)
|
private static string RemoveMangaSpecialTags(string title)
|
||||||
{
|
{
|
||||||
foreach (var regex in MangaSpecialRegex)
|
return MangaSpecialRegex.Replace(title, string.Empty);
|
||||||
{
|
|
||||||
var matches = regex.Matches(title);
|
|
||||||
foreach (var match in matches.Where(m => m.Success))
|
|
||||||
{
|
|
||||||
title = title.Replace(match.Value, string.Empty).Trim();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return title;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static string RemoveEuropeanTags(string title)
|
private static string RemoveEuropeanTags(string title)
|
||||||
{
|
{
|
||||||
foreach (var regex in EuropeanComicRegex)
|
return EuropeanComicRegex.Replace(title, string.Empty);
|
||||||
{
|
|
||||||
var matches = regex.Matches(title);
|
|
||||||
foreach (var match in matches.Where(m => m.Success))
|
|
||||||
{
|
|
||||||
title = title.Replace(match.Value, string.Empty).Trim();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return title;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static string RemoveComicSpecialTags(string title)
|
private static string RemoveComicSpecialTags(string title)
|
||||||
{
|
{
|
||||||
foreach (var regex in ComicSpecialRegex)
|
return ComicSpecialRegex.Replace(title, string.Empty);
|
||||||
{
|
|
||||||
var matches = regex.Matches(title);
|
|
||||||
foreach (var match in matches.Where(m => m.Success))
|
|
||||||
{
|
|
||||||
title = title.Replace(match.Value, string.Empty).Trim();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return title;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -920,14 +826,14 @@ public static class Parser
|
|||||||
/// <param name="title"></param>
|
/// <param name="title"></param>
|
||||||
/// <param name="isComic"></param>
|
/// <param name="isComic"></param>
|
||||||
/// <returns></returns>
|
/// <returns></returns>
|
||||||
|
|
||||||
public static string CleanTitle(string title, bool isComic = false)
|
public static string CleanTitle(string title, bool isComic = false)
|
||||||
{
|
{
|
||||||
title = RemoveReleaseGroup(title);
|
|
||||||
|
title = ReplaceUnderscores(title);
|
||||||
|
|
||||||
title = RemoveEditionTagHolders(title);
|
title = RemoveEditionTagHolders(title);
|
||||||
|
|
||||||
title = isComic ? RemoveComicSpecialTags(title) : RemoveMangaSpecialTags(title);
|
|
||||||
|
|
||||||
if (isComic)
|
if (isComic)
|
||||||
{
|
{
|
||||||
title = RemoveComicSpecialTags(title);
|
title = RemoveComicSpecialTags(title);
|
||||||
@ -938,34 +844,10 @@ public static class Parser
|
|||||||
title = RemoveMangaSpecialTags(title);
|
title = RemoveMangaSpecialTags(title);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
title = title.Trim(SpacesAndSeparators);
|
||||||
title = title.Replace("_", " ").Trim();
|
|
||||||
if (title.EndsWith("-") || title.EndsWith(","))
|
|
||||||
{
|
|
||||||
title = title.Substring(0, title.Length - 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (title.StartsWith("-") || title.StartsWith(","))
|
|
||||||
{
|
|
||||||
title = title.Substring(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
title = EmptySpaceRegex.Replace(title, " ");
|
title = EmptySpaceRegex.Replace(title, " ");
|
||||||
|
|
||||||
return title.Trim();
|
|
||||||
}
|
|
||||||
|
|
||||||
private static string RemoveReleaseGroup(string title)
|
|
||||||
{
|
|
||||||
foreach (var regex in ReleaseGroupRegex)
|
|
||||||
{
|
|
||||||
var matches = regex.Matches(title);
|
|
||||||
foreach (var match in matches.Where(m => m.Success))
|
|
||||||
{
|
|
||||||
title = title.Replace(match.Value, string.Empty);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return title;
|
return title;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1150,4 +1032,6 @@ public static class Parser
|
|||||||
{
|
{
|
||||||
return FormatTagSpecialKeywords.Contains(comicInfoFormat);
|
return FormatTagSpecialKeywords.Contains(comicInfoFormat);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static string ReplaceUnderscores(string name) => name?.Replace("_", " ");
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user