Parsing Enhancements (#126)

* More cases for parsing regex

* Implemented the ability to parse "Special" keywords.

* Commented out some unit tests

* More parsing cases

* Fixed unit tests

* Fixed typo in build script

* Fixed a bug where if there was a series with same name, but different capitalization, we wouldn't process it's infos.

* Tons of regex updates to handle more cases.

* More regex tweaking to handle as many cases as possible.

* Bad merge caused the comic parser to break. Fixed with some better regex.
This commit is contained in:
Joseph Milazzo 2021-03-29 15:15:49 -05:00 committed by GitHub
parent 3e031ab458
commit d9246b7351
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 118 additions and 39 deletions

View File

@ -55,6 +55,10 @@ namespace API.Tests
[InlineData("Kedouin Makoto - Corpse Party Musume, Chapter 12 [Dametrans][v2]", "0")]
[InlineData("Vagabond_v03", "3")]
[InlineData("Mujaki No Rakune Volume 10.cbz", "10")]
[InlineData("Umineko no Naku Koro ni - Episode 3 - Banquet of the Golden Witch #02.cbz", "3")]
[InlineData("Volume 12 - Janken Boy is Coming!.cbz", "12")]
[InlineData("[dmntsf.net] One Piece - Digital Colored Comics Vol. 20 Ch. 177 - 30 Million vs 81 Million.cbz", "20")]
[InlineData("Gantz.V26.cbz", "26")]
public void ParseVolumeTest(string filename, string expected)
{
Assert.Equal(expected, ParseVolume(filename));
@ -110,8 +114,18 @@ namespace API.Tests
[InlineData("Vagabond_v03", "Vagabond")]
[InlineData("[AN] Mahoutsukai to Deshi no Futekisetsu na Kankei Chp. 1", "Mahoutsukai to Deshi no Futekisetsu na Kankei")]
[InlineData("Beelzebub_Side_Story_02_RHS.zip", "Beelzebub Side Story")]
[InlineData("Corpse Party -The Anthology- Sachikos game of love Hysteric Birthday 2U Chapter 01", "Corpse Party -The Anthology- Sachikos game of love Hysteric Birthday 2U")]
[InlineData("[BAA]_Darker_than_Black_Omake-1.zip", "Darker than Black")]
[InlineData("Baketeriya ch01-05.zip", "Baketeriya")]
[InlineData("[PROzess]Kimi_ha_midara_na_Boku_no_Joou_-_Ch01", "Kimi ha midara na Boku no Joou")]
[InlineData("[SugoiSugoi]_NEEDLESS_Vol.2_-_Disk_The_Informant_5_[ENG].rar", "NEEDLESS")]
[InlineData("Fullmetal Alchemist chapters 101-108.cbz", "Fullmetal Alchemist")]
[InlineData("To Love Ru v09 Uncensored (Ch.071-079).cbz", "To Love Ru")]
[InlineData("[dmntsf.net] One Piece - Digital Colored Comics Vol. 20 Ch. 177 - 30 Million vs 81 Million.cbz", "One Piece")]
//[InlineData("Corpse Party -The Anthology- Sachikos game of love Hysteric Birthday 2U Extra Chapter", "Corpse Party -The Anthology- Sachikos game of love Hysteric Birthday 2U")]
[InlineData("Corpse Party -The Anthology- Sachikos game of love Hysteric Birthday 2U Chapter 01", "Corpse Party -The Anthology- Sachikos game of love Hysteric Birthday 2U")]
[InlineData("Vol03_ch15-22.rar", "")]
[InlineData("Love Hina - Special.cbz", "")] // This has to be a fallback case
[InlineData("Ani-Hina Art Collection.cbz", "")] // This has to be a fallback case
public void ParseSeriesTest(string filename, string expected)
{
Assert.Equal(expected, ParseSeries(filename));
@ -157,6 +171,11 @@ namespace API.Tests
[InlineData("To Love Ru v18 Uncensored (Ch.153-162.5)", "153-162.5")]
[InlineData("[AN] Mahoutsukai to Deshi no Futekisetsu na Kankei Chp. 1", "1")]
[InlineData("Beelzebub_Side_Story_02_RHS.zip", "2")]
[InlineData("[PROzess]Kimi_ha_midara_na_Boku_no_Joou_-_Ch01", "1")]
[InlineData("Fullmetal Alchemist chapters 101-108.cbz", "101-108")]
[InlineData("Umineko no Naku Koro ni - Episode 3 - Banquet of the Golden Witch #02.cbz", "2")]
[InlineData("To Love Ru v09 Uncensored (Ch.071-079).cbz", "71-79")]
[InlineData("Corpse Party -The Anthology- Sachikos game of love Hysteric Birthday 2U Extra Chapter.rar", "0")]
public void ParseChaptersTest(string filename, string expected)
{
Assert.Equal(expected, ParseChapter(filename));
@ -211,6 +230,8 @@ namespace API.Tests
[InlineData("Wotakoi - Love is Hard for Otaku Omnibus v01 (2018) (Digital) (danke-Empire)", "Omnibus")]
[InlineData("To Love Ru v01 Uncensored (Ch.001-007)", "Uncensored")]
[InlineData("Chobits Omnibus Edition v01 [Dark Horse]", "Omnibus Edition")]
[InlineData("[dmntsf.net] One Piece - Digital Colored Comics Vol. 20 Ch. 177 - 30 Million vs 81 Million.cbz", "Digital Colored Comics")]
[InlineData("AKIRA - c003 (v01) [Full Color] [Darkhorse].cbz", "Full Color")]
public void ParseEditionTest(string input, string expected)
{
Assert.Equal(expected, ParseEdition(input));
@ -221,6 +242,8 @@ namespace API.Tests
[InlineData("Beelzebub_Side_Story_02_RHS.zip", false)]
[InlineData("Darker than Black Shikkoku no Hana Special [Simple Scans].zip", true)]
[InlineData("Darker than Black Shikkoku no Hana Fanbook Extra [Simple Scans].zip", true)]
[InlineData("Corpse Party -The Anthology- Sachikos game of love Hysteric Birthday 2U Extra Chapter", true)]
[InlineData("Ani-Hina Art Collection.cbz", true)]
public void ParseMangaSpecialTest(string input, bool expected)
{
Assert.Equal(expected, ParseMangaSpecial(input) != "");

View File

@ -37,11 +37,14 @@ namespace API.Parser
new Regex(
@"(volume )(?<Volume>\d+)",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
// Tower Of God S01 014 (CBT) (digital).cbz
new Regex(
@"(?<Series>.*)(\b|_|)(S(?<Volume>\d+))",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
// Umineko no Naku Koro ni - Episode 3 - Banquet of the Golden Witch #02.cbz
new Regex(
@"(?<Series>.*)( |_|-)(?:Episode)(?: |_)(?<Volume>\d+(-\d+)?)",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
};
@ -55,6 +58,10 @@ namespace API.Parser
new Regex(
@"(?<Series>.*)( - )(?:v|vo|c)\d",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
// [dmntsf.net] One Piece - Digital Colored Comics Vol. 20 Ch. 177 - 30 Million vs 81 Million.cbz
new Regex(
@"(?<Series>.*) (\b|_|-)(vol)\.?",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
// Historys Strongest Disciple Kenichi_v11_c90-98.zip, Killing Bites Vol. 0001 Ch. 0001 - Galactica Scanlations (gb)
new Regex(
@"(?<Series>.*) (\b|_|-)v",
@ -96,7 +103,7 @@ namespace API.Parser
new Regex(
@"(?<Series>.*)( |_)\((c |ch |chapter )",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
// Black Bullet (This is very loose, keep towards bottom) (?<Series>.*)(_)(v|vo|c|volume)
// Black Bullet (This is very loose, keep towards bottom)
new Regex(
@"(?<Series>.*)(_)(v|vo|c|volume)( |_)\d+",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
@ -106,15 +113,31 @@ namespace API.Parser
RegexOptions.IgnoreCase | RegexOptions.Compiled),
// Corpse Party -The Anthology- Sachikos game of love Hysteric Birthday 2U Chapter 01
new Regex(
@"^(?!Vol)(?<Series>.*)( |_)Chapter( |_)(\d+)", // TODO: This is breaking a ton of cases
@"^(?!Vol)(?<Series>.*)( |_)Chapter( |_)(\d+)",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
// Akiiro Bousou Biyori - 01.jpg, Beelzebub_172_RHS.zip, Cynthia the Mission 29.rar
// [SugoiSugoi]_NEEDLESS_Vol.2_-_Disk_The_Informant_5_[ENG].rar
new Regex(
@"^(?!Vol)(?<Series>.*)( |_|-)(\d+)",
@"^(?<Series>.*)( |_)Vol\.?\d+",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
// Fullmetal Alchemist chapters 101-108.cbz
new Regex(
@"^(?!vol)(?<Series>.*)( |_)(chapters( |_)?)\d+-?\d*",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
// Baketeriya ch01-05.zip, Akiiro Bousou Biyori - 01.jpg, Beelzebub_172_RHS.zip, Cynthia the Mission 29.rar
new Regex(
@"^(?!Vol\.?)(?<Series>.*)( |_|-)(?<!-)(ch)?\d+-?\d*", //fails on
RegexOptions.IgnoreCase | RegexOptions.Compiled),
// Baketeriya ch01-05.zip
new Regex(
@"^(?!Vol)(?<Series>.*)ch\d+-?\d?",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
// [BAA]_Darker_than_Black_Omake-1.zip
new Regex(
@"^(?!Vol)(?<Series>.*)(-)\d+-?\d*", // This catches a lot of stuff ^(?!Vol)(?<Series>.*)( |_)(\d+)
RegexOptions.IgnoreCase | RegexOptions.Compiled),
// [BAA]_Darker_than_Black_c1 (This is very greedy, make sure it's close to last)
new Regex(
@"(?<Series>.*)( |_|-)(c)\d+",
@"^(?!Vol)(?<Series>.*)( |_|-)(ch?)\d+",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
};
@ -130,7 +153,7 @@ namespace API.Parser
RegexOptions.IgnoreCase | RegexOptions.Compiled),
// Batman & Wildcat (1 of 3)
new Regex(
@"(?<Series>.*(\d{4})?)( |_)(?:\(\d+ of \d+)",
@"(?<Series>.*(\d{4})?)( |_)(?:\((?<Volume>\d+) of \d+)",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
// Teen Titans v1 001 (1966-02) (digital) (OkC.O.M.P.U.T.O.-Novus)
new Regex(
@ -178,11 +201,11 @@ namespace API.Parser
RegexOptions.IgnoreCase | RegexOptions.Compiled),
// Scott Pilgrim 02 - Scott Pilgrim vs. The World (2005)
new Regex(
@"^(?<Series>.*)(?: |_)(?<Volume>\d+)",
@"^(?<Series>.*)(?: |_)(?<!of )(?<Volume>\d+)",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
// Batman & Catwoman - Trail of the Gun 01, Batman & Grendel (1996) 01 - Devil's Bones, Teen Titans v1 001 (1966-02) (digital) (OkC.O.M.P.U.T.O.-Novus)
new Regex(
@"^(?<Series>.*)(?: (?<Volume>\d+))",
@"^(?<Series>.*)(?<!of)(?: (?<Volume>\d+))",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
// Batman & Robin the Teen Wonder #0
new Regex(
@ -238,11 +261,14 @@ namespace API.Parser
new Regex(
@"v\d+\.(?<Chapter>\d+(?:.\d+|-\d+)?)",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
// Mob Psycho 100
// Umineko no Naku Koro ni - Episode 3 - Banquet of the Golden Witch #02.cbz (Rare case, if causes issue remove)
new Regex(
@"^(?<Series>.*)(?: |_)#(?<Chapter>\d+)",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
// Hinowa ga CRUSH! 018 (2019) (Digital) (LuCaZ).cbz, Hinowa ga CRUSH! 018.5 (2019) (Digital) (LuCaZ).cbz
new Regex(
@"^(?!Vol)(?<Series>.*) (?<!vol\. )(?<Chapter>\d+(?:.\d+|-\d+)?)(?: \(\d{4}\))?",
@"^(?!Vol)(?<Series>.*) (?<!vol\. )(?<Chapter>\d+(?:.\d+|-\d+)?)(?: \(\d{4}\))?(\b|_|-)",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
// Tower Of God S01 014 (CBT) (digital).cbz
new Regex(
@ -256,7 +282,7 @@ namespace API.Parser
new Regex(
@"Chapter(?<Chapter>\d+(-\d+)?)", //(?:.\d+|-\d+)?
RegexOptions.IgnoreCase | RegexOptions.Compiled),
};
private static readonly Regex[] MangaEditionRegex = {
// Tenjo Tenge {Full Contact Edition} v01 (2011) (Digital) (ASTC).cbz
@ -271,6 +297,14 @@ namespace API.Parser
new Regex(
@"(\b|_)(?<Edition>Uncensored)(\b|_)",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
// [dmntsf.net] One Piece - Digital Colored Comics Vol. 20 Ch. 177 - 30 Million vs 81 Million.cbz
new Regex(
@"(\b|_)(?<Edition>Digital(?: |_)Colored(?: |_)Comics)(\b|_)?",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
// AKIRA - c003 (v01) [Full Color] [Darkhorse].cbz
new Regex(
@"(\b|_)(?<Edition>Full(?: |_)Color)(\b|_)?",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
};
private static readonly Regex[] CleanupRegex =
@ -293,7 +327,7 @@ namespace API.Parser
{
// All Keywords, does not account for checking if contains volume/chapter identification. Parser.Parse() will handle.
new Regex(
@"(?<Special>Special|OneShot|One\-Shot|Omake|Extra)",
@"(?<Special>Specials?|OneShot|One\-Shot|Omake|Extra( Chapter)?|Art Collection)",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
};
@ -430,7 +464,7 @@ namespace API.Parser
var matches = regex.Matches(filename);
foreach (Match match in matches)
{
if (match.Groups["Volume"] == Match.Empty) continue;
if (!match.Groups["Volume"].Success || match.Groups["Volume"] == Match.Empty) continue;
var value = match.Groups["Volume"].Value;
if (!value.Contains("-")) return RemoveLeadingZeroes(match.Groups["Volume"].Value);
@ -452,7 +486,7 @@ namespace API.Parser
var matches = regex.Matches(filename);
foreach (Match match in matches)
{
if (match.Groups["Volume"] == Match.Empty) continue;
if (!match.Groups["Volume"].Success || match.Groups["Volume"] == Match.Empty) continue;
var value = match.Groups["Volume"].Value;
if (!value.Contains("-")) return RemoveLeadingZeroes(match.Groups["Volume"].Value);
@ -474,20 +508,16 @@ namespace API.Parser
var matches = regex.Matches(filename);
foreach (Match match in matches)
{
if (match.Groups["Chapter"] != Match.Empty)
{
var value = match.Groups["Chapter"].Value;
if (!match.Groups["Chapter"].Success || match.Groups["Chapter"] == Match.Empty) continue;
var value = match.Groups["Chapter"].Value;
if (value.Contains("-"))
{
var tokens = value.Split("-");
var from = RemoveLeadingZeroes(tokens[0]);
var to = RemoveLeadingZeroes(tokens[1]);
return $"{from}-{to}";
}
return RemoveLeadingZeroes(match.Groups["Chapter"].Value);
}
if (!value.Contains("-")) return RemoveLeadingZeroes(match.Groups["Chapter"].Value);
var tokens = value.Split("-");
var from = RemoveLeadingZeroes(tokens[0]);
var to = RemoveLeadingZeroes(tokens[1]);
return $"{@from}-{to}";
}
}
@ -502,7 +532,7 @@ namespace API.Parser
var matches = regex.Matches(filename);
foreach (Match match in matches)
{
if (match.Groups["Chapter"] != Match.Empty)
if (match.Groups["Chapter"].Success && match.Groups["Chapter"] != Match.Empty)
{
var value = match.Groups["Chapter"].Value;
@ -536,6 +566,18 @@ namespace API.Parser
}
}
}
foreach (var regex in MangaEditionRegex)
{
var matches = regex.Matches(title);
foreach (Match match in matches)
{
if (match.Success)
{
title = title.Replace(match.Value, "");
}
}
}
return title;
}

View File

@ -148,13 +148,6 @@ namespace API.Services.Tasks
{
if (parsedSeries == null) throw new ArgumentNullException(nameof(parsedSeries));
// For all parsedSeries, any infos that contain same series name and IsSpecial is true are combined
// foreach (var series in parsedSeries)
// {
// var seriesName = series.Key;
// if (parsedSeries.ContainsKey(seriesName))
// }
// First, remove any series that are not in parsedSeries list
var foundSeries = parsedSeries.Select(s => Parser.Parser.Normalize(s.Key)).ToList();
var missingSeries = library.Series.Where(existingSeries =>
@ -190,7 +183,7 @@ namespace API.Services.Tasks
existingSeries.NormalizedName = Parser.Parser.Normalize(key);
existingSeries.LocalizedName ??= key;
}
// Now, we only have to deal with series that exist on disk. Let's recalculate the volumes for each series
var librarySeries = library.Series.ToList();
Parallel.ForEach(librarySeries, (series) =>
@ -320,7 +313,25 @@ namespace API.Services.Tasks
private void TrackSeries(ParserInfo info)
{
if (info.Series == string.Empty) return;
// Check if normalized info.Series already exists and if so, update info to use that name instead
var normalizedSeries = Parser.Parser.Normalize(info.Series);
var existingName = _scannedSeries.SingleOrDefault(p => Parser.Parser.Normalize(p.Key) == normalizedSeries)
.Key;
if (!string.IsNullOrEmpty(existingName))
{
_logger.LogInformation("Found duplicate parsed infos, merged {Original} into {Merged}", info.Series, existingName);
info.Series = existingName;
}
// TODO: For all parsedSeries, any infos that contain same series name and IsSpecial is true are combined
// foreach (var series in parsedSeries)
// {
// var seriesName = series.Key;
// if (parsedSeries.ContainsKey(seriesName))
// }
_scannedSeries.AddOrUpdate(info.Series, new List<ParserInfo>() {info}, (_, oldValue) =>
{
oldValue ??= new List<ParserInfo>();

View File

@ -71,6 +71,9 @@ Package()
echo "Copying Install information"
cp ../INSTALL.txt "$lOutputFolder"/README.txt
echo "Copying LICENSE"
cp ../LICENSE "$lOutputFolder"/LICENSE.txt
echo "Renaming API -> Kavita"
mv "$lOutputFolder"/API "$lOutputFolder"/Kavita