mirror of
https://github.com/Kareadita/Kavita.git
synced 2025-07-31 14:33:50 -04:00
Parsing Enhancements (#126)
* More cases for parsing regex * Implemented the ability to parse "Special" keywords. * Commented out some unit tests * More parsing cases * Fixed unit tests * Fixed typo in build script * Fixed a bug where if there was a series with same name, but different capitalization, we wouldn't process it's infos. * Tons of regex updates to handle more cases. * More regex tweaking to handle as many cases as possible. * Bad merge caused the comic parser to break. Fixed with some better regex.
This commit is contained in:
parent
3e031ab458
commit
d9246b7351
@ -55,6 +55,10 @@ namespace API.Tests
|
||||
[InlineData("Kedouin Makoto - Corpse Party Musume, Chapter 12 [Dametrans][v2]", "0")]
|
||||
[InlineData("Vagabond_v03", "3")]
|
||||
[InlineData("Mujaki No Rakune Volume 10.cbz", "10")]
|
||||
[InlineData("Umineko no Naku Koro ni - Episode 3 - Banquet of the Golden Witch #02.cbz", "3")]
|
||||
[InlineData("Volume 12 - Janken Boy is Coming!.cbz", "12")]
|
||||
[InlineData("[dmntsf.net] One Piece - Digital Colored Comics Vol. 20 Ch. 177 - 30 Million vs 81 Million.cbz", "20")]
|
||||
[InlineData("Gantz.V26.cbz", "26")]
|
||||
public void ParseVolumeTest(string filename, string expected)
|
||||
{
|
||||
Assert.Equal(expected, ParseVolume(filename));
|
||||
@ -110,8 +114,18 @@ namespace API.Tests
|
||||
[InlineData("Vagabond_v03", "Vagabond")]
|
||||
[InlineData("[AN] Mahoutsukai to Deshi no Futekisetsu na Kankei Chp. 1", "Mahoutsukai to Deshi no Futekisetsu na Kankei")]
|
||||
[InlineData("Beelzebub_Side_Story_02_RHS.zip", "Beelzebub Side Story")]
|
||||
[InlineData("Corpse Party -The Anthology- Sachikos game of love Hysteric Birthday 2U Chapter 01", "Corpse Party -The Anthology- Sachikos game of love Hysteric Birthday 2U")]
|
||||
[InlineData("[BAA]_Darker_than_Black_Omake-1.zip", "Darker than Black")]
|
||||
[InlineData("Baketeriya ch01-05.zip", "Baketeriya")]
|
||||
[InlineData("[PROzess]Kimi_ha_midara_na_Boku_no_Joou_-_Ch01", "Kimi ha midara na Boku no Joou")]
|
||||
[InlineData("[SugoiSugoi]_NEEDLESS_Vol.2_-_Disk_The_Informant_5_[ENG].rar", "NEEDLESS")]
|
||||
[InlineData("Fullmetal Alchemist chapters 101-108.cbz", "Fullmetal Alchemist")]
|
||||
[InlineData("To Love Ru v09 Uncensored (Ch.071-079).cbz", "To Love Ru")]
|
||||
[InlineData("[dmntsf.net] One Piece - Digital Colored Comics Vol. 20 Ch. 177 - 30 Million vs 81 Million.cbz", "One Piece")]
|
||||
//[InlineData("Corpse Party -The Anthology- Sachikos game of love Hysteric Birthday 2U Extra Chapter", "Corpse Party -The Anthology- Sachikos game of love Hysteric Birthday 2U")]
|
||||
[InlineData("Corpse Party -The Anthology- Sachikos game of love Hysteric Birthday 2U Chapter 01", "Corpse Party -The Anthology- Sachikos game of love Hysteric Birthday 2U")]
|
||||
[InlineData("Vol03_ch15-22.rar", "")]
|
||||
[InlineData("Love Hina - Special.cbz", "")] // This has to be a fallback case
|
||||
[InlineData("Ani-Hina Art Collection.cbz", "")] // This has to be a fallback case
|
||||
public void ParseSeriesTest(string filename, string expected)
|
||||
{
|
||||
Assert.Equal(expected, ParseSeries(filename));
|
||||
@ -157,6 +171,11 @@ namespace API.Tests
|
||||
[InlineData("To Love Ru v18 Uncensored (Ch.153-162.5)", "153-162.5")]
|
||||
[InlineData("[AN] Mahoutsukai to Deshi no Futekisetsu na Kankei Chp. 1", "1")]
|
||||
[InlineData("Beelzebub_Side_Story_02_RHS.zip", "2")]
|
||||
[InlineData("[PROzess]Kimi_ha_midara_na_Boku_no_Joou_-_Ch01", "1")]
|
||||
[InlineData("Fullmetal Alchemist chapters 101-108.cbz", "101-108")]
|
||||
[InlineData("Umineko no Naku Koro ni - Episode 3 - Banquet of the Golden Witch #02.cbz", "2")]
|
||||
[InlineData("To Love Ru v09 Uncensored (Ch.071-079).cbz", "71-79")]
|
||||
[InlineData("Corpse Party -The Anthology- Sachikos game of love Hysteric Birthday 2U Extra Chapter.rar", "0")]
|
||||
public void ParseChaptersTest(string filename, string expected)
|
||||
{
|
||||
Assert.Equal(expected, ParseChapter(filename));
|
||||
@ -211,6 +230,8 @@ namespace API.Tests
|
||||
[InlineData("Wotakoi - Love is Hard for Otaku Omnibus v01 (2018) (Digital) (danke-Empire)", "Omnibus")]
|
||||
[InlineData("To Love Ru v01 Uncensored (Ch.001-007)", "Uncensored")]
|
||||
[InlineData("Chobits Omnibus Edition v01 [Dark Horse]", "Omnibus Edition")]
|
||||
[InlineData("[dmntsf.net] One Piece - Digital Colored Comics Vol. 20 Ch. 177 - 30 Million vs 81 Million.cbz", "Digital Colored Comics")]
|
||||
[InlineData("AKIRA - c003 (v01) [Full Color] [Darkhorse].cbz", "Full Color")]
|
||||
public void ParseEditionTest(string input, string expected)
|
||||
{
|
||||
Assert.Equal(expected, ParseEdition(input));
|
||||
@ -221,6 +242,8 @@ namespace API.Tests
|
||||
[InlineData("Beelzebub_Side_Story_02_RHS.zip", false)]
|
||||
[InlineData("Darker than Black Shikkoku no Hana Special [Simple Scans].zip", true)]
|
||||
[InlineData("Darker than Black Shikkoku no Hana Fanbook Extra [Simple Scans].zip", true)]
|
||||
[InlineData("Corpse Party -The Anthology- Sachikos game of love Hysteric Birthday 2U Extra Chapter", true)]
|
||||
[InlineData("Ani-Hina Art Collection.cbz", true)]
|
||||
public void ParseMangaSpecialTest(string input, bool expected)
|
||||
{
|
||||
Assert.Equal(expected, ParseMangaSpecial(input) != "");
|
||||
|
@ -37,11 +37,14 @@ namespace API.Parser
|
||||
new Regex(
|
||||
@"(volume )(?<Volume>\d+)",
|
||||
RegexOptions.IgnoreCase | RegexOptions.Compiled),
|
||||
|
||||
// Tower Of God S01 014 (CBT) (digital).cbz
|
||||
new Regex(
|
||||
@"(?<Series>.*)(\b|_|)(S(?<Volume>\d+))",
|
||||
RegexOptions.IgnoreCase | RegexOptions.Compiled),
|
||||
// Umineko no Naku Koro ni - Episode 3 - Banquet of the Golden Witch #02.cbz
|
||||
new Regex(
|
||||
@"(?<Series>.*)( |_|-)(?:Episode)(?: |_)(?<Volume>\d+(-\d+)?)",
|
||||
RegexOptions.IgnoreCase | RegexOptions.Compiled),
|
||||
|
||||
};
|
||||
|
||||
@ -55,6 +58,10 @@ namespace API.Parser
|
||||
new Regex(
|
||||
@"(?<Series>.*)( - )(?:v|vo|c)\d",
|
||||
RegexOptions.IgnoreCase | RegexOptions.Compiled),
|
||||
// [dmntsf.net] One Piece - Digital Colored Comics Vol. 20 Ch. 177 - 30 Million vs 81 Million.cbz
|
||||
new Regex(
|
||||
@"(?<Series>.*) (\b|_|-)(vol)\.?",
|
||||
RegexOptions.IgnoreCase | RegexOptions.Compiled),
|
||||
// Historys Strongest Disciple Kenichi_v11_c90-98.zip, Killing Bites Vol. 0001 Ch. 0001 - Galactica Scanlations (gb)
|
||||
new Regex(
|
||||
@"(?<Series>.*) (\b|_|-)v",
|
||||
@ -96,7 +103,7 @@ namespace API.Parser
|
||||
new Regex(
|
||||
@"(?<Series>.*)( |_)\((c |ch |chapter )",
|
||||
RegexOptions.IgnoreCase | RegexOptions.Compiled),
|
||||
// Black Bullet (This is very loose, keep towards bottom) (?<Series>.*)(_)(v|vo|c|volume)
|
||||
// Black Bullet (This is very loose, keep towards bottom)
|
||||
new Regex(
|
||||
@"(?<Series>.*)(_)(v|vo|c|volume)( |_)\d+",
|
||||
RegexOptions.IgnoreCase | RegexOptions.Compiled),
|
||||
@ -106,15 +113,31 @@ namespace API.Parser
|
||||
RegexOptions.IgnoreCase | RegexOptions.Compiled),
|
||||
// Corpse Party -The Anthology- Sachikos game of love Hysteric Birthday 2U Chapter 01
|
||||
new Regex(
|
||||
@"^(?!Vol)(?<Series>.*)( |_)Chapter( |_)(\d+)", // TODO: This is breaking a ton of cases
|
||||
@"^(?!Vol)(?<Series>.*)( |_)Chapter( |_)(\d+)",
|
||||
RegexOptions.IgnoreCase | RegexOptions.Compiled),
|
||||
// Akiiro Bousou Biyori - 01.jpg, Beelzebub_172_RHS.zip, Cynthia the Mission 29.rar
|
||||
// [SugoiSugoi]_NEEDLESS_Vol.2_-_Disk_The_Informant_5_[ENG].rar
|
||||
new Regex(
|
||||
@"^(?!Vol)(?<Series>.*)( |_|-)(\d+)",
|
||||
@"^(?<Series>.*)( |_)Vol\.?\d+",
|
||||
RegexOptions.IgnoreCase | RegexOptions.Compiled),
|
||||
// Fullmetal Alchemist chapters 101-108.cbz
|
||||
new Regex(
|
||||
@"^(?!vol)(?<Series>.*)( |_)(chapters( |_)?)\d+-?\d*",
|
||||
RegexOptions.IgnoreCase | RegexOptions.Compiled),
|
||||
// Baketeriya ch01-05.zip, Akiiro Bousou Biyori - 01.jpg, Beelzebub_172_RHS.zip, Cynthia the Mission 29.rar
|
||||
new Regex(
|
||||
@"^(?!Vol\.?)(?<Series>.*)( |_|-)(?<!-)(ch)?\d+-?\d*", //fails on
|
||||
RegexOptions.IgnoreCase | RegexOptions.Compiled),
|
||||
// Baketeriya ch01-05.zip
|
||||
new Regex(
|
||||
@"^(?!Vol)(?<Series>.*)ch\d+-?\d?",
|
||||
RegexOptions.IgnoreCase | RegexOptions.Compiled),
|
||||
// [BAA]_Darker_than_Black_Omake-1.zip
|
||||
new Regex(
|
||||
@"^(?!Vol)(?<Series>.*)(-)\d+-?\d*", // This catches a lot of stuff ^(?!Vol)(?<Series>.*)( |_)(\d+)
|
||||
RegexOptions.IgnoreCase | RegexOptions.Compiled),
|
||||
// [BAA]_Darker_than_Black_c1 (This is very greedy, make sure it's close to last)
|
||||
new Regex(
|
||||
@"(?<Series>.*)( |_|-)(c)\d+",
|
||||
@"^(?!Vol)(?<Series>.*)( |_|-)(ch?)\d+",
|
||||
RegexOptions.IgnoreCase | RegexOptions.Compiled),
|
||||
};
|
||||
|
||||
@ -130,7 +153,7 @@ namespace API.Parser
|
||||
RegexOptions.IgnoreCase | RegexOptions.Compiled),
|
||||
// Batman & Wildcat (1 of 3)
|
||||
new Regex(
|
||||
@"(?<Series>.*(\d{4})?)( |_)(?:\(\d+ of \d+)",
|
||||
@"(?<Series>.*(\d{4})?)( |_)(?:\((?<Volume>\d+) of \d+)",
|
||||
RegexOptions.IgnoreCase | RegexOptions.Compiled),
|
||||
// Teen Titans v1 001 (1966-02) (digital) (OkC.O.M.P.U.T.O.-Novus)
|
||||
new Regex(
|
||||
@ -178,11 +201,11 @@ namespace API.Parser
|
||||
RegexOptions.IgnoreCase | RegexOptions.Compiled),
|
||||
// Scott Pilgrim 02 - Scott Pilgrim vs. The World (2005)
|
||||
new Regex(
|
||||
@"^(?<Series>.*)(?: |_)(?<Volume>\d+)",
|
||||
@"^(?<Series>.*)(?: |_)(?<!of )(?<Volume>\d+)",
|
||||
RegexOptions.IgnoreCase | RegexOptions.Compiled),
|
||||
// Batman & Catwoman - Trail of the Gun 01, Batman & Grendel (1996) 01 - Devil's Bones, Teen Titans v1 001 (1966-02) (digital) (OkC.O.M.P.U.T.O.-Novus)
|
||||
new Regex(
|
||||
@"^(?<Series>.*)(?: (?<Volume>\d+))",
|
||||
@"^(?<Series>.*)(?<!of)(?: (?<Volume>\d+))",
|
||||
RegexOptions.IgnoreCase | RegexOptions.Compiled),
|
||||
// Batman & Robin the Teen Wonder #0
|
||||
new Regex(
|
||||
@ -238,11 +261,14 @@ namespace API.Parser
|
||||
new Regex(
|
||||
@"v\d+\.(?<Chapter>\d+(?:.\d+|-\d+)?)",
|
||||
RegexOptions.IgnoreCase | RegexOptions.Compiled),
|
||||
// Mob Psycho 100
|
||||
// Umineko no Naku Koro ni - Episode 3 - Banquet of the Golden Witch #02.cbz (Rare case, if causes issue remove)
|
||||
new Regex(
|
||||
@"^(?<Series>.*)(?: |_)#(?<Chapter>\d+)",
|
||||
RegexOptions.IgnoreCase | RegexOptions.Compiled),
|
||||
|
||||
// Hinowa ga CRUSH! 018 (2019) (Digital) (LuCaZ).cbz, Hinowa ga CRUSH! 018.5 (2019) (Digital) (LuCaZ).cbz
|
||||
new Regex(
|
||||
@"^(?!Vol)(?<Series>.*) (?<!vol\. )(?<Chapter>\d+(?:.\d+|-\d+)?)(?: \(\d{4}\))?",
|
||||
@"^(?!Vol)(?<Series>.*) (?<!vol\. )(?<Chapter>\d+(?:.\d+|-\d+)?)(?: \(\d{4}\))?(\b|_|-)",
|
||||
RegexOptions.IgnoreCase | RegexOptions.Compiled),
|
||||
// Tower Of God S01 014 (CBT) (digital).cbz
|
||||
new Regex(
|
||||
@ -256,7 +282,7 @@ namespace API.Parser
|
||||
new Regex(
|
||||
@"Chapter(?<Chapter>\d+(-\d+)?)", //(?:.\d+|-\d+)?
|
||||
RegexOptions.IgnoreCase | RegexOptions.Compiled),
|
||||
|
||||
|
||||
};
|
||||
private static readonly Regex[] MangaEditionRegex = {
|
||||
// Tenjo Tenge {Full Contact Edition} v01 (2011) (Digital) (ASTC).cbz
|
||||
@ -271,6 +297,14 @@ namespace API.Parser
|
||||
new Regex(
|
||||
@"(\b|_)(?<Edition>Uncensored)(\b|_)",
|
||||
RegexOptions.IgnoreCase | RegexOptions.Compiled),
|
||||
// [dmntsf.net] One Piece - Digital Colored Comics Vol. 20 Ch. 177 - 30 Million vs 81 Million.cbz
|
||||
new Regex(
|
||||
@"(\b|_)(?<Edition>Digital(?: |_)Colored(?: |_)Comics)(\b|_)?",
|
||||
RegexOptions.IgnoreCase | RegexOptions.Compiled),
|
||||
// AKIRA - c003 (v01) [Full Color] [Darkhorse].cbz
|
||||
new Regex(
|
||||
@"(\b|_)(?<Edition>Full(?: |_)Color)(\b|_)?",
|
||||
RegexOptions.IgnoreCase | RegexOptions.Compiled),
|
||||
};
|
||||
|
||||
private static readonly Regex[] CleanupRegex =
|
||||
@ -293,7 +327,7 @@ namespace API.Parser
|
||||
{
|
||||
// All Keywords, does not account for checking if contains volume/chapter identification. Parser.Parse() will handle.
|
||||
new Regex(
|
||||
@"(?<Special>Special|OneShot|One\-Shot|Omake|Extra)",
|
||||
@"(?<Special>Specials?|OneShot|One\-Shot|Omake|Extra( Chapter)?|Art Collection)",
|
||||
RegexOptions.IgnoreCase | RegexOptions.Compiled),
|
||||
};
|
||||
|
||||
@ -430,7 +464,7 @@ namespace API.Parser
|
||||
var matches = regex.Matches(filename);
|
||||
foreach (Match match in matches)
|
||||
{
|
||||
if (match.Groups["Volume"] == Match.Empty) continue;
|
||||
if (!match.Groups["Volume"].Success || match.Groups["Volume"] == Match.Empty) continue;
|
||||
|
||||
var value = match.Groups["Volume"].Value;
|
||||
if (!value.Contains("-")) return RemoveLeadingZeroes(match.Groups["Volume"].Value);
|
||||
@ -452,7 +486,7 @@ namespace API.Parser
|
||||
var matches = regex.Matches(filename);
|
||||
foreach (Match match in matches)
|
||||
{
|
||||
if (match.Groups["Volume"] == Match.Empty) continue;
|
||||
if (!match.Groups["Volume"].Success || match.Groups["Volume"] == Match.Empty) continue;
|
||||
|
||||
var value = match.Groups["Volume"].Value;
|
||||
if (!value.Contains("-")) return RemoveLeadingZeroes(match.Groups["Volume"].Value);
|
||||
@ -474,20 +508,16 @@ namespace API.Parser
|
||||
var matches = regex.Matches(filename);
|
||||
foreach (Match match in matches)
|
||||
{
|
||||
if (match.Groups["Chapter"] != Match.Empty)
|
||||
{
|
||||
var value = match.Groups["Chapter"].Value;
|
||||
if (!match.Groups["Chapter"].Success || match.Groups["Chapter"] == Match.Empty) continue;
|
||||
|
||||
var value = match.Groups["Chapter"].Value;
|
||||
|
||||
if (value.Contains("-"))
|
||||
{
|
||||
var tokens = value.Split("-");
|
||||
var from = RemoveLeadingZeroes(tokens[0]);
|
||||
var to = RemoveLeadingZeroes(tokens[1]);
|
||||
return $"{from}-{to}";
|
||||
}
|
||||
|
||||
return RemoveLeadingZeroes(match.Groups["Chapter"].Value);
|
||||
}
|
||||
if (!value.Contains("-")) return RemoveLeadingZeroes(match.Groups["Chapter"].Value);
|
||||
|
||||
var tokens = value.Split("-");
|
||||
var from = RemoveLeadingZeroes(tokens[0]);
|
||||
var to = RemoveLeadingZeroes(tokens[1]);
|
||||
return $"{@from}-{to}";
|
||||
|
||||
}
|
||||
}
|
||||
@ -502,7 +532,7 @@ namespace API.Parser
|
||||
var matches = regex.Matches(filename);
|
||||
foreach (Match match in matches)
|
||||
{
|
||||
if (match.Groups["Chapter"] != Match.Empty)
|
||||
if (match.Groups["Chapter"].Success && match.Groups["Chapter"] != Match.Empty)
|
||||
{
|
||||
var value = match.Groups["Chapter"].Value;
|
||||
|
||||
@ -536,6 +566,18 @@ namespace API.Parser
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
foreach (var regex in MangaEditionRegex)
|
||||
{
|
||||
var matches = regex.Matches(title);
|
||||
foreach (Match match in matches)
|
||||
{
|
||||
if (match.Success)
|
||||
{
|
||||
title = title.Replace(match.Value, "");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return title;
|
||||
}
|
||||
|
@ -148,13 +148,6 @@ namespace API.Services.Tasks
|
||||
{
|
||||
if (parsedSeries == null) throw new ArgumentNullException(nameof(parsedSeries));
|
||||
|
||||
// For all parsedSeries, any infos that contain same series name and IsSpecial is true are combined
|
||||
// foreach (var series in parsedSeries)
|
||||
// {
|
||||
// var seriesName = series.Key;
|
||||
// if (parsedSeries.ContainsKey(seriesName))
|
||||
// }
|
||||
|
||||
// First, remove any series that are not in parsedSeries list
|
||||
var foundSeries = parsedSeries.Select(s => Parser.Parser.Normalize(s.Key)).ToList();
|
||||
var missingSeries = library.Series.Where(existingSeries =>
|
||||
@ -190,7 +183,7 @@ namespace API.Services.Tasks
|
||||
existingSeries.NormalizedName = Parser.Parser.Normalize(key);
|
||||
existingSeries.LocalizedName ??= key;
|
||||
}
|
||||
|
||||
|
||||
// Now, we only have to deal with series that exist on disk. Let's recalculate the volumes for each series
|
||||
var librarySeries = library.Series.ToList();
|
||||
Parallel.ForEach(librarySeries, (series) =>
|
||||
@ -320,7 +313,25 @@ namespace API.Services.Tasks
|
||||
private void TrackSeries(ParserInfo info)
|
||||
{
|
||||
if (info.Series == string.Empty) return;
|
||||
|
||||
|
||||
// Check if normalized info.Series already exists and if so, update info to use that name instead
|
||||
var normalizedSeries = Parser.Parser.Normalize(info.Series);
|
||||
var existingName = _scannedSeries.SingleOrDefault(p => Parser.Parser.Normalize(p.Key) == normalizedSeries)
|
||||
.Key;
|
||||
if (!string.IsNullOrEmpty(existingName))
|
||||
{
|
||||
_logger.LogInformation("Found duplicate parsed infos, merged {Original} into {Merged}", info.Series, existingName);
|
||||
info.Series = existingName;
|
||||
}
|
||||
|
||||
// TODO: For all parsedSeries, any infos that contain same series name and IsSpecial is true are combined
|
||||
// foreach (var series in parsedSeries)
|
||||
// {
|
||||
// var seriesName = series.Key;
|
||||
// if (parsedSeries.ContainsKey(seriesName))
|
||||
// }
|
||||
|
||||
|
||||
_scannedSeries.AddOrUpdate(info.Series, new List<ParserInfo>() {info}, (_, oldValue) =>
|
||||
{
|
||||
oldValue ??= new List<ParserInfo>();
|
||||
|
Loading…
x
Reference in New Issue
Block a user