Lots of Parsing Enhancements (#120)

* More cases for parsing regex

* Implemented the ability to parse "Special" keywords.

* Commented out some unit tests

* More parsing cases

* Fixed unit tests

* Fixed typo in build script
This commit is contained in:
Joseph Milazzo 2021-03-28 18:00:05 -05:00 committed by GitHub
parent 7e54d332f5
commit 3e031ab458
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 122 additions and 21 deletions

View File

@ -53,6 +53,8 @@ namespace API.Tests
[InlineData("Kodomo no Jikan vol. 1.cbz", "1")]
[InlineData("Kodomo no Jikan vol. 10.cbz", "10")]
[InlineData("Kedouin Makoto - Corpse Party Musume, Chapter 12 [Dametrans][v2]", "0")]
[InlineData("Vagabond_v03", "3")]
[InlineData("Mujaki No Rakune Volume 10.cbz", "10")]
public void ParseVolumeTest(string filename, string expected)
{
Assert.Equal(expected, ParseVolume(filename));
@ -105,6 +107,11 @@ namespace API.Tests
[InlineData("Goblin Slayer Side Story - Year One 025.5", "Goblin Slayer Side Story - Year One")]
[InlineData("Goblin Slayer - Brand New Day 006.5 (2019) (Digital) (danke-Empire)", "Goblin Slayer - Brand New Day")]
[InlineData("Kedouin Makoto - Corpse Party Musume, Chapter 01 [Dametrans][v2]", "Kedouin Makoto - Corpse Party Musume")]
[InlineData("Vagabond_v03", "Vagabond")]
[InlineData("[AN] Mahoutsukai to Deshi no Futekisetsu na Kankei Chp. 1", "Mahoutsukai to Deshi no Futekisetsu na Kankei")]
[InlineData("Beelzebub_Side_Story_02_RHS.zip", "Beelzebub Side Story")]
[InlineData("Corpse Party -The Anthology- Sachikos game of love Hysteric Birthday 2U Chapter 01", "Corpse Party -The Anthology- Sachikos game of love Hysteric Birthday 2U")]
[InlineData("[BAA]_Darker_than_Black_Omake-1.zip", "Darker than Black")]
public void ParseSeriesTest(string filename, string expected)
{
Assert.Equal(expected, ParseSeries(filename));
@ -146,6 +153,10 @@ namespace API.Tests
[InlineData("VanDread-v01-c001[MD].zip", "1")]
[InlineData("Goblin Slayer Side Story - Year One 025.5", "25.5")]
[InlineData("Kedouin Makoto - Corpse Party Musume, Chapter 01", "1")]
[InlineData("To Love Ru v11 Uncensored (Ch.089-097+Omake)", "89-97")]
[InlineData("To Love Ru v18 Uncensored (Ch.153-162.5)", "153-162.5")]
[InlineData("[AN] Mahoutsukai to Deshi no Futekisetsu na Kankei Chp. 1", "1")]
[InlineData("Beelzebub_Side_Story_02_RHS.zip", "2")]
public void ParseChaptersTest(string filename, string expected)
{
Assert.Equal(expected, ParseChapter(filename));
@ -197,10 +208,23 @@ namespace API.Tests
[InlineData("Tenjou Tenge Omnibus", "Omnibus")]
[InlineData("Tenjou Tenge {Full Contact Edition}", "Full Contact Edition")]
[InlineData("Tenjo Tenge {Full Contact Edition} v01 (2011) (Digital) (ASTC).cbz", "Full Contact Edition")]
[InlineData("Wotakoi - Love is Hard for Otaku Omnibus v01 (2018) (Digital) (danke-Empire)", "Omnibus")]
[InlineData("To Love Ru v01 Uncensored (Ch.001-007)", "Uncensored")]
[InlineData("Chobits Omnibus Edition v01 [Dark Horse]", "Omnibus Edition")]
public void ParseEditionTest(string input, string expected)
{
Assert.Equal(expected, ParseEdition(input));
}
[Theory]
[InlineData("Beelzebub Special OneShot - Minna no Kochikame x Beelzebub (2016) [Mangastream].cbz", true)]
[InlineData("Beelzebub_Omake_June_2012_RHS", true)]
[InlineData("Beelzebub_Side_Story_02_RHS.zip", false)]
[InlineData("Darker than Black Shikkoku no Hana Special [Simple Scans].zip", true)]
[InlineData("Darker than Black Shikkoku no Hana Fanbook Extra [Simple Scans].zip", true)]
public void ParseMangaSpecialTest(string input, bool expected)
{
Assert.Equal(expected, ParseMangaSpecial(input) != "");
}
[Theory]
[InlineData("12-14", 12)]
@ -236,6 +260,7 @@ namespace API.Tests
[InlineData("Scott Pilgrim 01 - Scott Pilgrim's Precious Little Life (2004)", "Scott Pilgrim")]
[InlineData("Teen Titans v1 001 (1966-02) (digital) (OkC.O.M.P.U.T.O.-Novus)", "Teen Titans")]
[InlineData("Scott Pilgrim 02 - Scott Pilgrim vs. The World (2005)", "Scott Pilgrim")]
[InlineData("Wolverine - Origins 003 (2006) (digital) (Minutemen-PhD)", "Wolverine - Origins")]
public void ParseComicSeriesTest(string filename, string expected)
{
Assert.Equal(expected, ParseComicSeries(filename));

BIN
API/Data/kavita.db Normal file

Binary file not shown.

View File

@ -14,8 +14,7 @@ namespace API.Parser
private static readonly Regex ImageRegex = new Regex(ImageFileExtensions, RegexOptions.IgnoreCase | RegexOptions.Compiled);
private static readonly Regex MangaFileRegex = new Regex(MangaFileExtensions, RegexOptions.IgnoreCase | RegexOptions.Compiled);
private static readonly Regex XmlRegex = new Regex(XmlRegexExtensions, RegexOptions.IgnoreCase | RegexOptions.Compiled);
//?: is a non-capturing group in C#, else anything in () will be a group
private static readonly Regex[] MangaVolumeRegex = new[]
{
// Dance in the Vampire Bund v16-17
@ -32,11 +31,11 @@ namespace API.Parser
RegexOptions.IgnoreCase | RegexOptions.Compiled),
// Killing Bites Vol. 0001 Ch. 0001 - Galactica Scanlations (gb)
new Regex(
@"(vol\.? ?)(?<Volume>0*[1-9]+)",
@"(vol\.? ?)(?<Volume>\d+)",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
// Tonikaku Cawaii [Volume 11].cbz
new Regex(
@"(volume )(?<Volume>0?[1-9]+)",
@"(volume )(?<Volume>\d+)",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
// Tower Of God S01 014 (CBT) (digital).cbz
@ -101,13 +100,21 @@ namespace API.Parser
new Regex(
@"(?<Series>.*)(_)(v|vo|c|volume)( |_)\d+",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
// Mahoutsukai to Deshi no Futekisetsu na Kankei Chp. 1
new Regex(
@"(?<Series>.*)( |_)(?:Chp.? ?\d+)",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
// Corpse Party -The Anthology- Sachikos game of love Hysteric Birthday 2U Chapter 01
new Regex(
@"^(?!Vol)(?<Series>.*)( |_)Chapter( |_)(\d+)", // TODO: This is breaking a ton of cases
RegexOptions.IgnoreCase | RegexOptions.Compiled),
// Akiiro Bousou Biyori - 01.jpg, Beelzebub_172_RHS.zip, Cynthia the Mission 29.rar
new Regex(
@"^(?!Vol)(?<Series>.*)( |_)(\d+)",
@"^(?!Vol)(?<Series>.*)( |_|-)(\d+)",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
// [BAA]_Darker_than_Black_c1 (This is very greedy, make sure it's close to last)
new Regex(
@"(?<Series>.*)( |_)(c)\d+",
@"(?<Series>.*)( |_|-)(c)\d+",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
};
@ -223,8 +230,9 @@ namespace API.Parser
private static readonly Regex[] MangaChapterRegex = new[]
{
// Historys Strongest Disciple Kenichi_v11_c90-98.zip, ...c90.5-100.5
new Regex(
@"(c|ch)(\.? ?)(?<Chapter>\d+(?:.\d+|-\d+)?)",
@"(c|ch)(\.? ?)(?<Chapter>(\d+(\.\d)?)-?(\d+(\.\d)?)?)",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
// [Suihei Kiki]_Kasumi_Otoko_no_Ko_[Taruby]_v1.1.zip
new Regex(
@ -251,13 +259,17 @@ namespace API.Parser
};
private static readonly Regex[] MangaEditionRegex = {
//Tenjo Tenge {Full Contact Edition} v01 (2011) (Digital) (ASTC).cbz
// Tenjo Tenge {Full Contact Edition} v01 (2011) (Digital) (ASTC).cbz
new Regex(
@"(?<Edition>({|\(|\[).* Edition(}|\)|\]))",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
//Tenjo Tenge {Full Contact Edition} v01 (2011) (Digital) (ASTC).cbz
// Tenjo Tenge {Full Contact Edition} v01 (2011) (Digital) (ASTC).cbz
new Regex(
@"(\b|_)(?<Edition>Omnibus)(\b|_)",
@"(\b|_)(?<Edition>Omnibus(( |_)?Edition)?)(\b|_)?",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
// To Love Ru v01 Uncensored (Ch.001-007)
new Regex(
@"(\b|_)(?<Edition>Uncensored)(\b|_)",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
};
@ -277,6 +289,14 @@ namespace API.Parser
RegexOptions.IgnoreCase | RegexOptions.Compiled),
};
private static readonly Regex[] MangaSpecialRegex =
{
// All Keywords, does not account for checking if contains volume/chapter identification. Parser.Parse() will handle.
new Regex(
@"(?<Special>Special|OneShot|One\-Shot|Omake|Extra)",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
};
/// <summary>
/// Parses information out of a file path. Will fallback to using directory name if Series couldn't be parsed
@ -314,6 +334,13 @@ namespace API.Parser
ret.Series = CleanTitle(ret.Series.Replace(edition, ""));
ret.Edition = edition;
}
var isSpecial = ParseMangaSpecial(fileName);
if (ret.Chapters == "0" && ret.Volumes == "0" && !string.IsNullOrEmpty(isSpecial))
{
ret.IsSpecial = true;
}
return ret.Series == string.Empty ? null : ret;
@ -346,6 +373,23 @@ namespace API.Parser
return string.Empty;
}
public static string ParseMangaSpecial(string filePath)
{
foreach (var regex in MangaSpecialRegex)
{
var matches = regex.Matches(filePath);
foreach (Match match in matches)
{
if (match.Groups["Special"].Success && match.Groups["Special"].Value != string.Empty)
{
return match.Groups["Special"].Value;
}
}
}
return string.Empty;
}
public static string ParseSeries(string filename)
{
foreach (var regex in MangaSeriesRegex)
@ -496,6 +540,25 @@ namespace API.Parser
return title;
}
private static string RemoveSpecialTags(string title)
{
foreach (var regex in MangaSpecialRegex)
{
var matches = regex.Matches(title);
foreach (Match match in matches)
{
if (match.Success)
{
title = title.Replace(match.Value, "");
}
}
}
return title;
}
/// <summary>
/// Translates _ -> spaces, trims front and back of string, removes release groups
/// </summary>
@ -507,6 +570,8 @@ namespace API.Parser
title = RemoveEditionTagHolders(title);
title = RemoveSpecialTags(title);
title = title.Replace("_", " ").Trim();
if (title.EndsWith("-"))
{

View File

@ -24,5 +24,10 @@ namespace API.Parser
/// This can potentially story things like "Omnibus, Color, Full Contact Edition, Extra, Final, etc"
/// </summary>
public string Edition { get; set; } = "";
/// <summary>
/// If the file contains no volume/chapter information and contains Special Keywords <see cref="Parser.MangaSpecialRegex"/>
/// </summary>
public bool IsSpecial { get; set; } = false;
}
}

View File

@ -49,15 +49,15 @@ namespace API.Services.Tasks
{
// NOTE: This solution isn't the best, but it has potential. We need to handle a few other cases so it works great.
return false;
// if (/*_environment.IsProduction() && */!_forceUpdate && Directory.GetLastWriteTime(folder.Path) < folder.LastScanned)
// if (!_forceUpdate && Directory.GetLastWriteTime(folder.Path) < folder.LastScanned)
// {
// _logger.LogDebug($"{folder.Path} hasn't been updated since last scan. Skipping.");
// _logger.LogDebug("{FolderPath} hasn't been modified since last scan. Skipping", folder.Path);
// skippedFolders += 1;
// return true;
// }
//
// return false;
//return false;
}
private void Cleanup()
@ -134,7 +134,6 @@ namespace API.Services.Tasks
if (Task.Run(() => _unitOfWork.Complete()).Result)
{
_logger.LogInformation("Scan completed on {LibraryName}. Parsed {ParsedSeriesCount} series in {ElapsedScanTime} ms", library.Name, series.Keys.Count, sw.ElapsedMilliseconds);
}
else
@ -149,6 +148,13 @@ namespace API.Services.Tasks
{
if (parsedSeries == null) throw new ArgumentNullException(nameof(parsedSeries));
// For all parsedSeries, any infos that contain same series name and IsSpecial is true are combined
// foreach (var series in parsedSeries)
// {
// var seriesName = series.Key;
// if (parsedSeries.ContainsKey(seriesName))
// }
// First, remove any series that are not in parsedSeries list
var foundSeries = parsedSeries.Select(s => Parser.Parser.Normalize(s.Key)).ToList();
var missingSeries = library.Series.Where(existingSeries =>
@ -222,7 +228,7 @@ namespace API.Services.Tasks
series.Volumes.Add(volume);
}
volume.IsSpecial = volume.Number == 0 && infos.All(p => p.Chapters == "0");
volume.IsSpecial = volume.Number == 0 && infos.All(p => p.Chapters == "0" || p.IsSpecial);
_logger.LogDebug("Parsing {SeriesName} - Volume {VolumeNumber}", series.Name, volume.Name);
UpdateChapters(volume, infos);
volume.Pages = volume.Chapters.Sum(c => c.Pages);
@ -314,7 +320,7 @@ namespace API.Services.Tasks
private void TrackSeries(ParserInfo info)
{
if (info.Series == string.Empty) return;
_scannedSeries.AddOrUpdate(info.Series, new List<ParserInfo>() {info}, (_, oldValue) =>
{
oldValue ??= new List<ParserInfo>();

View File

@ -136,7 +136,7 @@ namespace API
applicationLifetime.ApplicationStopping.Register(OnShutdown);
applicationLifetime.ApplicationStarted.Register(() =>
{
Console.WriteLine("Kavita - v0.3.5");
Console.WriteLine("Kavita - v0.3.6");
});
}

View File

@ -65,8 +65,8 @@ Package()
# TODO: Use no-restore? Because Build should have already done it for us
echo "Building"
cd API
echo dotnet publish -c release --self-contained --runtime $runtime -o "$lOutputFolder" --framework $framework
dotnet publish -c release --self-contained --runtime $runtime -o "$lOutputFolder" --framework $framework
echo dotnet publish -c Release --self-contained --runtime $runtime -o "$lOutputFolder" --framework $framework
dotnet publish -c Release --self-contained --runtime $runtime -o "$lOutputFolder" --framework $framework
echo "Copying Install information"
cp ../INSTALL.txt "$lOutputFolder"/README.txt