using System; using System.Collections.Immutable; using System.IO; using System.Linq; using System.Text.RegularExpressions; using API.Entities.Enums; namespace API.Services.Tasks.Scanner.Parser; public static class Parser { public const string DefaultChapter = "0"; public const string DefaultVolume = "0"; public static readonly TimeSpan RegexTimeout = TimeSpan.FromMilliseconds(500); public const string ImageFileExtensions = @"^(\.png|\.jpeg|\.jpg|\.webp|\.gif|\.avif)"; public const string ArchiveFileExtensions = @"\.cbz|\.zip|\.rar|\.cbr|\.tar.gz|\.7zip|\.7z|\.cb7|\.cbt"; private const string BookFileExtensions = @"\.epub|\.pdf"; private const string XmlRegexExtensions = @"\.xml"; public const string MacOsMetadataFileStartsWith = @"._"; public const string SupportedExtensions = ArchiveFileExtensions + "|" + ImageFileExtensions + "|" + BookFileExtensions; private const RegexOptions MatchOptions = RegexOptions.IgnoreCase | RegexOptions.Compiled | RegexOptions.CultureInvariant; private static readonly ImmutableArray FormatTagSpecialKeywords = ImmutableArray.Create( "Special", "Reference", "Director's Cut", "Box Set", "Box-Set", "Annual", "Anthology", "Epilogue", "One Shot", "One-Shot", "Prologue", "TPB", "Trade Paper Back", "Omnibus", "Compendium", "Absolute", "Graphic Novel", "GN", "FCBD"); private static readonly char[] LeadingZeroesTrimChars = new[] { '0' }; private static readonly char[] SpacesAndSeparators = { '\0', '\t', '\r', ' ', '-', ','}; private const string Number = @"\d+(\.\d)?"; private const string NumberRange = Number + @"(-" + Number + @")?"; /// /// non greedy matching of a string where parenthesis are balanced /// public const string BalancedParen = @"(?:[^()]|(?\()|(?<-open>\)))*?(?(open)(?!))"; /// /// non greedy matching of a string where square brackets are balanced /// public const string BalancedBracket = @"(?:[^\[\]]|(?\[)|(?<-open>\]))*?(?(open)(?!))"; /// /// Matches [Complete], release tags like [kmts] but not [ Complete ] or [kmts ] /// private const string TagsInBrackets = $@"\[(?!\s){BalancedBracket}(? /// Common regex patterns present in both Comics and Mangas /// private const string CommonSpecial = @"Specials?|One[- ]?Shot|Extra(?:\sChapter)?(?=\s)|Art Collection|Side Stories|Bonus"; /// /// Matches against font-family css syntax. Does not match if url import has data: starting, as that is binary data /// /// See here for some examples https://developer.mozilla.org/en-US/docs/Web/CSS/@font-face public static readonly Regex FontSrcUrlRegex = new Regex(@"(?(?:src:\s?)?(?:url|local)\((?!data:)" + "(?:[\"']?)" + @"(?!data:))" + "(?(?!data:)[^\"']+?)" + "(?[\"']?" + @"\);?)", MatchOptions, RegexTimeout); /// /// https://developer.mozilla.org/en-US/docs/Web/CSS/@import /// public static readonly Regex CssImportUrlRegex = new Regex("(@import\\s([\"|']|url\\([\"|']))(?[^'\"]+)([\"|']\\)?);", MatchOptions | RegexOptions.Multiline, RegexTimeout); /// /// Misc css image references, like background-image: url(), border-image, or list-style-image /// /// Original prepend: (background|border|list-style)-image:\s?)? public static readonly Regex CssImageUrlRegex = new Regex(@"(url\((?!data:).(?!data:))" + "(?(?!data:)[^\"']*)" + @"(.\))", MatchOptions, RegexTimeout); private static readonly Regex ImageRegex = new Regex(ImageFileExtensions, MatchOptions, RegexTimeout); private static readonly Regex ArchiveFileRegex = new Regex(ArchiveFileExtensions, MatchOptions, RegexTimeout); private static readonly Regex ComicInfoArchiveRegex = new Regex(@"\.cbz|\.cbr|\.cb7|\.cbt", MatchOptions, RegexTimeout); private static readonly Regex XmlRegex = new Regex(XmlRegexExtensions, MatchOptions, RegexTimeout); private static readonly Regex BookFileRegex = new Regex(BookFileExtensions, MatchOptions, RegexTimeout); private static readonly Regex CoverImageRegex = new Regex(@"(? /// Recognizes the Special token only /// private static readonly Regex SpecialTokenRegex = new Regex(@"SP\d+", MatchOptions, RegexTimeout); private static readonly Regex[] MangaVolumeRegex = new[] { // Dance in the Vampire Bund v16-17 new Regex( @"(?.*)(\b|_)v(?\d+-?\d+)( |_)", MatchOptions, RegexTimeout), // NEEDLESS_Vol.4_-Simeon_6_v2[SugoiSugoi].rar new Regex( @"(?.*)(\b|_)(?!\[)(vol\.?)(?\d+(-\d+)?)(?!\])", MatchOptions, RegexTimeout), // Historys Strongest Disciple Kenichi_v11_c90-98.zip or Dance in the Vampire Bund v16-17 new Regex( @"(?.*)(\b|_)(?!\[)v(?" + NumberRange + @")(?!\])", MatchOptions, RegexTimeout), // Kodomo no Jikan vol. 10, [dmntsf.net] One Piece - Digital Colored Comics Vol. 20.5-21.5 Ch. 177 new Regex( @"(?.*)(\b|_)(vol\.? ?)(?\d+(\.\d)?(-\d+)?(\.\d)?)", MatchOptions, RegexTimeout), // Killing Bites Vol. 0001 Ch. 0001 - Galactica Scanlations (gb) new Regex( @"(vol\.? ?)(?\d+(\.\d)?)", MatchOptions, RegexTimeout), // Tonikaku Cawaii [Volume 11].cbz new Regex( @"(volume )(?\d+(\.\d)?)", MatchOptions, RegexTimeout), // Tower Of God S01 014 (CBT) (digital).cbz new Regex( @"(?.*)(\b|_|)(S(?\d+))", MatchOptions, RegexTimeout), // vol_001-1.cbz for MangaPy default naming convention new Regex( @"(vol_)(?\d+(\.\d)?)", MatchOptions, RegexTimeout), // Chinese Volume: 第n卷 -> Volume n, 第n册 -> Volume n, 幽游白书完全版 第03卷 天下 or 阿衰online 第1册 new Regex( @"第(?\d+)(卷|册)", MatchOptions, RegexTimeout), // Chinese Volume: 卷n -> Volume n, 册n -> Volume n new Regex( @"(卷|册)(?\d+)", MatchOptions, RegexTimeout), // Korean Volume: 제n화|권|회|장 -> Volume n, n화|권|회|장 -> Volume n, 63권#200.zip -> Volume 63 (no chapter, #200 is just files inside) new Regex( @"제?(?\d+(\.\d)?)(권|회|화|장)", MatchOptions, RegexTimeout), // Korean Season: 시즌n -> Season n, new Regex( @"시즌(?\d+\-?\d+)", MatchOptions, RegexTimeout), // Korean Season: 시즌n -> Season n, n시즌 -> season n new Regex( @"(?\d+(\-|~)?\d+?)시즌", MatchOptions, RegexTimeout), // Korean Season: 시즌n -> Season n, n시즌 -> season n new Regex( @"시즌(?\d+(\-|~)?\d+?)", MatchOptions, RegexTimeout), // Japanese Volume: n巻 -> Volume n new Regex( @"(?\d+(?:(\-)\d+)?)巻", MatchOptions, RegexTimeout), // Russian Volume: Том n -> Volume n, Тома n -> Volume new Regex( @"Том(а?)(\.?)(\s|_)?(?\d+(?:(\-)\d+)?)", MatchOptions, RegexTimeout), // Russian Volume: n Том -> Volume n new Regex( @"(\s|_)?(?\d+(?:(\-)\d+)?)(\s|_)Том(а?)", MatchOptions, RegexTimeout), }; private static readonly Regex[] MangaSeriesRegex = new[] { // Russian Volume: Том n -> Volume n, Тома n -> Volume new Regex( @"(?.+?)Том(а?)(\.?)(\s|_)?(?\d+(?:(\-)\d+)?)", MatchOptions, RegexTimeout), // Russian Volume: n Том -> Volume n new Regex( @"(?.+?)(\s|_)?(?\d+(?:(\-)\d+)?)(\s|_)Том(а?)", MatchOptions, RegexTimeout), // Russian Chapter: n Главa -> Chapter n new Regex( @"(?.+?)(?!Том)(?\d+(?:\.\d+|-\d+)?)(\s|_)(Глава|глава|Главы|Глава)", MatchOptions, RegexTimeout), // Russian Chapter: Главы n -> Chapter n new Regex( @"(?.+?)(Глава|глава|Главы|Глава)(\.?)(\s|_)?(?\d+(?:.\d+|-\d+)?)", MatchOptions, RegexTimeout), // Grand Blue Dreaming - SP02 new Regex( @"(?.*)(\b|_|-|\s)(?:sp)\d", MatchOptions, RegexTimeout), // [SugoiSugoi]_NEEDLESS_Vol.2_-_Disk_The_Informant_5_[ENG].rar, Yuusha Ga Shinda! - Vol.tbd Chapter 27.001 V2 Infection ①.cbz new Regex( @"^(?.*)( |_)Vol\.?(\d+|tbd)", MatchOptions, RegexTimeout), // Mad Chimera World - Volume 005 - Chapter 026.cbz (couldn't figure out how to get Volume negative lookaround working on below regex), // The Duke of Death and His Black Maid - Vol. 04 Ch. 054.5 - V4 Omake new Regex( @"(?.+?)(\s|_|-)+(?:Vol(ume|\.)?(\s|_|-)+\d+)(\s|_|-)+(?:(Ch|Chapter|Ch)\.?)(\s|_|-)+(?\d+)", MatchOptions, RegexTimeout), // Ichiban_Ushiro_no_Daimaou_v04_ch34_[VISCANS].zip, VanDread-v01-c01.zip new Regex( @"(?.*)(\b|_)v(?\d+-?\d*)(\s|_|-)", MatchOptions, RegexTimeout), // Gokukoku no Brynhildr - c001-008 (v01) [TrinityBAKumA], Black Bullet - v4 c17 [batoto] new Regex( @"(?.*)( - )(?:v|vo|c|chapters)\d", MatchOptions, RegexTimeout), // Kedouin Makoto - Corpse Party Musume, Chapter 19 [Dametrans].zip new Regex( @"(?.*)(?:, Chapter )(?\d+)", MatchOptions, RegexTimeout), // Please Go Home, Akutsu-San! - Chapter 038.5 - Volume Announcement.cbz, My Charms Are Wasted on Kuroiwa Medaka - Ch. 37.5 - Volume Extras new Regex( @"(?.+?)(\s|_|-)(?!Vol)(\s|_|-)((?:Chapter)|(?:Ch\.))(\s|_|-)(?\d+)", MatchOptions, RegexTimeout), // [dmntsf.net] One Piece - Digital Colored Comics Vol. 20 Ch. 177 - 30 Million vs 81 Million.cbz new Regex( @"(?.+?):? (\b|_|-)(vol)\.?(\s|-|_)?\d+", MatchOptions, RegexTimeout), // [xPearse] Kyochuu Rettou Chapter 001 Volume 1 [English] [Manga] [Volume Scans] new Regex( @"(?.+?):?(\s|\b|_|-)Chapter(\s|\b|_|-)\d+(\s|\b|_|-)(vol)(ume)", MatchOptions, RegexTimeout), // [xPearse] Kyochuu Rettou Volume 1 [English] [Manga] [Volume Scans] new Regex( @"(?.+?):? (\b|_|-)(vol)(ume)", MatchOptions, RegexTimeout), //Knights of Sidonia c000 (S2 LE BD Omake - BLAME!) [Habanero Scans] new Regex( @"(?.*)(\bc\d+\b)", MatchOptions, RegexTimeout), //Tonikaku Cawaii [Volume 11], Darling in the FranXX - Volume 01.cbz new Regex( @"(?.*)(?: _|-|\[|\()\s?vol(ume)?", MatchOptions, RegexTimeout), // Momo The Blood Taker - Chapter 027 Violent Emotion.cbz, Grand Blue Dreaming - SP02 Extra (2019) (Digital) (danke-Empire).cbz new Regex( @"^(?(?!Vol).+?)(?:(ch(apter|\.)(\b|_|-|\s))|sp)\d", MatchOptions, RegexTimeout), // Historys Strongest Disciple Kenichi_v11_c90-98.zip, Killing Bites Vol. 0001 Ch. 0001 - Galactica Scanlations (gb) new Regex( @"(?.*) (\b|_|-)(v|ch\.?|c|s)\d+", MatchOptions, RegexTimeout), // Hinowa ga CRUSH! 018 (2019) (Digital) (LuCaZ).cbz new Regex( @"(?.*)\s+(?\d+)\s+(?:\(\d{4}\))\s", MatchOptions, RegexTimeout), // Goblin Slayer - Brand New Day 006.5 (2019) (Digital) (danke-Empire) new Regex( @"(?.*) (-)?(?\d+(?:.\d+|-\d+)?) \(\d{4}\)", MatchOptions, RegexTimeout), // Noblesse - Episode 429 (74 Pages).7z new Regex( @"(?.*)(\s|_)(?:Episode|Ep\.?)(\s|_)(?\d+(?:.\d+|-\d+)?)", MatchOptions, RegexTimeout), // Akame ga KILL! ZERO (2016-2019) (Digital) (LuCaZ) new Regex( @"(?.*)\(\d", MatchOptions, RegexTimeout), // Tonikaku Kawaii (Ch 59-67) (Ongoing) new Regex( @"(?.*)(\s|_)\((c\s|ch\s|chapter\s)", MatchOptions, RegexTimeout), // Fullmetal Alchemist chapters 101-108 new Regex( @"(?.+?)(\s|_|\-)+?chapters(\s|_|\-)+?\d+(\s|_|\-)+?", MatchOptions, RegexTimeout), // It's Witching Time! 001 (Digital) (Anonymous1234) new Regex( @"(?.+?)(\s|_|\-)+?\d+(\s|_|\-)\(", MatchOptions, RegexTimeout), //Ichinensei_ni_Nacchattara_v01_ch01_[Taruby]_v1.1.zip must be before [Suihei Kiki]_Kasumi_Otoko_no_Ko_[Taruby]_v1.1.zip // due to duplicate version identifiers in file. new Regex( @"(?.*)(v|s)\d+(-\d+)?(_|\s)", MatchOptions, RegexTimeout), //[Suihei Kiki]_Kasumi_Otoko_no_Ko_[Taruby]_v1.1.zip new Regex( @"(?.*)(v|s)\d+(-\d+)?", MatchOptions, RegexTimeout), // Black Bullet (This is very loose, keep towards bottom) new Regex( @"(?.*)(_)(v|vo|c|volume)( |_)\d+", MatchOptions, RegexTimeout), // [Hidoi]_Amaenaideyo_MS_vol01_chp02.rar new Regex( @"(?.*)( |_)(vol\d+)?( |_)(?:Chp\.? ?\d+)", MatchOptions, RegexTimeout), // Mahoutsukai to Deshi no Futekisetsu na Kankei Chp. 1 new Regex( @"(?.*)( |_)(?:Chp.? ?\d+)", MatchOptions, RegexTimeout), // Corpse Party -The Anthology- Sachikos game of love Hysteric Birthday 2U Chapter 01 new Regex( @"^(?!Vol)(?.*)( |_)Chapter( |_)(\d+)", MatchOptions, RegexTimeout), // Fullmetal Alchemist chapters 101-108.cbz new Regex( @"^(?!vol)(?.*)( |_)(chapters( |_)?)\d+-?\d*", MatchOptions, RegexTimeout), // Umineko no Naku Koro ni - Episode 1 - Legend of the Golden Witch #1 new Regex( @"^(?!Vol\.?)(?.*)( |_|-)(?.*)ch\d+-?\d?", MatchOptions, RegexTimeout), // Magi - Ch.252-005.cbz new Regex( @"(?.*)( ?- ?)Ch\.\d+-?\d*", MatchOptions, RegexTimeout), // [BAA]_Darker_than_Black_Omake-1, Bleach 001-002, Kodoja #001 (March 2016) new Regex( @"^(?!Vol)(?!Chapter)(?.+?)(-|_|\s|#)\d+(-\d+)?", MatchOptions, RegexTimeout), // Baketeriya ch01-05.zip, Akiiro Bousou Biyori - 01.jpg, Beelzebub_172_RHS.zip, Cynthia the Mission 29.rar, A Compendium of Ghosts - 031 - The Third Story_ Part 12 (Digital) (Cobalt001) new Regex( @"^(?!Vol\.?)(?!Chapter)(?.+?)(\s|_|-)(?.*)( |_|-)(ch?)\d+", MatchOptions, RegexTimeout), // Japanese Volume: n巻 -> Volume n new Regex( @"(?.+?)第(?\d+(?:(\-)\d+)?)巻", MatchOptions, RegexTimeout), }; private static readonly Regex[] ComicSeriesRegex = new[] { // Russian Volume: Том n -> Volume n, Тома n -> Volume new Regex( @"(?.+?)Том(а?)(\.?)(\s|_)?(?\d+(?:(\-)\d+)?)", MatchOptions, RegexTimeout), // Russian Volume: n Том -> Volume n new Regex( @"(?.+?)(\s|_)?(?\d+(?:(\-)\d+)?)(\s|_)Том(а?)", MatchOptions, RegexTimeout), // Russian Chapter: n Главa -> Chapter n new Regex( @"(?.+?)(?!Том)(?\d+(?:\.\d+|-\d+)?)(\s|_)(Глава|глава|Главы|Глава)", MatchOptions, RegexTimeout), // Russian Chapter: Главы n -> Chapter n new Regex( @"(?.+?)(Глава|глава|Главы|Глава)(\.?)(\s|_)?(?\d+(?:.\d+|-\d+)?)", MatchOptions, RegexTimeout), // Tintin - T22 Vol 714 pour Sydney new Regex( @"(?.+?)\s?(\b|_|-)\s?((vol|tome|t)\.?)(?\d+(-\d+)?)", MatchOptions, RegexTimeout), // Invincible Vol 01 Family matters (2005) (Digital) new Regex( @"(?.+?)(\b|_)((vol|tome|t)\.?)(\s|_)(?\d+(-\d+)?)", MatchOptions, RegexTimeout), // Batman Beyond 2.0 001 (2013) new Regex( @"^(?.+?\S\.\d) (?\d+)", MatchOptions, RegexTimeout), // 04 - Asterix the Gladiator (1964) (Digital-Empire) (WebP by Doc MaKS) new Regex( @"^(?\d+)\s(-\s|_)(?.*(\d{4})?)( |_)(\(|\d+)", MatchOptions, RegexTimeout), // 01 Spider-Man & Wolverine 01.cbr new Regex( @"^(?\d+)\s(?:-\s)(?.*) (\d+)?", MatchOptions, RegexTimeout), // Batman & Wildcat (1 of 3) new Regex( @"(?.*(\d{4})?)( |_)(?:\((?\d+) of \d+)", MatchOptions, RegexTimeout), // Teen Titans v1 001 (1966-02) (digital) (OkC.O.M.P.U.T.O.-Novus), Aldebaran-Antares-t6 new Regex( @"^(?.+?)(?: |_|-)(v|t)\d+", MatchOptions, RegexTimeout), // Amazing Man Comics chapter 25 new Regex( @"^(?.+?)(?: |_)c(hapter) \d+", MatchOptions, RegexTimeout), // Amazing Man Comics issue #25 new Regex( @"^(?.+?)(?: |_)i(ssue) #\d+", MatchOptions, RegexTimeout), // Batman Wayne Family Adventures - Ep. 001 - Moving In new Regex( @"^(?.+?)(\s|_|-)(?:Ep\.?)(\s|_|-)+\d+", MatchOptions, RegexTimeout), // Batgirl Vol.2000 #57 (December, 2004) new Regex( @"^(?.+?)Vol\.?\s?#?(?:\d+)", MatchOptions, RegexTimeout), // Batman & Robin the Teen Wonder #0 new Regex( @"^(?.*)(?: |_)#\d+", MatchOptions, RegexTimeout), // Batman & Catwoman - Trail of the Gun 01, Batman & Grendel (1996) 01 - Devil's Bones, Teen Titans v1 001 (1966-02) (digital) (OkC.O.M.P.U.T.O.-Novus) new Regex( @"^(?.+?)(?: \d+)", MatchOptions, RegexTimeout), // Scott Pilgrim 02 - Scott Pilgrim vs. The World (2005) new Regex( @"^(?.+?)(?: |_)(?\d+)", MatchOptions, RegexTimeout), // The First Asterix Frieze (WebP by Doc MaKS) new Regex( @"^(?.*)(?: |_)(?!\(\d{4}|\d{4}-\d{2}\))\(", MatchOptions, RegexTimeout), // spawn-123, spawn-chapter-123 (from https://github.com/Girbons/comics-downloader) new Regex( @"^(?.+?)-(chapter-)?(?\d+)", MatchOptions, RegexTimeout), // MUST BE LAST: Batman & Daredevil - King of New York new Regex( @"^(?.*)", MatchOptions, RegexTimeout), }; private static readonly Regex[] ComicVolumeRegex = new[] { // Teen Titans v1 001 (1966-02) (digital) (OkC.O.M.P.U.T.O.-Novus) new Regex( @"^(?.+?)(?: |_)(t|v)(?" + NumberRange + @")", MatchOptions, RegexTimeout), // Batgirl Vol.2000 #57 (December, 2004) new Regex( @"^(?.+?)(?:\s|_)(v|vol|tome|t)\.?(\s|_)?(?\d+)", MatchOptions, RegexTimeout), // Chinese Volume: 第n卷 -> Volume n, 第n册 -> Volume n, 幽游白书完全版 第03卷 天下 or 阿衰online 第1册 new Regex( @"第(?\d+)(卷|册)", MatchOptions, RegexTimeout), // Chinese Volume: 卷n -> Volume n, 册n -> Volume n new Regex( @"(卷|册)(?\d+)", MatchOptions, RegexTimeout), // Korean Volume: 제n권 -> Volume n, n권 -> Volume n, 63권#200.zip new Regex( @"제?(?\d+)권", MatchOptions, RegexTimeout), // Japanese Volume: n巻 -> Volume n new Regex( @"(?\d+(?:(\-)\d+)?)巻", MatchOptions, RegexTimeout), // Russian Volume: Том n -> Volume n, Тома n -> Volume new Regex( @"Том(а?)(\.?)(\s|_)?(?\d+(?:(\-)\d+)?)", MatchOptions, RegexTimeout), // Russian Volume: n Том -> Volume n new Regex( @"(\s|_)?(?\d+(?:(\-)\d+)?)(\s|_)Том(а?)", MatchOptions, RegexTimeout), }; private static readonly Regex[] ComicChapterRegex = new[] { // Batman & Wildcat (1 of 3) new Regex( @"(?.*(\d{4})?)( |_)(?:\((?\d+) of \d+)", MatchOptions, RegexTimeout), // Batman Beyond 04 (of 6) (1999) new Regex( @"(?.+?)(?\d+)(\s|_|-)?\(of", MatchOptions, RegexTimeout), // Batman Beyond 2.0 001 (2013) new Regex( @"^(?.+?\S\.\d) (?\d+)", MatchOptions, RegexTimeout), // Teen Titans v1 001 (1966-02) (digital) (OkC.O.M.P.U.T.O.-Novus) new Regex( @"^(?.+?)(?: |_)v(?\d+)(?: |_)(c? ?)(?(\d+(\.\d)?)-?(\d+(\.\d)?)?)(c? ?)", MatchOptions, RegexTimeout), // Batman & Robin the Teen Wonder #0 new Regex( @"^(?.+?)(?:\s|_)#(?\d+)", MatchOptions, RegexTimeout), // Batman 2016 - Chapter 01, Batman 2016 - Issue 01, Batman 2016 - Issue #01 new Regex( @"^(?.+?)((c(hapter)?)|issue)(_|\s)#?(?(\d+(\.\d)?)-?(\d+(\.\d)?)?)", MatchOptions, RegexTimeout), // Invincible 070.5 - Invincible Returns 1 (2010) (digital) (Minutemen-InnerDemons).cbr new Regex( @"^(?.+?)(?:\s|_)(c? ?(chapter)?)(?(\d+(\.\d)?)-?(\d+(\.\d)?)?)(c? ?)-", MatchOptions, RegexTimeout), // Batgirl Vol.2000 #57 (December, 2004) new Regex( @"^(?.+?)(?:vol\.?\d+)\s#(?\d+)", MatchOptions, RegexTimeout), // Russian Chapter: Главы n -> Chapter n new Regex( @"(Глава|глава|Главы|Глава)(\.?)(\s|_)?(?\d+(?:.\d+|-\d+)?)", MatchOptions, RegexTimeout), // Russian Chapter: n Главa -> Chapter n new Regex( @"(?!Том)(?\d+(?:\.\d+|-\d+)?)(\s|_)(Глава|глава|Главы|Глава)", MatchOptions, RegexTimeout), // Batman & Catwoman - Trail of the Gun 01, Batman & Grendel (1996) 01 - Devil's Bones, Teen Titans v1 001 (1966-02) (digital) (OkC.O.M.P.U.T.O.-Novus) new Regex( @"^(?.+?)(?: (?\d+))", MatchOptions, RegexTimeout), // Saga 001 (2012) (Digital) (Empire-Zone) new Regex( @"(?.+?)(?: |_)(c? ?)(?(\d+(\.\d)?)-?(\d+(\.\d)?)?)\s\(\d{4}", MatchOptions, RegexTimeout), // Amazing Man Comics chapter 25 new Regex( @"^(?!Vol)(?.+?)( |_)c(hapter)( |_)(?\d*)", MatchOptions, RegexTimeout), // Amazing Man Comics issue #25 new Regex( @"^(?!Vol)(?.+?)( |_)i(ssue)( |_) #(?\d*)", MatchOptions, RegexTimeout), // spawn-123, spawn-chapter-123 (from https://github.com/Girbons/comics-downloader) new Regex( @"^(?.+?)-(chapter-)?(?\d+)", MatchOptions, RegexTimeout), }; private static readonly Regex[] MangaChapterRegex = new[] { // Historys Strongest Disciple Kenichi_v11_c90-98.zip, ...c90.5-100.5 new Regex( @"(\b|_)(c|ch)(\.?\s?)(?(\d+(\.\d)?)-?(\d+(\.\d)?)?)", MatchOptions, RegexTimeout), // [Suihei Kiki]_Kasumi_Otoko_no_Ko_[Taruby]_v1.1.zip new Regex( @"v\d+\.(\s|_)(?\d+(?:.\d+|-\d+)?)", MatchOptions, RegexTimeout), // Umineko no Naku Koro ni - Episode 3 - Banquet of the Golden Witch #02.cbz (Rare case, if causes issue remove) new Regex( @"^(?.*)(?: |_)#(?\d+)", MatchOptions, RegexTimeout), // Green Worldz - Chapter 027, Kimi no Koto ga Daidaidaidaidaisuki na 100-nin no Kanojo Chapter 11-10 new Regex( @"^(?!Vol)(?.*)\s?(?\d+(?:\.?[\d-]+)?)", MatchOptions, RegexTimeout), // Russian Chapter: Главы n -> Chapter n new Regex( @"(Глава|глава|Главы|Глава)(\.?)(\s|_)?(?\d+(?:.\d+|-\d+)?)", MatchOptions, RegexTimeout), // Hinowa ga CRUSH! 018 (2019) (Digital) (LuCaZ).cbz, Hinowa ga CRUSH! 018.5 (2019) (Digital) (LuCaZ).cbz new Regex( @"^(?.+?)(?\d+(?:\.\d+|-\d+)?)(?:\s\(\d{4}\))?(\b|_|-)", MatchOptions, RegexTimeout), // Tower Of God S01 014 (CBT) (digital).cbz new Regex( @"(?.*)\sS(?\d+)\s(?\d+(?:.\d+|-\d+)?)", MatchOptions, RegexTimeout), // Beelzebub_01_[Noodles].zip, Beelzebub_153b_RHS.zip new Regex( @"^((?!v|vo|vol|Volume).)*(\s|_)(?\.?\d+(?:.\d+|-\d+)?)(?b)?(\s|_|\[|\()", MatchOptions, RegexTimeout), // Yumekui-Merry_DKThias_Chapter21.zip new Regex( @"Chapter(?\d+(-\d+)?)", //(?:.\d+|-\d+)? MatchOptions, RegexTimeout), // [Hidoi]_Amaenaideyo_MS_vol01_chp02.rar new Regex( @"(?.*)(\s|_)(vol\d+)?(\s|_)Chp\.? ?(?\d+)", MatchOptions, RegexTimeout), // Vol 1 Chapter 2 new Regex( @"(?((vol|volume|v))?(\s|_)?\.?\d+)(\s|_)(Chp|Chapter)\.?(\s|_)?(?\d+)", MatchOptions, RegexTimeout), // Chinese Chapter: 第n话 -> Chapter n, 【TFO汉化&Petit汉化】迷你偶像漫画第25话 new Regex( @"第(?\d+)话", MatchOptions, RegexTimeout), // Korean Chapter: 제n화 -> Chapter n, 가디언즈 오브 갤럭시 죽음의 보석.E0008.7화#44 new Regex( @"제?(?\d+\.?\d+)(회|화|장)", MatchOptions, RegexTimeout), // Korean Chapter: 第10話 -> Chapter n, [ハレム]ナナとカオル ~高校生のSMごっこ~ 第1話 new Regex( @"第?(?\d+(?:\.\d+|-\d+)?)話", MatchOptions, RegexTimeout), // Russian Chapter: n Главa -> Chapter n new Regex( @"(?!Том)(?\d+(?:\.\d+|-\d+)?)(\s|_)(Глава|глава|Главы|Глава)", MatchOptions, RegexTimeout), }; private static readonly Regex MangaEditionRegex = new Regex( // Tenjo Tenge {Full Contact Edition} v01 (2011) (Digital) (ASTC).cbz // To Love Ru v01 Uncensored (Ch.001-007) @"\b(?:Omnibus(?:\s?Edition)?|Uncensored)\b", MatchOptions, RegexTimeout ); // Matches anything between balanced parenthesis, tags between brackets, {} and {Complete} private static readonly Regex CleanupRegex = new Regex( $@"(?:\({BalancedParen}\)|{TagsInBrackets}|\{{\}}|\{{Complete\}})", MatchOptions, RegexTimeout ); private static readonly Regex MangaSpecialRegex = new Regex( // All Keywords, does not account for checking if contains volume/chapter identification. Parser.Parse() will handle. $@"\b(?:{CommonSpecial}|Omake)\b", MatchOptions, RegexTimeout ); private static readonly Regex ComicSpecialRegex = new Regex( // All Keywords, does not account for checking if contains volume/chapter identification. Parser.Parse() will handle. $@"\b(?:{CommonSpecial}|\d.+?(\W|-|^)Annual|Annual(\W|-|$)|Book \d.+?|Compendium(\W|-|$|\s.+?)|Omnibus(\W|-|$|\s.+?)|FCBD \d.+?|Absolute(\W|-|$|\s.+?)|Preview(\W|-|$|\s.+?)|Hors[ -]S[ée]rie|TPB|HS|THS)\b", MatchOptions, RegexTimeout ); private static readonly Regex EuropeanComicRegex = new Regex( // All Keywords, does not account for checking if contains volume/chapter identification. Parser.Parse() will handle. @"\b(?:Bd[-\s]Fr)\b", MatchOptions, RegexTimeout ); // If SP\d+ is in the filename, we force treat it as a special regardless if volume or chapter might have been found. private static readonly Regex SpecialMarkerRegex = new Regex( @"SP\d+", MatchOptions, RegexTimeout ); private static readonly Regex EmptySpaceRegex = new Regex( @"\s{2,}", MatchOptions, RegexTimeout ); public static MangaFormat ParseFormat(string filePath) { if (IsArchive(filePath)) return MangaFormat.Archive; if (IsImage(filePath)) return MangaFormat.Image; if (IsEpub(filePath)) return MangaFormat.Epub; if (IsPdf(filePath)) return MangaFormat.Pdf; return MangaFormat.Unknown; } public static string ParseEdition(string filePath) { filePath = ReplaceUnderscores(filePath); var match = MangaEditionRegex.Match(filePath); return match.Success ? match.Value : string.Empty; } /// /// If the file has SP marker. /// /// /// public static bool HasSpecialMarker(string filePath) { return SpecialMarkerRegex.IsMatch(filePath); } public static bool IsMangaSpecial(string filePath) { filePath = ReplaceUnderscores(filePath); return MangaSpecialRegex.IsMatch(filePath); } public static bool IsComicSpecial(string filePath) { filePath = ReplaceUnderscores(filePath); return ComicSpecialRegex.IsMatch(filePath); } public static string ParseSeries(string filename) { foreach (var regex in MangaSeriesRegex) { var matches = regex.Matches(filename); var group = matches .Select(match => match.Groups["Series"]) .FirstOrDefault(group => group.Success && group != Match.Empty); if (group != null) return CleanTitle(group.Value); } return string.Empty; } public static string ParseComicSeries(string filename) { foreach (var regex in ComicSeriesRegex) { var matches = regex.Matches(filename); var group = matches .Select(match => match.Groups["Series"]) .FirstOrDefault(group => group.Success && group != Match.Empty); if (group != null) return CleanTitle(group.Value, true); } return string.Empty; } public static string ParseVolume(string filename) { foreach (var regex in MangaVolumeRegex) { var matches = regex.Matches(filename); foreach (var group in matches.Select(match => match.Groups)) { if (!group["Volume"].Success || group["Volume"] == Match.Empty) continue; var value = group["Volume"].Value; var hasPart = group["Part"].Success; return FormatValue(value, hasPart); } } return DefaultVolume; } public static string ParseComicVolume(string filename) { foreach (var regex in ComicVolumeRegex) { var matches = regex.Matches(filename); foreach (var group in matches.Select(match => match.Groups)) { if (!group["Volume"].Success || group["Volume"] == Match.Empty) continue; var value = group["Volume"].Value; var hasPart = group["Part"].Success; return FormatValue(value, hasPart); } } return DefaultVolume; } private static string FormatValue(string value, bool hasPart) { if (!value.Contains('-')) { return RemoveLeadingZeroes(hasPart ? AddChapterPart(value) : value); } var tokens = value.Split("-"); var from = RemoveLeadingZeroes(tokens[0]); if (tokens.Length != 2) return from; var to = RemoveLeadingZeroes(hasPart ? AddChapterPart(tokens[1]) : tokens[1]); return $"{from}-{to}"; } public static string ParseChapter(string filename) { foreach (var regex in MangaChapterRegex) { var matches = regex.Matches(filename); foreach (var groups in matches.Select(match => match.Groups)) { if (!groups["Chapter"].Success || groups["Chapter"] == Match.Empty) continue; var value = groups["Chapter"].Value; var hasPart = groups["Part"].Success; return FormatValue(value, hasPart); } } return DefaultChapter; } private static string AddChapterPart(string value) { if (value.Contains('.')) { return value; } return $"{value}.5"; } public static string ParseComicChapter(string filename) { foreach (var regex in ComicChapterRegex) { var matches = regex.Matches(filename); foreach (var groups in matches.Select(match => match.Groups)) { if (!groups["Chapter"].Success || groups["Chapter"] == Match.Empty) continue; var value = groups["Chapter"].Value; var hasPart = groups["Part"].Success; return FormatValue(value, hasPart); } } return DefaultChapter; } private static string RemoveEditionTagHolders(string title) { title = CleanupRegex.Replace(title, string.Empty); title = MangaEditionRegex.Replace(title, string.Empty); return title; } private static string RemoveMangaSpecialTags(string title) { return MangaSpecialRegex.Replace(title, string.Empty); } private static string RemoveEuropeanTags(string title) { return EuropeanComicRegex.Replace(title, string.Empty); } private static string RemoveComicSpecialTags(string title) { return ComicSpecialRegex.Replace(title, string.Empty); } /// /// Translates _ -> spaces, trims front and back of string, removes release groups /// /// Hippos_the_Great [Digital], -> Hippos the Great /// /// /// /// /// public static string CleanTitle(string title, bool isComic = false) { title = ReplaceUnderscores(title); title = RemoveEditionTagHolders(title); if (isComic) { title = RemoveComicSpecialTags(title); title = RemoveEuropeanTags(title); } else { title = RemoveMangaSpecialTags(title); } title = title.Trim(SpacesAndSeparators); title = EmptySpaceRegex.Replace(title, " "); return title; } /// /// Pads the start of a number string with 0's so ordering works fine if there are over 100 items. /// Handles ranges (ie 4-8) -> (004-008). /// /// /// A zero padded number public static string PadZeros(string number) { if (!number.Contains('-')) return PerformPadding(number); var tokens = number.Split("-"); return $"{PerformPadding(tokens[0])}-{PerformPadding(tokens[1])}"; } private static string PerformPadding(string number) { var num = int.Parse(number); return num switch { < 10 => "00" + num, < 100 => "0" + num, _ => number }; } public static string RemoveLeadingZeroes(string title) { var ret = title.TrimStart(LeadingZeroesTrimChars); return string.IsNullOrEmpty(ret) ? "0" : ret; } public static bool IsArchive(string filePath) { return ArchiveFileRegex.IsMatch(Path.GetExtension(filePath)); } public static bool IsComicInfoExtension(string filePath) { return ComicInfoArchiveRegex.IsMatch(Path.GetExtension(filePath)); } public static bool IsBook(string filePath) { return BookFileRegex.IsMatch(Path.GetExtension(filePath)); } public static bool IsImage(string filePath) { return !filePath.StartsWith('.') && ImageRegex.IsMatch(Path.GetExtension(filePath)); } public static bool IsXml(string filePath) { return XmlRegex.IsMatch(Path.GetExtension(filePath)); } public static float MinNumberFromRange(string range) { try { if (!Regex.IsMatch(range, @"^[\d\-.]+$", MatchOptions, RegexTimeout)) { return (float) 0.0; } var tokens = range.Replace("_", string.Empty).Split("-"); return tokens.Min(float.Parse); } catch { return (float) 0.0; } } public static float MaxNumberFromRange(string range) { try { if (!Regex.IsMatch(range, @"^[\d\-.]+$", MatchOptions, RegexTimeout)) { return (float) 0.0; } var tokens = range.Replace("_", string.Empty).Split("-"); return tokens.Max(float.Parse); } catch { return (float) 0.0; } } public static string Normalize(string name) { return NormalizeRegex.Replace(name, string.Empty).Trim().ToLower(); } /// /// Responsible for preparing special title for rendering to the UI. Replaces _ with ' ' and strips out SP\d+ /// /// /// public static string CleanSpecialTitle(string name) { if (string.IsNullOrEmpty(name)) return name; var cleaned = SpecialTokenRegex.Replace(name.Replace('_', ' '), string.Empty).Trim(); var lastIndex = cleaned.LastIndexOf('.'); if (lastIndex > 0) { cleaned = cleaned.Substring(0, cleaned.LastIndexOf('.')).Trim(); } return string.IsNullOrEmpty(cleaned) ? name : cleaned; } /// /// Tests whether the file is a cover image such that: contains "cover", is named "folder", and is an image /// /// If the path has "backcover" in it, it will be ignored /// Filename with extension /// public static bool IsCoverImage(string filename) { return IsImage(filename) && CoverImageRegex.IsMatch(filename); } /// /// Validates that a Path doesn't start with certain blacklisted folders, like __MACOSX, @Recently-Snapshot, etc and that if a full path, the filename /// doesn't start with ._, which is a metadata file on MACOSX. /// /// /// public static bool HasBlacklistedFolderInPath(string path) { return path.Contains("__MACOSX") || path.StartsWith("@Recently-Snapshot") || path.StartsWith("@recycle") || path.StartsWith("._") || Path.GetFileName(path).StartsWith("._") || path.Contains(".qpkg") || path.StartsWith("#recycle") || path.Contains(".caltrash"); } public static bool IsEpub(string filePath) { return Path.GetExtension(filePath).Equals(".epub", StringComparison.InvariantCultureIgnoreCase); } public static bool IsPdf(string filePath) { return Path.GetExtension(filePath).Equals(".pdf", StringComparison.InvariantCultureIgnoreCase); } /// /// Cleans an author's name /// /// If the author is Last, First, this will not reverse /// /// public static string CleanAuthor(string author) { return string.IsNullOrEmpty(author) ? string.Empty : author.Trim(); } /// /// Cleans user query string input /// /// /// public static string CleanQuery(string query) { return Uri.UnescapeDataString(query).Trim().Replace(@"%", string.Empty) .Replace(":", string.Empty); } /// /// Normalizes the slashes in a path to be /// /// /manga/1\1 -> /manga/1/1 /// /// public static string NormalizePath(string? path) { return string.IsNullOrEmpty(path) ? string.Empty : path.Replace(Path.DirectorySeparatorChar, Path.AltDirectorySeparatorChar) .Replace(@"//", Path.AltDirectorySeparatorChar + string.Empty); } /// /// Checks against a set of strings to validate if a ComicInfo.Format should receive special treatment /// /// /// public static bool HasComicInfoSpecial(string comicInfoFormat) { return FormatTagSpecialKeywords.Contains(comicInfoFormat); } private static string ReplaceUnderscores(string name) { return string.IsNullOrEmpty(name) ? string.Empty : name.Replace('_', ' '); } public static string? ExtractFilename(string fileUrl) { var matches = Parser.CssImageUrlRegex.Matches(fileUrl); foreach (Match match in matches) { if (!match.Success) continue; // NOTE: This is failing for //localhost:5000/api/book/29919/book-resources?file=OPS/images/tick1.jpg var importFile = match.Groups["Filename"].Value; if (!importFile.Contains("?")) return importFile; } return null; } }