diff --git a/API.Benchmark/EpubBenchmark.cs b/API.Benchmark/EpubBenchmark.cs deleted file mode 100644 index 1d47889b1..000000000 --- a/API.Benchmark/EpubBenchmark.cs +++ /dev/null @@ -1,105 +0,0 @@ -using System; -using System.Linq; -using System.Text.RegularExpressions; -using System.Threading.Tasks; -using API.Services; -using BenchmarkDotNet.Attributes; -using BenchmarkDotNet.Order; -using HtmlAgilityPack; -using VersOne.Epub; - -namespace API.Benchmark; - -[StopOnFirstError] -[MemoryDiagnoser] -[RankColumn] -[Orderer(SummaryOrderPolicy.FastestToSlowest)] -[SimpleJob(launchCount: 1, warmupCount: 5, invocationCount: 20)] -public class EpubBenchmark -{ - private const string FilePath = @"E:\Books\Invaders of the Rokujouma\Invaders of the Rokujouma - Volume 01.epub"; - private readonly Regex _wordRegex = new Regex(@"\b\w+\b", RegexOptions.Compiled | RegexOptions.IgnoreCase); - - [Benchmark] - public async Task GetWordCount_PassByRef() - { - using var book = await EpubReader.OpenBookAsync(FilePath, BookService.BookReaderOptions); - foreach (var bookFile in book.Content.Html.Values) - { - await GetBookWordCount_PassByRef(bookFile); - } - } - - [Benchmark] - public async Task GetBookWordCount_SumEarlier() - { - using var book = await EpubReader.OpenBookAsync(FilePath, BookService.BookReaderOptions); - foreach (var bookFile in book.Content.Html.Values) - { - await GetBookWordCount_SumEarlier(bookFile); - } - } - - [Benchmark] - public async Task GetBookWordCount_Regex() - { - using var book = await EpubReader.OpenBookAsync(FilePath, BookService.BookReaderOptions); - foreach (var bookFile in book.Content.Html.Values) - { - await GetBookWordCount_Regex(bookFile); - } - } - - private int GetBookWordCount_PassByString(string fileContents) - { - var doc = new HtmlDocument(); - doc.LoadHtml(fileContents); - var delimiter = new char[] {' '}; - - return doc.DocumentNode.SelectNodes("//body//text()[not(parent::script)]") - .Select(node => node.InnerText) - .Select(text => text.Split(delimiter, StringSplitOptions.RemoveEmptyEntries) - .Where(s => char.IsLetter(s[0]))) - .Select(words => words.Count()) - .Where(wordCount => wordCount > 0) - .Sum(); - } - - private async Task GetBookWordCount_PassByRef(EpubContentFileRef bookFile) - { - var doc = new HtmlDocument(); - doc.LoadHtml(await bookFile.ReadContentAsTextAsync()); - var delimiter = new char[] {' '}; - - var textNodes = doc.DocumentNode.SelectNodes("//body//text()[not(parent::script)]"); - if (textNodes == null) return 0; - return textNodes.Select(node => node.InnerText) - .Select(text => text.Split(delimiter, StringSplitOptions.RemoveEmptyEntries) - .Where(s => char.IsLetter(s[0]))) - .Select(words => words.Count()) - .Where(wordCount => wordCount > 0) - .Sum(); - } - - private async Task GetBookWordCount_SumEarlier(EpubContentFileRef bookFile) - { - var doc = new HtmlDocument(); - doc.LoadHtml(await bookFile.ReadContentAsTextAsync()); - - return doc.DocumentNode.SelectNodes("//body//text()[not(parent::script)]") - .DefaultIfEmpty() - .Select(node => node.InnerText.Split(' ', StringSplitOptions.RemoveEmptyEntries) - .Where(s => char.IsLetter(s[0]))) - .Sum(words => words.Count()); - } - - private async Task GetBookWordCount_Regex(EpubContentFileRef bookFile) - { - var doc = new HtmlDocument(); - doc.LoadHtml(await bookFile.ReadContentAsTextAsync()); - - - return doc.DocumentNode.SelectNodes("//body//text()[not(parent::script)]") - .Sum(node => _wordRegex.Matches(node.InnerText).Count); - } -} diff --git a/API/API.csproj b/API/API.csproj index 2eaa5f9ff..4ef864443 100644 --- a/API/API.csproj +++ b/API/API.csproj @@ -104,7 +104,7 @@ - + diff --git a/API/Controllers/BookController.cs b/API/Controllers/BookController.cs index 62cbcd436..1e7026cfd 100644 --- a/API/Controllers/BookController.cs +++ b/API/Controllers/BookController.cs @@ -98,9 +98,9 @@ public class BookController : BaseApiController using var book = await EpubReader.OpenBookAsync(chapter.Files.ElementAt(0).FilePath, BookService.BookReaderOptions); var key = BookService.CoalesceKeyForAnyFile(book, file); - if (!book.Content.AllFiles.ContainsKey(key)) return BadRequest("File was not found in book"); + if (!book.Content.AllFiles.Local.ContainsKey(key)) return BadRequest("File was not found in book"); - var bookFile = book.Content.AllFiles[key]; + var bookFile = book.Content.AllFiles.Local[key]; var content = await bookFile.ReadContentAsBytesAsync(); var contentType = BookService.GetContentType(bookFile.ContentType); diff --git a/API/Services/BookService.cs b/API/Services/BookService.cs index 669e97282..23919e3bf 100644 --- a/API/Services/BookService.cs +++ b/API/Services/BookService.cs @@ -11,7 +11,6 @@ using API.Data.Metadata; using API.DTOs.Reader; using API.Entities; using API.Entities.Enums; -using API.Helpers; using API.Services.Tasks.Scanner.Parser; using Docnet.Core; using Docnet.Core.Converters; @@ -176,20 +175,20 @@ public class BookService : IBookService // @Import statements will be handled by browser, so we must inline the css into the original file that request it, so they can be Scoped var prepend = filename.Length > 0 ? filename.Replace(Path.GetFileName(filename), string.Empty) : string.Empty; var importBuilder = new StringBuilder(); - //foreach (Match match in Tasks.Scanner.Parser.Parser.CssImportUrlRegex().Matches(stylesheetHtml)) + foreach (Match match in Parser.CssImportUrlRegex.Matches(stylesheetHtml)) { if (!match.Success) continue; var importFile = match.Groups["Filename"].Value; - var key = CleanContentKeys(importFile); + var key = CleanContentKeys(importFile); // Validate if CoalesceKey works well here if (!key.Contains(prepend)) { key = prepend + key; } - if (!book.Content.AllFiles.ContainsKey(key)) continue; + if (!book.Content.AllFiles.Local.ContainsKey(key)) continue; - var bookFile = book.Content.AllFiles[key]; + var bookFile = book.Content.AllFiles.Local[key]; var content = await bookFile.ReadContentAsBytesAsync(); importBuilder.Append(Encoding.UTF8.GetString(content)); } @@ -258,7 +257,7 @@ public class BookService : IBookService var importFile = match.Groups["Filename"].Value; var key = CleanContentKeys(importFile); - if (!book.Content.AllFiles.ContainsKey(key)) continue; + if (!book.Content.AllFiles.Local.ContainsKey(key)) continue; stylesheetHtml = stylesheetHtml.Replace(importFile, apiBase + key); } @@ -308,9 +307,9 @@ public class BookService : IBookService /// private static string GetKeyForImage(EpubBookRef book, string imageFile) { - if (book.Content.Images.ContainsKey(imageFile)) return imageFile; + if (book.Content.Images.Local.ContainsKey(imageFile)) return imageFile; - var correctedKey = book.Content.Images.Keys.SingleOrDefault(s => s.EndsWith(imageFile)); + var correctedKey = book.Content.Images.Local.Keys.SingleOrDefault(s => s.EndsWith(imageFile)); if (correctedKey != null) { imageFile = correctedKey; @@ -319,7 +318,7 @@ public class BookService : IBookService { // There are cases where the key is defined static like OEBPS/Images/1-4.jpg but reference is ../Images/1-4.jpg correctedKey = - book.Content.Images.Keys.SingleOrDefault(s => s.EndsWith(imageFile.Replace("..", string.Empty))); + book.Content.Images.Local.Keys.SingleOrDefault(s => s.EndsWith(imageFile.Replace("..", string.Empty))); if (correctedKey != null) { imageFile = correctedKey; @@ -373,9 +372,9 @@ public class BookService : IBookService var key = CleanContentKeys(styleLinks.Attributes["href"].Value); // Some epubs are malformed the key in content.opf might be: content/resources/filelist_0_0.xml but the actual html links to resources/filelist_0_0.xml // In this case, we will do a search for the key that ends with - if (!book.Content.Css.ContainsKey(key)) + if (!book.Content.Css.Local.ContainsKey(key)) { - var correctedKey = book.Content.Css.Keys.SingleOrDefault(s => s.EndsWith(key)); + var correctedKey = book.Content.Css.Local.Keys.SingleOrDefault(s => s.EndsWith(key)); if (correctedKey == null) { _logger.LogError("Epub is Malformed, key: {Key} is not matching OPF file", key); @@ -387,10 +386,10 @@ public class BookService : IBookService try { - var cssFile = book.Content.Css[key]; + var cssFile = book.Content.Css.Local[key]; var styleContent = await ScopeStyles(await cssFile.ReadContentAsync(), apiBase, - cssFile.FileName, book); + cssFile.FilePath, book); if (styleContent != null) { body.PrependChild(HtmlNode.CreateNode($"")); @@ -422,16 +421,19 @@ public class BookService : IBookService } var (year, month, day) = GetPublicationDate(publicationDate); + var summary = epubBook.Schema.Package.Metadata.Descriptions.FirstOrDefault(); var info = new ComicInfo { - Summary = epubBook.Schema.Package.Metadata.Description, - Publisher = string.Join(",", epubBook.Schema.Package.Metadata.Publishers), + Summary = string.IsNullOrEmpty(summary?.Description) ? string.Empty : summary.Description, + Publisher = string.Join(",", epubBook.Schema.Package.Metadata.Publishers.Select(p => p.Publisher)), Month = month, Day = day, Year = year, Title = epubBook.Title, - Genre = string.Join(",", epubBook.Schema.Package.Metadata.Subjects.Select(s => s.ToLower().Trim())), - LanguageISO = ValidateLanguage(epubBook.Schema.Package.Metadata.Languages.FirstOrDefault()) + Genre = string.Join(",", epubBook.Schema.Package.Metadata.Subjects.Select(s => s.Subject.ToLower().Trim())), + LanguageISO = ValidateLanguage(epubBook.Schema.Package.Metadata.Languages + .Select(l => l.Language) + .FirstOrDefault()) }; ComicInfo.CleanComicInfo(info); @@ -484,21 +486,20 @@ public class BookService : IBookService // These look to be genres from https://manual.calibre-ebook.com/sub_groups.html or can be "series" break; case "role": - if (!metadataItem.Scheme.Equals("marc:relators")) break; + if (metadataItem.Scheme != null && !metadataItem.Scheme.Equals("marc:relators")) break; - var creatorId = metadataItem.Refines.Replace("#", string.Empty); - var person = epubBook.Schema.Package.Metadata.Creators.SingleOrDefault(c => c.Id == creatorId); + var creatorId = metadataItem.Refines?.Replace("#", string.Empty); + var person = epubBook.Schema.Package.Metadata.Creators + .SingleOrDefault(c => c.Id == creatorId); if (person == null) break; PopulatePerson(metadataItem, info, person); break; case "title-type": - break; - // This is currently not possible until VersOne update's to allow EPUB 3 Title to have attributes (3.3 update) if (!metadataItem.Content.Equals("collection")) break; - var titleId = metadataItem.Refines.Replace("#", string.Empty); - var readingListElem = epubBook.Schema.Package.Metadata.MetaItems.FirstOrDefault(item => - item.Name == "dc:title" && item.Id == titleId); + var titleId = metadataItem.Refines?.Replace("#", string.Empty); + var readingListElem = epubBook.Schema.Package.Metadata.Titles + .FirstOrDefault(item => item.Id == titleId); if (readingListElem == null) break; var count = epubBook.Schema.Package.Metadata.MetaItems @@ -507,13 +508,13 @@ public class BookService : IBookService if (count == null || count.Content == "0") { // Treat this as a Collection - info.StoryArc += "," + readingListElem.Content; + info.SeriesGroup += (string.IsNullOrEmpty(info.StoryArc) ? string.Empty : ",") + readingListElem.Title.Replace(",", "_"); } else { // Treat as a reading list - info.AlternateSeries += "," + readingListElem.Content; - info.AlternateNumber += "," + count.Content; + info.AlternateSeries += (string.IsNullOrEmpty(info.AlternateSeries) ? string.Empty : ",") + readingListElem.Title.Replace(",", "_"); + info.AlternateNumber += (string.IsNullOrEmpty(info.AlternateNumber) ? string.Empty : ",") + count.Content; } break; @@ -686,7 +687,7 @@ public class BookService : IBookService foreach (var contentFileRef in await book.GetReadingOrderAsync()) { if (contentFileRef.ContentType != EpubContentType.XHTML_1_1) continue; - dict.Add(contentFileRef.FileName, pageCount); + dict.Add(contentFileRef.FilePath, pageCount); // FileName -> FilePath pageCount += 1; } @@ -860,13 +861,13 @@ public class BookService : IBookService if (mappings.ContainsKey(CleanContentKeys(key))) return key; // Fallback to searching for key (bad epub metadata) - var correctedKey = book.Content.Html.Keys.FirstOrDefault(s => s.EndsWith(key)); + var correctedKey = book.Content.Html.Local.Keys.FirstOrDefault(s => s.EndsWith(key)); if (!string.IsNullOrEmpty(correctedKey)) { key = correctedKey; } - var stepsBack = CountParentDirectory(book.Content.NavigationHtmlFile.FileName); + var stepsBack = CountParentDirectory(book.Content.NavigationHtmlFile?.FilePath); // FileName -> FilePath if (mappings.TryGetValue(key, out _)) { return key; @@ -884,13 +885,13 @@ public class BookService : IBookService public static string CoalesceKeyForAnyFile(EpubBookRef book, string key) { - if (book.Content.AllFiles.ContainsKey(key)) return key; + if (book.Content.AllFiles.Local.ContainsKey(key)) return key; var cleanedKey = CleanContentKeys(key); - if (book.Content.AllFiles.ContainsKey(cleanedKey)) return cleanedKey; + if (book.Content.AllFiles.Local.ContainsKey(cleanedKey)) return cleanedKey; // Fallback to searching for key (bad epub metadata) - var correctedKey = book.Content.AllFiles.Keys.SingleOrDefault(s => s.EndsWith(key)); + var correctedKey = book.Content.AllFiles.Local.Keys.SingleOrDefault(s => s.EndsWith(key)); if (!string.IsNullOrEmpty(correctedKey)) { key = correctedKey; @@ -913,42 +914,45 @@ public class BookService : IBookService var navItems = await book.GetNavigationAsync(); var chaptersList = new List(); - foreach (var navigationItem in navItems) + if (navItems != null) { - if (navigationItem.NestedItems.Count == 0) + foreach (var navigationItem in navItems) { - CreateToCChapter(book, navigationItem, Array.Empty(), chaptersList, mappings); - continue; - } - - var nestedChapters = new List(); - - foreach (var nestedChapter in navigationItem.NestedItems.Where(n => n.Link != null)) - { - var key = CoalesceKey(book, mappings, nestedChapter.Link.ContentFileName); - if (mappings.TryGetValue(key, out var mapping)) + if (navigationItem.NestedItems.Count == 0) { - nestedChapters.Add(new BookChapterItem - { - Title = nestedChapter.Title, - Page = mapping, - Part = nestedChapter.Link.Anchor ?? string.Empty, - Children = new List() - }); + CreateToCChapter(book, navigationItem, Array.Empty(), chaptersList, mappings); + continue; } - } - CreateToCChapter(book, navigationItem, nestedChapters, chaptersList, mappings); + var nestedChapters = new List(); + + foreach (var nestedChapter in navigationItem.NestedItems.Where(n => n.Link != null)) + { + var key = CoalesceKey(book, mappings, nestedChapter.Link?.ContentFileName); + if (mappings.TryGetValue(key, out var mapping)) + { + nestedChapters.Add(new BookChapterItem + { + Title = nestedChapter.Title, + Page = mapping, + Part = nestedChapter.Link?.Anchor ?? string.Empty, + Children = new List() + }); + } + } + + CreateToCChapter(book, navigationItem, nestedChapters, chaptersList, mappings); + } } if (chaptersList.Count != 0) return chaptersList; // Generate from TOC from links (any point past this, Kavita is generating as a TOC doesn't exist) - var tocPage = book.Content.Html.Keys.FirstOrDefault(k => k.ToUpper().Contains("TOC")); + var tocPage = book.Content.Html.Local.Keys.FirstOrDefault(k => k.ToUpper().Contains("TOC")); if (tocPage == null) return chaptersList; // Find all anchor tags, for each anchor we get inner text, to lower then title case on UI. Get href and generate page content var doc = new HtmlDocument(); - var content = await book.Content.Html[tocPage].ReadContentAsync(); + var content = await book.Content.Html.Local[tocPage].ReadContentAsync(); doc.LoadHtml(content); var anchors = doc.DocumentNode.SelectNodes("//a"); if (anchors == null) return chaptersList; @@ -1129,8 +1133,8 @@ public class BookService : IBookService { // Try to get the cover image from OPF file, if not set, try to parse it from all the files, then result to the first one. var coverImageContent = epubBook.Content.Cover - ?? epubBook.Content.Images.Values.FirstOrDefault(file => Parser.IsCoverImage(file.FileName)) - ?? epubBook.Content.Images.Values.FirstOrDefault(); + ?? epubBook.Content.Images.Local.Values.FirstOrDefault(file => Parser.IsCoverImage(file.FilePath)) // FileName -> FilePath + ?? epubBook.Content.Images.Local.Values.FirstOrDefault(); if (coverImageContent == null) return string.Empty; using var stream = coverImageContent.GetContentStream(); @@ -1200,12 +1204,6 @@ public class BookService : IBookService } // Remove comments from CSS - // body = CssComment().Replace(body, string.Empty); - // - // body = WhiteSpace1().Replace(body, "#"); - // body = WhiteSpace2().Replace(body, string.Empty); - // body = WhiteSpace3().Replace(body, " "); - // body = WhiteSpace4().Replace(body, "$1"); body = Regex.Replace(body, @"/\*[\d\D]*?\*/", string.Empty, RegexOptions.None, Parser.RegexTimeout); body = Regex.Replace(body, @"[a-zA-Z]+#", "#", RegexOptions.None, Parser.RegexTimeout); @@ -1221,7 +1219,6 @@ public class BookService : IBookService //Swallow exception. Some css don't have style rules ending in ';' } - //body = UnitPadding().Replace(body, "$1"); body = Regex.Replace(body, @"([\s:]0)(px|pt|%|em)", "$1", RegexOptions.None, Parser.RegexTimeout); @@ -1230,7 +1227,7 @@ public class BookService : IBookService private void LogBookErrors(EpubBookRef book, EpubContentFileRef contentFileRef, HtmlDocument doc) { - _logger.LogError("{FilePath} has an invalid html file (Page {PageName})", book.FilePath, contentFileRef.FileName); + _logger.LogError("{FilePath} has an invalid html file (Page {PageName})", book.FilePath, contentFileRef.Key); foreach (var error in doc.ParseErrors) { _logger.LogError("Line {LineNumber}, Reason: {Reason}", error.Line, error.Reason); diff --git a/API/Services/Tasks/Metadata/WordCountAnalyzerService.cs b/API/Services/Tasks/Metadata/WordCountAnalyzerService.cs index b4687d749..cec383905 100644 --- a/API/Services/Tasks/Metadata/WordCountAnalyzerService.cs +++ b/API/Services/Tasks/Metadata/WordCountAnalyzerService.cs @@ -172,7 +172,7 @@ public class WordCountAnalyzerService : IWordCountAnalyzerService { using var book = await EpubReader.OpenBookAsync(filePath, BookService.BookReaderOptions); - var totalPages = book.Content.Html.Values; + var totalPages = book.Content.Html.Local.Values; foreach (var bookPage in totalPages) { var progress = Math.Max(0F, @@ -238,10 +238,10 @@ public class WordCountAnalyzerService : IWordCountAnalyzerService } - private static async Task GetWordCountFromHtml(EpubContentFileRef bookFile) + private static async Task GetWordCountFromHtml(EpubLocalTextContentFileRef bookFile) { var doc = new HtmlDocument(); - doc.LoadHtml(await bookFile.ReadContentAsTextAsync()); + doc.LoadHtml(await bookFile.ReadContentAsync()); var textNodes = doc.DocumentNode.SelectNodes("//body//text()[not(parent::script)]"); if (textNodes == null) return 0; diff --git a/openapi.json b/openapi.json index 36eb05fcf..8bdce79c3 100644 --- a/openapi.json +++ b/openapi.json @@ -7,7 +7,7 @@ "name": "GPL-3.0", "url": "https://github.com/Kareadita/Kavita/blob/develop/LICENSE" }, - "version": "0.7.2.10" + "version": "0.7.2.11" }, "servers": [ {