using System; using System.Collections.Generic; using System.Globalization; using System.IO; using System.Linq; using System.Text; using System.Text.RegularExpressions; using System.Threading.Tasks; using System.Web; using API.Data.Metadata; using API.DTOs.Reader; using API.Entities; using API.Entities.Enums; using API.Services.Tasks.Scanner.Parser; using Docnet.Core; using Docnet.Core.Converters; using Docnet.Core.Models; using Docnet.Core.Readers; using ExCSS; using HtmlAgilityPack; using Kavita.Common; using Microsoft.Extensions.Logging; using Microsoft.IO; using SixLabors.ImageSharp; using SixLabors.ImageSharp.PixelFormats; using VersOne.Epub; using VersOne.Epub.Options; namespace API.Services; public interface IBookService { int GetNumberOfPages(string filePath); string GetCoverImage(string fileFilePath, string fileName, string outputDirectory, bool saveAsWebP = false); ComicInfo? GetComicInfo(string filePath); ParserInfo? ParseInfo(string filePath); /// /// Scopes styles to .reading-section and replaces img src to the passed apiBase /// /// /// /// If the stylesheetHtml contains Import statements, when scoping the filename, scope needs to be wrt filepath. /// Book Reference, needed for if you expect Import statements /// Task ScopeStyles(string stylesheetHtml, string apiBase, string filename, EpubBookRef book); /// /// Extracts a PDF file's pages as images to an target directory /// /// This method relies on Docnet which has explicit patches from Kavita for ARM support. This should only be used with Tachiyomi /// /// Where the files will be extracted to. If doesn't exist, will be created. void ExtractPdfImages(string fileFilePath, string targetDirectory); Task> GenerateTableOfContents(Chapter chapter); Task GetBookPage(int page, int chapterId, string cachedEpubPath, string baseUrl); Task> CreateKeyToPageMappingAsync(EpubBookRef book); } public class BookService : IBookService { private readonly ILogger _logger; private readonly IDirectoryService _directoryService; private readonly IImageService _imageService; private readonly IMediaErrorService _mediaErrorService; private readonly StylesheetParser _cssParser = new (); private static readonly RecyclableMemoryStreamManager StreamManager = new (); private const string CssScopeClass = ".book-content"; private const string BookApiUrl = "book-resources?file="; public static readonly EpubReaderOptions BookReaderOptions = new() { PackageReaderOptions = new PackageReaderOptions { IgnoreMissingToc = true } }; public BookService(ILogger logger, IDirectoryService directoryService, IImageService imageService, IMediaErrorService mediaErrorService) { _logger = logger; _directoryService = directoryService; _imageService = imageService; _mediaErrorService = mediaErrorService; } private static bool HasClickableHrefPart(HtmlNode anchor) { return anchor.GetAttributeValue("href", string.Empty).Contains("#") && anchor.GetAttributeValue("tabindex", string.Empty) != "-1" && anchor.GetAttributeValue("role", string.Empty) != "presentation"; } public static string GetContentType(EpubContentType type) { string contentType; switch (type) { case EpubContentType.IMAGE_GIF: contentType = "image/gif"; break; case EpubContentType.IMAGE_PNG: contentType = "image/png"; break; case EpubContentType.IMAGE_JPEG: contentType = "image/jpeg"; break; case EpubContentType.FONT_OPENTYPE: contentType = "font/otf"; break; case EpubContentType.FONT_TRUETYPE: contentType = "font/ttf"; break; case EpubContentType.IMAGE_SVG: contentType = "image/svg+xml"; break; default: contentType = "application/octet-stream"; break; } return contentType; } private static void UpdateLinks(HtmlNode anchor, Dictionary mappings, int currentPage) { if (anchor.Name != "a") return; var hrefParts = CleanContentKeys(anchor.GetAttributeValue("href", string.Empty)) .Split("#"); // Some keys get uri encoded when parsed, so replace any of those characters with original var mappingKey = HttpUtility.UrlDecode(hrefParts[0]); if (!mappings.ContainsKey(mappingKey)) { if (HasClickableHrefPart(anchor)) { var part = hrefParts.Length > 1 ? hrefParts[1] : anchor.GetAttributeValue("href", string.Empty); anchor.Attributes.Add("kavita-page", $"{currentPage}"); anchor.Attributes.Add("kavita-part", part); anchor.Attributes.Remove("href"); anchor.Attributes.Add("href", "javascript:void(0)"); } else { anchor.Attributes.Add("target", "_blank"); anchor.Attributes.Add("rel", "noreferrer noopener"); } return; } var mappedPage = mappings[mappingKey]; anchor.Attributes.Add("kavita-page", $"{mappedPage}"); if (hrefParts.Length > 1) { anchor.Attributes.Add("kavita-part", hrefParts[1]); } anchor.Attributes.Remove("href"); anchor.Attributes.Add("href", "javascript:void(0)"); } /// /// Scopes styles to .reading-section and replaces img src to the passed apiBase /// /// /// /// If the stylesheetHtml contains Import statements, when scoping the filename, scope needs to be wrt filepath. /// Book Reference, needed for if you expect Import statements /// public async Task ScopeStyles(string stylesheetHtml, string apiBase, string filename, EpubBookRef book) { // @Import statements will be handled by browser, so we must inline the css into the original file that request it, so they can be Scoped var prepend = filename.Length > 0 ? filename.Replace(Path.GetFileName(filename), string.Empty) : string.Empty; var importBuilder = new StringBuilder(); //foreach (Match match in Tasks.Scanner.Parser.Parser.CssImportUrlRegex().Matches(stylesheetHtml)) foreach (Match match in Parser.CssImportUrlRegex.Matches(stylesheetHtml)) { if (!match.Success) continue; var importFile = match.Groups["Filename"].Value; var key = CleanContentKeys(importFile); if (!key.Contains(prepend)) { key = prepend + key; } if (!book.Content.AllFiles.ContainsKey(key)) continue; var bookFile = book.Content.AllFiles[key]; var content = await bookFile.ReadContentAsBytesAsync(); importBuilder.Append(Encoding.UTF8.GetString(content)); } stylesheetHtml = stylesheetHtml.Insert(0, importBuilder.ToString()); EscapeCssImportReferences(ref stylesheetHtml, apiBase, prepend); EscapeFontFamilyReferences(ref stylesheetHtml, apiBase, prepend); // Check if there are any background images and rewrite those urls EscapeCssImageReferences(ref stylesheetHtml, apiBase, book); var styleContent = RemoveWhiteSpaceFromStylesheets(stylesheetHtml); styleContent = styleContent.Replace("body", CssScopeClass); if (string.IsNullOrEmpty(styleContent)) return string.Empty; var stylesheet = await _cssParser.ParseAsync(styleContent); foreach (var styleRule in stylesheet.StyleRules) { if (styleRule.Selector.Text == CssScopeClass) continue; if (styleRule.Selector.Text.Contains(',')) { styleRule.Text = styleRule.Text.Replace(styleRule.SelectorText, string.Join(", ", styleRule.Selector.Text.Split(",").Select(s => $"{CssScopeClass} " + s))); continue; } styleRule.Text = $"{CssScopeClass} " + styleRule.Text; } return RemoveWhiteSpaceFromStylesheets(stylesheet.ToCss()); } private static void EscapeCssImportReferences(ref string stylesheetHtml, string apiBase, string prepend) { //foreach (Match match in Tasks.Scanner.Parser.Parser.CssImportUrlRegex().Matches(stylesheetHtml)) foreach (Match match in Parser.CssImportUrlRegex.Matches(stylesheetHtml)) { if (!match.Success) continue; var importFile = match.Groups["Filename"].Value; stylesheetHtml = stylesheetHtml.Replace(importFile, apiBase + prepend + importFile); } } private static void EscapeFontFamilyReferences(ref string stylesheetHtml, string apiBase, string prepend) { //foreach (Match match in Tasks.Scanner.Parser.Parser.FontSrcUrlRegex().Matches(stylesheetHtml)) foreach (Match match in Parser.FontSrcUrlRegex.Matches(stylesheetHtml)) { if (!match.Success) continue; var importFile = match.Groups["Filename"].Value; stylesheetHtml = stylesheetHtml.Replace(importFile, apiBase + prepend + importFile); } } private static void EscapeCssImageReferences(ref string stylesheetHtml, string apiBase, EpubBookRef book) { //var matches = Tasks.Scanner.Parser.Parser.CssImageUrlRegex().Matches(stylesheetHtml); var matches = Parser.CssImageUrlRegex.Matches(stylesheetHtml); foreach (Match match in matches) { if (!match.Success) continue; var importFile = match.Groups["Filename"].Value; var key = CleanContentKeys(importFile); if (!book.Content.AllFiles.ContainsKey(key)) continue; stylesheetHtml = stylesheetHtml.Replace(importFile, apiBase + key); } } private static void ScopeImages(HtmlDocument doc, EpubBookRef book, string apiBase) { var images = doc.DocumentNode.SelectNodes("//img") ?? doc.DocumentNode.SelectNodes("//image") ?? doc.DocumentNode.SelectNodes("//svg"); if (images == null) return; var parent = images.First().ParentNode; foreach (var image in images) { string? key = null; if (image.Attributes["src"] != null) { key = "src"; } else if (image.Attributes["xlink:href"] != null) { key = "xlink:href"; } if (string.IsNullOrEmpty(key)) continue; var imageFile = GetKeyForImage(book, image.Attributes[key].Value); image.Attributes.Remove(key); // UrlEncode here to transform ../ into an escaped version, which avoids blocking on nginx image.Attributes.Add(key, $"{apiBase}" + HttpUtility.UrlEncode(imageFile)); // Add a custom class that the reader uses to ensure images stay within reader parent.AddClass("kavita-scale-width-container"); image.AddClass("kavita-scale-width"); } } /// /// Returns the image key associated with the file. Contains some basic fallback logic. /// /// /// /// private static string GetKeyForImage(EpubBookRef book, string imageFile) { if (book.Content.Images.ContainsKey(imageFile)) return imageFile; var correctedKey = book.Content.Images.Keys.SingleOrDefault(s => s.EndsWith(imageFile)); if (correctedKey != null) { imageFile = correctedKey; } else if (imageFile.StartsWith("..")) { // There are cases where the key is defined static like OEBPS/Images/1-4.jpg but reference is ../Images/1-4.jpg correctedKey = book.Content.Images.Keys.SingleOrDefault(s => s.EndsWith(imageFile.Replace("..", string.Empty))); if (correctedKey != null) { imageFile = correctedKey; } } return imageFile; } private static string PrepareFinalHtml(HtmlDocument doc, HtmlNode body) { // Check if any classes on the html node (some r2l books do this) and move them to body tag for scoping var htmlNode = doc.DocumentNode.SelectSingleNode("//html"); if (htmlNode == null || !htmlNode.Attributes.Contains("class")) return body.InnerHtml; var bodyClasses = body.Attributes.Contains("class") ? body.Attributes["class"].Value : string.Empty; var classes = htmlNode.Attributes["class"].Value + " " + bodyClasses; body.Attributes.Add("class", $"{classes}"); // I actually need the body tag itself for the classes, so i will create a div and put the body stuff there. return $"
{body.InnerHtml}
"; } private static void RewriteAnchors(int page, HtmlDocument doc, Dictionary mappings) { var anchors = doc.DocumentNode.SelectNodes("//a"); if (anchors == null) return; foreach (var anchor in anchors) { UpdateLinks(anchor, mappings, page); } } private async Task InlineStyles(HtmlDocument doc, EpubBookRef book, string apiBase, HtmlNode body) { var inlineStyles = doc.DocumentNode.SelectNodes("//style"); if (inlineStyles != null) { foreach (var inlineStyle in inlineStyles) { var styleContent = await ScopeStyles(inlineStyle.InnerHtml, apiBase, "", book); body.PrependChild(HtmlNode.CreateNode($"")); } } var styleNodes = doc.DocumentNode.SelectNodes("/html/head/link"); if (styleNodes != null) { foreach (var styleLinks in styleNodes) { var key = CleanContentKeys(styleLinks.Attributes["href"].Value); // Some epubs are malformed the key in content.opf might be: content/resources/filelist_0_0.xml but the actual html links to resources/filelist_0_0.xml // In this case, we will do a search for the key that ends with if (!book.Content.Css.ContainsKey(key)) { var correctedKey = book.Content.Css.Keys.SingleOrDefault(s => s.EndsWith(key)); if (correctedKey == null) { _logger.LogError("Epub is Malformed, key: {Key} is not matching OPF file", key); continue; } key = correctedKey; } try { var cssFile = book.Content.Css[key]; var styleContent = await ScopeStyles(await cssFile.ReadContentAsync(), apiBase, cssFile.FileName, book); if (styleContent != null) { body.PrependChild(HtmlNode.CreateNode($"")); } } catch (Exception ex) { _logger.LogError(ex, "There was an error reading css file for inlining likely due to a key mismatch in metadata"); await _mediaErrorService.ReportMediaIssueAsync(book.FilePath, MediaErrorProducer.BookService, "There was an error reading css file for inlining likely due to a key mismatch in metadata", ex); } } } } public ComicInfo? GetComicInfo(string filePath) { if (!IsValidFile(filePath) || Parser.IsPdf(filePath)) return null; try { using var epubBook = EpubReader.OpenBook(filePath, BookReaderOptions); var publicationDate = epubBook.Schema.Package.Metadata.Dates.FirstOrDefault(pDate => pDate.Event == "publication")?.Date; if (string.IsNullOrEmpty(publicationDate)) { publicationDate = epubBook.Schema.Package.Metadata.Dates.FirstOrDefault()?.Date; } var (year, month, day) = GetPublicationDate(publicationDate); var info = new ComicInfo { Summary = epubBook.Schema.Package.Metadata.Description, Writer = string.Join(",", epubBook.Schema.Package.Metadata.Creators.Select(c => Parser.CleanAuthor(c.Creator))), Publisher = string.Join(",", epubBook.Schema.Package.Metadata.Publishers), Month = month, Day = day, Year = year, Title = epubBook.Title, Genre = string.Join(",", epubBook.Schema.Package.Metadata.Subjects.Select(s => s.ToLower().Trim())), LanguageISO = ValidateLanguage(epubBook.Schema.Package.Metadata.Languages.FirstOrDefault()) }; ComicInfo.CleanComicInfo(info); // Parse tags not exposed via Library foreach (var metadataItem in epubBook.Schema.Package.Metadata.MetaItems) { // EPUB 2 and 3 switch (metadataItem.Name) { case "calibre:rating": info.UserRating = float.Parse(metadataItem.Content); break; case "calibre:title_sort": info.TitleSort = metadataItem.Content; break; case "calibre:series": info.Series = metadataItem.Content; info.SeriesSort = metadataItem.Content; break; case "calibre:series_index": info.Volume = metadataItem.Content; break; } // EPUB 3.2+ only switch (metadataItem.Property) { case "group-position": info.Volume = metadataItem.Content; break; case "belongs-to-collection": info.Series = metadataItem.Content; info.SeriesSort = metadataItem.Content; break; case "collection-type": // These look to be genres from https://manual.calibre-ebook.com/sub_groups.html or can be "series" break; } } var hasVolumeInSeries = !Parser.ParseVolume(info.Title) .Equals(Parser.DefaultVolume); if (string.IsNullOrEmpty(info.Volume) && hasVolumeInSeries && (!info.Series.Equals(info.Title) || string.IsNullOrEmpty(info.Series))) { // This is likely a light novel for which we can set series from parsed title info.Series = Parser.ParseSeries(info.Title); info.Volume = Parser.ParseVolume(info.Title); } return info; } catch (Exception ex) { _logger.LogWarning(ex, "[GetComicInfo] There was an exception parsing metadata"); _mediaErrorService.ReportMediaIssue(filePath, MediaErrorProducer.BookService, "There was an exception parsing metadata", ex); } return null; } private static (int year, int month, int day) GetPublicationDate(string publicationDate) { var dateParsed = DateTime.TryParse(publicationDate, out var date); var year = 0; var month = 0; var day = 0; switch (dateParsed) { case true: year = date.Year; month = date.Month; day = date.Day; break; case false when !string.IsNullOrEmpty(publicationDate) && publicationDate.Length == 4: int.TryParse(publicationDate, out year); break; } return (year, month, day); } private static string ValidateLanguage(string? language) { if (string.IsNullOrEmpty(language)) return string.Empty; try { return CultureInfo.GetCultureInfo(language).ToString(); } catch (Exception) { return string.Empty; } } private bool IsValidFile(string filePath) { if (!File.Exists(filePath)) { _logger.LogWarning("[BookService] Book {EpubFile} could not be found", filePath); return false; } if (Parser.IsBook(filePath)) return true; _logger.LogWarning("[BookService] Book {EpubFile} is not a valid EPUB/PDF", filePath); return false; } public int GetNumberOfPages(string filePath) { if (!IsValidFile(filePath)) return 0; try { if (Parser.IsPdf(filePath)) { using var docReader = DocLib.Instance.GetDocReader(filePath, new PageDimensions(1080, 1920)); return docReader.GetPageCount(); } using var epubBook = EpubReader.OpenBook(filePath, BookReaderOptions); return epubBook.GetReadingOrder().Count; } catch (Exception ex) { _logger.LogWarning(ex, "[BookService] There was an exception getting number of pages, defaulting to 0"); _mediaErrorService.ReportMediaIssue(filePath, MediaErrorProducer.BookService, "There was an exception getting number of pages, defaulting to 0", ex); } return 0; } private static string EscapeTags(string content) { // content = StartingScriptTag().Replace(content, ""); // content = StartingTitleTag().Replace(content, ""); content = Regex.Replace(content, @")", "", RegexOptions.None, Parser.RegexTimeout); content = Regex.Replace(content, @")", "", RegexOptions.None, Parser.RegexTimeout); return content; } /// /// Removes all leading ../ /// /// /// public static string CleanContentKeys(string key) { return key.Replace("../", string.Empty); } public async Task> CreateKeyToPageMappingAsync(EpubBookRef book) { var dict = new Dictionary(); var pageCount = 0; foreach (var contentFileRef in await book.GetReadingOrderAsync()) { if (contentFileRef.ContentType != EpubContentType.XHTML_1_1) continue; dict.Add(contentFileRef.FileName, pageCount); pageCount += 1; } return dict; } /// /// Parses out Title from book. Chapters and Volumes will always be "0". If there is any exception reading book (malformed books) /// then null is returned. This expects only an epub file /// /// /// public ParserInfo? ParseInfo(string filePath) { if (!Parser.IsEpub(filePath)) return null; try { using var epubBook = EpubReader.OpenBook(filePath, BookReaderOptions); // // // If all three are present, we can take that over dc:title and format as: // Series = The Dark Tower, Volume = 5, Filename as "Wolves of the Calla" // In addition, the following can exist and should parse as a series (EPUB 3.2 spec) // // The Lord of the Rings // // set // 2 try { var seriesIndex = string.Empty; var series = string.Empty; var specialName = string.Empty; foreach (var metadataItem in epubBook.Schema.Package.Metadata.MetaItems) { // EPUB 2 and 3 switch (metadataItem.Name) { case "calibre:series_index": seriesIndex = metadataItem.Content; break; case "calibre:series": series = metadataItem.Content; break; case "calibre:title_sort": specialName = metadataItem.Content; break; } // EPUB 3.2+ only switch (metadataItem.Property) { case "group-position": seriesIndex = metadataItem.Content; break; case "belongs-to-collection": series = metadataItem.Content; break; case "collection-type": // These look to be genres from https://manual.calibre-ebook.com/sub_groups.html or can be "series" break; } } if (!string.IsNullOrEmpty(series) && !string.IsNullOrEmpty(seriesIndex)) { if (string.IsNullOrEmpty(specialName)) { specialName = epubBook.Title; } var info = new ParserInfo { Chapters = Parser.DefaultChapter, Edition = string.Empty, Format = MangaFormat.Epub, Filename = Path.GetFileName(filePath), Title = specialName?.Trim() ?? string.Empty, FullFilePath = filePath, IsSpecial = false, Series = series.Trim(), SeriesSort = series.Trim(), Volumes = seriesIndex }; return info; } } catch (Exception) { // Swallow exception } return new ParserInfo { Chapters = Parser.DefaultChapter, Edition = string.Empty, Format = MangaFormat.Epub, Filename = Path.GetFileName(filePath), Title = epubBook.Title.Trim(), FullFilePath = filePath, IsSpecial = false, Series = epubBook.Title.Trim(), Volumes = Parser.DefaultVolume, }; } catch (Exception ex) { _logger.LogWarning(ex, "[BookService] There was an exception when opening epub book: {FileName}", filePath); _mediaErrorService.ReportMediaIssue(filePath, MediaErrorProducer.BookService, "There was an exception when opening epub book", ex); } return null; } /// /// Extracts a pdf into images to a target directory. Uses multi-threaded implementation since docnet is slow normally. /// /// /// public void ExtractPdfImages(string fileFilePath, string targetDirectory) { _directoryService.ExistOrCreate(targetDirectory); using var docReader = DocLib.Instance.GetDocReader(fileFilePath, new PageDimensions(1080, 1920)); var pages = docReader.GetPageCount(); Parallel.For(0, pages, pageNumber => { using var stream = StreamManager.GetStream("BookService.GetPdfPage"); GetPdfPage(docReader, pageNumber, stream); using var fileStream = File.Create(Path.Combine(targetDirectory, "Page-" + pageNumber + ".png")); stream.Seek(0, SeekOrigin.Begin); stream.CopyTo(fileStream); }); } /// /// Responsible to scope all the css, links, tags, etc to prepare a self contained html file for the page /// /// Html Doc that will be appended to /// Underlying epub /// API Url for file loading to pass through /// Body element from the epub /// Epub mappings /// Page number we are loading /// public async Task ScopePage(HtmlDocument doc, EpubBookRef book, string apiBase, HtmlNode body, Dictionary mappings, int page) { await InlineStyles(doc, book, apiBase, body); RewriteAnchors(page, doc, mappings); ScopeImages(doc, book, apiBase); return PrepareFinalHtml(doc, body); } /// /// Tries to find the correct key by applying cleaning and remapping if the epub has bad data. Only works for HTML files. /// /// /// /// /// private static string CoalesceKey(EpubBookRef book, IDictionary mappings, string key) { if (mappings.ContainsKey(CleanContentKeys(key))) return key; // Fallback to searching for key (bad epub metadata) var correctedKey = book.Content.Html.Keys.FirstOrDefault(s => s.EndsWith(key)); if (!string.IsNullOrEmpty(correctedKey)) { key = correctedKey; } return key; } public static string CoalesceKeyForAnyFile(EpubBookRef book, string key) { if (book.Content.AllFiles.ContainsKey(key)) return key; var cleanedKey = CleanContentKeys(key); if (book.Content.AllFiles.ContainsKey(cleanedKey)) return cleanedKey; // Fallback to searching for key (bad epub metadata) var correctedKey = book.Content.AllFiles.Keys.SingleOrDefault(s => s.EndsWith(key)); if (!string.IsNullOrEmpty(correctedKey)) { key = correctedKey; } return key; } /// /// This will return a list of mappings from ID -> page num. ID will be the xhtml key and page num will be the reading order /// this is used to rewrite anchors in the book text so that we always load properly in our reader. /// /// Chapter with at least one file /// public async Task> GenerateTableOfContents(Chapter chapter) { using var book = await EpubReader.OpenBookAsync(chapter.Files.ElementAt(0).FilePath, BookReaderOptions); var mappings = await CreateKeyToPageMappingAsync(book); var navItems = await book.GetNavigationAsync(); var chaptersList = new List(); foreach (var navigationItem in navItems) { if (navigationItem.NestedItems.Count == 0) { CreateToCChapter(navigationItem, Array.Empty(), chaptersList, mappings); continue; } var nestedChapters = new List(); foreach (var nestedChapter in navigationItem.NestedItems.Where(n => n.Link != null)) { var key = CoalesceKey(book, mappings, nestedChapter.Link.ContentFileName); if (mappings.ContainsKey(key)) { nestedChapters.Add(new BookChapterItem { Title = nestedChapter.Title, Page = mappings[key], Part = nestedChapter.Link.Anchor ?? string.Empty, Children = new List() }); } } CreateToCChapter(navigationItem, nestedChapters, chaptersList, mappings); } if (chaptersList.Count != 0) return chaptersList; // Generate from TOC from links (any point past this, Kavita is generating as a TOC doesn't exist) var tocPage = book.Content.Html.Keys.FirstOrDefault(k => k.ToUpper().Contains("TOC")); if (tocPage == null) return chaptersList; // Find all anchor tags, for each anchor we get inner text, to lower then title case on UI. Get href and generate page content var doc = new HtmlDocument(); var content = await book.Content.Html[tocPage].ReadContentAsync(); doc.LoadHtml(content); var anchors = doc.DocumentNode.SelectNodes("//a"); if (anchors == null) return chaptersList; foreach (var anchor in anchors) { if (!anchor.Attributes.Contains("href")) continue; var key = CoalesceKey(book, mappings, anchor.Attributes["href"].Value.Split("#")[0]); if (string.IsNullOrEmpty(key) || !mappings.ContainsKey(key)) continue; var part = string.Empty; if (anchor.Attributes["href"].Value.Contains('#')) { part = anchor.Attributes["href"].Value.Split("#")[1]; } chaptersList.Add(new BookChapterItem { Title = anchor.InnerText, Page = mappings[key], Part = part, Children = new List() }); } return chaptersList; } /// /// This returns a single page within the epub book. All html will be rewritten to be scoped within our reader, /// all css is scoped, etc. /// /// The requested page /// The chapterId /// The path to the cached epub file /// The API base for Kavita, to rewrite urls to so we load though our endpoint /// Full epub HTML Page, scoped to Kavita's reader /// All exceptions throw this public async Task GetBookPage(int page, int chapterId, string cachedEpubPath, string baseUrl) { using var book = await EpubReader.OpenBookAsync(cachedEpubPath, BookReaderOptions); var mappings = await CreateKeyToPageMappingAsync(book); var apiBase = baseUrl + "book/" + chapterId + "/" + BookApiUrl; var counter = 0; var doc = new HtmlDocument {OptionFixNestedTags = true}; var bookPages = await book.GetReadingOrderAsync(); try { foreach (var contentFileRef in bookPages) { if (page != counter) { counter++; continue; } var content = await contentFileRef.ReadContentAsync(); if (contentFileRef.ContentType != EpubContentType.XHTML_1_1) return content; // In more cases than not, due to this being XML not HTML, we need to escape the script tags. content = EscapeTags(content); doc.LoadHtml(content); var body = doc.DocumentNode.SelectSingleNode("//body"); if (body == null) { if (doc.ParseErrors.Any()) { LogBookErrors(book, contentFileRef, doc); throw new KavitaException("The file is malformed! Cannot read."); } _logger.LogError("{FilePath} has no body tag! Generating one for support. Book may be skewed", book.FilePath); doc.DocumentNode.SelectSingleNode("/html").AppendChild(HtmlNode.CreateNode("")); body = doc.DocumentNode.SelectSingleNode("/html/body"); } return await ScopePage(doc, book, apiBase, body, mappings, page); } } catch (Exception ex) { _logger.LogError(ex, "There was an issue reading one of the pages for {Book}", book.FilePath); await _mediaErrorService.ReportMediaIssueAsync(book.FilePath, MediaErrorProducer.BookService, "There was an issue reading one of the pages for", ex); } throw new KavitaException("Could not find the appropriate html for that page"); } private static void CreateToCChapter(EpubNavigationItemRef navigationItem, IList nestedChapters, ICollection chaptersList, IReadOnlyDictionary mappings) { if (navigationItem.Link == null) { var item = new BookChapterItem { Title = navigationItem.Title, Children = nestedChapters }; if (nestedChapters.Count > 0) { item.Page = nestedChapters[0].Page; } chaptersList.Add(item); } else { var groupKey = CleanContentKeys(navigationItem.Link.ContentFileName); if (mappings.ContainsKey(groupKey)) { chaptersList.Add(new BookChapterItem { Title = navigationItem.Title, Page = mappings[groupKey], Children = nestedChapters }); } } } /// /// Extracts the cover image to covers directory and returns file path back /// /// /// Name of the new file. /// Where to output the file, defaults to covers directory /// When saving the file, use WebP encoding instead of PNG /// public string GetCoverImage(string fileFilePath, string fileName, string outputDirectory, bool saveAsWebP = false) { if (!IsValidFile(fileFilePath)) return string.Empty; if (Parser.IsPdf(fileFilePath)) { return GetPdfCoverImage(fileFilePath, fileName, outputDirectory, saveAsWebP); } using var epubBook = EpubReader.OpenBook(fileFilePath, BookReaderOptions); try { // Try to get the cover image from OPF file, if not set, try to parse it from all the files, then result to the first one. var coverImageContent = epubBook.Content.Cover ?? epubBook.Content.Images.Values.FirstOrDefault(file => Parser.IsCoverImage(file.FileName)) ?? epubBook.Content.Images.Values.FirstOrDefault(); if (coverImageContent == null) return string.Empty; using var stream = coverImageContent.GetContentStream(); return _imageService.WriteCoverThumbnail(stream, fileName, outputDirectory, saveAsWebP); } catch (Exception ex) { _logger.LogWarning(ex, "[BookService] There was a critical error and prevented thumbnail generation on {BookFile}. Defaulting to no cover image", fileFilePath); _mediaErrorService.ReportMediaIssue(fileFilePath, MediaErrorProducer.BookService, "There was a critical error and prevented thumbnail generation", ex); } return string.Empty; } private string GetPdfCoverImage(string fileFilePath, string fileName, string outputDirectory, bool saveAsWebP) { try { using var docReader = DocLib.Instance.GetDocReader(fileFilePath, new PageDimensions(1080, 1920)); if (docReader.GetPageCount() == 0) return string.Empty; using var stream = StreamManager.GetStream("BookService.GetPdfPage"); GetPdfPage(docReader, 0, stream); return _imageService.WriteCoverThumbnail(stream, fileName, outputDirectory, saveAsWebP); } catch (Exception ex) { _logger.LogWarning(ex, "[BookService] There was a critical error and prevented thumbnail generation on {BookFile}. Defaulting to no cover image", fileFilePath); _mediaErrorService.ReportMediaIssue(fileFilePath, MediaErrorProducer.BookService, "There was a critical error and prevented thumbnail generation", ex); } return string.Empty; } /// /// Returns an image raster of a page within a PDF /// /// /// /// private static void GetPdfPage(IDocReader docReader, int pageNumber, Stream stream) { using var pageReader = docReader.GetPageReader(pageNumber); var rawBytes = pageReader.GetImage(new NaiveTransparencyRemover()); var width = pageReader.GetPageWidth(); var height = pageReader.GetPageHeight(); var image = Image.LoadPixelData(rawBytes, width, height); stream.Seek(0, SeekOrigin.Begin); image.SaveAsPng(stream); stream.Seek(0, SeekOrigin.Begin); } private static string RemoveWhiteSpaceFromStylesheets(string body) { if (string.IsNullOrEmpty(body)) { return string.Empty; } // Remove comments from CSS // body = CssComment().Replace(body, string.Empty); // // body = WhiteSpace1().Replace(body, "#"); // body = WhiteSpace2().Replace(body, string.Empty); // body = WhiteSpace3().Replace(body, " "); // body = WhiteSpace4().Replace(body, "$1"); body = Regex.Replace(body, @"/\*[\d\D]*?\*/", string.Empty, RegexOptions.None, Parser.RegexTimeout); body = Regex.Replace(body, @"[a-zA-Z]+#", "#", RegexOptions.None, Parser.RegexTimeout); body = Regex.Replace(body, @"[\n\r]+\s*", string.Empty, RegexOptions.None, Parser.RegexTimeout); body = Regex.Replace(body, @"\s+", " ", RegexOptions.None, Parser.RegexTimeout); body = Regex.Replace(body, @"\s?([:,;{}])\s?", "$1", RegexOptions.None, Parser.RegexTimeout); try { body = body.Replace(";}", "}"); } catch (Exception) { //Swallow exception. Some css don't have style rules ending in ';' } //body = UnitPadding().Replace(body, "$1"); body = Regex.Replace(body, @"([\s:]0)(px|pt|%|em)", "$1", RegexOptions.None, Parser.RegexTimeout); return body; } private void LogBookErrors(EpubBookRef book, EpubContentFileRef contentFileRef, HtmlDocument doc) { _logger.LogError("{FilePath} has an invalid html file (Page {PageName})", book.FilePath, contentFileRef.FileName); foreach (var error in doc.ParseErrors) { _logger.LogError("Line {LineNumber}, Reason: {Reason}", error.Line, error.Reason); } } }