using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Text; using System.Text.RegularExpressions; using System.Threading.Tasks; using System.Web; using API.Data.Metadata; using API.Entities.Enums; using API.Parser; using Docnet.Core; using Docnet.Core.Converters; using Docnet.Core.Models; using Docnet.Core.Readers; using ExCSS; using HtmlAgilityPack; using Microsoft.Extensions.Logging; using Microsoft.IO; using SixLabors.ImageSharp; using SixLabors.ImageSharp.PixelFormats; using VersOne.Epub; using Image = SixLabors.ImageSharp.Image; namespace API.Services { public interface IBookService { int GetNumberOfPages(string filePath); string GetCoverImage(string fileFilePath, string fileName); Task> CreateKeyToPageMappingAsync(EpubBookRef book); /// /// Scopes styles to .reading-section and replaces img src to the passed apiBase /// /// /// /// If the stylesheetHtml contains Import statements, when scoping the filename, scope needs to be wrt filepath. /// Book Reference, needed for if you expect Import statements /// Task ScopeStyles(string stylesheetHtml, string apiBase, string filename, EpubBookRef book); ComicInfo GetComicInfo(string filePath); ParserInfo ParseInfo(string filePath); /// /// Extracts a PDF file's pages as images to an target directory /// /// /// Where the files will be extracted to. If doesn't exist, will be created. void ExtractPdfImages(string fileFilePath, string targetDirectory); } public class BookService : IBookService { private readonly ILogger _logger; private readonly IDirectoryService _directoryService; private readonly IImageService _imageService; private readonly StylesheetParser _cssParser = new (); private static readonly RecyclableMemoryStreamManager StreamManager = new (); private const string CssScopeClass = ".book-content"; public BookService(ILogger logger, IDirectoryService directoryService, IImageService imageService) { _logger = logger; _directoryService = directoryService; _imageService = imageService; } private static bool HasClickableHrefPart(HtmlNode anchor) { return anchor.GetAttributeValue("href", string.Empty).Contains("#") && anchor.GetAttributeValue("tabindex", string.Empty) != "-1" && anchor.GetAttributeValue("role", string.Empty) != "presentation"; } public static string GetContentType(EpubContentType type) { string contentType; switch (type) { case EpubContentType.IMAGE_GIF: contentType = "image/gif"; break; case EpubContentType.IMAGE_PNG: contentType = "image/png"; break; case EpubContentType.IMAGE_JPEG: contentType = "image/jpeg"; break; case EpubContentType.FONT_OPENTYPE: contentType = "font/otf"; break; case EpubContentType.FONT_TRUETYPE: contentType = "font/ttf"; break; case EpubContentType.IMAGE_SVG: contentType = "image/svg+xml"; break; default: contentType = "application/octet-stream"; break; } return contentType; } public static void UpdateLinks(HtmlNode anchor, Dictionary mappings, int currentPage) { if (anchor.Name != "a") return; var hrefParts = CleanContentKeys(anchor.GetAttributeValue("href", string.Empty)) .Split("#"); // Some keys get uri encoded when parsed, so replace any of those characters with original var mappingKey = HttpUtility.UrlDecode(hrefParts[0]); if (!mappings.ContainsKey(mappingKey)) { if (HasClickableHrefPart(anchor)) { var part = hrefParts.Length > 1 ? hrefParts[1] : anchor.GetAttributeValue("href", string.Empty); anchor.Attributes.Add("kavita-page", $"{currentPage}"); anchor.Attributes.Add("kavita-part", part); anchor.Attributes.Remove("href"); anchor.Attributes.Add("href", "javascript:void(0)"); } else { anchor.Attributes.Add("target", "_blank"); anchor.Attributes.Add("rel", "noreferrer noopener"); } return; } var mappedPage = mappings[mappingKey]; anchor.Attributes.Add("kavita-page", $"{mappedPage}"); if (hrefParts.Length > 1) { anchor.Attributes.Add("kavita-part", hrefParts[1]); } anchor.Attributes.Remove("href"); anchor.Attributes.Add("href", "javascript:void(0)"); } public async Task ScopeStyles(string stylesheetHtml, string apiBase, string filename, EpubBookRef book) { // @Import statements will be handled by browser, so we must inline the css into the original file that request it, so they can be // Scoped var prepend = filename.Length > 0 ? filename.Replace(Path.GetFileName(filename), "") : string.Empty; var importBuilder = new StringBuilder(); foreach (Match match in Parser.Parser.CssImportUrlRegex.Matches(stylesheetHtml)) { if (!match.Success) continue; var importFile = match.Groups["Filename"].Value; var key = CleanContentKeys(importFile); if (!key.Contains(prepend)) { key = prepend + key; } if (!book.Content.AllFiles.ContainsKey(key)) continue; var bookFile = book.Content.AllFiles[key]; var content = await bookFile.ReadContentAsBytesAsync(); importBuilder.Append(Encoding.UTF8.GetString(content)); } stylesheetHtml = stylesheetHtml.Insert(0, importBuilder.ToString()); var importMatches = Parser.Parser.CssImportUrlRegex.Matches(stylesheetHtml); foreach (Match match in importMatches) { if (!match.Success) continue; var importFile = match.Groups["Filename"].Value; stylesheetHtml = stylesheetHtml.Replace(importFile, apiBase + prepend + importFile); } // Check if there are any background images and rewrite those urls EscapeCssImageReferences(ref stylesheetHtml, apiBase, book); var styleContent = RemoveWhiteSpaceFromStylesheets(stylesheetHtml); styleContent = styleContent.Replace("body", CssScopeClass); if (string.IsNullOrEmpty(styleContent)) return string.Empty; var stylesheet = await _cssParser.ParseAsync(styleContent); foreach (var styleRule in stylesheet.StyleRules) { if (styleRule.Selector.Text == CssScopeClass) continue; if (styleRule.Selector.Text.Contains(",")) { styleRule.Text = styleRule.Text.Replace(styleRule.SelectorText, string.Join(", ", styleRule.Selector.Text.Split(",").Select(s => $"{CssScopeClass} " + s))); continue; } styleRule.Text = $"{CssScopeClass} " + styleRule.Text; } return RemoveWhiteSpaceFromStylesheets(stylesheet.ToCss()); } private static void EscapeCssImageReferences(ref string stylesheetHtml, string apiBase, EpubBookRef book) { var matches = Parser.Parser.CssImageUrlRegex.Matches(stylesheetHtml); foreach (Match match in matches) { if (!match.Success) continue; var importFile = match.Groups["Filename"].Value; var key = CleanContentKeys(importFile); if (!book.Content.AllFiles.ContainsKey(key)) continue; stylesheetHtml = stylesheetHtml.Replace(importFile, apiBase + key); } } public ComicInfo GetComicInfo(string filePath) { if (!IsValidFile(filePath) || Parser.Parser.IsPdf(filePath)) return null; try { using var epubBook = EpubReader.OpenBook(filePath); var publicationDate = epubBook.Schema.Package.Metadata.Dates.FirstOrDefault(date => date.Event == "publication")?.Date; var info = new ComicInfo() { // TODO: Summary is in html, we need to turn it into string Summary = epubBook.Schema.Package.Metadata.Description, Writer = string.Join(",", epubBook.Schema.Package.Metadata.Creators.Select(c => Parser.Parser.CleanAuthor(c.Creator))), Publisher = string.Join(",", epubBook.Schema.Package.Metadata.Publishers), Month = !string.IsNullOrEmpty(publicationDate) ? DateTime.Parse(publicationDate).Month : 0, Year = !string.IsNullOrEmpty(publicationDate) ? DateTime.Parse(publicationDate).Year : 0, Title = epubBook.Title, Genre = string.Join(",", epubBook.Schema.Package.Metadata.Subjects.Select(s => s.ToLower().Trim())), }; // Parse tags not exposed via Library foreach (var metadataItem in epubBook.Schema.Package.Metadata.MetaItems) { switch (metadataItem.Name) { case "calibre:rating": info.UserRating = float.Parse(metadataItem.Content); break; case "calibre:title_sort": info.TitleSort = metadataItem.Content; break; } } return info; } catch (Exception ex) { _logger.LogWarning(ex, "[GetComicInfo] There was an exception getting metadata"); } return null; } private bool IsValidFile(string filePath) { if (!File.Exists(filePath)) { _logger.LogWarning("[BookService] Book {EpubFile} could not be found", filePath); return false; } if (Parser.Parser.IsBook(filePath)) return true; _logger.LogWarning("[BookService] Book {EpubFile} is not a valid EPUB/PDF", filePath); return false; } public int GetNumberOfPages(string filePath) { if (!IsValidFile(filePath)) return 0; try { if (Parser.Parser.IsPdf(filePath)) { using var docReader = DocLib.Instance.GetDocReader(filePath, new PageDimensions(1080, 1920)); return docReader.GetPageCount(); } using var epubBook = EpubReader.OpenBook(filePath); return epubBook.Content.Html.Count; } catch (Exception ex) { _logger.LogWarning(ex, "[BookService] There was an exception getting number of pages, defaulting to 0"); } return 0; } public static string EscapeTags(string content) { content = Regex.Replace(content, @")", ""); content = Regex.Replace(content, @")", ""); return content; } public static string CleanContentKeys(string key) { return key.Replace("../", string.Empty); } public async Task> CreateKeyToPageMappingAsync(EpubBookRef book) { var dict = new Dictionary(); var pageCount = 0; foreach (var contentFileRef in await book.GetReadingOrderAsync()) { if (contentFileRef.ContentType != EpubContentType.XHTML_1_1) continue; dict.Add(contentFileRef.FileName, pageCount); pageCount += 1; } return dict; } /// /// Parses out Title from book. Chapters and Volumes will always be "0". If there is any exception reading book (malformed books) /// then null is returned. This expects only an epub file /// /// /// public ParserInfo ParseInfo(string filePath) { if (!Parser.Parser.IsEpub(filePath)) return null; try { using var epubBook = EpubReader.OpenBook(filePath); // // // If all three are present, we can take that over dc:title and format as: // Series = The Dark Tower, Volume = 5, Filename as "Wolves of the Calla" // In addition, the following can exist and should parse as a series (EPUB 3.2 spec) // // The Lord of the Rings // // set // 2 try { var seriesIndex = string.Empty; var series = string.Empty; var specialName = string.Empty; var groupPosition = string.Empty; var titleSort = string.Empty; foreach (var metadataItem in epubBook.Schema.Package.Metadata.MetaItems) { // EPUB 2 and 3 switch (metadataItem.Name) { case "calibre:series_index": seriesIndex = metadataItem.Content; break; case "calibre:series": series = metadataItem.Content; break; case "calibre:title_sort": specialName = metadataItem.Content; titleSort = metadataItem.Content; break; } // EPUB 3.2+ only switch (metadataItem.Property) { case "group-position": seriesIndex = metadataItem.Content; break; case "belongs-to-collection": series = metadataItem.Content; break; case "collection-type": groupPosition = metadataItem.Content; break; } } if (!string.IsNullOrEmpty(series) && !string.IsNullOrEmpty(seriesIndex) && (!string.IsNullOrEmpty(specialName) || groupPosition.Equals("series") || groupPosition.Equals("set"))) { if (string.IsNullOrEmpty(specialName)) { specialName = epubBook.Title; } var info = new ParserInfo() { Chapters = Parser.Parser.DefaultChapter, Edition = string.Empty, Format = MangaFormat.Epub, Filename = Path.GetFileName(filePath), Title = specialName?.Trim(), FullFilePath = filePath, IsSpecial = false, Series = series.Trim(), Volumes = seriesIndex }; // Don't set titleSort if the book belongs to a group if (!string.IsNullOrEmpty(titleSort) && string.IsNullOrEmpty(seriesIndex)) { info.SeriesSort = titleSort; } return info; } } catch (Exception) { // Swallow exception } return new ParserInfo() { Chapters = Parser.Parser.DefaultChapter, Edition = string.Empty, Format = MangaFormat.Epub, Filename = Path.GetFileName(filePath), Title = epubBook.Title.Trim(), FullFilePath = filePath, IsSpecial = false, Series = epubBook.Title.Trim(), Volumes = Parser.Parser.DefaultVolume, }; } catch (Exception ex) { _logger.LogWarning(ex, "[BookService] There was an exception when opening epub book: {FileName}", filePath); } return null; } /// /// Extracts a pdf into images to a target directory. Uses multi-threaded implementation since docnet is slow normally. /// /// /// public void ExtractPdfImages(string fileFilePath, string targetDirectory) { _directoryService.ExistOrCreate(targetDirectory); using var docReader = DocLib.Instance.GetDocReader(fileFilePath, new PageDimensions(1080, 1920)); var pages = docReader.GetPageCount(); Parallel.For(0, pages, pageNumber => { using var stream = StreamManager.GetStream("BookService.GetPdfPage"); GetPdfPage(docReader, pageNumber, stream); using var fileStream = File.Create(Path.Combine(targetDirectory, "Page-" + pageNumber + ".png")); stream.Seek(0, SeekOrigin.Begin); stream.CopyTo(fileStream); }); } /// /// Extracts the cover image to covers directory and returns file path back /// /// /// Name of the new file. /// public string GetCoverImage(string fileFilePath, string fileName) { if (!IsValidFile(fileFilePath)) return string.Empty; if (Parser.Parser.IsPdf(fileFilePath)) { return GetPdfCoverImage(fileFilePath, fileName); } using var epubBook = EpubReader.OpenBook(fileFilePath); try { // Try to get the cover image from OPF file, if not set, try to parse it from all the files, then result to the first one. var coverImageContent = epubBook.Content.Cover ?? epubBook.Content.Images.Values.FirstOrDefault(file => Parser.Parser.IsCoverImage(file.FileName)) ?? epubBook.Content.Images.Values.FirstOrDefault(); if (coverImageContent == null) return string.Empty; using var stream = coverImageContent.GetContentStream(); return _imageService.WriteCoverThumbnail(stream, fileName); } catch (Exception ex) { _logger.LogWarning(ex, "[BookService] There was a critical error and prevented thumbnail generation on {BookFile}. Defaulting to no cover image", fileFilePath); } return string.Empty; } private string GetPdfCoverImage(string fileFilePath, string fileName) { try { using var docReader = DocLib.Instance.GetDocReader(fileFilePath, new PageDimensions(1080, 1920)); if (docReader.GetPageCount() == 0) return string.Empty; using var stream = StreamManager.GetStream("BookService.GetPdfPage"); GetPdfPage(docReader, 0, stream); return _imageService.WriteCoverThumbnail(stream, fileName); } catch (Exception ex) { _logger.LogWarning(ex, "[BookService] There was a critical error and prevented thumbnail generation on {BookFile}. Defaulting to no cover image", fileFilePath); } return string.Empty; } private static void GetPdfPage(IDocReader docReader, int pageNumber, Stream stream) { using var pageReader = docReader.GetPageReader(pageNumber); var rawBytes = pageReader.GetImage(new NaiveTransparencyRemover()); var width = pageReader.GetPageWidth(); var height = pageReader.GetPageHeight(); var image = Image.LoadPixelData(rawBytes, width, height); stream.Seek(0, SeekOrigin.Begin); image.SaveAsPng(stream); stream.Seek(0, SeekOrigin.Begin); } private static string RemoveWhiteSpaceFromStylesheets(string body) { if (string.IsNullOrEmpty(body)) { return string.Empty; } // Remove comments from CSS body = Regex.Replace(body, @"/\*[\d\D]*?\*/", string.Empty); body = Regex.Replace(body, @"[a-zA-Z]+#", "#"); body = Regex.Replace(body, @"[\n\r]+\s*", string.Empty); body = Regex.Replace(body, @"\s+", " "); body = Regex.Replace(body, @"\s?([:,;{}])\s?", "$1"); try { body = body.Replace(";}", "}"); } catch (Exception) { /* Swallow exception. Some css doesn't have style rules ending in ; */ } body = Regex.Replace(body, @"([\s:]0)(px|pt|%|em)", "$1"); return body; } } }