Kavita/API/Services/BookService.cs
Joe Milazzo ccb6ac9705
Expanded Metadata for EPUBs (#1965)
* Fixed a bug breaking ability to save server settings

* Explicitly capture more people roles from Epubs, else fallback to how we do it now. It seems to be getting called twice and 2nd time is overriding data. Not sure why

* Refactored the code to clean it up

* Added support for generating collections or reading list based on dc:title and collection title-type with an optional display-seq.

* ReadingList/Collection support can't be done until VersOne supports. https://github.com/vers-one/EpubReader/issues/81

* Double include author for epub parsing and let the People code handle removing duplicates.
2023-05-07 10:43:21 -07:00

1180 lines
47 KiB
C#

using System;
using System.Collections.Generic;
using System.Globalization;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using System.Web;
using API.Data.Metadata;
using API.DTOs.Reader;
using API.Entities;
using API.Entities.Enums;
using API.Services.Tasks.Scanner.Parser;
using Docnet.Core;
using Docnet.Core.Converters;
using Docnet.Core.Models;
using Docnet.Core.Readers;
using ExCSS;
using HtmlAgilityPack;
using Kavita.Common;
using Microsoft.Extensions.Logging;
using Microsoft.IO;
using SixLabors.ImageSharp;
using SixLabors.ImageSharp.PixelFormats;
using VersOne.Epub;
using VersOne.Epub.Options;
using VersOne.Epub.Schema;
namespace API.Services;
public interface IBookService
{
int GetNumberOfPages(string filePath);
string GetCoverImage(string fileFilePath, string fileName, string outputDirectory, bool saveAsWebP = false);
ComicInfo? GetComicInfo(string filePath);
ParserInfo? ParseInfo(string filePath);
/// <summary>
/// Scopes styles to .reading-section and replaces img src to the passed apiBase
/// </summary>
/// <param name="stylesheetHtml"></param>
/// <param name="apiBase"></param>
/// <param name="filename">If the stylesheetHtml contains Import statements, when scoping the filename, scope needs to be wrt filepath.</param>
/// <param name="book">Book Reference, needed for if you expect Import statements</param>
/// <returns></returns>
Task<string> ScopeStyles(string stylesheetHtml, string apiBase, string filename, EpubBookRef book);
/// <summary>
/// Extracts a PDF file's pages as images to an target directory
/// </summary>
/// <remarks>This method relies on Docnet which has explicit patches from Kavita for ARM support. This should only be used with Tachiyomi</remarks>
/// <param name="fileFilePath"></param>
/// <param name="targetDirectory">Where the files will be extracted to. If doesn't exist, will be created.</param>
void ExtractPdfImages(string fileFilePath, string targetDirectory);
Task<ICollection<BookChapterItem>> GenerateTableOfContents(Chapter chapter);
Task<string> GetBookPage(int page, int chapterId, string cachedEpubPath, string baseUrl);
Task<Dictionary<string, int>> CreateKeyToPageMappingAsync(EpubBookRef book);
}
public class BookService : IBookService
{
private readonly ILogger<BookService> _logger;
private readonly IDirectoryService _directoryService;
private readonly IImageService _imageService;
private readonly IMediaErrorService _mediaErrorService;
private readonly StylesheetParser _cssParser = new ();
private static readonly RecyclableMemoryStreamManager StreamManager = new ();
private const string CssScopeClass = ".book-content";
private const string BookApiUrl = "book-resources?file=";
public static readonly EpubReaderOptions BookReaderOptions = new()
{
PackageReaderOptions = new PackageReaderOptions
{
IgnoreMissingToc = true
}
};
public BookService(ILogger<BookService> logger, IDirectoryService directoryService, IImageService imageService, IMediaErrorService mediaErrorService)
{
_logger = logger;
_directoryService = directoryService;
_imageService = imageService;
_mediaErrorService = mediaErrorService;
}
private static bool HasClickableHrefPart(HtmlNode anchor)
{
return anchor.GetAttributeValue("href", string.Empty).Contains("#")
&& anchor.GetAttributeValue("tabindex", string.Empty) != "-1"
&& anchor.GetAttributeValue("role", string.Empty) != "presentation";
}
public static string GetContentType(EpubContentType type)
{
string contentType;
switch (type)
{
case EpubContentType.IMAGE_GIF:
contentType = "image/gif";
break;
case EpubContentType.IMAGE_PNG:
contentType = "image/png";
break;
case EpubContentType.IMAGE_JPEG:
contentType = "image/jpeg";
break;
case EpubContentType.FONT_OPENTYPE:
contentType = "font/otf";
break;
case EpubContentType.FONT_TRUETYPE:
contentType = "font/ttf";
break;
case EpubContentType.IMAGE_SVG:
contentType = "image/svg+xml";
break;
default:
contentType = "application/octet-stream";
break;
}
return contentType;
}
private static void UpdateLinks(HtmlNode anchor, Dictionary<string, int> mappings, int currentPage)
{
if (anchor.Name != "a") return;
var hrefParts = CleanContentKeys(anchor.GetAttributeValue("href", string.Empty))
.Split("#");
// Some keys get uri encoded when parsed, so replace any of those characters with original
var mappingKey = HttpUtility.UrlDecode(hrefParts[0]);
if (!mappings.ContainsKey(mappingKey))
{
if (HasClickableHrefPart(anchor))
{
var part = hrefParts.Length > 1
? hrefParts[1]
: anchor.GetAttributeValue("href", string.Empty);
anchor.Attributes.Add("kavita-page", $"{currentPage}");
anchor.Attributes.Add("kavita-part", part);
anchor.Attributes.Remove("href");
anchor.Attributes.Add("href", "javascript:void(0)");
}
else
{
anchor.Attributes.Add("target", "_blank");
anchor.Attributes.Add("rel", "noreferrer noopener");
}
return;
}
var mappedPage = mappings[mappingKey];
anchor.Attributes.Add("kavita-page", $"{mappedPage}");
if (hrefParts.Length > 1)
{
anchor.Attributes.Add("kavita-part",
hrefParts[1]);
}
anchor.Attributes.Remove("href");
anchor.Attributes.Add("href", "javascript:void(0)");
}
/// <summary>
/// Scopes styles to .reading-section and replaces img src to the passed apiBase
/// </summary>
/// <param name="stylesheetHtml"></param>
/// <param name="apiBase"></param>
/// <param name="filename">If the stylesheetHtml contains Import statements, when scoping the filename, scope needs to be wrt filepath.</param>
/// <param name="book">Book Reference, needed for if you expect Import statements</param>
/// <returns></returns>
public async Task<string> ScopeStyles(string stylesheetHtml, string apiBase, string filename, EpubBookRef book)
{
// @Import statements will be handled by browser, so we must inline the css into the original file that request it, so they can be Scoped
var prepend = filename.Length > 0 ? filename.Replace(Path.GetFileName(filename), string.Empty) : string.Empty;
var importBuilder = new StringBuilder();
//foreach (Match match in Tasks.Scanner.Parser.Parser.CssImportUrlRegex().Matches(stylesheetHtml))
foreach (Match match in Parser.CssImportUrlRegex.Matches(stylesheetHtml))
{
if (!match.Success) continue;
var importFile = match.Groups["Filename"].Value;
var key = CleanContentKeys(importFile);
if (!key.Contains(prepend))
{
key = prepend + key;
}
if (!book.Content.AllFiles.ContainsKey(key)) continue;
var bookFile = book.Content.AllFiles[key];
var content = await bookFile.ReadContentAsBytesAsync();
importBuilder.Append(Encoding.UTF8.GetString(content));
}
stylesheetHtml = stylesheetHtml.Insert(0, importBuilder.ToString());
EscapeCssImportReferences(ref stylesheetHtml, apiBase, prepend);
EscapeFontFamilyReferences(ref stylesheetHtml, apiBase, prepend);
// Check if there are any background images and rewrite those urls
EscapeCssImageReferences(ref stylesheetHtml, apiBase, book);
var styleContent = RemoveWhiteSpaceFromStylesheets(stylesheetHtml);
styleContent = styleContent.Replace("body", CssScopeClass);
if (string.IsNullOrEmpty(styleContent)) return string.Empty;
var stylesheet = await _cssParser.ParseAsync(styleContent);
foreach (var styleRule in stylesheet.StyleRules)
{
if (styleRule.Selector.Text == CssScopeClass) continue;
if (styleRule.Selector.Text.Contains(','))
{
styleRule.Text = styleRule.Text.Replace(styleRule.SelectorText,
string.Join(", ",
styleRule.Selector.Text.Split(",").Select(s => $"{CssScopeClass} " + s)));
continue;
}
styleRule.Text = $"{CssScopeClass} " + styleRule.Text;
}
return RemoveWhiteSpaceFromStylesheets(stylesheet.ToCss());
}
private static void EscapeCssImportReferences(ref string stylesheetHtml, string apiBase, string prepend)
{
//foreach (Match match in Tasks.Scanner.Parser.Parser.CssImportUrlRegex().Matches(stylesheetHtml))
foreach (Match match in Parser.CssImportUrlRegex.Matches(stylesheetHtml))
{
if (!match.Success) continue;
var importFile = match.Groups["Filename"].Value;
stylesheetHtml = stylesheetHtml.Replace(importFile, apiBase + prepend + importFile);
}
}
private static void EscapeFontFamilyReferences(ref string stylesheetHtml, string apiBase, string prepend)
{
//foreach (Match match in Tasks.Scanner.Parser.Parser.FontSrcUrlRegex().Matches(stylesheetHtml))
foreach (Match match in Parser.FontSrcUrlRegex.Matches(stylesheetHtml))
{
if (!match.Success) continue;
var importFile = match.Groups["Filename"].Value;
stylesheetHtml = stylesheetHtml.Replace(importFile, apiBase + prepend + importFile);
}
}
private static void EscapeCssImageReferences(ref string stylesheetHtml, string apiBase, EpubBookRef book)
{
//var matches = Tasks.Scanner.Parser.Parser.CssImageUrlRegex().Matches(stylesheetHtml);
var matches = Parser.CssImageUrlRegex.Matches(stylesheetHtml);
foreach (Match match in matches)
{
if (!match.Success) continue;
var importFile = match.Groups["Filename"].Value;
var key = CleanContentKeys(importFile);
if (!book.Content.AllFiles.ContainsKey(key)) continue;
stylesheetHtml = stylesheetHtml.Replace(importFile, apiBase + key);
}
}
private static void ScopeImages(HtmlDocument doc, EpubBookRef book, string apiBase)
{
var images = doc.DocumentNode.SelectNodes("//img")
?? doc.DocumentNode.SelectNodes("//image") ?? doc.DocumentNode.SelectNodes("//svg");
if (images == null) return;
var parent = images.First().ParentNode;
foreach (var image in images)
{
string? key = null;
if (image.Attributes["src"] != null)
{
key = "src";
}
else if (image.Attributes["xlink:href"] != null)
{
key = "xlink:href";
}
if (string.IsNullOrEmpty(key)) continue;
var imageFile = GetKeyForImage(book, image.Attributes[key].Value);
image.Attributes.Remove(key);
// UrlEncode here to transform ../ into an escaped version, which avoids blocking on nginx
image.Attributes.Add(key, $"{apiBase}" + HttpUtility.UrlEncode(imageFile));
// Add a custom class that the reader uses to ensure images stay within reader
parent.AddClass("kavita-scale-width-container");
image.AddClass("kavita-scale-width");
}
}
/// <summary>
/// Returns the image key associated with the file. Contains some basic fallback logic.
/// </summary>
/// <param name="book"></param>
/// <param name="imageFile"></param>
/// <returns></returns>
private static string GetKeyForImage(EpubBookRef book, string imageFile)
{
if (book.Content.Images.ContainsKey(imageFile)) return imageFile;
var correctedKey = book.Content.Images.Keys.SingleOrDefault(s => s.EndsWith(imageFile));
if (correctedKey != null)
{
imageFile = correctedKey;
}
else if (imageFile.StartsWith(".."))
{
// There are cases where the key is defined static like OEBPS/Images/1-4.jpg but reference is ../Images/1-4.jpg
correctedKey =
book.Content.Images.Keys.SingleOrDefault(s => s.EndsWith(imageFile.Replace("..", string.Empty)));
if (correctedKey != null)
{
imageFile = correctedKey;
}
}
return imageFile;
}
private static string PrepareFinalHtml(HtmlDocument doc, HtmlNode body)
{
// Check if any classes on the html node (some r2l books do this) and move them to body tag for scoping
var htmlNode = doc.DocumentNode.SelectSingleNode("//html");
if (htmlNode == null || !htmlNode.Attributes.Contains("class")) return body.InnerHtml;
var bodyClasses = body.Attributes.Contains("class") ? body.Attributes["class"].Value : string.Empty;
var classes = htmlNode.Attributes["class"].Value + " " + bodyClasses;
body.Attributes.Add("class", $"{classes}");
// I actually need the body tag itself for the classes, so i will create a div and put the body stuff there.
return $"<div class=\"{body.Attributes["class"].Value}\">{body.InnerHtml}</div>";
}
private static void RewriteAnchors(int page, HtmlDocument doc, Dictionary<string, int> mappings)
{
var anchors = doc.DocumentNode.SelectNodes("//a");
if (anchors == null) return;
foreach (var anchor in anchors)
{
UpdateLinks(anchor, mappings, page);
}
}
private async Task InlineStyles(HtmlDocument doc, EpubBookRef book, string apiBase, HtmlNode body)
{
var inlineStyles = doc.DocumentNode.SelectNodes("//style");
if (inlineStyles != null)
{
foreach (var inlineStyle in inlineStyles)
{
var styleContent = await ScopeStyles(inlineStyle.InnerHtml, apiBase, "", book);
body.PrependChild(HtmlNode.CreateNode($"<style>{styleContent}</style>"));
}
}
var styleNodes = doc.DocumentNode.SelectNodes("/html/head/link");
if (styleNodes != null)
{
foreach (var styleLinks in styleNodes)
{
var key = CleanContentKeys(styleLinks.Attributes["href"].Value);
// Some epubs are malformed the key in content.opf might be: content/resources/filelist_0_0.xml but the actual html links to resources/filelist_0_0.xml
// In this case, we will do a search for the key that ends with
if (!book.Content.Css.ContainsKey(key))
{
var correctedKey = book.Content.Css.Keys.SingleOrDefault(s => s.EndsWith(key));
if (correctedKey == null)
{
_logger.LogError("Epub is Malformed, key: {Key} is not matching OPF file", key);
continue;
}
key = correctedKey;
}
try
{
var cssFile = book.Content.Css[key];
var styleContent = await ScopeStyles(await cssFile.ReadContentAsync(), apiBase,
cssFile.FileName, book);
if (styleContent != null)
{
body.PrependChild(HtmlNode.CreateNode($"<style>{styleContent}</style>"));
}
}
catch (Exception ex)
{
_logger.LogError(ex, "There was an error reading css file for inlining likely due to a key mismatch in metadata");
await _mediaErrorService.ReportMediaIssueAsync(book.FilePath, MediaErrorProducer.BookService,
"There was an error reading css file for inlining likely due to a key mismatch in metadata", ex);
}
}
}
}
public ComicInfo? GetComicInfo(string filePath)
{
if (!IsValidFile(filePath) || Parser.IsPdf(filePath)) return null;
try
{
using var epubBook = EpubReader.OpenBook(filePath, BookReaderOptions);
var publicationDate =
epubBook.Schema.Package.Metadata.Dates.FirstOrDefault(pDate => pDate.Event == "publication")?.Date;
if (string.IsNullOrEmpty(publicationDate))
{
publicationDate = epubBook.Schema.Package.Metadata.Dates.FirstOrDefault()?.Date;
}
var (year, month, day) = GetPublicationDate(publicationDate);
var info = new ComicInfo
{
Summary = epubBook.Schema.Package.Metadata.Description,
Publisher = string.Join(",", epubBook.Schema.Package.Metadata.Publishers),
Month = month,
Day = day,
Year = year,
Title = epubBook.Title,
Genre = string.Join(",", epubBook.Schema.Package.Metadata.Subjects.Select(s => s.ToLower().Trim())),
LanguageISO = ValidateLanguage(epubBook.Schema.Package.Metadata.Languages.FirstOrDefault())
};
ComicInfo.CleanComicInfo(info);
// Parse tags not exposed via Library
foreach (var metadataItem in epubBook.Schema.Package.Metadata.MetaItems)
{
// EPUB 2 and 3
switch (metadataItem.Name)
{
case "calibre:rating":
info.UserRating = float.Parse(metadataItem.Content);
break;
case "calibre:title_sort":
info.TitleSort = metadataItem.Content;
break;
case "calibre:series":
info.Series = metadataItem.Content;
info.SeriesSort = metadataItem.Content;
break;
case "calibre:series_index":
info.Volume = metadataItem.Content;
break;
}
// EPUB 3.2+ only
switch (metadataItem.Property)
{
case "group-position":
info.Volume = metadataItem.Content;
break;
case "belongs-to-collection":
info.Series = metadataItem.Content;
info.SeriesSort = metadataItem.Content;
break;
case "collection-type":
// These look to be genres from https://manual.calibre-ebook.com/sub_groups.html or can be "series"
break;
case "role":
if (!metadataItem.Scheme.Equals("marc:relators")) break;
var creatorId = metadataItem.Refines.Replace("#", string.Empty);
var person = epubBook.Schema.Package.Metadata.Creators.SingleOrDefault(c => c.Id == creatorId);
if (person == null) break;
PopulatePerson(metadataItem, info, person);
break;
case "title-type":
break;
// This is currently not possible until VersOne update's to allow EPUB 3 Title to have attributes
if (!metadataItem.Content.Equals("collection")) break;
var titleId = metadataItem.Refines.Replace("#", string.Empty);
var readingListElem = epubBook.Schema.Package.Metadata.MetaItems.FirstOrDefault(item =>
item.Name == "dc:title" && item.Id == titleId);
if (readingListElem == null) break;
var count = epubBook.Schema.Package.Metadata.MetaItems
.FirstOrDefault(item =>
item.Property == "display-seq" && item.Refines == metadataItem.Refines);
if (count == null || count.Content == "0")
{
// Treat this as a Collection
info.StoryArc += "," + readingListElem.Content;
}
else
{
// Treat as a reading list
info.AlternateSeries += "," + readingListElem.Content;
info.AlternateNumber += "," + count.Content;
}
break;
}
}
// Include regular Writer as well, for cases where there is no special tag
info.Writer = string.Join(",",
epubBook.Schema.Package.Metadata.Creators.Select(c => Parser.CleanAuthor(c.Creator)));
var hasVolumeInSeries = !Parser.ParseVolume(info.Title)
.Equals(Parser.DefaultVolume);
if (string.IsNullOrEmpty(info.Volume) && hasVolumeInSeries && (!info.Series.Equals(info.Title) || string.IsNullOrEmpty(info.Series)))
{
// This is likely a light novel for which we can set series from parsed title
info.Series = Parser.ParseSeries(info.Title);
info.Volume = Parser.ParseVolume(info.Title);
}
return info;
}
catch (Exception ex)
{
_logger.LogWarning(ex, "[GetComicInfo] There was an exception parsing metadata");
_mediaErrorService.ReportMediaIssue(filePath, MediaErrorProducer.BookService,
"There was an exception parsing metadata", ex);
}
return null;
}
private static void PopulatePerson(EpubMetadataMeta metadataItem, ComicInfo info, EpubMetadataCreator person)
{
switch (metadataItem.Content)
{
case "art":
case "artist":
info.CoverArtist += AppendAuthor(person);
return;
case "aut":
case "author":
info.Writer += AppendAuthor(person);
return;
case "pbl":
case "publisher":
info.Publisher += AppendAuthor(person);
return;
case "trl":
case "translator":
info.Translator += AppendAuthor(person);
return;
case "edt":
case "editor":
info.Editor += AppendAuthor(person);
return;
case "ill":
case "illustrator":
info.Letterer += AppendAuthor(person);
return;
case "clr":
case "colorist":
info.Colorist += AppendAuthor(person);
return;
}
}
private static string AppendAuthor(EpubMetadataCreator person)
{
return Parser.CleanAuthor(person.Creator) + ",";
}
private static (int year, int month, int day) GetPublicationDate(string publicationDate)
{
var dateParsed = DateTime.TryParse(publicationDate, out var date);
var year = 0;
var month = 0;
var day = 0;
switch (dateParsed)
{
case true:
year = date.Year;
month = date.Month;
day = date.Day;
break;
case false when !string.IsNullOrEmpty(publicationDate) && publicationDate.Length == 4:
int.TryParse(publicationDate, out year);
break;
}
return (year, month, day);
}
private static string ValidateLanguage(string? language)
{
if (string.IsNullOrEmpty(language)) return string.Empty;
try
{
return CultureInfo.GetCultureInfo(language).ToString();
}
catch (Exception)
{
return string.Empty;
}
}
private bool IsValidFile(string filePath)
{
if (!File.Exists(filePath))
{
_logger.LogWarning("[BookService] Book {EpubFile} could not be found", filePath);
return false;
}
if (Parser.IsBook(filePath)) return true;
_logger.LogWarning("[BookService] Book {EpubFile} is not a valid EPUB/PDF", filePath);
return false;
}
public int GetNumberOfPages(string filePath)
{
if (!IsValidFile(filePath)) return 0;
try
{
if (Parser.IsPdf(filePath))
{
using var docReader = DocLib.Instance.GetDocReader(filePath, new PageDimensions(1080, 1920));
return docReader.GetPageCount();
}
using var epubBook = EpubReader.OpenBook(filePath, BookReaderOptions);
return epubBook.GetReadingOrder().Count;
}
catch (Exception ex)
{
_logger.LogWarning(ex, "[BookService] There was an exception getting number of pages, defaulting to 0");
_mediaErrorService.ReportMediaIssue(filePath, MediaErrorProducer.BookService,
"There was an exception getting number of pages, defaulting to 0", ex);
}
return 0;
}
private static string EscapeTags(string content)
{
// content = StartingScriptTag().Replace(content, "<script$1></script>");
// content = StartingTitleTag().Replace(content, "<title$1></title>");
content = Regex.Replace(content, @"<script(.*)(/>)", "<script$1></script>", RegexOptions.None, Parser.RegexTimeout);
content = Regex.Replace(content, @"<title(.*)(/>)", "<title$1></title>", RegexOptions.None, Parser.RegexTimeout);
return content;
}
/// <summary>
/// Removes all leading ../
/// </summary>
/// <param name="key"></param>
/// <returns></returns>
public static string CleanContentKeys(string key)
{
return key.Replace("../", string.Empty);
}
public async Task<Dictionary<string, int>> CreateKeyToPageMappingAsync(EpubBookRef book)
{
var dict = new Dictionary<string, int>();
var pageCount = 0;
foreach (var contentFileRef in await book.GetReadingOrderAsync())
{
if (contentFileRef.ContentType != EpubContentType.XHTML_1_1) continue;
dict.Add(contentFileRef.FileName, pageCount);
pageCount += 1;
}
return dict;
}
/// <summary>
/// Parses out Title from book. Chapters and Volumes will always be "0". If there is any exception reading book (malformed books)
/// then null is returned. This expects only an epub file
/// </summary>
/// <param name="filePath"></param>
/// <returns></returns>
public ParserInfo? ParseInfo(string filePath)
{
if (!Parser.IsEpub(filePath)) return null;
try
{
using var epubBook = EpubReader.OpenBook(filePath, BookReaderOptions);
// <meta content="The Dark Tower" name="calibre:series"/>
// <meta content="Wolves of the Calla" name="calibre:title_sort"/>
// If all three are present, we can take that over dc:title and format as:
// Series = The Dark Tower, Volume = 5, Filename as "Wolves of the Calla"
// In addition, the following can exist and should parse as a series (EPUB 3.2 spec)
// <meta property="belongs-to-collection" id="c01">
// The Lord of the Rings
// </meta>
// <meta refines="#c01" property="collection-type">set</meta>
// <meta refines="#c01" property="group-position">2</meta>
try
{
var seriesIndex = string.Empty;
var series = string.Empty;
var specialName = string.Empty;
foreach (var metadataItem in epubBook.Schema.Package.Metadata.MetaItems)
{
// EPUB 2 and 3
switch (metadataItem.Name)
{
case "calibre:series_index":
seriesIndex = metadataItem.Content;
break;
case "calibre:series":
series = metadataItem.Content;
break;
case "calibre:title_sort":
specialName = metadataItem.Content;
break;
}
// EPUB 3.2+ only
switch (metadataItem.Property)
{
case "group-position":
seriesIndex = metadataItem.Content;
break;
case "belongs-to-collection":
series = metadataItem.Content;
break;
case "collection-type":
// These look to be genres from https://manual.calibre-ebook.com/sub_groups.html or can be "series"
break;
}
}
if (!string.IsNullOrEmpty(series) && !string.IsNullOrEmpty(seriesIndex))
{
if (string.IsNullOrEmpty(specialName))
{
specialName = epubBook.Title;
}
var info = new ParserInfo
{
Chapters = Parser.DefaultChapter,
Edition = string.Empty,
Format = MangaFormat.Epub,
Filename = Path.GetFileName(filePath),
Title = specialName?.Trim() ?? string.Empty,
FullFilePath = filePath,
IsSpecial = false,
Series = series.Trim(),
SeriesSort = series.Trim(),
Volumes = seriesIndex
};
return info;
}
}
catch (Exception)
{
// Swallow exception
}
return new ParserInfo
{
Chapters = Parser.DefaultChapter,
Edition = string.Empty,
Format = MangaFormat.Epub,
Filename = Path.GetFileName(filePath),
Title = epubBook.Title.Trim(),
FullFilePath = filePath,
IsSpecial = false,
Series = epubBook.Title.Trim(),
Volumes = Parser.DefaultVolume,
};
}
catch (Exception ex)
{
_logger.LogWarning(ex, "[BookService] There was an exception when opening epub book: {FileName}", filePath);
_mediaErrorService.ReportMediaIssue(filePath, MediaErrorProducer.BookService,
"There was an exception when opening epub book", ex);
}
return null;
}
/// <summary>
/// Extracts a pdf into images to a target directory. Uses multi-threaded implementation since docnet is slow normally.
/// </summary>
/// <param name="fileFilePath"></param>
/// <param name="targetDirectory"></param>
public void ExtractPdfImages(string fileFilePath, string targetDirectory)
{
_directoryService.ExistOrCreate(targetDirectory);
using var docReader = DocLib.Instance.GetDocReader(fileFilePath, new PageDimensions(1080, 1920));
var pages = docReader.GetPageCount();
Parallel.For(0, pages, pageNumber =>
{
using var stream = StreamManager.GetStream("BookService.GetPdfPage");
GetPdfPage(docReader, pageNumber, stream);
using var fileStream = File.Create(Path.Combine(targetDirectory, "Page-" + pageNumber + ".png"));
stream.Seek(0, SeekOrigin.Begin);
stream.CopyTo(fileStream);
});
}
/// <summary>
/// Responsible to scope all the css, links, tags, etc to prepare a self contained html file for the page
/// </summary>
/// <param name="doc">Html Doc that will be appended to</param>
/// <param name="book">Underlying epub</param>
/// <param name="apiBase">API Url for file loading to pass through</param>
/// <param name="body">Body element from the epub</param>
/// <param name="mappings">Epub mappings</param>
/// <param name="page">Page number we are loading</param>
/// <returns></returns>
public async Task<string> ScopePage(HtmlDocument doc, EpubBookRef book, string apiBase, HtmlNode body, Dictionary<string, int> mappings, int page)
{
await InlineStyles(doc, book, apiBase, body);
RewriteAnchors(page, doc, mappings);
ScopeImages(doc, book, apiBase);
return PrepareFinalHtml(doc, body);
}
/// <summary>
/// Tries to find the correct key by applying cleaning and remapping if the epub has bad data. Only works for HTML files.
/// </summary>
/// <param name="book"></param>
/// <param name="mappings"></param>
/// <param name="key"></param>
/// <returns></returns>
private static string CoalesceKey(EpubBookRef book, IDictionary<string, int> mappings, string key)
{
if (mappings.ContainsKey(CleanContentKeys(key))) return key;
// Fallback to searching for key (bad epub metadata)
var correctedKey = book.Content.Html.Keys.FirstOrDefault(s => s.EndsWith(key));
if (!string.IsNullOrEmpty(correctedKey))
{
key = correctedKey;
}
return key;
}
public static string CoalesceKeyForAnyFile(EpubBookRef book, string key)
{
if (book.Content.AllFiles.ContainsKey(key)) return key;
var cleanedKey = CleanContentKeys(key);
if (book.Content.AllFiles.ContainsKey(cleanedKey)) return cleanedKey;
// Fallback to searching for key (bad epub metadata)
var correctedKey = book.Content.AllFiles.Keys.SingleOrDefault(s => s.EndsWith(key));
if (!string.IsNullOrEmpty(correctedKey))
{
key = correctedKey;
}
return key;
}
/// <summary>
/// This will return a list of mappings from ID -> page num. ID will be the xhtml key and page num will be the reading order
/// this is used to rewrite anchors in the book text so that we always load properly in our reader.
/// </summary>
/// <param name="chapter">Chapter with at least one file</param>
/// <returns></returns>
public async Task<ICollection<BookChapterItem>> GenerateTableOfContents(Chapter chapter)
{
using var book = await EpubReader.OpenBookAsync(chapter.Files.ElementAt(0).FilePath, BookReaderOptions);
var mappings = await CreateKeyToPageMappingAsync(book);
var navItems = await book.GetNavigationAsync();
var chaptersList = new List<BookChapterItem>();
foreach (var navigationItem in navItems)
{
if (navigationItem.NestedItems.Count == 0)
{
CreateToCChapter(navigationItem, Array.Empty<BookChapterItem>(), chaptersList, mappings);
continue;
}
var nestedChapters = new List<BookChapterItem>();
foreach (var nestedChapter in navigationItem.NestedItems.Where(n => n.Link != null))
{
var key = CoalesceKey(book, mappings, nestedChapter.Link.ContentFileName);
if (mappings.ContainsKey(key))
{
nestedChapters.Add(new BookChapterItem
{
Title = nestedChapter.Title,
Page = mappings[key],
Part = nestedChapter.Link.Anchor ?? string.Empty,
Children = new List<BookChapterItem>()
});
}
}
CreateToCChapter(navigationItem, nestedChapters, chaptersList, mappings);
}
if (chaptersList.Count != 0) return chaptersList;
// Generate from TOC from links (any point past this, Kavita is generating as a TOC doesn't exist)
var tocPage = book.Content.Html.Keys.FirstOrDefault(k => k.ToUpper().Contains("TOC"));
if (tocPage == null) return chaptersList;
// Find all anchor tags, for each anchor we get inner text, to lower then title case on UI. Get href and generate page content
var doc = new HtmlDocument();
var content = await book.Content.Html[tocPage].ReadContentAsync();
doc.LoadHtml(content);
var anchors = doc.DocumentNode.SelectNodes("//a");
if (anchors == null) return chaptersList;
foreach (var anchor in anchors)
{
if (!anchor.Attributes.Contains("href")) continue;
var key = CoalesceKey(book, mappings, anchor.Attributes["href"].Value.Split("#")[0]);
if (string.IsNullOrEmpty(key) || !mappings.ContainsKey(key)) continue;
var part = string.Empty;
if (anchor.Attributes["href"].Value.Contains('#'))
{
part = anchor.Attributes["href"].Value.Split("#")[1];
}
chaptersList.Add(new BookChapterItem
{
Title = anchor.InnerText,
Page = mappings[key],
Part = part,
Children = new List<BookChapterItem>()
});
}
return chaptersList;
}
/// <summary>
/// This returns a single page within the epub book. All html will be rewritten to be scoped within our reader,
/// all css is scoped, etc.
/// </summary>
/// <param name="page">The requested page</param>
/// <param name="chapterId">The chapterId</param>
/// <param name="cachedEpubPath">The path to the cached epub file</param>
/// <param name="baseUrl">The API base for Kavita, to rewrite urls to so we load though our endpoint</param>
/// <returns>Full epub HTML Page, scoped to Kavita's reader</returns>
/// <exception cref="KavitaException">All exceptions throw this</exception>
public async Task<string> GetBookPage(int page, int chapterId, string cachedEpubPath, string baseUrl)
{
using var book = await EpubReader.OpenBookAsync(cachedEpubPath, BookReaderOptions);
var mappings = await CreateKeyToPageMappingAsync(book);
var apiBase = baseUrl + "book/" + chapterId + "/" + BookApiUrl;
var counter = 0;
var doc = new HtmlDocument {OptionFixNestedTags = true};
var bookPages = await book.GetReadingOrderAsync();
try
{
foreach (var contentFileRef in bookPages)
{
if (page != counter)
{
counter++;
continue;
}
var content = await contentFileRef.ReadContentAsync();
if (contentFileRef.ContentType != EpubContentType.XHTML_1_1) return content;
// In more cases than not, due to this being XML not HTML, we need to escape the script tags.
content = EscapeTags(content);
doc.LoadHtml(content);
var body = doc.DocumentNode.SelectSingleNode("//body");
if (body == null)
{
if (doc.ParseErrors.Any())
{
LogBookErrors(book, contentFileRef, doc);
throw new KavitaException("The file is malformed! Cannot read.");
}
_logger.LogError("{FilePath} has no body tag! Generating one for support. Book may be skewed", book.FilePath);
doc.DocumentNode.SelectSingleNode("/html").AppendChild(HtmlNode.CreateNode("<body></body>"));
body = doc.DocumentNode.SelectSingleNode("/html/body");
}
return await ScopePage(doc, book, apiBase, body, mappings, page);
}
} catch (Exception ex)
{
_logger.LogError(ex, "There was an issue reading one of the pages for {Book}", book.FilePath);
await _mediaErrorService.ReportMediaIssueAsync(book.FilePath, MediaErrorProducer.BookService,
"There was an issue reading one of the pages for", ex);
}
throw new KavitaException("Could not find the appropriate html for that page");
}
private static void CreateToCChapter(EpubNavigationItemRef navigationItem, IList<BookChapterItem> nestedChapters,
ICollection<BookChapterItem> chaptersList, IReadOnlyDictionary<string, int> mappings)
{
if (navigationItem.Link == null)
{
var item = new BookChapterItem
{
Title = navigationItem.Title,
Children = nestedChapters
};
if (nestedChapters.Count > 0)
{
item.Page = nestedChapters[0].Page;
}
chaptersList.Add(item);
}
else
{
var groupKey = CleanContentKeys(navigationItem.Link.ContentFileName);
if (mappings.ContainsKey(groupKey))
{
chaptersList.Add(new BookChapterItem
{
Title = navigationItem.Title,
Page = mappings[groupKey],
Children = nestedChapters
});
}
}
}
/// <summary>
/// Extracts the cover image to covers directory and returns file path back
/// </summary>
/// <param name="fileFilePath"></param>
/// <param name="fileName">Name of the new file.</param>
/// <param name="outputDirectory">Where to output the file, defaults to covers directory</param>
/// <param name="saveAsWebP">When saving the file, use WebP encoding instead of PNG</param>
/// <returns></returns>
public string GetCoverImage(string fileFilePath, string fileName, string outputDirectory, bool saveAsWebP = false)
{
if (!IsValidFile(fileFilePath)) return string.Empty;
if (Parser.IsPdf(fileFilePath))
{
return GetPdfCoverImage(fileFilePath, fileName, outputDirectory, saveAsWebP);
}
using var epubBook = EpubReader.OpenBook(fileFilePath, BookReaderOptions);
try
{
// Try to get the cover image from OPF file, if not set, try to parse it from all the files, then result to the first one.
var coverImageContent = epubBook.Content.Cover
?? epubBook.Content.Images.Values.FirstOrDefault(file => Parser.IsCoverImage(file.FileName))
?? epubBook.Content.Images.Values.FirstOrDefault();
if (coverImageContent == null) return string.Empty;
using var stream = coverImageContent.GetContentStream();
return _imageService.WriteCoverThumbnail(stream, fileName, outputDirectory, saveAsWebP);
}
catch (Exception ex)
{
_logger.LogWarning(ex, "[BookService] There was a critical error and prevented thumbnail generation on {BookFile}. Defaulting to no cover image", fileFilePath);
_mediaErrorService.ReportMediaIssue(fileFilePath, MediaErrorProducer.BookService,
"There was a critical error and prevented thumbnail generation", ex);
}
return string.Empty;
}
private string GetPdfCoverImage(string fileFilePath, string fileName, string outputDirectory, bool saveAsWebP)
{
try
{
using var docReader = DocLib.Instance.GetDocReader(fileFilePath, new PageDimensions(1080, 1920));
if (docReader.GetPageCount() == 0) return string.Empty;
using var stream = StreamManager.GetStream("BookService.GetPdfPage");
GetPdfPage(docReader, 0, stream);
return _imageService.WriteCoverThumbnail(stream, fileName, outputDirectory, saveAsWebP);
}
catch (Exception ex)
{
_logger.LogWarning(ex,
"[BookService] There was a critical error and prevented thumbnail generation on {BookFile}. Defaulting to no cover image",
fileFilePath);
_mediaErrorService.ReportMediaIssue(fileFilePath, MediaErrorProducer.BookService,
"There was a critical error and prevented thumbnail generation", ex);
}
return string.Empty;
}
/// <summary>
/// Returns an image raster of a page within a PDF
/// </summary>
/// <param name="docReader"></param>
/// <param name="pageNumber"></param>
/// <param name="stream"></param>
private static void GetPdfPage(IDocReader docReader, int pageNumber, Stream stream)
{
using var pageReader = docReader.GetPageReader(pageNumber);
var rawBytes = pageReader.GetImage(new NaiveTransparencyRemover());
var width = pageReader.GetPageWidth();
var height = pageReader.GetPageHeight();
var image = Image.LoadPixelData<Bgra32>(rawBytes, width, height);
stream.Seek(0, SeekOrigin.Begin);
image.SaveAsPng(stream);
stream.Seek(0, SeekOrigin.Begin);
}
private static string RemoveWhiteSpaceFromStylesheets(string body)
{
if (string.IsNullOrEmpty(body))
{
return string.Empty;
}
// Remove comments from CSS
// body = CssComment().Replace(body, string.Empty);
//
// body = WhiteSpace1().Replace(body, "#");
// body = WhiteSpace2().Replace(body, string.Empty);
// body = WhiteSpace3().Replace(body, " ");
// body = WhiteSpace4().Replace(body, "$1");
body = Regex.Replace(body, @"/\*[\d\D]*?\*/", string.Empty, RegexOptions.None, Parser.RegexTimeout);
body = Regex.Replace(body, @"[a-zA-Z]+#", "#", RegexOptions.None, Parser.RegexTimeout);
body = Regex.Replace(body, @"[\n\r]+\s*", string.Empty, RegexOptions.None, Parser.RegexTimeout);
body = Regex.Replace(body, @"\s+", " ", RegexOptions.None, Parser.RegexTimeout);
body = Regex.Replace(body, @"\s?([:,;{}])\s?", "$1", RegexOptions.None, Parser.RegexTimeout);
try
{
body = body.Replace(";}", "}");
}
catch (Exception)
{
//Swallow exception. Some css don't have style rules ending in ';'
}
//body = UnitPadding().Replace(body, "$1");
body = Regex.Replace(body, @"([\s:]0)(px|pt|%|em)", "$1", RegexOptions.None, Parser.RegexTimeout);
return body;
}
private void LogBookErrors(EpubBookRef book, EpubContentFileRef contentFileRef, HtmlDocument doc)
{
_logger.LogError("{FilePath} has an invalid html file (Page {PageName})", book.FilePath, contentFileRef.FileName);
foreach (var error in doc.ParseErrors)
{
_logger.LogError("Line {LineNumber}, Reason: {Reason}", error.Line, error.Reason);
}
}
}