Epub 3.2 Collection Tag support (#308)

* Hooked up logic for collections based on EPUB3.2 Spec and Fixed improper tags in EPUBs since it is XML and we are using HTML to parse it. * Fixed a bug with src:url url replacing so that it's much cleaner regex
2025-07-09 03:04:19 -04:00 · 2021-06-15 09:51:37 -05:00 · 2021-06-15 09:51:37 -05:00 · d02d2d3cb5
commit d02d2d3cb5
parent 584348c6ad
3 changed files with 44 additions and 6 deletions
--- a/API/Controllers/BookController.cs
+++ b/API/Controllers/BookController.cs
@ -186,6 +186,9 @@ namespace API.Controllers
                    var content = await contentFileRef.ReadContentAsync();
                    if (contentFileRef.ContentType != EpubContentType.XHTML_1_1) return Ok(content);
                    
+                    // In more cases than not, due to this being XML not HTML, we need to escape the script tags.
+                    content = BookService.EscapeTags(content);
+                    
                    doc.LoadHtml(content);
                    var body = doc.DocumentNode.SelectSingleNode("//body");

--- a/API/Parser/Parser.cs
+++ b/API/Parser/Parser.cs
@ -15,7 +15,7 @@ namespace API.Parser
        public const string ArchiveFileExtensions = @"\.cbz|\.zip|\.rar|\.cbr|\.tar.gz|\.7zip|\.7z|.cb7";
        public const string BookFileExtensions = @"\.epub";
        public const string ImageFileExtensions = @"^(\.png|\.jpeg|\.jpg)";
-        public static readonly Regex FontSrcUrlRegex = new Regex("(src:url\\(\"?'?)([a-z0-9/\\._]+)(\"?'?\\))", RegexOptions.IgnoreCase | RegexOptions.Compiled);
+        public static readonly Regex FontSrcUrlRegex = new Regex(@"(src:url\(.{1})" + "([^\"']*)" + @"(.{1}\))", RegexOptions.IgnoreCase | RegexOptions.Compiled);
        public static readonly Regex CssImportUrlRegex = new Regex("(@import\\s[\"|'])(?<Filename>[\\w\\d/\\._-]+)([\"|'];?)", RegexOptions.IgnoreCase | RegexOptions.Compiled);

        private static readonly string XmlRegexExtensions = @"\.xml";
--- a/API/Services/BookService.cs
+++ b/API/Services/BookService.cs
@ -23,7 +23,7 @@ namespace API.Services

        private const int ThumbnailWidth = 320; // 153w x 230h
        private readonly StylesheetParser _cssParser = new ();
-      
+
        public BookService(ILogger<BookService> logger)
        {
            _logger = logger;
@ -204,6 +204,13 @@ namespace API.Services
            return 0;
        }

+        public static string EscapeTags(string content)
+        {
+            content = Regex.Replace(content, @"<script(.*)(/>)", "<script$1></script>");
+            content = Regex.Replace(content, @"<title(.*)(/>)", "<title$1></title>");
+            return content;
+        }
+
        public static string CleanContentKeys(string key)
        {
            return key.Replace("../", string.Empty);
@ -241,14 +248,23 @@ namespace API.Services
                // <meta content="Wolves of the Calla" name="calibre:title_sort"/>
                // If all three are present, we can take that over dc:title and format as:
                // Series = The Dark Tower, Volume = 5, Filename as "Wolves of the Calla"
+                // In addition, the following can exist and should parse as a series (EPUB 3.2 spec)
+                // <meta property="belongs-to-collection" id="c01">
+                //   The Lord of the Rings
+                // </meta>
+                // <meta refines="#c01" property="collection-type">set</meta>
+                // <meta refines="#c01" property="group-position">2</meta>
                try
                {
-                    string seriesIndex = string.Empty;
-                    string series = string.Empty;
-                    string specialName = string.Empty;
+                    var seriesIndex = string.Empty;
+                    var series = string.Empty;
+                    var specialName = string.Empty;
+                    var groupPosition = string.Empty;
+
                    
                    foreach (var metadataItem in epubBook.Schema.Package.Metadata.MetaItems)
                    {
+                        // EPUB 2 and 3
                        switch (metadataItem.Name)
                        {
                            case "calibre:series_index":
@ -261,10 +277,29 @@ namespace API.Services
                                specialName = metadataItem.Content;
                                break;
                        }
+
+                        // EPUB 3.2+ only
+                        switch (metadataItem.Property)
+                        {
+                            case "group-position":
+                                seriesIndex = metadataItem.Content;
+                                break;
+                            case "belongs-to-collection":
+                                series = metadataItem.Content;
+                                break;
+                            case "collection-type":
+                                groupPosition = metadataItem.Content;
+                                break;
+                        }
                    }

-                    if (!string.IsNullOrEmpty(series) && !string.IsNullOrEmpty(seriesIndex) && !string.IsNullOrEmpty(specialName))
+                    if (!string.IsNullOrEmpty(series) && !string.IsNullOrEmpty(seriesIndex) &&
+                        (!string.IsNullOrEmpty(specialName) || groupPosition.Equals("series") || groupPosition.Equals("set")))
                    {
+                        if (string.IsNullOrEmpty(specialName))
+                        {
+                            specialName = epubBook.Title;
+                        }
                        return new ParserInfo()
                        {
                            Chapters = "0",