From c2d52efbcd011ce93f594f91c4be93985aff8d9e Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 22 Feb 2025 13:50:41 +0530
Subject: [PATCH] Implement unkepublify

---
 src/calibre/ebooks/oeb/polish/container.py    |   7 +-
 src/calibre/ebooks/oeb/polish/kepubify.py     | 139 +++++++++++++-----
 .../ebooks/oeb/polish/tests/kepubify.py       |  41 ++++--
 3 files changed, 132 insertions(+), 55 deletions(-)

diff --git a/src/calibre/ebooks/oeb/polish/container.py b/src/calibre/ebooks/oeb/polish/container.py
index b09d2a5e39..6edd901211 100644
--- a/src/calibre/ebooks/oeb/polish/container.py
+++ b/src/calibre/ebooks/oeb/polish/container.py
@@ -1577,7 +1577,7 @@ class AZW3Container(Container):
 # }}}
 
 
-def get_container(path, log=None, tdir=None, tweak_mode=False) -> Container:
+def get_container(path, log=None, tdir=None, tweak_mode=False, ebook_cls=None) -> Container:
     if log is None:
         log = default_log
     try:
@@ -1585,8 +1585,9 @@ def get_container(path, log=None, tdir=None, tweak_mode=False) -> Container:
     except Exception:
         isdir = False
     own_tdir = not tdir
-    ebook_cls = (AZW3Container if path.rpartition('.')[-1].lower() in {'azw3', 'mobi', 'original_azw3', 'original_mobi'} and not isdir
-            else EpubContainer)
+    if ebook_cls is None:
+        ebook_cls = (AZW3Container if path.rpartition('.')[-1].lower() in {'azw3', 'mobi', 'original_azw3', 'original_mobi'} and not isdir
+                else EpubContainer)
     if own_tdir:
         tdir = PersistentTemporaryDirectory(f'_{ebook_cls.book_type}_container')
     try:
diff --git a/src/calibre/ebooks/oeb/polish/kepubify.py b/src/calibre/ebooks/oeb/polish/kepubify.py
index 8b9d1636d3..c2de0086e7 100644
--- a/src/calibre/ebooks/oeb/polish/kepubify.py
+++ b/src/calibre/ebooks/oeb/polish/kepubify.py
@@ -16,16 +16,17 @@ import os
 import re
 import sys
 from concurrent.futures import ThreadPoolExecutor
+from functools import lru_cache
 from typing import NamedTuple
 
-from css_parser import parseString as parse_css_string
-from css_parser.css import CSSRule
+from css_parser import CSSParser
+from css_parser.css import CSSComment, CSSPageRule, CSSRule
 from lxml import etree
 
 from calibre.ebooks.metadata import authors_to_string
 from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES, XHTML, XPath, escape_cdata
 from calibre.ebooks.oeb.parse_utils import barename, merge_multiple_html_heads_and_bodies
-from calibre.ebooks.oeb.polish.container import Container, get_container
+from calibre.ebooks.oeb.polish.container import Container, EpubContainer, get_container
 from calibre.ebooks.oeb.polish.cover import find_cover_image, find_cover_image3, find_cover_page
 from calibre.ebooks.oeb.polish.parsing import parse
 from calibre.ebooks.oeb.polish.tts import lang_for_elem
@@ -40,6 +41,7 @@ INNER_DIV_ID = 'book-inner'
 KOBO_SPAN_CLASS = 'koboSpan'
 DUMMY_TITLE_PAGE_NAME = 'kobo-title-page-generated-by-calibre'
 DUMMY_COVER_IMAGE_NAME = 'kobo-cover-image-generated-by-calibre'
+CSS_COMMENT_COOKIE = 'calibre-removed-css-for-kobo'
 SKIPPED_TAGS = frozenset((
     '', 'script', 'style', 'atom', 'pre', 'audio', 'video', 'svg', 'math'
 ))
@@ -49,14 +51,21 @@ BLOCK_TAGS = frozenset((
 KOBO_CSS = 'div#book-inner { margin-top: 0; margin-bottom: 0; }'
 
 
+@lru_cache(2)
+def css_parser() -> CSSParser:
+    return CSSParser(validate=False)
+
+
 class Options(NamedTuple):
     extra_css: str = KOBO_CSS
     remove_widows_and_orphans: bool = False
     remove_at_page_rules: bool = False
 
+    for_removal: bool = False
+
     @property
     def needs_stylesheet_processing(self) -> bool:
-        return self.remove_at_page_rules or self.remove_widows_and_orphans
+        return self.remove_at_page_rules or self.remove_widows_and_orphans or self.for_removal
 
 
 def outer_html(node):
@@ -232,37 +241,68 @@ def serialize_html(root) -> bytes:
     return b"<?xml version='1.0' encoding='utf-8'?>\n" + ans.encode('utf-8')
 
 
+def nest_css_comments(text: str) -> str:
+    return text.replace('*/', '*\u200c/')
+
+
 def process_stylesheet(css: str, opts: Options) -> str:
-    sheet = parse_css_string(css)
-    removals = []
+    has_comment_cookie = CSS_COMMENT_COOKIE in css
+    if opts.for_removal and not has_comment_cookie:
+        return css  # avoid expensive parse
+    sheet = css_parser().parseString(css)
     changed = False
+    page_rules = []
+    if has_comment_cookie:
+        for i, rule in enumerate(sheet.cssRules):
+            if rule.type == CSSRule.STYLE_RULE:
+                s = rule.style
+                for q in ('widows', 'orphans'):
+                    if (prop := s.getProperty(f'-{CSS_COMMENT_COOKIE}-{q}')) is not None:
+                        prop.name = q
+                        changed = True
+            elif rule.type == CSSRule.COMMENT:
+                if rule.cssText.startswith(f'/* {CSS_COMMENT_COOKIE}: '):
+                    page_rules.append((i, rule))
+                    changed = True
+    for i, cr in page_rules:
+        pr = CSSPageRule()
+        pr.cssText = cr.cssText[len(f'/* {CSS_COMMENT_COOKIE}: '):-3]
+        sheet.deleteRule(i)
+        sheet.insertRule(pr, i)
+
+    if opts.for_removal:
+        return sheet.cssText if changed else css
+
     for i, rule in enumerate(sheet.cssRules):
-        if rule.type == CSSRule.PAGE_RULE:
-            if opts.remove_at_page_rules:
-                removals.append(i)
-        elif rule.type == CSSRule.STYLE_RULE:
+        if rule.type == CSSRule.STYLE_RULE:
             if opts.remove_widows_and_orphans:
                 s = rule.style
-                if s.removeProperty('widows'):
-                    changed = True
-                if s.removeProperty('orphans'):
-                    changed = True
-    for i in reversed(removals):
-        sheet.cssRules.pop(i)
-        changed = True
-    if changed:
-        css = sheet.cssText
-    return css
+                for q in ('widows', 'orphans'):
+                    if (prop := s.getProperty(q)) is not None:
+                        changed = True
+                        prop.name = f'-{CSS_COMMENT_COOKIE}-{q}'
+        elif rule.type == CSSRule.PAGE_RULE:
+            if opts.remove_at_page_rules:
+                changed = True
+                page_rules.append((i, rule))
+
+    for i, pr in page_rules:
+        comment = CSSComment(f'/* {CSS_COMMENT_COOKIE}: {nest_css_comments(pr.cssText)} */')
+        sheet.deleteRule(i)
+        sheet.insertRule(comment, i)
+    return sheet.cssText if changed else css
 
 
 def kepubify_parsed_html(root, opts: Options, metadata_lang: str = 'en'):
     remove_kobo_markup_from_html(root)
-    merge_multiple_html_heads_and_bodies(root)
+    if not opts.for_removal:
+        merge_multiple_html_heads_and_bodies(root)
     if opts.needs_stylesheet_processing:
         for style in XPath('//h:style')(root):
             if (style.get('type') or 'text/css') == 'text/css' and style.text:
                 style.text = process_stylesheet(style.text, opts)
-    add_kobo_markup_to_html(root, opts, metadata_lang)
+    if not opts.for_removal:
+        add_kobo_markup_to_html(root, opts, metadata_lang)
 
 
 def kepubify_html_data(raw: str | bytes, opts: Options = Options(), metadata_lang: str = 'en'):
@@ -375,19 +415,7 @@ def process_path(path: str, metadata_lang: str, opts: Options, media_type: str)
         process_stylesheet_path(path, opts)
 
 
-def kepubify_container(container: Container, opts: Options, max_workers: int = 0) -> None:
-    remove_dummy_title_page(container)
-    remove_dummy_cover_image(container)
-    metadata_lang = container.mi.language
-    cover_image_name = find_cover_image(container) or find_cover_image3(container)
-    mi = container.mi
-    if not cover_image_name:
-        from calibre.ebooks.covers import generate_cover
-        cdata = generate_cover(mi)
-        cover_image_name = container.add_file(f'{DUMMY_COVER_IMAGE_NAME}.jpeg', cdata, modify_name_if_needed=True)
-    container.apply_unique_properties(cover_image_name, 'cover-image')
-    if not find_cover_page(container) and not first_spine_item_is_probably_title_page(container):
-        add_dummy_title_page(container, cover_image_name, mi)
+def do_work_in_parallel(container: Container, opts: Options, metadata_lang: str, max_workers: int) -> None:
     names_that_need_work = tuple(name for name, mt in container.mime_map.items() if mt in OEB_DOCS or mt in OEB_STYLES)
     num_workers = calculate_number_of_workers(names_that_need_work, container, max_workers)
     paths = tuple(map(container.name_to_abspath, names_that_need_work))
@@ -403,6 +431,30 @@ def kepubify_container(container: Container, opts: Options, max_workers: int = 0
                 future.result()
 
 
+def unkepubify_container(container: Container, max_workers: int = 0) -> None:
+    remove_dummy_cover_image(container)
+    remove_dummy_title_page(container)
+    opts = Options(for_removal=True)
+    metadata_lang = container.mi.language
+    do_work_in_parallel(container, opts, metadata_lang, max_workers)
+
+
+def kepubify_container(container: Container, opts: Options, max_workers: int = 0) -> None:
+    remove_dummy_title_page(container)
+    remove_dummy_cover_image(container)
+    metadata_lang = container.mi.language
+    cover_image_name = find_cover_image(container) or find_cover_image3(container)
+    mi = container.mi
+    if not cover_image_name:
+        from calibre.ebooks.covers import generate_cover
+        cdata = generate_cover(mi)
+        cover_image_name = container.add_file(f'{DUMMY_COVER_IMAGE_NAME}.jpeg', cdata, modify_name_if_needed=True)
+    container.apply_unique_properties(cover_image_name, 'cover-image')
+    if not find_cover_page(container) and not first_spine_item_is_probably_title_page(container):
+        add_dummy_title_page(container, cover_image_name, mi)
+    do_work_in_parallel(container, opts, metadata_lang, max_workers)
+
+
 def kepubify_path(path, outpath='', max_workers=0, allow_overwrite=False, opts: Options = Options()):
     container = get_container(path, tweak_mode=True)
     kepubify_container(container, opts, max_workers=max_workers)
@@ -411,7 +463,20 @@ def kepubify_path(path, outpath='', max_workers=0, allow_overwrite=False, opts:
     c = 0
     while not allow_overwrite and outpath == path:
         c += 1
-        outpath = f'{base} - {c}.kepub'
+        outpath = f'{base}-{c}.kepub'
+    container.commit(outpath=outpath)
+    return outpath
+
+
+def unkepubify_path(path, outpath='', max_workers=0, allow_overwrite=False):
+    container = get_container(path, tweak_mode=True, ebook_cls=EpubContainer)
+    unkepubify_container(container, max_workers)
+    base, ext = os.path.splitext(path)
+    outpath = outpath or base + '.epub'
+    c = 0
+    while not allow_overwrite and outpath == path:
+        c += 1
+        outpath = f'{base}-{c}.epub'
     container.commit(outpath=outpath)
     return outpath
 
@@ -427,7 +492,7 @@ def make_options(
 ) -> Options:
     remove_widows_and_orphans = remove_at_page_rules = False
     if extra_css:
-        sheet = parse_css_string(extra_css)
+        sheet = css_parser().parseString(extra_css)
         for rule in sheet.cssRules:
             if rule.type == CSSRule.PAGE_RULE:
                 remove_at_page_rules = True
diff --git a/src/calibre/ebooks/oeb/polish/tests/kepubify.py b/src/calibre/ebooks/oeb/polish/tests/kepubify.py
index f56f87be6c..66e59ed84e 100644
--- a/src/calibre/ebooks/oeb/polish/tests/kepubify.py
+++ b/src/calibre/ebooks/oeb/polish/tests/kepubify.py
@@ -2,15 +2,18 @@
 # License: GPLv3 Copyright: 2025, Kovid Goyal <kovid at kovidgoyal.net>
 
 
+from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES
 from calibre.ebooks.oeb.polish.container import get_container
 from calibre.ebooks.oeb.polish.kepubify import (
+    CSS_COMMENT_COOKIE,
     DUMMY_COVER_IMAGE_NAME,
     DUMMY_TITLE_PAGE_NAME,
     Options,
     kepubify_html_data,
+    kepubify_parsed_html,
     kepubify_path,
-    remove_kobo_markup_from_html,
     serialize_html,
+    unkepubify_path,
 )
 from calibre.ebooks.oeb.polish.parsing import parse
 from calibre.ebooks.oeb.polish.tests.base import BaseTest, get_book_for_kepubify
@@ -18,22 +21,31 @@ from calibre.ebooks.oeb.polish.tests.base import BaseTest, get_book_for_kepubify
 
 class KepubifyTests(BaseTest):
 
-    def test_kepubify_container(self):
+    def test_kepubify(self):
         def b(has_cover=True, epub_version='3'):
             path = get_book_for_kepubify(has_cover=has_cover, epub_version=epub_version)
-            opts = Options()
-            opts = opts._replace(remove_widows_and_orphans=True)
-            opts = opts._replace(remove_at_page_rules=True)
+            opts = Options()._replace(remove_widows_and_orphans=True, remove_at_page_rules=True)
             outpath = kepubify_path(path, opts=opts, allow_overwrite=True)
             c = get_container(outpath, tweak_mode=True)
             spine_names = tuple(n for n, is_linear in c.spine_names)
             cname = 'titlepage.xhtml' if has_cover else f'{DUMMY_TITLE_PAGE_NAME}.xhtml'
             self.assertEqual(spine_names, (cname, 'index_split_000.html', 'index_split_001.html'))
             ps = c.open('page_styles.css', 'r').read()
-            for q in ('@page', 'widows', 'orphans'):
-                self.assertNotIn(q, ps)
+            for q in (f'{CSS_COMMENT_COOKIE}: @page', f'-{CSS_COMMENT_COOKIE}-widows', f'-{CSS_COMMENT_COOKIE}-orphans'):
+                self.assertIn(q, ps)
             cimage = ('cover.png',) if has_cover else (f'{DUMMY_COVER_IMAGE_NAME}.jpeg',)
             self.assertEqual(cimage, tuple(c.manifest_items_with_property('cover-image')))
+            # unkepubify
+            outpath = unkepubify_path(outpath)
+            expected = get_container(path, tweak_mode=True)
+            actual = get_container(outpath, tweak_mode=True)
+            self.assertEqual(
+                tuple(expected.manifest_items_with_property('cover-image')), tuple(actual.manifest_items_with_property('cover-image')))
+            self.assertEqual(tuple(expected.mime_map), tuple(actual.mime_map))
+            for name, mt in expected.mime_map.items():
+                if mt in OEB_DOCS or mt in OEB_STYLES or name.endswith('.opf'):
+                    self.assertEqual(expected.open(name, 'rb').read(), actual.open(name, 'rb').read())
+
         for has_cover in (True, False):
             for epub_version in '23':
                 b(has_cover, epub_version)
@@ -87,19 +99,18 @@ div#book-inner { margin-top: 0; margin-bottom: 0; }</style></head><body><div id=
             '<div><script><![CDATA[1 < 2 & 3]]></script></div>',
 
             # CSS filtering
-            '<div><style>@page { margin: 13px; }\ndiv { color: red; widows: 12 }</style>Something something</div>':
-            '<div><style>div {\n  color: red;\n}</style><span class="koboSpan" id="kobo.1.1">Something something</span></div>'
+            '<div><style>@page {\n  margin: 13px;\n}\ndiv {\n  widows: 12;\n  color: red;\n}</style>Some</div>':
+            f'<div><style>/* {CSS_COMMENT_COOKIE}: @page {{\n  margin: 13px;\n}} */\n'
+            f'div {{\n  -{CSS_COMMENT_COOKIE}-widows: 12;\n  color: red;\n}}</style>'
+            '<span class="koboSpan" id="kobo.1.1">Some</span></div>'
         }.items():
-            opts = Options()
-            opts = opts._replace(remove_widows_and_orphans=True)
-            opts = opts._replace(remove_at_page_rules=True)
+            opts = Options()._replace(remove_widows_and_orphans=True, remove_at_page_rules=True)
             root = kepubify_html_data(src, opts)
             actual = serialize_html(root).decode('utf-8')
             actual = actual[len(prefix):-len(suffix)]
             self.assertEqual(expected, actual)
-            if '@page' in src:
-                continue
             expected = serialize_html(parse(src)).decode('utf-8')
-            remove_kobo_markup_from_html(root)
+            opts = opts._replace(for_removal=True)
+            kepubify_parsed_html(root, opts)
             actual = serialize_html(root).decode('utf-8')
             self.assertEqual(expected, actual)