From 7bc40302760f4db20cdd48f1df867a8fca69c1e1 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Fri, 21 Feb 2025 22:41:04 +0530
Subject: [PATCH] kepubify: Implement CSS filtering

---
 src/calibre/ebooks/oeb/polish/kepubify.py     | 61 +++++++++++++++++--
 .../ebooks/oeb/polish/tests/kepubify.py       | 13 +++-
 2 files changed, 67 insertions(+), 7 deletions(-)

diff --git a/src/calibre/ebooks/oeb/polish/kepubify.py b/src/calibre/ebooks/oeb/polish/kepubify.py
index cb6ef6ef3c..c97eec01a1 100644
--- a/src/calibre/ebooks/oeb/polish/kepubify.py
+++ b/src/calibre/ebooks/oeb/polish/kepubify.py
@@ -23,7 +23,7 @@ from css_parser.css import CSSRule
 from lxml import etree
 
 from calibre.ebooks.metadata import authors_to_string
-from calibre.ebooks.oeb.base import OEB_DOCS, XHTML, XPath, escape_cdata
+from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES, XHTML, XPath, escape_cdata
 from calibre.ebooks.oeb.parse_utils import barename, merge_multiple_html_heads_and_bodies
 from calibre.ebooks.oeb.polish.container import Container, get_container
 from calibre.ebooks.oeb.polish.cover import find_cover_image, find_cover_image3, find_cover_page
@@ -53,6 +53,10 @@ class Options(NamedTuple):
     remove_widows_and_orphans: bool = False
     remove_at_page_rules: bool = False
 
+    @property
+    def needs_stylesheet_processing(self) -> bool:
+        return self.remove_at_page_rules or self.remove_widows_and_orphans
+
 
 def outer_html(node):
     return etree.tostring(node, encoding='unicode', with_tail=False)
@@ -227,9 +231,36 @@ def serialize_html(root) -> bytes:
     return b"<?xml version='1.0' encoding='utf-8'?>\n" + ans.encode('utf-8')
 
 
+def process_stylesheet(css: str, opts: Options) -> str:
+    sheet = parse_css_string(css)
+    removals = []
+    changed = False
+    for i, rule in enumerate(sheet.cssRules):
+        if rule.type == CSSRule.PAGE_RULE:
+            if opts.remove_at_page_rules:
+                removals.append(i)
+        elif rule.type == CSSRule.STYLE_RULE:
+            if opts.remove_widows_and_orphans:
+                s = rule.style
+                if s.removeProperty('widows'):
+                    changed = True
+                if s.removeProperty('orphans'):
+                    changed = True
+    for i in reversed(removals):
+        sheet.cssRules.pop(i)
+        changed = True
+    if changed:
+        css = sheet.cssText
+    return css
+
+
 def kepubify_parsed_html(root, opts: Options, metadata_lang: str = 'en'):
     remove_kobo_markup_from_html(root)
     merge_multiple_html_heads_and_bodies(root)
+    if opts.needs_stylesheet_processing:
+        for style in XPath('//h:style')(root):
+            if (style.get('type') or 'text/css') == 'text/css' and style.text:
+                style.text = process_stylesheet(style.text, opts)
     add_kobo_markup_to_html(root, opts, metadata_lang)
 
 
@@ -319,6 +350,24 @@ def first_spine_item_is_probably_title_page(container: Container) -> bool:
     return False
 
 
+def process_stylesheet_path(path: str, opts: Options) -> None:
+    if opts.needs_stylesheet_processing:
+        with open(path, 'r+b') as f:
+            css = f.read().decode()
+            ncss = process_stylesheet(css, opts)
+            if ncss is not css:
+                f.seek(0)
+                f.truncate()
+                f.write(ncss)
+
+
+def process_path(path: str, metadata_lang: str, opts: Options, media_type: str) -> None:
+    if media_type in OEB_DOCS:
+        kepubify_html_path(path, metadata_lang, opts)
+    elif media_type in OEB_STYLES:
+        process_stylesheet_path(path, opts)
+
+
 def kepubify_container(container: Container, opts: Options, max_workers: int = 0) -> None:
     remove_dummy_title_page(container)
     metadata_lang = container.mi.language
@@ -327,15 +376,17 @@ def kepubify_container(container: Container, opts: Options, max_workers: int = 0
         container.apply_unique_properties(cover_image_name, 'cover-image')
     if not find_cover_page(container) and not first_spine_item_is_probably_title_page(container):
         add_dummy_title_page(container, cover_image_name)
-    names_that_need_work = tuple(name for name, mt in container.mime_map.items() if mt in OEB_DOCS)
+    names_that_need_work = tuple(name for name, mt in container.mime_map.items() if mt in OEB_DOCS or mt in OEB_STYLES)
     num_workers = calculate_number_of_workers(names_that_need_work, container, max_workers)
     paths = tuple(map(container.name_to_abspath, names_that_need_work))
     if num_workers < 2:
-        for path in paths:
-            kepubify_html_path(path, metadata_lang, opts)
+        for name in names_that_need_work:
+            process_path(container.name_to_abspath(name), metadata_lang, opts, container.mime_map[name])
     else:
         with ThreadPoolExecutor(max_workers=num_workers) as executor:
-            futures = tuple(executor.submit(kepubify_html_path, path, metadata_lang, opts) for path in paths)
+            futures = tuple(executor.submit(
+                process_path, container.name_to_abspath(name), metadata_lang, opts, container.mime_map[name])
+                            for name in names_that_need_work)
             for future in futures:
                 future.result()
 
diff --git a/src/calibre/ebooks/oeb/polish/tests/kepubify.py b/src/calibre/ebooks/oeb/polish/tests/kepubify.py
index 623bf2a0c8..e70dff9a02 100644
--- a/src/calibre/ebooks/oeb/polish/tests/kepubify.py
+++ b/src/calibre/ebooks/oeb/polish/tests/kepubify.py
@@ -2,7 +2,7 @@
 # License: GPLv3 Copyright: 2025, Kovid Goyal <kovid at kovidgoyal.net>
 
 
-from calibre.ebooks.oeb.polish.kepubify import kepubify_html_data, remove_kobo_markup_from_html, serialize_html
+from calibre.ebooks.oeb.polish.kepubify import Options, kepubify_html_data, remove_kobo_markup_from_html, serialize_html
 from calibre.ebooks.oeb.polish.parsing import parse
 from calibre.ebooks.oeb.polish.tests.base import BaseTest
 
@@ -56,11 +56,20 @@ div#book-inner { margin-top: 0; margin-bottom: 0; }</style></head><body><div id=
             '<p><span class="koboSpan" id="kobo.1.1">A&#160;nbsp;&#160;</span></p>',
             '<div><script>1 < 2 & 3</script>':  # escaping with cdata note that kepubify doesnt do this
             '<div><script><![CDATA[1 < 2 & 3]]></script></div>',
+
+            # CSS filtering
+            '<div><style>@page { margin: 13px; }\ndiv { color: red; widows: 12 }</style>Something something</div>':
+            '<div><style>div {\n  color: red;\n}</style><span class="koboSpan" id="kobo.1.1">Something something</span></div>'
         }.items():
-            root = kepubify_html_data(src)
+            opts = Options()
+            opts = opts._replace(remove_widows_and_orphans=True)
+            opts = opts._replace(remove_at_page_rules=True)
+            root = kepubify_html_data(src, opts)
             actual = serialize_html(root).decode('utf-8')
             actual = actual[len(prefix):-len(suffix)]
             self.assertEqual(expected, actual)
+            if '@page' in src:
+                continue
             expected = serialize_html(parse(src)).decode('utf-8')
             remove_kobo_markup_from_html(root)
             actual = serialize_html(root).decode('utf-8')