KF8 Output: Handle all CSS correctly

2025-07-09 03:04:10 -04:00 · 2012-04-17 22:20:50 +05:30 · 2012-04-17 22:20:50 +05:30 · 38340559ac
commit 38340559ac
parent 2b4f74b86d
3 changed files with 69 additions and 11 deletions
--- a/src/calibre/ebooks/mobi/utils.py
+++ b/src/calibre/ebooks/mobi/utils.py
@ -364,7 +364,7 @@ def count_set_bits(num):
        num >>= 1
    return ans

-def to_base(num, base=32):
+def to_base(num, base=32, min_num_digits=None):
    digits = string.digits + string.ascii_uppercase
    sign = 1 if num >= 0 else -1
    if num == 0: return '0'
@ -373,6 +373,8 @@ def to_base(num, base=32):
    while num:
        ans.append(digits[(num % base)])
        num //= base
+    if min_num_digits is not None and len(ans) < min_num_digits:
+        ans.extend('0'*(min_num_digits - len(ans)))
    if sign < 0:
        ans.append('-')
    ans.reverse()
--- a/src/calibre/ebooks/mobi/writer8/main.py
+++ b/src/calibre/ebooks/mobi/writer8/main.py
@ -9,14 +9,22 @@ __docformat__ = 'restructuredtext en'

 import copy
 from functools import partial
+from collections import defaultdict

 import cssutils
+from lxml import etree

-from calibre import isbytestring
-from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, SVG_MIME, XPath)
+from calibre import isbytestring, force_unicode
+from calibre.ebooks.mobi.utils import to_base
+from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, SVG_MIME, XPath,
+        extract, XHTML)

 XML_DOCS = OEB_DOCS | {SVG_MIME}

+# References to record numbers in KF8 are stored as base-32 encoded integers,
+# with 4 digits
+to_ref = partial(to_base, base=32, min_num_digits=4)
+
 class KF8Writer(object):

    def __init__(self, oeb, opts, resources):
@ -24,10 +32,10 @@ class KF8Writer(object):
        self.used_images = set()
        self.resources = resources
        self.dup_data()
+        self.flows = [None] # First flow item is reserved for the text

        self.replace_resource_links()
-
-        self.create_pieces()
+        self.extract_css_into_flows()

    def dup_data(self):
        ''' Duplicate data so that any changes we make to markup/CSS only
@ -57,12 +65,13 @@ class KF8Writer(object):
            idx = self.resources.item_map.get(ref, None)
            if idx is not None:
                is_image = self.resources.records[idx-1][:4] not in {b'FONT'}
+                idx = to_ref(idx)
                if is_image:
                    self.used_images.add(ref)
-                    return 'kindle:embed:%04d?mime=%s'%(idx,
+                    return 'kindle:embed:%s?mime=%s'%(idx,
                            self.resources.mime_map[ref])
                else:
-                    return 'kindle:embed:%04d'%idx
+                    return 'kindle:embed:%s'%idx
            return oref

        for item in self.oeb.manifest:
@ -90,11 +99,44 @@ class KF8Writer(object):
                replacer = partial(pointer, item)
                cssutils.replaceUrls(sheet, replacer, ignoreImportRules=True)

+    def extract_css_into_flows(self):
+        inlines = defaultdict(list) # Ensure identical <style>s not repeated
+        sheets = {}

-    def create_pieces(self):
-        self.flows = [None] # First flow item is reserved for the text
+        for item in self.oeb.manifest:
+            if item.media_type in OEB_STYLES:
+                data = self.data(item).cssText
+                self.flows.append(force_unicode(data, 'utf-8'))
+                sheets[item.href] = len(self.flows)

        for item in self.oeb.spine:
            root = self.data(item)
-            root
+            if not hasattr(root, 'xpath'): continue
+
+            for link in XPath('//h:link[@href]')(root):
+                href = item.abshref(link.get('href'))
+                idx = sheets.get(href, None)
+                if idx is not None:
+                    idx = to_ref(idx)
+                    link.set('href', 'kindle:flow:%s?mime=text/css'%idx)
+
+            for tag in XPath('//h:style')(root):
+                p = tag.getparent()
+                idx = p.index(tag)
+                raw = tag.text
+                if not raw or not raw.strip():
+                    extract(tag)
+                    continue
+                repl = etree.Element(XHTML('link'), type='text/css',
+                        rel='stylesheet')
+                p.insert(idx, repl)
+                extract(tag)
+                inlines[raw].append(repl)
+
+        for raw, elems in inlines.iteritems():
+            self.flows.append(raw)
+            idx = to_ref(len(self.flows))
+            for link in elems:
+                link.set('href', 'kindle:flow:%s?mime=text/css'%idx)
+

--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@ -357,7 +357,21 @@ def urlnormalize(href):
    parts = (urlquote(part) for part in parts)
    return urlunparse(parts)

-
+def extract(elem):
+    """
+    Removes this element from the tree, including its children and
+    text.  The tail text is joined to the previous element or
+    parent.
+    """
+    parent = elem.getparent()
+    if parent is not None:
+        if elem.tail:
+            previous = elem.getprevious()
+            if previous is None:
+                parent.text = (parent.text or '') + elem.tail
+            else:
+                previous.tail = (previous.tail or '') + elem.tail
+        parent.remove(elem)

 class DummyHandler(logging.Handler):