Conversion: Performance improvement for books that have many HTML files that all include the same CSS stylesheets

Now the conversion engine will flatten the same sequence of sheets only once. Merge branch 'master' of https://github.com/princesse-framboise/calibre
2025-07-09 03:04:10 -04:00 · 2019-06-14 10:10:05 +05:30 · 2019-06-14 10:10:05 +05:30 · ca4fd1f381
commit ca4fd1f381
parent b2f299de88 a2418620bf
1 changed files with 96 additions and 70 deletions
--- a/src/calibre/ebooks/oeb/stylizer.py
+++ b/src/calibre/ebooks/oeb/stylizer.py
@ -95,6 +95,90 @@ def test_media_ok():
    assert media_ok('screen, (device-width:10px)')
    assert not media_ok('screen and (device-width:10px)')

+class StylizerRules(object):
+
+    def __init__(self, opts, profile, stylesheets):
+        self.opts, self.profile, self.stylesheets = opts, profile, stylesheets
+
+        index = 0
+        self.rules = []
+        self.page_rule = {}
+        self.font_face_rules = []
+        for sheet_index, stylesheet in enumerate(stylesheets):
+            href = stylesheet.href
+            for rule in stylesheet.cssRules:
+                if rule.type == rule.MEDIA_RULE:
+                    if media_ok(rule.media.mediaText):
+                        for subrule in rule.cssRules:
+                            self.rules.extend(self.flatten_rule(subrule, href, index, is_user_agent_sheet=sheet_index==0))
+                            index += 1
+                else:
+                    self.rules.extend(self.flatten_rule(rule, href, index, is_user_agent_sheet=sheet_index==0))
+                    index = index + 1
+        self.rules.sort(key=itemgetter(0))  # sort by specificity
+
+    def flatten_rule(self, rule, href, index, is_user_agent_sheet=False):
+        results = []
+        sheet_index = 0 if is_user_agent_sheet else 1
+        if isinstance(rule, CSSStyleRule):
+            style = self.flatten_style(rule.style)
+            for selector in rule.selectorList:
+                specificity = (sheet_index,) + selector.specificity + (index,)
+                text = selector.selectorText
+                selector = list(selector.seq)
+                results.append((specificity, selector, style, text, href))
+        elif isinstance(rule, CSSPageRule):
+            style = self.flatten_style(rule.style)
+            self.page_rule.update(style)
+        elif isinstance(rule, CSSFontFaceRule):
+            if rule.style.length > 1:
+                # Ignore the meaningless font face rules generated by the
+                # benighted MS Word that contain only a font-family declaration
+                # and nothing else
+                self.font_face_rules.append(rule)
+        return results
+
+    def flatten_style(self, cssstyle):
+        style = {}
+        for prop in cssstyle:
+            name = prop.name
+            normalizer = normalizers.get(name, None)
+            if normalizer is not None:
+                style.update(normalizer(name, prop.cssValue))
+            elif name == 'text-align':
+                style['text-align'] = self._apply_text_align(prop.value)
+            else:
+                style[name] = prop.value
+        if 'font-size' in style:
+            size = style['font-size']
+            if size == 'normal':
+                size = 'medium'
+            if size == 'smallest':
+                size = 'xx-small'
+            if size in FONT_SIZE_NAMES:
+                style['font-size'] = "%.1frem" % (self.profile.fnames[size] / float(self.profile.fbase))
+        if '-epub-writing-mode' in style:
+            for x in ('-webkit-writing-mode', 'writing-mode'):
+                style[x] = style.get(x, style['-epub-writing-mode'])
+        return style
+
+    def _apply_text_align(self, text):
+        if text in ('left', 'justify') and self.opts.change_justification in ('left', 'justify'):
+            text = self.opts.change_justification
+        return text
+
+    def same_rules(self, opts, profile, stylesheets):
+        if self.opts != opts:
+            # it's unlikely to happen, but better safe than sorry
+            return False
+        if self.profile != profile:
+            return False
+        if len(self.stylesheets) != len(stylesheets):
+            return False
+        for index, stylesheet in enumerate(self.stylesheets):
+            if stylesheet != stylesheets[index]:
+                return False
+        return True

 class Stylizer(object):
    STYLESHEETS = WeakKeyDictionary()
@ -133,7 +217,6 @@ class Stylizer(object):

        parser = CSSParser(fetcher=self._fetch_css_file,
                log=logging.getLogger('calibre.css'))
-        self.font_face_rules = []
        for elem in style_tags:
            if (elem.tag == XHTML('style') and elem.get('type', CSS_MIME) in OEB_STYLES and media_ok(elem.get('media'))):
                text = elem.text if elem.text else u''
@ -200,29 +283,22 @@ class Stylizer(object):
                    self.logger.exception('Failed to parse %s, ignoring.'%w)
                    self.logger.debug('Bad css: ')
                    self.logger.debug(x)
-        rules = []
-        index = 0
-        self.stylesheets = set()
-        self.page_rule = {}
-        for sheet_index, stylesheet in enumerate(stylesheets):
-            href = stylesheet.href
-            self.stylesheets.add(href)
-            for rule in stylesheet.cssRules:
-                if rule.type == rule.MEDIA_RULE:
-                    if media_ok(rule.media.mediaText):
-                        for subrule in rule.cssRules:
-                            rules.extend(self.flatten_rule(subrule, href, index, is_user_agent_sheet=sheet_index==0))
-                            index += 1
-                else:
-                    rules.extend(self.flatten_rule(rule, href, index, is_user_agent_sheet=sheet_index==0))
-                    index = index + 1
-        rules.sort(key=itemgetter(0))  # sort by specificity
-        self.rules = rules
+
+        # using oeb to store the rules, page rule and font face rules
+        # and generating them again if opts, profile or stylesheets are different
+        if (not hasattr(self.oeb, 'stylizer_rules')) \
+            or not self.oeb.stylizer_rules.same_rules(self.opts, self.profile, stylesheets):
+            self.oeb.stylizer_rules = StylizerRules(self.opts, self.profile, stylesheets)
+        self.rules = self.oeb.stylizer_rules.rules
+        self.page_rule = self.oeb.stylizer_rules.page_rule
+        self.font_face_rules = self.oeb.stylizer_rules.font_face_rules
+        self.flatten_style = self.oeb.stylizer_rules.flatten_style
+
        self._styles = {}
        pseudo_pat = re.compile(u':{1,2}(%s)' % ('|'.join(INAPPROPRIATE_PSEUDO_CLASSES)), re.I)
        select = Select(tree, ignore_inappropriate_pseudo_classes=True)

-        for _, _, cssdict, text, _ in rules:
+        for _, _, cssdict, text, _ in self.rules:
            fl = pseudo_pat.search(text)
            try:
                matches = tuple(select(text))
@ -301,56 +377,6 @@ class Stylizer(object):
            data = data.encode('utf-8')
        return ('utf-8', data)

-    def flatten_rule(self, rule, href, index, is_user_agent_sheet=False):
-        results = []
-        sheet_index = 0 if is_user_agent_sheet else 1
-        if isinstance(rule, CSSStyleRule):
-            style = self.flatten_style(rule.style)
-            for selector in rule.selectorList:
-                specificity = (sheet_index,) + selector.specificity + (index,)
-                text = selector.selectorText
-                selector = list(selector.seq)
-                results.append((specificity, selector, style, text, href))
-        elif isinstance(rule, CSSPageRule):
-            style = self.flatten_style(rule.style)
-            self.page_rule.update(style)
-        elif isinstance(rule, CSSFontFaceRule):
-            if rule.style.length > 1:
-                # Ignore the meaningless font face rules generated by the
-                # benighted MS Word that contain only a font-family declaration
-                # and nothing else
-                self.font_face_rules.append(rule)
-        return results
-
-    def flatten_style(self, cssstyle):
-        style = {}
-        for prop in cssstyle:
-            name = prop.name
-            normalizer = normalizers.get(name, None)
-            if normalizer is not None:
-                style.update(normalizer(name, prop.cssValue))
-            elif name == 'text-align':
-                style['text-align'] = self._apply_text_align(prop.value)
-            else:
-                style[name] = prop.value
-        if 'font-size' in style:
-            size = style['font-size']
-            if size == 'normal':
-                size = 'medium'
-            if size == 'smallest':
-                size = 'xx-small'
-            if size in FONT_SIZE_NAMES:
-                style['font-size'] = "%.1frem" % (self.profile.fnames[size] / float(self.profile.fbase))
-        if '-epub-writing-mode' in style:
-            for x in ('-webkit-writing-mode', 'writing-mode'):
-                style[x] = style.get(x, style['-epub-writing-mode'])
-        return style
-
-    def _apply_text_align(self, text):
-        if text in ('left', 'justify') and self.opts.change_justification in ('left', 'justify'):
-            text = self.opts.change_justification
-        return text
-
    def style(self, element):
        try:
            return self._styles[element]