Conversion: Performance improvement for books that have many HTML files that all include the same CSS stylesheets

Now the conversion engine will flatten the same sequence of sheets
only once.

Merge branch 'master' of https://github.com/princesse-framboise/calibre
This commit is contained in:
Kovid Goyal 2019-06-14 10:10:05 +05:30
commit ca4fd1f381
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -95,6 +95,90 @@ def test_media_ok():
assert media_ok('screen, (device-width:10px)') assert media_ok('screen, (device-width:10px)')
assert not media_ok('screen and (device-width:10px)') assert not media_ok('screen and (device-width:10px)')
class StylizerRules(object):
def __init__(self, opts, profile, stylesheets):
self.opts, self.profile, self.stylesheets = opts, profile, stylesheets
index = 0
self.rules = []
self.page_rule = {}
self.font_face_rules = []
for sheet_index, stylesheet in enumerate(stylesheets):
href = stylesheet.href
for rule in stylesheet.cssRules:
if rule.type == rule.MEDIA_RULE:
if media_ok(rule.media.mediaText):
for subrule in rule.cssRules:
self.rules.extend(self.flatten_rule(subrule, href, index, is_user_agent_sheet=sheet_index==0))
index += 1
else:
self.rules.extend(self.flatten_rule(rule, href, index, is_user_agent_sheet=sheet_index==0))
index = index + 1
self.rules.sort(key=itemgetter(0)) # sort by specificity
def flatten_rule(self, rule, href, index, is_user_agent_sheet=False):
results = []
sheet_index = 0 if is_user_agent_sheet else 1
if isinstance(rule, CSSStyleRule):
style = self.flatten_style(rule.style)
for selector in rule.selectorList:
specificity = (sheet_index,) + selector.specificity + (index,)
text = selector.selectorText
selector = list(selector.seq)
results.append((specificity, selector, style, text, href))
elif isinstance(rule, CSSPageRule):
style = self.flatten_style(rule.style)
self.page_rule.update(style)
elif isinstance(rule, CSSFontFaceRule):
if rule.style.length > 1:
# Ignore the meaningless font face rules generated by the
# benighted MS Word that contain only a font-family declaration
# and nothing else
self.font_face_rules.append(rule)
return results
def flatten_style(self, cssstyle):
style = {}
for prop in cssstyle:
name = prop.name
normalizer = normalizers.get(name, None)
if normalizer is not None:
style.update(normalizer(name, prop.cssValue))
elif name == 'text-align':
style['text-align'] = self._apply_text_align(prop.value)
else:
style[name] = prop.value
if 'font-size' in style:
size = style['font-size']
if size == 'normal':
size = 'medium'
if size == 'smallest':
size = 'xx-small'
if size in FONT_SIZE_NAMES:
style['font-size'] = "%.1frem" % (self.profile.fnames[size] / float(self.profile.fbase))
if '-epub-writing-mode' in style:
for x in ('-webkit-writing-mode', 'writing-mode'):
style[x] = style.get(x, style['-epub-writing-mode'])
return style
def _apply_text_align(self, text):
if text in ('left', 'justify') and self.opts.change_justification in ('left', 'justify'):
text = self.opts.change_justification
return text
def same_rules(self, opts, profile, stylesheets):
if self.opts != opts:
# it's unlikely to happen, but better safe than sorry
return False
if self.profile != profile:
return False
if len(self.stylesheets) != len(stylesheets):
return False
for index, stylesheet in enumerate(self.stylesheets):
if stylesheet != stylesheets[index]:
return False
return True
class Stylizer(object): class Stylizer(object):
STYLESHEETS = WeakKeyDictionary() STYLESHEETS = WeakKeyDictionary()
@ -133,7 +217,6 @@ class Stylizer(object):
parser = CSSParser(fetcher=self._fetch_css_file, parser = CSSParser(fetcher=self._fetch_css_file,
log=logging.getLogger('calibre.css')) log=logging.getLogger('calibre.css'))
self.font_face_rules = []
for elem in style_tags: for elem in style_tags:
if (elem.tag == XHTML('style') and elem.get('type', CSS_MIME) in OEB_STYLES and media_ok(elem.get('media'))): if (elem.tag == XHTML('style') and elem.get('type', CSS_MIME) in OEB_STYLES and media_ok(elem.get('media'))):
text = elem.text if elem.text else u'' text = elem.text if elem.text else u''
@ -200,29 +283,22 @@ class Stylizer(object):
self.logger.exception('Failed to parse %s, ignoring.'%w) self.logger.exception('Failed to parse %s, ignoring.'%w)
self.logger.debug('Bad css: ') self.logger.debug('Bad css: ')
self.logger.debug(x) self.logger.debug(x)
rules = []
index = 0 # using oeb to store the rules, page rule and font face rules
self.stylesheets = set() # and generating them again if opts, profile or stylesheets are different
self.page_rule = {} if (not hasattr(self.oeb, 'stylizer_rules')) \
for sheet_index, stylesheet in enumerate(stylesheets): or not self.oeb.stylizer_rules.same_rules(self.opts, self.profile, stylesheets):
href = stylesheet.href self.oeb.stylizer_rules = StylizerRules(self.opts, self.profile, stylesheets)
self.stylesheets.add(href) self.rules = self.oeb.stylizer_rules.rules
for rule in stylesheet.cssRules: self.page_rule = self.oeb.stylizer_rules.page_rule
if rule.type == rule.MEDIA_RULE: self.font_face_rules = self.oeb.stylizer_rules.font_face_rules
if media_ok(rule.media.mediaText): self.flatten_style = self.oeb.stylizer_rules.flatten_style
for subrule in rule.cssRules:
rules.extend(self.flatten_rule(subrule, href, index, is_user_agent_sheet=sheet_index==0))
index += 1
else:
rules.extend(self.flatten_rule(rule, href, index, is_user_agent_sheet=sheet_index==0))
index = index + 1
rules.sort(key=itemgetter(0)) # sort by specificity
self.rules = rules
self._styles = {} self._styles = {}
pseudo_pat = re.compile(u':{1,2}(%s)' % ('|'.join(INAPPROPRIATE_PSEUDO_CLASSES)), re.I) pseudo_pat = re.compile(u':{1,2}(%s)' % ('|'.join(INAPPROPRIATE_PSEUDO_CLASSES)), re.I)
select = Select(tree, ignore_inappropriate_pseudo_classes=True) select = Select(tree, ignore_inappropriate_pseudo_classes=True)
for _, _, cssdict, text, _ in rules: for _, _, cssdict, text, _ in self.rules:
fl = pseudo_pat.search(text) fl = pseudo_pat.search(text)
try: try:
matches = tuple(select(text)) matches = tuple(select(text))
@ -301,56 +377,6 @@ class Stylizer(object):
data = data.encode('utf-8') data = data.encode('utf-8')
return ('utf-8', data) return ('utf-8', data)
def flatten_rule(self, rule, href, index, is_user_agent_sheet=False):
results = []
sheet_index = 0 if is_user_agent_sheet else 1
if isinstance(rule, CSSStyleRule):
style = self.flatten_style(rule.style)
for selector in rule.selectorList:
specificity = (sheet_index,) + selector.specificity + (index,)
text = selector.selectorText
selector = list(selector.seq)
results.append((specificity, selector, style, text, href))
elif isinstance(rule, CSSPageRule):
style = self.flatten_style(rule.style)
self.page_rule.update(style)
elif isinstance(rule, CSSFontFaceRule):
if rule.style.length > 1:
# Ignore the meaningless font face rules generated by the
# benighted MS Word that contain only a font-family declaration
# and nothing else
self.font_face_rules.append(rule)
return results
def flatten_style(self, cssstyle):
style = {}
for prop in cssstyle:
name = prop.name
normalizer = normalizers.get(name, None)
if normalizer is not None:
style.update(normalizer(name, prop.cssValue))
elif name == 'text-align':
style['text-align'] = self._apply_text_align(prop.value)
else:
style[name] = prop.value
if 'font-size' in style:
size = style['font-size']
if size == 'normal':
size = 'medium'
if size == 'smallest':
size = 'xx-small'
if size in FONT_SIZE_NAMES:
style['font-size'] = "%.1frem" % (self.profile.fnames[size] / float(self.profile.fbase))
if '-epub-writing-mode' in style:
for x in ('-webkit-writing-mode', 'writing-mode'):
style[x] = style.get(x, style['-epub-writing-mode'])
return style
def _apply_text_align(self, text):
if text in ('left', 'justify') and self.opts.change_justification in ('left', 'justify'):
text = self.opts.change_justification
return text
def style(self, element): def style(self, element):
try: try:
return self._styles[element] return self._styles[element]