Implement gathering of data for CSS report

2025-07-09 03:04:10 -04:00 · 2015-01-22 14:08:13 +05:30 · 2015-01-22 14:08:13 +05:30 · 01bab1b8de
commit 01bab1b8de
parent 69a0e6ba05
1 changed files with 109 additions and 3 deletions
--- a/src/calibre/ebooks/oeb/polish/report.py
+++ b/src/calibre/ebooks/oeb/polish/report.py
@ -6,10 +6,13 @@ from __future__ import (unicode_literals, division, absolute_import,
 __license__ = 'GPL v3'
 __copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>'
-import posixpath, os, time, types
+import posixpath, os, time, types, re
 from collections import namedtuple, defaultdict, Counter
 from calibre import prepare_string_for_xml
 from calibre.ebooks.oeb.base import XPath
 from calibre.ebooks.oeb.polish.container import OEB_DOCS, OEB_STYLES, OEB_FONTS
 from calibre.ebooks.oeb.polish.css import build_selector, PSEUDO_PAT, MIN_SPACE_RE
 from calibre.ebooks.oeb.polish.spell import get_all_words
 from calibre.utils.icu import numeric_sort_key, ord_string, safe_chr
 from calibre.utils.magick.draw import identify
@ -115,14 +118,117 @@ def chars_data(container, book_locale):
    for i, (codepoint, usage) in enumerate(chars.iteritems()):
        yield Char(i, safe_chr(codepoint), codepoint, sorted(usage, key=sort_key), counter[codepoint])
 CSSRule = namedtuple('CSSRule', 'selector location')
 RuleLocation = namedtuple('RuleLocation', 'file_name line column')
 MatchLocation = namedtuple('MatchLocation', 'tag sourceline')
 def css_data(container, book_locale):
    import tinycss
    from tinycss.css21 import RuleSet, ImportRule
    def css_rules(file_name, rules, sourceline=0):
        ans = []
        for rule in rules:
            if isinstance(rule, RuleSet):
                selector = rule.selector.as_css()
                ans.append(CSSRule(selector, RuleLocation(file_name, sourceline + rule.line, rule.column)))
            elif isinstance(rule, ImportRule):
                import_name = container.href_to_name(rule.uri, file_name)
                if import_name and container.exists(import_name):
                    ans.append(import_name)
            elif getattr(rule, 'rules', False):
                ans.extend(css_rules(file_name, rule.rules, sourceline))
        return ans
    parser = tinycss.make_full_parser()
    importable_sheets = {}
    html_sheets = {}
    spine_names = {name for name, is_linear in container.spine_names}
    style_path, link_path = XPath('//h:style'), XPath('//h:link/@href')
    for name, mt in container.mime_map.iteritems():
        if mt in OEB_STYLES:
            importable_sheets[name] = css_rules(name, parser.parse_stylesheet(container.raw_data(name)).rules)
        elif mt in OEB_DOCS and name in spine_names:
            html_sheets[name] = []
            for style in style_path(container.parsed(name)):
                if style.get('type', 'text/css') == 'text/css' and style.text:
                    html_sheets[name].append(
                        css_rules(name, parser.parse_stylesheet(container.raw_data(name)).rules, style.sourceline))
    rule_map = defaultdict(lambda : defaultdict(list))
    pseudo_pat = re.compile(PSEUDO_PAT, re.I)
    cache = {}
    def rules_in_sheet(sheet):
        for rule in sheet:
            if isinstance(rule, CSSRule):
                yield rule
            sheet = importable_sheets.get(rule)
            if sheet is not None:
                for rule in rules_in_sheet(sheet):
                    yield rule
    def sheets_for_html(name, root):
        for href in link_path(root):
            tname = container.href_to_name(href, name)
            sheet = importable_sheets.get(tname)
            if sheet is not None:
                yield sheet
    def tag_text(elem):
        tag = elem.tag.rpartition('}')[-1]
        if elem.attrib:
            attribs = ' '.join('%s="%s"' % (k, prepare_string_for_xml(elem.get(k, ''), True)) for k in elem.keys())
            return '<%s %s>' % (tag, attribs)
        return '<%s>' % tag
    def matches_for_selector(selector, root):
        selector = pseudo_pat.sub('', selector)
        selector = MIN_SPACE_RE.sub(r'\1', selector)
        try:
            xp = cache[(True, selector)]
        except KeyError:
            xp = cache[(True, selector)] = build_selector(selector)
        try:
            matches = xp(root)
        except Exception:
            return ()
        if not matches:
            try:
                xp = cache[(False, selector)]
            except KeyError:
                xp = cache[(False, selector)] = build_selector(selector, case_sensitive=False)
            try:
                matches = xp(root)
            except Exception:
                return ()
        return (MatchLocation(tag_text(elem), elem.sourceline) for elem in matches)
    for name, inline_sheets in html_sheets.iteritems():
        root = container.parsed(name)
        for sheet in list(sheets_for_html(name, root)) + inline_sheets:
            for rule in sheet:
                rule_map[rule][name].extend(matches_for_selector(rule.selector, root))
    ans = []
    for rule, loc_map in rule_map.iteritems():
        la = [(name, locations) for name, locations in loc_map.iteritems() if locations]
        if la:
            ans.append((rule, la))
    return ans
 def gather_data(container, book_locale):
    timing = {}
    data = {}
-    for x in 'files images words chars'.split():
+    for x in 'files chars images words css'.split():
        st = time.time()
        data[x] = globals()[x + '_data'](container, book_locale)
        if isinstance(data[x], types.GeneratorType):
            data[x] = tuple(data[x])
        timing[x] = time.time() - st
    return data, timing