Stop using Qt WebKit to calculate font usage statistics

2025-07-08 10:44:09 -04:00 · 2016-04-13 10:54:16 +05:30 · 2016-04-13 10:54:16 +05:30 · 33e23e50ab
commit 33e23e50ab
parent 9a2dc518ad
3 changed files with 248 additions and 396 deletions
--- a/src/calibre/ebooks/oeb/polish/font_stats.coffee
+++ b/src/calibre/ebooks/oeb/polish/font_stats.coffee
@ -1,127 +0,0 @@
-#!/usr/bin/env coffee
-# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
-
-###
- Copyright 2013, Kovid Goyal <kovid at kovidgoyal.net>
- Released under the GPLv3 License
-###
-
-
-if window?.calibre_utils
-    log = window.calibre_utils.log
-
-font_dict = (style, computed=false) ->
-    if computed
-        fams = []
-        family = style.getPropertyCSSValue('font-family')
-        if family.cssValueType == CSSValue.CSS_PRIMITIVE_VALUE
-            fams.push(family.getStringValue())
-        else
-            for f in family
-                fams.push(f.getStringValue())
-    else
-        fams = style.getPropertyValue('font-family')
-    return {
-        'font-family':fams,
-        'font-weight':style.getPropertyValue('font-weight'),
-        'font-style':style.getPropertyValue('font-style'),
-        'font-stretch':style.getPropertyValue('font-stretch'),
-        'text-transform':style.getPropertyValue('text-transform'),
-        'font-variant':style.getPropertyValue('font-variant'),
-    }
-
-font_usage = (node) ->
-    style = window.getComputedStyle(node, null)
-    ans = font_dict(style, true)
-    text = []
-    for child in node.childNodes
-        if child.nodeType == Node.TEXT_NODE
-            text.push(child.nodeValue)
-    ans['text'] = text
-    return ans
-
-process_sheet = (sheet, font_faces) ->
-    for rule in sheet.cssRules
-        if rule.type == rule.FONT_FACE_RULE
-            process_font_face_rule(rule, font_faces)
-        else if rule.type == rule.IMPORT_RULE and rule.styleSheet
-            process_sheet(rule.styleSheet, font_faces)
-
-process_font_face_rule = (rule, font_faces) ->
-    fd = font_dict(rule.style)
-    fd['src'] = rule.style.getPropertyValue('src')
-    font_faces.push(fd)
-
-fl_pat = /:{1,2}(first-letter|first-line)/i
-
-process_sheet_for_pseudo = (sheet, rules) ->
-    for rule in sheet.cssRules
-        if rule.type == rule.STYLE_RULE
-            st = rule.selectorText
-            m = fl_pat.exec(st)
-            if m
-                pseudo = m[1].toLowerCase()
-                ff = rule.style.getPropertyValue('font-family')
-                if ff
-                    process_style_rule(st, rule.style, rules, pseudo)
-        else if rule.type == rule.IMPORT_RULE and rule.styleSheet
-            process_sheet_for_pseudo(rule.styleSheet, rules)
-
-process_style_rule = (selector_text, style, rules, pseudo) ->
-    selector_text = selector_text.replace(fl_pat, '')
-    fd = font_dict(style)
-    for element in document.querySelectorAll(selector_text)
-        text = element.innerText
-        if text
-            rules.push([fd, text, pseudo])
-
-class FontStats
-    # This class is a namespace to expose functions via the
-    # window.font_stats object.
-
-    constructor: () ->
-        if not this instanceof arguments.callee
-            throw new Error('FontStats constructor called as function')
-
-    get_font_face_rules: () ->
-        font_faces = []
-        for sheet in document.styleSheets
-            process_sheet(sheet, font_faces)
-        py_bridge.value = font_faces
-
-    get_font_usage: () ->
-        ans = []
-        busage = font_usage(document.body)
-        if busage != null
-            ans.push(busage)
-        for node in document.body.getElementsByTagName('*')
-            usage = font_usage(node)
-            if usage != null
-                ans.push(usage)
-        py_bridge.value = ans
-
-    get_pseudo_element_font_usage: () ->
-        ans = []
-        for sheet in document.styleSheets
-            process_sheet_for_pseudo(sheet, ans)
-        py_bridge.value = ans
-
-    get_font_families: () ->
-        ans = {}
-        for node in document.getElementsByTagName('*')
-            rules = document.defaultView.getMatchedCSSRules(node, '')
-            if rules
-                for rule in rules
-                    style = rule.style
-                    family = style.getPropertyValue('font-family')
-                    if family
-                        ans[family] = true
-            if node.getAttribute('style')
-                family = node.style.getPropertyValue('font-family')
-                if family
-                    ans[family] = true
-        py_bridge.value = ans
-
-if window?
-    window.font_stats = new FontStats()
-
--- a/src/calibre/ebooks/oeb/polish/stats.py
+++ b/src/calibre/ebooks/oeb/polish/stats.py
@ -7,19 +7,15 @@ __license__   = 'GPL v3'
 __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'

-import json, sys, os, logging
-from urllib import unquote
-from collections import defaultdict
+import sys
+from functools import partial

+from lxml.etree import tostring
 import regex
-from cssutils import CSSParser
-from PyQt5.Qt import (pyqtProperty, QEventLoop, Qt, QSize, QTimer,
-                      pyqtSlot)
-from PyQt5.QtWebKitWidgets import QWebPage, QWebView

-from calibre.constants import iswindows
-from calibre.ebooks.oeb.display.webview import load_html
-from calibre.gui2 import must_use_qt
+from calibre.ebooks.oeb.base import XHTML
+from calibre.ebooks.oeb.polish.cascade import iterrules, resolve_styles, iterdeclaration
+from calibre.utils.icu import ord_string, safe_chr

 def normalize_font_properties(font):
    w = font.get('font-weight', None)
@ -102,286 +98,200 @@ def get_matching_rules(rules, font):
            return m
    return []

-def parse_font_families(parser, raw):
-    style = parser.parseStyle('font-family:' + raw, validate=False).getProperty('font-family')
-    for x in style.propertyValue:
-        x = x.value
-        if x:
-            yield x
+def get_css_text(elem, resolve_pseudo_property, which='before'):
+    text = resolve_pseudo_property(elem, which, 'content')[0].value
+    if text and len(text) > 2 and text[0] == '"' and text[-1] == '"':
+        return text[1:-1]
+    return ''

-def get_pseudo_element_font_usage(pseudo_element_font_usage, first_letter_pat, parser):
+caps_variants = {'smallcaps', 'small-caps', 'all-small-caps', 'petite-caps', 'all-petite-caps', 'unicase'}
+
+def get_element_text(elem, resolve_property, resolve_pseudo_property, capitalize_pat, for_pseudo=None):
    ans = []
-    for font_dict, text, pseudo in pseudo_element_font_usage:
-        text = text.strip()
-        if pseudo == 'first-letter':
-            prefix = first_letter_pat.match(text)
-            if prefix is not None:
-                text = prefix + text[len(prefix):].lstrip()[:1]
-            else:
-                text = text[:1]
-        if text:
-            font = font_dict.copy()
-            font['text'] = text
-            font['font-family'] = list(parse_font_families(parser, font['font-family']))
-            ans.append(font)
-
+    before = get_css_text(elem, resolve_pseudo_property)
+    if before:
+        ans.append(before)
+    if for_pseudo is not None:
+        ans.append(tostring(elem, method='text', encoding=unicode, with_tail=False))
+    else:
+        if elem.text:
+            ans.append(elem.text)
+        for child in elem.iterchildren():
+            t = getattr(child, 'tail', '')
+            if t:
+                ans.append(t)
+    after = get_css_text(elem, resolve_pseudo_property, 'after')
+    if after:
+        ans.append(after)
+    ans = ''.join(ans)
+    if for_pseudo is not None:
+        tt = resolve_pseudo_property(elem, for_pseudo, 'text-transform')[0].value
+        fv = resolve_pseudo_property(elem, for_pseudo, 'font-variant')[0].value
+    else:
+        tt = resolve_property(elem, 'text-transform')[0].value
+        fv = resolve_property(elem, 'font-variant')[0].value
+    if fv in caps_variants:
+        ans += icu_upper(ans)
+    if tt != 'none':
+        if tt == 'uppercase':
+            ans = icu_upper(ans)
+        elif tt == 'lowercase':
+            ans = icu_lower(ans)
+        elif tt == 'capitalize':
+            m = capitalize_pat.search(ans)
+            if m is not None:
+                ans += icu_upper(m.group())
    return ans

-class Page(QWebPage):  # {{{
+def get_font_dict(elem, resolve_property, pseudo=None):
+    ans = {}
+    if pseudo is None:
+        ff = resolve_property(elem, 'font-family')
+    else:
+        ff = resolve_property(elem, pseudo, 'font-family')
+    ans['font-family'] = tuple(x.value for x in ff)
+    for p in 'weight', 'style', 'stretch':
+        p = 'font-' + p
+        rp = resolve_property(elem, p) if pseudo is None else resolve_property(elem, pseudo, p)
+        ans[p] = type('')(rp[0].value)
+    normalize_font_properties(ans)
+    return ans

-    def __init__(self, log):
-        self.log = log
-        QWebPage.__init__(self)
-        self.js = None
-        self.evaljs = self.mainFrame().evaluateJavaScript
-        self.bridge_value = None
-        nam = self.networkAccessManager()
-        nam.setNetworkAccessible(nam.NotAccessible)
-        self.longjs_counter = 0
+bad_fonts = {'serif', 'sans-serif', 'monospace', 'cursive', 'fantasy', 'sansserif', 'inherit'}
+exclude_chars = frozenset(ord_string('\n\r\t'))
+skip_tags = {XHTML(x) for x in 'script style title meta link'.split()}
+font_keys = {'font-weight', 'font-style', 'font-stretch', 'font-family'}

-    def javaScriptConsoleMessage(self, msg, lineno, msgid):
-        self.log(u'JS:', unicode(msg))
-
-    def javaScriptAlert(self, frame, msg):
-        self.log(unicode(msg))
-
-    @pyqtSlot(result=bool)
-    def shouldInterruptJavaScript(self):
-        if self.longjs_counter < 5:
-            self.log('Long running javascript, letting it proceed')
-            self.longjs_counter += 1
-            return False
-        self.log.warn('Long running javascript, aborting it')
-        return True
-
-    def _pass_json_value_getter(self):
-        val = json.dumps(self.bridge_value)
-        return val
-
-    def _pass_json_value_setter(self, value):
-        # Qt WebKit in Qt 4.x adds extra null bytes to the end of the string
-        # if the JSON contains non-BMP characters
-        self.bridge_value = json.loads(unicode(value).rstrip('\0'))
-
-    _pass_json_value = pyqtProperty(str, fget=_pass_json_value_getter,
-            fset=_pass_json_value_setter)
-
-    def load_js(self):
-        self.longjs_counter = 0
-        if self.js is None:
-            from calibre.utils.resources import compiled_coffeescript
-            self.js = compiled_coffeescript('ebooks.oeb.display.utils')
-            self.js += compiled_coffeescript('ebooks.oeb.polish.font_stats')
-        self.mainFrame().addToJavaScriptWindowObject("py_bridge", self)
-        self.evaljs(self.js)
-        self.evaljs('''
-        Object.defineProperty(py_bridge, 'value', {
-               get : function() { return JSON.parse(this._pass_json_value); },
-               set : function(val) { this._pass_json_value = JSON.stringify(val); }
-        });
-        ''')
-# }}}
+def prepare_font_rule(cssdict):
+    cssdict['font-family'] = frozenset(cssdict['font-family'][:1])
+    cssdict['width'] = widths[cssdict['font-stretch']]
+    cssdict['weight'] = int(cssdict['font-weight'])

 class StatsCollector(object):

+    first_letter_pat = capitalize_pat = None
+
    def __init__(self, container, do_embed=False):
-        self.container = container
-        self.log = self.logger = container.log
-        self.do_embed = do_embed
-        must_use_qt()
-        self.parser = CSSParser(loglevel=logging.CRITICAL, log=logging.getLogger('calibre.css'))
-        self.first_letter_pat = regex.compile(r'^[\p{Ps}\p{Ps}\p{Pe}\p{Pi}\p{Pf}\p{Po}]+', regex.VERSION1 | regex.UNICODE)
-        self.capitalize_pat = regex.compile(r'[\p{L}\p{N}]', regex.VERSION1 | regex.UNICODE)
+        if self.first_letter_pat is None:
+            StatsCollector.first_letter_pat = self.first_letter_pat = regex.compile(
+                r'^[\p{Ps}\p{Ps}\p{Pe}\p{Pi}\p{Pf}\p{Po}]+', regex.VERSION1 | regex.UNICODE)
+            StatsCollector.capitalize_pat = self.capitalize_pat = regex.compile(
+                r'[\p{L}\p{N}]', regex.VERSION1 | regex.UNICODE)

-        self.loop = QEventLoop()
-        self.view = QWebView()
-        self.page = Page(self.log)
-        self.view.setPage(self.page)
-        self.page.setViewportSize(QSize(1200, 1600))
+        self.collect_font_stats(container, do_embed)

-        self.view.loadFinished.connect(self.collect,
-                type=Qt.QueuedConnection)
+    def collect_font_face_rules(self, container, processed, spine_name, sheet, sheet_name):
+        if sheet_name in processed:
+            sheet_rules = processed[sheet_name]
+        else:
+            sheet_rules = []
+            if sheet_name != spine_name:
+                processed[sheet_name] = sheet_rules
+            for rule, base_name, rule_index in iterrules(container, sheet_name, rules=sheet, rule_type='FONT_FACE_RULE'):
+                cssdict = {}
+                for prop in iterdeclaration(rule.style):
+                    if prop.name == 'font-family':
+                        cssdict['font-family'] = [icu_lower(x.value) for x in prop.propertyValue]
+                    elif prop.name.startswith('font-'):
+                        cssdict[prop.name] = prop.propertyValue[0].value
+                    elif prop.name == 'src':
+                        for val in prop.propertyValue:
+                            x = val.value
+                            fname = container.href_to_name(x, sheet_name)
+                            if container.has_name(fname):
+                                cssdict['src'] = fname
+                                break
+                        else:
+                            container.log.warn('The @font-face rule refers to a font file that does not exist in the book: %s' % prop.propertyValue.cssText)
+                if 'src' not in cssdict:
+                    continue
+                ff = cssdict.get('font-family')
+                if not ff or ff[0] in bad_fonts:
+                    continue
+                normalize_font_properties(cssdict)
+                prepare_font_rule(cssdict)
+                sheet_rules.append(cssdict)
+        self.font_rule_map[spine_name].extend(sheet_rules)

-        self.render_queue = list(container.spine_items)
+    def get_element_font_usage(self, elem, resolve_property, resolve_pseudo_property, font_face_rules, do_embed, font_usage_map, font_spec):
+        text = get_element_text(elem, resolve_property, resolve_pseudo_property, self.capitalize_pat)
+        if not text:
+            return
+
+        def update_usage_for_embed(font, chars):
+            if not do_embed:
+                return
+            ff = [icu_lower(x) for x in font.get('font-family', ())]
+            if ff and ff[0] not in bad_fonts:
+                key = frozenset(((k, ff[0] if k == 'font-family' else v) for k, v in font.iteritems() if k in font_keys))
+                val = font_usage_map.get(key)
+                if val is None:
+                    val = font_usage_map[key] = {'text': set()}
+                    for k in font_keys:
+                        val[k] = font[k][0] if k == 'font-family' else font[k]
+                val['text'] |= chars
+            for ff in font.get('font-family', ()):
+                if ff and icu_lower(ff) not in bad_fonts:
+                    font_spec.add(ff)
+
+        font = get_font_dict(elem, resolve_property)
+        chars = frozenset(ord_string(text)) - exclude_chars
+        update_usage_for_embed(font, chars)
+        for rule in get_matching_rules(font_face_rules, font):
+            self.font_stats[rule['src']] |= chars
+        q = resolve_pseudo_property(elem, 'first-letter', 'font-family', abort_on_missing=True)
+        if q is not None:
+            font = get_font_dict(elem, resolve_pseudo_property, pseudo='first-letter')
+            text = get_element_text(elem, resolve_property, resolve_pseudo_property, self.capitalize_pat, for_pseudo='first-letter')
+            m = self.first_letter_pat.search(text.lstrip())
+            if m is not None:
+                chars = frozenset(ord_string(m.group())) - exclude_chars
+                update_usage_for_embed(font, chars)
+                for rule in get_matching_rules(font_face_rules, font):
+                    self.font_stats[rule['src']] |= chars
+        q = resolve_pseudo_property(elem, 'first-line', 'font-family', abort_on_missing=True)
+        if q is not None:
+            font = get_font_dict(elem, resolve_pseudo_property, pseudo='first-letter')
+            text = get_element_text(elem, resolve_property, resolve_pseudo_property, self.capitalize_pat, for_pseudo='first-line')
+            chars = frozenset(ord_string(text)) - exclude_chars
+            update_usage_for_embed(font, chars)
+            for rule in get_matching_rules(font_face_rules, font):
+                self.font_stats[rule['src']] |= chars
+
+    def get_font_usage(self, container, spine_name, resolve_property, resolve_pseudo_property, font_face_rules, do_embed):
+        root = container.parsed(spine_name)
+        for body in root.iterchildren(XHTML('body')):
+            for elem in body.iter('*'):
+                if elem.tag not in skip_tags:
+                    self.get_element_font_usage(
+                        elem, resolve_property, resolve_pseudo_property, font_face_rules, do_embed,
+                        self.font_usage_map[spine_name], self.font_spec_map[spine_name])
+
+    def collect_font_stats(self, container, do_embed=False):
        self.font_stats = {}
        self.font_usage_map = {}
        self.font_spec_map = {}
        self.font_rule_map = {}
        self.all_font_rules = {}

-        QTimer.singleShot(0, self.render_book)
+        processed_sheets = {}
+        for name, is_linear in container.spine_names:
+            self.font_rule_map[name] = font_face_rules = []
+            resolve_property, resolve_pseudo_property, select = resolve_styles(container, name, sheet_callback=partial(
+                self.collect_font_face_rules, container, processed_sheets, name))

-        if self.loop.exec_() == 1:
-            raise Exception('Failed to gather statistics from book, see log for details')
+            for rule in font_face_rules:
+                self.all_font_rules[rule['src']] = rule
+                if rule['src'] not in self.font_stats:
+                    self.font_stats[rule['src']] = set()

-    def log_exception(self, *args):
-        orig = self.log.filter_level
-        try:
-            self.log.filter_level = self.log.DEBUG
-            self.log.exception(*args)
-        finally:
-            self.log.filter_level = orig
-
-    def render_book(self):
-        try:
-            if not self.render_queue:
-                self.loop.exit()
-            else:
-                self.render_next()
-        except:
-            self.log_exception('Rendering failed')
-            self.loop.exit(1)
-
-    def render_next(self):
-        item = unicode(self.render_queue.pop(0))
-        self.current_item = item
-        load_html(item, self.view)
-
-    def collect(self, ok):
-        if not ok:
-            self.log.error('Failed to render document: %s'%self.container.relpath(self.current_item))
-            self.loop.exit(1)
-            return
-        try:
-            self.page.load_js()
-            self.collect_font_stats()
-        except:
-            self.log_exception('Failed to collect font stats from: %s'%self.container.relpath(self.current_item))
-            self.loop.exit(1)
-            return
-
-        self.render_book()
-
-    def href_to_name(self, href, warn_name):
-        if not href.startswith('file://'):
-            self.log.warn('Non-local URI in', warn_name, ':', href, 'ignoring')
-            return None
-        src = href[len('file://'):]
-        if iswindows and len(src) > 2 and (src[0], src[2]) == ('/', ':'):
-            src = src[1:]
-        src = src.replace('/', os.sep)
-        src = unquote(src)
-        name = self.container.abspath_to_name(src)
-        if not self.container.has_name(name):
-            self.log.warn('Missing resource', href, 'in', warn_name,
-                          'ignoring')
-            return None
-        return name
-
-    def collect_font_stats(self):
-        self.page.evaljs('window.font_stats.get_font_face_rules()')
-        font_face_rules = self.page.bridge_value
-        if not isinstance(font_face_rules, list):
-            raise Exception('Unknown error occurred while reading font-face rules')
-
-        # Weed out invalid font-face rules
-        rules = []
-        import tinycss
-        parser = tinycss.make_full_parser()
-        for rule in font_face_rules:
-            ff = rule.get('font-family', None)
-            if not ff:
-                continue
-            style = self.parser.parseStyle('font-family:%s'%ff, validate=False)
-            ff = [x.value for x in
-                  style.getProperty('font-family').propertyValue]
-            if not ff or ff[0] == 'inherit':
-                continue
-            rule['font-family'] = frozenset(icu_lower(f) for f in ff)
-            src = rule.get('src', None)
-            if not src:
-                continue
-            try:
-                tokens = parser.parse_stylesheet('@font-face { src: %s }' % src).rules[0].declarations[0].value
-            except Exception:
-                self.log.warn('Failed to parse @font-family src: %s' % src)
-                continue
-            for token in tokens:
-                if token.type == 'URI':
-                    uv = token.value
-                    if uv:
-                        sn = self.href_to_name(uv, '@font-face rule')
-                        if sn is not None:
-                            rule['src'] = sn
-                            break
-            else:
-                self.log.warn('The @font-face rule refers to a font file that does not exist in the book: %s' % src)
-                continue
-            normalize_font_properties(rule)
-            rule['width'] = widths[rule['font-stretch']]
-            rule['weight'] = int(rule['font-weight'])
-            rules.append(rule)
-
-        if not rules and not self.do_embed:
-            return
-
-        self.font_rule_map[self.container.abspath_to_name(self.current_item)] = rules
-        for rule in rules:
-            self.all_font_rules[rule['src']] = rule
-
-        for rule in rules:
-            if rule['src'] not in self.font_stats:
-                self.font_stats[rule['src']] = set()
-
-        self.page.evaljs('window.font_stats.get_font_usage()')
-        font_usage = self.page.bridge_value
-        if not isinstance(font_usage, list):
-            raise Exception('Unknown error occurred while reading font usage')
-        self.page.evaljs('window.font_stats.get_pseudo_element_font_usage()')
-        pseudo_element_font_usage = self.page.bridge_value
-        if not isinstance(pseudo_element_font_usage, list):
-            raise Exception('Unknown error occurred while reading pseudo element font usage')
-        font_usage += get_pseudo_element_font_usage(pseudo_element_font_usage, self.first_letter_pat, self.parser)
-        exclude = {'\n', '\r', '\t'}
-        self.font_usage_map[self.container.abspath_to_name(self.current_item)] = fu = defaultdict(dict)
-        bad_fonts = {'serif', 'sans-serif', 'monospace', 'cursive', 'fantasy', 'sansserif', 'inherit'}
-        for font in font_usage:
-            text = set()
-            for t in font['text']:
-                tt = (font['text-transform'] or '').lower()
-                if tt != 'none':
-                    if tt == 'uppercase':
-                        t = icu_upper(t)
-                    elif tt == 'lowercase':
-                        t = icu_lower(t)
-                    elif tt == 'capitalize':
-                        m = self.capitalize_pat.search(t)
-                        if m is not None:
-                            t += icu_upper(m.group())
-                fv = (font['font-variant'] or '').lower()
-                if fv in {'smallcaps', 'small-caps', 'all-small-caps', 'petite-caps', 'all-petite-caps', 'unicase'}:
-                    t += icu_upper(t)  # for renderers that try to fake small-caps by using small normal caps
-                text |= frozenset(t)
-            text.difference_update(exclude)
-            if not text:
-                continue
-            normalize_font_properties(font)
-            for rule in get_matching_rules(rules, font):
-                self.font_stats[rule['src']] |= text
-            if self.do_embed:
-                ff = [icu_lower(x) for x in font.get('font-family', [])]
-                if ff and ff[0] not in bad_fonts:
-                    keys = {'font-weight', 'font-style', 'font-stretch', 'font-family'}
-                    key = frozenset(((k, ff[0] if k == 'font-family' else v) for k, v in font.iteritems() if k in keys))
-                    val = fu[key]
-                    if not val:
-                        val.update({k:(font[k][0] if k == 'font-family' else font[k]) for k in keys})
-                        val['text'] = set()
-                    val['text'] |= text
-        self.font_usage_map[self.container.abspath_to_name(self.current_item)] = dict(fu)
-
-        if self.do_embed:
-            self.page.evaljs('window.font_stats.get_font_families()')
-            font_families = self.page.bridge_value
-            if not isinstance(font_families, dict):
-                raise Exception('Unknown error occurred while reading font families')
-            self.font_spec_map[self.container.abspath_to_name(self.current_item)] = fs = set()
-            for font_dict, text, pseudo in pseudo_element_font_usage:
-                font_families[font_dict['font-family']] = True
-            for raw in font_families.iterkeys():
-                for x in parse_font_families(self.parser, raw):
-                    if x.lower() not in bad_fonts:
-                        fs.add(x)
+            self.font_usage_map[name] = {}
+            self.font_spec_map[name] = set()
+            self.get_font_usage(container, name, resolve_property, resolve_pseudo_property, font_face_rules, do_embed)
+        self.font_stats = {k:{safe_chr(x) for x in v} for k, v in self.font_stats.iteritems()}
+        for fum in self.font_usage_map.itervalues():
+            for v in fum.itervalues():
+                v['text'] = {safe_chr(x) for x in v['text']}

 if __name__ == '__main__':
    from calibre.ebooks.oeb.polish.container import get_container
--- a/src/calibre/ebooks/oeb/polish/tests/cascade.py
+++ b/src/calibre/ebooks/oeb/polish/tests/cascade.py
@ -12,6 +12,7 @@ from calibre.constants import iswindows
 from calibre.ebooks.oeb.base import OEB_STYLES, OEB_DOCS
 from calibre.ebooks.oeb.polish.cascade import iterrules, resolve_styles, DEFAULTS
 from calibre.ebooks.oeb.polish.container import ContainerBase, href_to_name
+from calibre.ebooks.oeb.polish.stats import StatsCollector, font_keys, normalize_font_properties, prepare_font_rule
 from calibre.ebooks.oeb.polish.tests.base import BaseTest
 from calibre.utils.logging import Log, Stream

@ -45,6 +46,12 @@ class VirtualContainer(ContainerBase):
                self.parsed_cache[name] = self.files[name]
        return self.parsed_cache[name]

+    @property
+    def spine_names(self):
+        for name in sorted(self.mime_map):
+            if self.mime_map[name] in OEB_DOCS:
+                yield name, True
+
 class CascadeTest(BaseTest):

    def test_iterrules(self):
@ -131,3 +138,65 @@ class CascadeTest(BaseTest):
        t('p', 'before', 'font-weight', 'bold')
        t('p', 'first-letter', 'content')
        t('p', 'first-letter', 'content', abort_on_missing=True)
+
+    def test_font_stats(self):
+        embeds = '@font-face { font-family: X; src: url(X.otf) }\n@font-face { font-family: X; src: url(XB.otf); font-weight: bold }'
+        def get_stats(html, *fonts):
+            styles = []
+            html = '<html><head><link href="styles.css"></head><body>{}</body></html>'.format(html)
+            files = {'index.html':html, 'X.otf':b'xxx', 'XB.otf': b'xbxb'}
+            for font in fonts:
+                styles.append('@font-face {')
+                for k, v in font.iteritems():
+                    if k == 'src':
+                        files[v] = b'xxx'
+                        v = 'url(%s)' % v
+                    styles.append('%s : %s;' % (k, v))
+                styles.append('}\n')
+            html = '<html><head><link href="styles.css"></head><body>{}</body></html>'.format(html)
+            files['styles.css'] = embeds + '\n'.join(styles)
+            c = VirtualContainer(files)
+            return StatsCollector(c, do_embed=True)
+
+        def font(family, weight=None, style=None):
+            f = {}
+            if weight is not None:
+                f['font-weight'] = weight
+            if style is not None:
+                f['font-style'] = style
+            f = normalize_font_properties(f)
+            f['font-family'] = [family]
+            return f
+
+        def font_rule(src, *args, **kw):
+            ans = font(*args, **kw)
+            ans['font-family'] = list(map(icu_lower, ans['font-family']))
+            prepare_font_rule(ans)
+            ans['src'] = src
+            return ans
+
+        def fkey(*args, **kw):
+            f = font(*args, **kw)
+            f['font-family'] = icu_lower(f['font-family'][0])
+            return frozenset((k, v) for k, v in f.iteritems() if k in font_keys)
+
+        def fu(text, *args, **kw):
+            key = fkey(*args, **kw)
+            val = font(*args, **kw)
+            val['text'] = set(text)
+            val['font-family'] = val['font-family'][0]
+            return key, val
+
+        s = get_stats('<p style="font-family: X">abc<b>d\nef</b><i>ghi</i></p><p style="font-family: U">u</p>')
+        # The normal font must include ghi as it will be used to simulate
+        # italic by most rendering engines when the italic font is missing
+        self.assertEqual(s.font_stats, {'XB.otf':set('def'), 'X.otf':set('abcghi')})
+        self.assertEqual(s.font_spec_map, {'index.html':set('XU')})
+        self.assertEqual(s.all_font_rules, {'X.otf':font_rule('X.otf', 'X'), 'XB.otf':font_rule('XB.otf', 'X', 'bold')})
+        self.assertEqual(set(s.font_rule_map), {'index.html'})
+        self.assertEqual(s.font_rule_map['index.html'], [font_rule('X.otf', 'X'), font_rule('XB.otf', 'X', 'bold')])
+        self.assertEqual(set(s.font_usage_map), {'index.html'})
+        self.assertEqual(s.font_usage_map['index.html'], dict([fu('abc', 'X'), fu('def', 'X', weight='bold'), fu('ghi', 'X', style='italic'), fu('u', 'U')]))
+
+        s = get_stats('<p style="font-family: X; text-transform:uppercase">abc</p><b style="font-family: X; font-variant: small-caps">d\nef</b>')
+        self.assertEqual(s.font_stats, {'XB.otf':set('defDEF'), 'X.otf':set('ABC')})