ebook-polish: Collect font usage stats

2025-12-10 15:15:03 -05:00 · 2013-02-03 16:45:07 +05:30 · 2013-02-03 16:45:07 +05:30 · bf96abd838
commit bf96abd838
parent c0c6a177aa
5 changed files with 269 additions and 11 deletions
--- a/resources/compiled_coffeescript.zip
+++ b/resources/compiled_coffeescript.zip
--- a/setup/resources.py
+++ b/setup/resources.py
@ -26,7 +26,7 @@ def get_opts_from_parser(parser):
 class Coffee(Command): # {{{

    description = 'Compile coffeescript files into javascript'
-    COFFEE_DIRS = ('ebooks/oeb/display',)
+    COFFEE_DIRS = ('ebooks/oeb/display', 'ebooks/oeb/polish')

    def add_options(self, parser):
        parser.add_option('--watch', '-w', action='store_true', default=False,
--- a/src/calibre/ebooks/oeb/polish/container.py
+++ b/src/calibre/ebooks/oeb/polish/container.py
@ -7,7 +7,7 @@ __license__   = 'GPL v3'
 __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'

-import os, posixpath, logging, sys, hashlib, uuid
+import os, logging, sys, hashlib, uuid
 from urllib import unquote as urlunquote

 from lxml import etree
@ -56,7 +56,7 @@ class Container(object):
                # Special case if we have stumbled onto the opf
                if path == opfpath:
                    self.opf_name = name
-                    self.opf_dir = posixpath.dirname(path)
+                    self.opf_dir = os.path.dirname(path)
                    self.mime_map[name] = guess_type('a.opf')[0]

        # Update mime map with data from the OPF
@ -66,13 +66,25 @@ class Container(object):
            href = item.get('href')
            self.mime_map[self.href_to_name(href)] = item.get('media-type')

+    def abspath_to_name(self, fullpath):
+        return self.relpath(os.path.abspath(fullpath)).replace(os.sep, '/')

    def href_to_name(self, href, base=None):
+        '''
+        Convert an href (relative to base) to a name (i.e. a path
+        relative to self.root with POSIX separators).
+
+        base must be an absolute path with OS separators or None, in which case
+        the href is interpreted relative to the dir containing the OPF.
+        '''
        if base is None:
            base = self.opf_dir
        href = urlunquote(href.partition('#')[0])
-        fullpath = posixpath.abspath(posixpath.join(base, href))
-        return self.relpath(fullpath)
+        fullpath = os.path.join(base, *href.split('/'))
+        return self.abspath_to_name(fullpath)
+
+    def has_name(self, name):
+        return name in self.name_path_map

    def relpath(self, path):
        return relpath(path, self.root)
@ -345,10 +357,14 @@ class AZW3Container(Container):
        super(AZW3Container, self).__init__(tdir, opf_path, log)
        self.obfuscated_fonts = {x.replace(os.sep, '/') for x in obfuscated_fonts}

+def get_container(path, log=None):
+    if log is None: log = default_log
+    ebook = (AZW3Container if path.rpartition('.')[-1].lower() in {'azw3', 'mobi'}
+            else EpubContainer)(path, log)
+    return ebook
+
 if __name__ == '__main__':
-    f = sys.argv[-1]
-    ebook = (AZW3Container if f.rpartition('.')[-1].lower() in {'azw3', 'mobi'}
-            else EpubContainer)(f, default_log)
+    ebook = get_container(sys.argv[-1])
    for s in ebook.spine_items:
        print (ebook.relpath(s))

--- a/src/calibre/ebooks/oeb/polish/font_stats.coffee
+++ b/src/calibre/ebooks/oeb/polish/font_stats.coffee
@ -0,0 +1,72 @@
+#!/usr/bin/env coffee
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+
+###
+ Copyright 2013, Kovid Goyal <kovid at kovidgoyal.net>
+ Released under the GPLv3 License
+###
+
+
+if window?.calibre_utils
+    log = window.calibre_utils.log
+
+font_dict = (style, computed=false) ->
+    if computed
+        fams = []
+        family = style.getPropertyCSSValue('font-family')
+        if family.cssValueType == CSSValue.CSS_PRIMITIVE_VALUE
+            fams.push(family.getStringValue())
+        else
+            for f in family
+                fams.push(f.getStringValue())
+    else
+        fams = style.getPropertyValue('font-family')
+    return {
+        'font-family':fams,
+        'font-weight':style.getPropertyValue('font-weight'),
+        'font-style':style.getPropertyValue('font-style'),
+        'font-stretch':style.getPropertyValue('font-stretch'),
+    }
+
+font_usage = (node) ->
+    style = window.getComputedStyle(node, null)
+    ans = font_dict(style, true)
+    text = []
+    for child in node.childNodes
+        if child.nodeType == Node.TEXT_NODE
+            text.push(child.nodeValue)
+    ans['text'] = text
+    return ans
+
+class FontStats
+    # This class is a namespace to expose functions via the
+    # window.font_stats object.
+
+    constructor: () ->
+        if not this instanceof arguments.callee
+            throw new Error('FontStats constructor called as function')
+
+    get_font_face_rules: () ->
+        font_faces = []
+        for sheet in document.styleSheets
+            for rule in sheet.cssRules
+                if rule.type == rule.FONT_FACE_RULE
+                    fd = font_dict(rule.style)
+                    fd['src'] = rule.style.getPropertyValue('src')
+                    font_faces.push(fd)
+        py_bridge.value = font_faces
+
+    get_font_usage: () ->
+        ans = []
+        busage = font_usage(document.body)
+        if busage != null
+            ans.push(busage)
+        for node in document.body.getElementsByTagName('*')
+            usage = font_usage(node)
+            if usage != null
+                ans.push(usage)
+        py_bridge.value = ans
+
+if window?
+    window.font_stats = new FontStats()
+
--- a/src/calibre/ebooks/oeb/polish/stats.py
+++ b/src/calibre/ebooks/oeb/polish/stats.py
@ -7,19 +7,107 @@ __license__   = 'GPL v3'
 __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'

-import json
+import json, sys, os
+from urllib import unquote

+from cssutils import parseStyle
 from PyQt4.Qt import (QWebPage, pyqtProperty, QString, QEventLoop, QWebView,
                      Qt, QSize, QTimer)

+from calibre.constants import iswindows
 from calibre.ebooks.oeb.display.webview import load_html
 from calibre.gui2 import must_use_qt

-class Page(QWebPage):
+def normalize_font_properties(font):
+    w = font.get('font-weight', None)
+    if not w and w != 0:
+        w = 'normal'
+    w = unicode(w)
+    w = {'normal':'400', 'bold':'700'}.get(w, w)
+    if w not in {'100', '200', '300', '400', '500', '600', '700',
+            '800', '900'}:
+        w = '400'
+    font['font-weight'] = w
+
+    val = font.get('font-style', None)
+    if val not in {'normal', 'italic', 'oblique'}:
+        val = 'normal'
+    font['font-style'] = val
+
+    val = font.get('font-stretch', None)
+    if val not in {'normal', 'ultra-condensed', 'extra-condensed', 'condensed',
+                   'semi-condensed', 'semi-expanded', 'expanded',
+                   'extra-expanded', 'ultra-expanded'}:
+        val = 'normal'
+    font['font-stretch'] = val
+
+widths = {x:i for i, x in enumerate(( 'ultra-condensed',
+        'extra-condensed', 'condensed', 'semi-condensed', 'normal',
+        'semi-expanded', 'expanded', 'extra-expanded', 'ultra-expanded'
+        ))}
+
+def get_matching_rules(rules, font):
+    normalize_font_properties(font)
+    matches = []
+
+    # Filter on family
+    for rule in reversed(rules):
+        ff = frozenset(icu_lower(x) for x in font.get('font-family', []))
+        if ff.intersection(rule['font-family']):
+            matches.append(rule)
+    if not matches:
+        return []
+
+    # Filter on font stretch
+    width = widths[font.get('font-stretch', 'normal')]
+
+    min_dist = min(abs(width-f['width']) for f in matches)
+    nearest = [f for f in matches if abs(width-f['width']) ==
+        min_dist]
+    if width <= 4:
+        lmatches = [f for f in nearest if f['width'] <= width]
+    else:
+        lmatches = [f for f in nearest if f['width'] >= width]
+    matches = (lmatches or nearest)
+
+    # Filter on font-style
+    fs = font.get('font-style', 'normal')
+    order = {
+            'oblique':['oblique', 'italic', 'normal'],
+            'normal':['normal', 'oblique', 'italic']
+        }.get(fs, ['italic', 'oblique', 'normal'])
+    for q in order:
+        m = [f for f in matches if f.get('font-style', 'normal') == q]
+        if m:
+            matches = m
+            break
+
+    # Filter on font weight
+    fw = int(font.get('font-weight', '400'))
+    if fw == 400:
+        q = [400, 500, 300, 200, 100, 600, 700, 800, 900]
+    elif fw == 500:
+        q = [500, 400, 300, 200, 100, 600, 700, 800, 900]
+    elif fw < 400:
+        q = [fw] + list(xrange(fw-100, -100, -100)) + list(xrange(fw+100,
+            100, 1000))
+    else:
+        q = [fw] + list(xrange(fw+100, 100, 1000)) + list(xrange(fw-100,
+            -100, -100))
+    for wt in q:
+        m = [f for f in matches if f['weight'] == wt]
+        if m:
+            return m
+    return []
+
+class Page(QWebPage): # {{{

    def __init__(self, log):
        self.log = log
        QWebPage.__init__(self)
+        self.js = None
+        self.evaljs = self.mainFrame().evaluateJavaScript
+        self.bridge_value = None

    def javaScriptConsoleMessage(self, msg, lineno, msgid):
        self.log(u'JS:', unicode(msg))
@ -40,6 +128,23 @@ class Page(QWebPage):
    _pass_json_value = pyqtProperty(QString, fget=_pass_json_value_getter,
            fset=_pass_json_value_setter)

+    def load_js(self):
+        if self.js is None:
+            from calibre.utils.resources import compiled_coffeescript
+            self.js = compiled_coffeescript('ebooks.oeb.display.utils')
+            self.js += compiled_coffeescript('ebooks.oeb.polish.font_stats')
+        self.mainFrame().addToJavaScriptWindowObject("py_bridge", self)
+        self.evaljs(self.js)
+        self.evaljs('''
+        py_bridge.__defineGetter__('value', function() {
+            return JSON.parse(this._pass_json_value);
+        });
+        py_bridge.__defineSetter__('value', function(val) {
+            this._pass_json_value = JSON.stringify(val);
+        });
+        ''')
+# }}}
+
 class StatsCollector(object):

    def __init__(self, container):
@ -85,6 +190,7 @@ class StatsCollector(object):
            self.loop.exit(1)
            return
        try:
+            self.page.load_js()
            self.collect_font_stats()
        except:
            self.log.exception('Failed to collect font stats from: %s'%self.container.relpath(self.current_item))
@ -94,6 +200,70 @@ class StatsCollector(object):
        self.render_book()

    def collect_font_stats(self):
-        pass
+        self.page.evaljs('window.font_stats.get_font_face_rules()')
+        font_face_rules = self.page.bridge_value
+        if not isinstance(font_face_rules, list):
+            raise Exception('Unknown error occurred while reading font-face rules')

+        # Weed out invalid font-face rules
+        rules = []
+        for rule in font_face_rules:
+            ff = rule.get('font-family', None)
+            if not ff: continue
+            style = parseStyle('font-family:%s'%ff, validate=False)
+            ff = [x.value for x in
+                  style.getProperty('font-family').propertyValue]
+            if not ff or ff[0] == 'inherit':
+                continue
+            rule['font-family'] = frozenset(icu_lower(f) for f in ff)
+            src = rule.get('src', None)
+            if not src: continue
+            style = parseStyle('background-image:%s'%src, validate=False)
+            src = style.getProperty('background-image').propertyValue[0].uri
+            if not src.startswith('file://'):
+                self.log.warn('Unknown URI in @font-face: %r'%src)
+                continue
+            src = src[len('file://'):]
+            if iswindows and src.startswith('/'):
+                src = src[1:]
+            src = src.replace('/', os.sep)
+            src = unquote(src)
+            name = self.container.abspath_to_name(src)
+            if not self.container.has_name(name):
+                self.log.warn('Font %r referenced in @font-face rule not found'
+                              %name)
+                continue
+            rule['src'] = name
+            normalize_font_properties(rule)
+            rule['width'] = widths[rule['font-stretch']]
+            rule['weight'] = int(rule['font-weight'])
+            rules.append(rule)
+
+        if not rules:
+            return
+
+        for rule in rules:
+            if rule['src'] not in self.font_stats:
+                self.font_stats[rule['src']] = set()
+
+        self.page.evaljs('window.font_stats.get_font_usage()')
+        font_usage = self.page.bridge_value
+        if not isinstance(font_usage, list):
+            raise Exception('Unknown error occurred while reading font usage')
+        exclude = {'\n', '\r', '\t'}
+        for font in font_usage:
+            text = set()
+            for t in font['text']:
+                text |= frozenset(t)
+            text.difference_update(exclude)
+            if not text: continue
+            for rule in get_matching_rules(rules, font):
+                self.font_stats[rule['src']] |= text
+
+if __name__ == '__main__':
+    from calibre.ebooks.oeb.polish.container import get_container
+    from calibre.utils.logging import default_log
+    default_log.filter_level = default_log.DEBUG
+    ebook = get_container(sys.argv[-1], default_log)
+    print (StatsCollector(ebook).font_stats)