From bf96abd8384a845b45c20c91b42eb020eeae0e40 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 3 Feb 2013 16:45:07 +0530 Subject: [PATCH] ebook-polish: Collect font usage stats --- resources/compiled_coffeescript.zip | Bin 64128 -> 67342 bytes setup/resources.py | 2 +- src/calibre/ebooks/oeb/polish/container.py | 30 ++- .../ebooks/oeb/polish/font_stats.coffee | 72 +++++++ src/calibre/ebooks/oeb/polish/stats.py | 176 +++++++++++++++++- 5 files changed, 269 insertions(+), 11 deletions(-) create mode 100644 src/calibre/ebooks/oeb/polish/font_stats.coffee diff --git a/resources/compiled_coffeescript.zip b/resources/compiled_coffeescript.zip index 8948c66f6928d134184026510400bfc14ed2274c..7edbd43dc949175cf277bdbc8334c8b2f00910f5 100644 GIT binary patch delta 2023 zcmah~PfQ$D7+;nx&<&-OKg%vyc$&BibeW3S8X>5NtRW)Z!ivTX8JC^6yHkdlab}jf zAq2I-G(9w$uX^aA5fi=HXnN~GJoezh_UKJ-?L|B^1upo$_htr{wQ-Y~ee=EF_wV<9 z@4ZhR*I&OL8cn}lAL7{>G#~o_&Cwa zn&!C^KOpAHX*L}TZ(!W@8rMS?mds}G?4pQI-0aN6i>!Md|@~ZKNCS9qVWisoy{XGoEtZ_b=n-Z&7`6XS<-!K4o%w3i_#SZ; zpc+qBXukAR-$8dhZAah+c+u;)tn9_!T*6%5Agx8aQKdPZDKlSEH69ap(<2CzGE?toidf#FNDOo@r4=H5%egR6Y_+un=8+h6nHt!oX?w?7#C Z`3e90aKEjkDbN&nFBk}Hh3eQ1{txwFpg#Zr delta 156 zcmeC{V`=!xy!nH+Z184pp7rvZdFvOnPM^xkC^!8y3nTwzoqCSxr&t+9rvGALlw%SO zo-AK4I$fWY(Uj?J28cb`M~QEGCo3b*^m{-(Ooxhqg7VX4*%-B$_SJwD7<6(>PXqCD nyTJVW)7Yl31@UWNgZb~jbBP6bv$8P&0Xq + Released under the GPLv3 License +### + + +if window?.calibre_utils + log = window.calibre_utils.log + +font_dict = (style, computed=false) -> + if computed + fams = [] + family = style.getPropertyCSSValue('font-family') + if family.cssValueType == CSSValue.CSS_PRIMITIVE_VALUE + fams.push(family.getStringValue()) + else + for f in family + fams.push(f.getStringValue()) + else + fams = style.getPropertyValue('font-family') + return { + 'font-family':fams, + 'font-weight':style.getPropertyValue('font-weight'), + 'font-style':style.getPropertyValue('font-style'), + 'font-stretch':style.getPropertyValue('font-stretch'), + } + +font_usage = (node) -> + style = window.getComputedStyle(node, null) + ans = font_dict(style, true) + text = [] + for child in node.childNodes + if child.nodeType == Node.TEXT_NODE + text.push(child.nodeValue) + ans['text'] = text + return ans + +class FontStats + # This class is a namespace to expose functions via the + # window.font_stats object. + + constructor: () -> + if not this instanceof arguments.callee + throw new Error('FontStats constructor called as function') + + get_font_face_rules: () -> + font_faces = [] + for sheet in document.styleSheets + for rule in sheet.cssRules + if rule.type == rule.FONT_FACE_RULE + fd = font_dict(rule.style) + fd['src'] = rule.style.getPropertyValue('src') + font_faces.push(fd) + py_bridge.value = font_faces + + get_font_usage: () -> + ans = [] + busage = font_usage(document.body) + if busage != null + ans.push(busage) + for node in document.body.getElementsByTagName('*') + usage = font_usage(node) + if usage != null + ans.push(usage) + py_bridge.value = ans + +if window? + window.font_stats = new FontStats() + diff --git a/src/calibre/ebooks/oeb/polish/stats.py b/src/calibre/ebooks/oeb/polish/stats.py index 5203a97b62..6cba5d3c87 100644 --- a/src/calibre/ebooks/oeb/polish/stats.py +++ b/src/calibre/ebooks/oeb/polish/stats.py @@ -7,19 +7,107 @@ __license__ = 'GPL v3' __copyright__ = '2013, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import json +import json, sys, os +from urllib import unquote +from cssutils import parseStyle from PyQt4.Qt import (QWebPage, pyqtProperty, QString, QEventLoop, QWebView, Qt, QSize, QTimer) +from calibre.constants import iswindows from calibre.ebooks.oeb.display.webview import load_html from calibre.gui2 import must_use_qt -class Page(QWebPage): +def normalize_font_properties(font): + w = font.get('font-weight', None) + if not w and w != 0: + w = 'normal' + w = unicode(w) + w = {'normal':'400', 'bold':'700'}.get(w, w) + if w not in {'100', '200', '300', '400', '500', '600', '700', + '800', '900'}: + w = '400' + font['font-weight'] = w + + val = font.get('font-style', None) + if val not in {'normal', 'italic', 'oblique'}: + val = 'normal' + font['font-style'] = val + + val = font.get('font-stretch', None) + if val not in {'normal', 'ultra-condensed', 'extra-condensed', 'condensed', + 'semi-condensed', 'semi-expanded', 'expanded', + 'extra-expanded', 'ultra-expanded'}: + val = 'normal' + font['font-stretch'] = val + +widths = {x:i for i, x in enumerate(( 'ultra-condensed', + 'extra-condensed', 'condensed', 'semi-condensed', 'normal', + 'semi-expanded', 'expanded', 'extra-expanded', 'ultra-expanded' + ))} + +def get_matching_rules(rules, font): + normalize_font_properties(font) + matches = [] + + # Filter on family + for rule in reversed(rules): + ff = frozenset(icu_lower(x) for x in font.get('font-family', [])) + if ff.intersection(rule['font-family']): + matches.append(rule) + if not matches: + return [] + + # Filter on font stretch + width = widths[font.get('font-stretch', 'normal')] + + min_dist = min(abs(width-f['width']) for f in matches) + nearest = [f for f in matches if abs(width-f['width']) == + min_dist] + if width <= 4: + lmatches = [f for f in nearest if f['width'] <= width] + else: + lmatches = [f for f in nearest if f['width'] >= width] + matches = (lmatches or nearest) + + # Filter on font-style + fs = font.get('font-style', 'normal') + order = { + 'oblique':['oblique', 'italic', 'normal'], + 'normal':['normal', 'oblique', 'italic'] + }.get(fs, ['italic', 'oblique', 'normal']) + for q in order: + m = [f for f in matches if f.get('font-style', 'normal') == q] + if m: + matches = m + break + + # Filter on font weight + fw = int(font.get('font-weight', '400')) + if fw == 400: + q = [400, 500, 300, 200, 100, 600, 700, 800, 900] + elif fw == 500: + q = [500, 400, 300, 200, 100, 600, 700, 800, 900] + elif fw < 400: + q = [fw] + list(xrange(fw-100, -100, -100)) + list(xrange(fw+100, + 100, 1000)) + else: + q = [fw] + list(xrange(fw+100, 100, 1000)) + list(xrange(fw-100, + -100, -100)) + for wt in q: + m = [f for f in matches if f['weight'] == wt] + if m: + return m + return [] + +class Page(QWebPage): # {{{ def __init__(self, log): self.log = log QWebPage.__init__(self) + self.js = None + self.evaljs = self.mainFrame().evaluateJavaScript + self.bridge_value = None def javaScriptConsoleMessage(self, msg, lineno, msgid): self.log(u'JS:', unicode(msg)) @@ -40,6 +128,23 @@ class Page(QWebPage): _pass_json_value = pyqtProperty(QString, fget=_pass_json_value_getter, fset=_pass_json_value_setter) + def load_js(self): + if self.js is None: + from calibre.utils.resources import compiled_coffeescript + self.js = compiled_coffeescript('ebooks.oeb.display.utils') + self.js += compiled_coffeescript('ebooks.oeb.polish.font_stats') + self.mainFrame().addToJavaScriptWindowObject("py_bridge", self) + self.evaljs(self.js) + self.evaljs(''' + py_bridge.__defineGetter__('value', function() { + return JSON.parse(this._pass_json_value); + }); + py_bridge.__defineSetter__('value', function(val) { + this._pass_json_value = JSON.stringify(val); + }); + ''') +# }}} + class StatsCollector(object): def __init__(self, container): @@ -85,6 +190,7 @@ class StatsCollector(object): self.loop.exit(1) return try: + self.page.load_js() self.collect_font_stats() except: self.log.exception('Failed to collect font stats from: %s'%self.container.relpath(self.current_item)) @@ -94,6 +200,70 @@ class StatsCollector(object): self.render_book() def collect_font_stats(self): - pass + self.page.evaljs('window.font_stats.get_font_face_rules()') + font_face_rules = self.page.bridge_value + if not isinstance(font_face_rules, list): + raise Exception('Unknown error occurred while reading font-face rules') + # Weed out invalid font-face rules + rules = [] + for rule in font_face_rules: + ff = rule.get('font-family', None) + if not ff: continue + style = parseStyle('font-family:%s'%ff, validate=False) + ff = [x.value for x in + style.getProperty('font-family').propertyValue] + if not ff or ff[0] == 'inherit': + continue + rule['font-family'] = frozenset(icu_lower(f) for f in ff) + src = rule.get('src', None) + if not src: continue + style = parseStyle('background-image:%s'%src, validate=False) + src = style.getProperty('background-image').propertyValue[0].uri + if not src.startswith('file://'): + self.log.warn('Unknown URI in @font-face: %r'%src) + continue + src = src[len('file://'):] + if iswindows and src.startswith('/'): + src = src[1:] + src = src.replace('/', os.sep) + src = unquote(src) + name = self.container.abspath_to_name(src) + if not self.container.has_name(name): + self.log.warn('Font %r referenced in @font-face rule not found' + %name) + continue + rule['src'] = name + normalize_font_properties(rule) + rule['width'] = widths[rule['font-stretch']] + rule['weight'] = int(rule['font-weight']) + rules.append(rule) + + if not rules: + return + + for rule in rules: + if rule['src'] not in self.font_stats: + self.font_stats[rule['src']] = set() + + self.page.evaljs('window.font_stats.get_font_usage()') + font_usage = self.page.bridge_value + if not isinstance(font_usage, list): + raise Exception('Unknown error occurred while reading font usage') + exclude = {'\n', '\r', '\t'} + for font in font_usage: + text = set() + for t in font['text']: + text |= frozenset(t) + text.difference_update(exclude) + if not text: continue + for rule in get_matching_rules(rules, font): + self.font_stats[rule['src']] |= text + +if __name__ == '__main__': + from calibre.ebooks.oeb.polish.container import get_container + from calibre.utils.logging import default_log + default_log.filter_level = default_log.DEBUG + ebook = get_container(sys.argv[-1], default_log) + print (StatsCollector(ebook).font_stats)