ebook-polish: Collect font usage stats

This commit is contained in:
Kovid Goyal 2013-02-03 16:45:07 +05:30
parent c0c6a177aa
commit bf96abd838
5 changed files with 269 additions and 11 deletions

Binary file not shown.

View File

@ -26,7 +26,7 @@ def get_opts_from_parser(parser):
class Coffee(Command): # {{{
description = 'Compile coffeescript files into javascript'
COFFEE_DIRS = ('ebooks/oeb/display',)
COFFEE_DIRS = ('ebooks/oeb/display', 'ebooks/oeb/polish')
def add_options(self, parser):
parser.add_option('--watch', '-w', action='store_true', default=False,

View File

@ -7,7 +7,7 @@ __license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os, posixpath, logging, sys, hashlib, uuid
import os, logging, sys, hashlib, uuid
from urllib import unquote as urlunquote
from lxml import etree
@ -56,7 +56,7 @@ class Container(object):
# Special case if we have stumbled onto the opf
if path == opfpath:
self.opf_name = name
self.opf_dir = posixpath.dirname(path)
self.opf_dir = os.path.dirname(path)
self.mime_map[name] = guess_type('a.opf')[0]
# Update mime map with data from the OPF
@ -66,13 +66,25 @@ class Container(object):
href = item.get('href')
self.mime_map[self.href_to_name(href)] = item.get('media-type')
def abspath_to_name(self, fullpath):
return self.relpath(os.path.abspath(fullpath)).replace(os.sep, '/')
def href_to_name(self, href, base=None):
'''
Convert an href (relative to base) to a name (i.e. a path
relative to self.root with POSIX separators).
base must be an absolute path with OS separators or None, in which case
the href is interpreted relative to the dir containing the OPF.
'''
if base is None:
base = self.opf_dir
href = urlunquote(href.partition('#')[0])
fullpath = posixpath.abspath(posixpath.join(base, href))
return self.relpath(fullpath)
fullpath = os.path.join(base, *href.split('/'))
return self.abspath_to_name(fullpath)
def has_name(self, name):
return name in self.name_path_map
def relpath(self, path):
return relpath(path, self.root)
@ -345,10 +357,14 @@ class AZW3Container(Container):
super(AZW3Container, self).__init__(tdir, opf_path, log)
self.obfuscated_fonts = {x.replace(os.sep, '/') for x in obfuscated_fonts}
def get_container(path, log=None):
if log is None: log = default_log
ebook = (AZW3Container if path.rpartition('.')[-1].lower() in {'azw3', 'mobi'}
else EpubContainer)(path, log)
return ebook
if __name__ == '__main__':
f = sys.argv[-1]
ebook = (AZW3Container if f.rpartition('.')[-1].lower() in {'azw3', 'mobi'}
else EpubContainer)(f, default_log)
ebook = get_container(sys.argv[-1])
for s in ebook.spine_items:
print (ebook.relpath(s))

View File

@ -0,0 +1,72 @@
#!/usr/bin/env coffee
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
###
Copyright 2013, Kovid Goyal <kovid at kovidgoyal.net>
Released under the GPLv3 License
###
if window?.calibre_utils
log = window.calibre_utils.log
font_dict = (style, computed=false) ->
if computed
fams = []
family = style.getPropertyCSSValue('font-family')
if family.cssValueType == CSSValue.CSS_PRIMITIVE_VALUE
fams.push(family.getStringValue())
else
for f in family
fams.push(f.getStringValue())
else
fams = style.getPropertyValue('font-family')
return {
'font-family':fams,
'font-weight':style.getPropertyValue('font-weight'),
'font-style':style.getPropertyValue('font-style'),
'font-stretch':style.getPropertyValue('font-stretch'),
}
font_usage = (node) ->
style = window.getComputedStyle(node, null)
ans = font_dict(style, true)
text = []
for child in node.childNodes
if child.nodeType == Node.TEXT_NODE
text.push(child.nodeValue)
ans['text'] = text
return ans
class FontStats
# This class is a namespace to expose functions via the
# window.font_stats object.
constructor: () ->
if not this instanceof arguments.callee
throw new Error('FontStats constructor called as function')
get_font_face_rules: () ->
font_faces = []
for sheet in document.styleSheets
for rule in sheet.cssRules
if rule.type == rule.FONT_FACE_RULE
fd = font_dict(rule.style)
fd['src'] = rule.style.getPropertyValue('src')
font_faces.push(fd)
py_bridge.value = font_faces
get_font_usage: () ->
ans = []
busage = font_usage(document.body)
if busage != null
ans.push(busage)
for node in document.body.getElementsByTagName('*')
usage = font_usage(node)
if usage != null
ans.push(usage)
py_bridge.value = ans
if window?
window.font_stats = new FontStats()

View File

@ -7,19 +7,107 @@ __license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import json
import json, sys, os
from urllib import unquote
from cssutils import parseStyle
from PyQt4.Qt import (QWebPage, pyqtProperty, QString, QEventLoop, QWebView,
Qt, QSize, QTimer)
from calibre.constants import iswindows
from calibre.ebooks.oeb.display.webview import load_html
from calibre.gui2 import must_use_qt
class Page(QWebPage):
def normalize_font_properties(font):
w = font.get('font-weight', None)
if not w and w != 0:
w = 'normal'
w = unicode(w)
w = {'normal':'400', 'bold':'700'}.get(w, w)
if w not in {'100', '200', '300', '400', '500', '600', '700',
'800', '900'}:
w = '400'
font['font-weight'] = w
val = font.get('font-style', None)
if val not in {'normal', 'italic', 'oblique'}:
val = 'normal'
font['font-style'] = val
val = font.get('font-stretch', None)
if val not in {'normal', 'ultra-condensed', 'extra-condensed', 'condensed',
'semi-condensed', 'semi-expanded', 'expanded',
'extra-expanded', 'ultra-expanded'}:
val = 'normal'
font['font-stretch'] = val
widths = {x:i for i, x in enumerate(( 'ultra-condensed',
'extra-condensed', 'condensed', 'semi-condensed', 'normal',
'semi-expanded', 'expanded', 'extra-expanded', 'ultra-expanded'
))}
def get_matching_rules(rules, font):
normalize_font_properties(font)
matches = []
# Filter on family
for rule in reversed(rules):
ff = frozenset(icu_lower(x) for x in font.get('font-family', []))
if ff.intersection(rule['font-family']):
matches.append(rule)
if not matches:
return []
# Filter on font stretch
width = widths[font.get('font-stretch', 'normal')]
min_dist = min(abs(width-f['width']) for f in matches)
nearest = [f for f in matches if abs(width-f['width']) ==
min_dist]
if width <= 4:
lmatches = [f for f in nearest if f['width'] <= width]
else:
lmatches = [f for f in nearest if f['width'] >= width]
matches = (lmatches or nearest)
# Filter on font-style
fs = font.get('font-style', 'normal')
order = {
'oblique':['oblique', 'italic', 'normal'],
'normal':['normal', 'oblique', 'italic']
}.get(fs, ['italic', 'oblique', 'normal'])
for q in order:
m = [f for f in matches if f.get('font-style', 'normal') == q]
if m:
matches = m
break
# Filter on font weight
fw = int(font.get('font-weight', '400'))
if fw == 400:
q = [400, 500, 300, 200, 100, 600, 700, 800, 900]
elif fw == 500:
q = [500, 400, 300, 200, 100, 600, 700, 800, 900]
elif fw < 400:
q = [fw] + list(xrange(fw-100, -100, -100)) + list(xrange(fw+100,
100, 1000))
else:
q = [fw] + list(xrange(fw+100, 100, 1000)) + list(xrange(fw-100,
-100, -100))
for wt in q:
m = [f for f in matches if f['weight'] == wt]
if m:
return m
return []
class Page(QWebPage): # {{{
def __init__(self, log):
self.log = log
QWebPage.__init__(self)
self.js = None
self.evaljs = self.mainFrame().evaluateJavaScript
self.bridge_value = None
def javaScriptConsoleMessage(self, msg, lineno, msgid):
self.log(u'JS:', unicode(msg))
@ -40,6 +128,23 @@ class Page(QWebPage):
_pass_json_value = pyqtProperty(QString, fget=_pass_json_value_getter,
fset=_pass_json_value_setter)
def load_js(self):
if self.js is None:
from calibre.utils.resources import compiled_coffeescript
self.js = compiled_coffeescript('ebooks.oeb.display.utils')
self.js += compiled_coffeescript('ebooks.oeb.polish.font_stats')
self.mainFrame().addToJavaScriptWindowObject("py_bridge", self)
self.evaljs(self.js)
self.evaljs('''
py_bridge.__defineGetter__('value', function() {
return JSON.parse(this._pass_json_value);
});
py_bridge.__defineSetter__('value', function(val) {
this._pass_json_value = JSON.stringify(val);
});
''')
# }}}
class StatsCollector(object):
def __init__(self, container):
@ -85,6 +190,7 @@ class StatsCollector(object):
self.loop.exit(1)
return
try:
self.page.load_js()
self.collect_font_stats()
except:
self.log.exception('Failed to collect font stats from: %s'%self.container.relpath(self.current_item))
@ -94,6 +200,70 @@ class StatsCollector(object):
self.render_book()
def collect_font_stats(self):
pass
self.page.evaljs('window.font_stats.get_font_face_rules()')
font_face_rules = self.page.bridge_value
if not isinstance(font_face_rules, list):
raise Exception('Unknown error occurred while reading font-face rules')
# Weed out invalid font-face rules
rules = []
for rule in font_face_rules:
ff = rule.get('font-family', None)
if not ff: continue
style = parseStyle('font-family:%s'%ff, validate=False)
ff = [x.value for x in
style.getProperty('font-family').propertyValue]
if not ff or ff[0] == 'inherit':
continue
rule['font-family'] = frozenset(icu_lower(f) for f in ff)
src = rule.get('src', None)
if not src: continue
style = parseStyle('background-image:%s'%src, validate=False)
src = style.getProperty('background-image').propertyValue[0].uri
if not src.startswith('file://'):
self.log.warn('Unknown URI in @font-face: %r'%src)
continue
src = src[len('file://'):]
if iswindows and src.startswith('/'):
src = src[1:]
src = src.replace('/', os.sep)
src = unquote(src)
name = self.container.abspath_to_name(src)
if not self.container.has_name(name):
self.log.warn('Font %r referenced in @font-face rule not found'
%name)
continue
rule['src'] = name
normalize_font_properties(rule)
rule['width'] = widths[rule['font-stretch']]
rule['weight'] = int(rule['font-weight'])
rules.append(rule)
if not rules:
return
for rule in rules:
if rule['src'] not in self.font_stats:
self.font_stats[rule['src']] = set()
self.page.evaljs('window.font_stats.get_font_usage()')
font_usage = self.page.bridge_value
if not isinstance(font_usage, list):
raise Exception('Unknown error occurred while reading font usage')
exclude = {'\n', '\r', '\t'}
for font in font_usage:
text = set()
for t in font['text']:
text |= frozenset(t)
text.difference_update(exclude)
if not text: continue
for rule in get_matching_rules(rules, font):
self.font_stats[rule['src']] |= text
if __name__ == '__main__':
from calibre.ebooks.oeb.polish.container import get_container
from calibre.utils.logging import default_log
default_log.filter_level = default_log.DEBUG
ebook = get_container(sys.argv[-1], default_log)
print (StatsCollector(ebook).font_stats)