mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
ebook-polish: Collect font usage stats
This commit is contained in:
parent
c0c6a177aa
commit
bf96abd838
Binary file not shown.
@ -26,7 +26,7 @@ def get_opts_from_parser(parser):
|
||||
class Coffee(Command): # {{{
|
||||
|
||||
description = 'Compile coffeescript files into javascript'
|
||||
COFFEE_DIRS = ('ebooks/oeb/display',)
|
||||
COFFEE_DIRS = ('ebooks/oeb/display', 'ebooks/oeb/polish')
|
||||
|
||||
def add_options(self, parser):
|
||||
parser.add_option('--watch', '-w', action='store_true', default=False,
|
||||
|
@ -7,7 +7,7 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os, posixpath, logging, sys, hashlib, uuid
|
||||
import os, logging, sys, hashlib, uuid
|
||||
from urllib import unquote as urlunquote
|
||||
|
||||
from lxml import etree
|
||||
@ -56,7 +56,7 @@ class Container(object):
|
||||
# Special case if we have stumbled onto the opf
|
||||
if path == opfpath:
|
||||
self.opf_name = name
|
||||
self.opf_dir = posixpath.dirname(path)
|
||||
self.opf_dir = os.path.dirname(path)
|
||||
self.mime_map[name] = guess_type('a.opf')[0]
|
||||
|
||||
# Update mime map with data from the OPF
|
||||
@ -66,13 +66,25 @@ class Container(object):
|
||||
href = item.get('href')
|
||||
self.mime_map[self.href_to_name(href)] = item.get('media-type')
|
||||
|
||||
def abspath_to_name(self, fullpath):
|
||||
return self.relpath(os.path.abspath(fullpath)).replace(os.sep, '/')
|
||||
|
||||
def href_to_name(self, href, base=None):
|
||||
'''
|
||||
Convert an href (relative to base) to a name (i.e. a path
|
||||
relative to self.root with POSIX separators).
|
||||
|
||||
base must be an absolute path with OS separators or None, in which case
|
||||
the href is interpreted relative to the dir containing the OPF.
|
||||
'''
|
||||
if base is None:
|
||||
base = self.opf_dir
|
||||
href = urlunquote(href.partition('#')[0])
|
||||
fullpath = posixpath.abspath(posixpath.join(base, href))
|
||||
return self.relpath(fullpath)
|
||||
fullpath = os.path.join(base, *href.split('/'))
|
||||
return self.abspath_to_name(fullpath)
|
||||
|
||||
def has_name(self, name):
|
||||
return name in self.name_path_map
|
||||
|
||||
def relpath(self, path):
|
||||
return relpath(path, self.root)
|
||||
@ -345,10 +357,14 @@ class AZW3Container(Container):
|
||||
super(AZW3Container, self).__init__(tdir, opf_path, log)
|
||||
self.obfuscated_fonts = {x.replace(os.sep, '/') for x in obfuscated_fonts}
|
||||
|
||||
def get_container(path, log=None):
|
||||
if log is None: log = default_log
|
||||
ebook = (AZW3Container if path.rpartition('.')[-1].lower() in {'azw3', 'mobi'}
|
||||
else EpubContainer)(path, log)
|
||||
return ebook
|
||||
|
||||
if __name__ == '__main__':
|
||||
f = sys.argv[-1]
|
||||
ebook = (AZW3Container if f.rpartition('.')[-1].lower() in {'azw3', 'mobi'}
|
||||
else EpubContainer)(f, default_log)
|
||||
ebook = get_container(sys.argv[-1])
|
||||
for s in ebook.spine_items:
|
||||
print (ebook.relpath(s))
|
||||
|
||||
|
72
src/calibre/ebooks/oeb/polish/font_stats.coffee
Normal file
72
src/calibre/ebooks/oeb/polish/font_stats.coffee
Normal file
@ -0,0 +1,72 @@
|
||||
#!/usr/bin/env coffee
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
|
||||
###
|
||||
Copyright 2013, Kovid Goyal <kovid at kovidgoyal.net>
|
||||
Released under the GPLv3 License
|
||||
###
|
||||
|
||||
|
||||
if window?.calibre_utils
|
||||
log = window.calibre_utils.log
|
||||
|
||||
font_dict = (style, computed=false) ->
|
||||
if computed
|
||||
fams = []
|
||||
family = style.getPropertyCSSValue('font-family')
|
||||
if family.cssValueType == CSSValue.CSS_PRIMITIVE_VALUE
|
||||
fams.push(family.getStringValue())
|
||||
else
|
||||
for f in family
|
||||
fams.push(f.getStringValue())
|
||||
else
|
||||
fams = style.getPropertyValue('font-family')
|
||||
return {
|
||||
'font-family':fams,
|
||||
'font-weight':style.getPropertyValue('font-weight'),
|
||||
'font-style':style.getPropertyValue('font-style'),
|
||||
'font-stretch':style.getPropertyValue('font-stretch'),
|
||||
}
|
||||
|
||||
font_usage = (node) ->
|
||||
style = window.getComputedStyle(node, null)
|
||||
ans = font_dict(style, true)
|
||||
text = []
|
||||
for child in node.childNodes
|
||||
if child.nodeType == Node.TEXT_NODE
|
||||
text.push(child.nodeValue)
|
||||
ans['text'] = text
|
||||
return ans
|
||||
|
||||
class FontStats
|
||||
# This class is a namespace to expose functions via the
|
||||
# window.font_stats object.
|
||||
|
||||
constructor: () ->
|
||||
if not this instanceof arguments.callee
|
||||
throw new Error('FontStats constructor called as function')
|
||||
|
||||
get_font_face_rules: () ->
|
||||
font_faces = []
|
||||
for sheet in document.styleSheets
|
||||
for rule in sheet.cssRules
|
||||
if rule.type == rule.FONT_FACE_RULE
|
||||
fd = font_dict(rule.style)
|
||||
fd['src'] = rule.style.getPropertyValue('src')
|
||||
font_faces.push(fd)
|
||||
py_bridge.value = font_faces
|
||||
|
||||
get_font_usage: () ->
|
||||
ans = []
|
||||
busage = font_usage(document.body)
|
||||
if busage != null
|
||||
ans.push(busage)
|
||||
for node in document.body.getElementsByTagName('*')
|
||||
usage = font_usage(node)
|
||||
if usage != null
|
||||
ans.push(usage)
|
||||
py_bridge.value = ans
|
||||
|
||||
if window?
|
||||
window.font_stats = new FontStats()
|
||||
|
@ -7,19 +7,107 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import json
|
||||
import json, sys, os
|
||||
from urllib import unquote
|
||||
|
||||
from cssutils import parseStyle
|
||||
from PyQt4.Qt import (QWebPage, pyqtProperty, QString, QEventLoop, QWebView,
|
||||
Qt, QSize, QTimer)
|
||||
|
||||
from calibre.constants import iswindows
|
||||
from calibre.ebooks.oeb.display.webview import load_html
|
||||
from calibre.gui2 import must_use_qt
|
||||
|
||||
class Page(QWebPage):
|
||||
def normalize_font_properties(font):
|
||||
w = font.get('font-weight', None)
|
||||
if not w and w != 0:
|
||||
w = 'normal'
|
||||
w = unicode(w)
|
||||
w = {'normal':'400', 'bold':'700'}.get(w, w)
|
||||
if w not in {'100', '200', '300', '400', '500', '600', '700',
|
||||
'800', '900'}:
|
||||
w = '400'
|
||||
font['font-weight'] = w
|
||||
|
||||
val = font.get('font-style', None)
|
||||
if val not in {'normal', 'italic', 'oblique'}:
|
||||
val = 'normal'
|
||||
font['font-style'] = val
|
||||
|
||||
val = font.get('font-stretch', None)
|
||||
if val not in {'normal', 'ultra-condensed', 'extra-condensed', 'condensed',
|
||||
'semi-condensed', 'semi-expanded', 'expanded',
|
||||
'extra-expanded', 'ultra-expanded'}:
|
||||
val = 'normal'
|
||||
font['font-stretch'] = val
|
||||
|
||||
widths = {x:i for i, x in enumerate(( 'ultra-condensed',
|
||||
'extra-condensed', 'condensed', 'semi-condensed', 'normal',
|
||||
'semi-expanded', 'expanded', 'extra-expanded', 'ultra-expanded'
|
||||
))}
|
||||
|
||||
def get_matching_rules(rules, font):
|
||||
normalize_font_properties(font)
|
||||
matches = []
|
||||
|
||||
# Filter on family
|
||||
for rule in reversed(rules):
|
||||
ff = frozenset(icu_lower(x) for x in font.get('font-family', []))
|
||||
if ff.intersection(rule['font-family']):
|
||||
matches.append(rule)
|
||||
if not matches:
|
||||
return []
|
||||
|
||||
# Filter on font stretch
|
||||
width = widths[font.get('font-stretch', 'normal')]
|
||||
|
||||
min_dist = min(abs(width-f['width']) for f in matches)
|
||||
nearest = [f for f in matches if abs(width-f['width']) ==
|
||||
min_dist]
|
||||
if width <= 4:
|
||||
lmatches = [f for f in nearest if f['width'] <= width]
|
||||
else:
|
||||
lmatches = [f for f in nearest if f['width'] >= width]
|
||||
matches = (lmatches or nearest)
|
||||
|
||||
# Filter on font-style
|
||||
fs = font.get('font-style', 'normal')
|
||||
order = {
|
||||
'oblique':['oblique', 'italic', 'normal'],
|
||||
'normal':['normal', 'oblique', 'italic']
|
||||
}.get(fs, ['italic', 'oblique', 'normal'])
|
||||
for q in order:
|
||||
m = [f for f in matches if f.get('font-style', 'normal') == q]
|
||||
if m:
|
||||
matches = m
|
||||
break
|
||||
|
||||
# Filter on font weight
|
||||
fw = int(font.get('font-weight', '400'))
|
||||
if fw == 400:
|
||||
q = [400, 500, 300, 200, 100, 600, 700, 800, 900]
|
||||
elif fw == 500:
|
||||
q = [500, 400, 300, 200, 100, 600, 700, 800, 900]
|
||||
elif fw < 400:
|
||||
q = [fw] + list(xrange(fw-100, -100, -100)) + list(xrange(fw+100,
|
||||
100, 1000))
|
||||
else:
|
||||
q = [fw] + list(xrange(fw+100, 100, 1000)) + list(xrange(fw-100,
|
||||
-100, -100))
|
||||
for wt in q:
|
||||
m = [f for f in matches if f['weight'] == wt]
|
||||
if m:
|
||||
return m
|
||||
return []
|
||||
|
||||
class Page(QWebPage): # {{{
|
||||
|
||||
def __init__(self, log):
|
||||
self.log = log
|
||||
QWebPage.__init__(self)
|
||||
self.js = None
|
||||
self.evaljs = self.mainFrame().evaluateJavaScript
|
||||
self.bridge_value = None
|
||||
|
||||
def javaScriptConsoleMessage(self, msg, lineno, msgid):
|
||||
self.log(u'JS:', unicode(msg))
|
||||
@ -40,6 +128,23 @@ class Page(QWebPage):
|
||||
_pass_json_value = pyqtProperty(QString, fget=_pass_json_value_getter,
|
||||
fset=_pass_json_value_setter)
|
||||
|
||||
def load_js(self):
|
||||
if self.js is None:
|
||||
from calibre.utils.resources import compiled_coffeescript
|
||||
self.js = compiled_coffeescript('ebooks.oeb.display.utils')
|
||||
self.js += compiled_coffeescript('ebooks.oeb.polish.font_stats')
|
||||
self.mainFrame().addToJavaScriptWindowObject("py_bridge", self)
|
||||
self.evaljs(self.js)
|
||||
self.evaljs('''
|
||||
py_bridge.__defineGetter__('value', function() {
|
||||
return JSON.parse(this._pass_json_value);
|
||||
});
|
||||
py_bridge.__defineSetter__('value', function(val) {
|
||||
this._pass_json_value = JSON.stringify(val);
|
||||
});
|
||||
''')
|
||||
# }}}
|
||||
|
||||
class StatsCollector(object):
|
||||
|
||||
def __init__(self, container):
|
||||
@ -85,6 +190,7 @@ class StatsCollector(object):
|
||||
self.loop.exit(1)
|
||||
return
|
||||
try:
|
||||
self.page.load_js()
|
||||
self.collect_font_stats()
|
||||
except:
|
||||
self.log.exception('Failed to collect font stats from: %s'%self.container.relpath(self.current_item))
|
||||
@ -94,6 +200,70 @@ class StatsCollector(object):
|
||||
self.render_book()
|
||||
|
||||
def collect_font_stats(self):
|
||||
pass
|
||||
self.page.evaljs('window.font_stats.get_font_face_rules()')
|
||||
font_face_rules = self.page.bridge_value
|
||||
if not isinstance(font_face_rules, list):
|
||||
raise Exception('Unknown error occurred while reading font-face rules')
|
||||
|
||||
# Weed out invalid font-face rules
|
||||
rules = []
|
||||
for rule in font_face_rules:
|
||||
ff = rule.get('font-family', None)
|
||||
if not ff: continue
|
||||
style = parseStyle('font-family:%s'%ff, validate=False)
|
||||
ff = [x.value for x in
|
||||
style.getProperty('font-family').propertyValue]
|
||||
if not ff or ff[0] == 'inherit':
|
||||
continue
|
||||
rule['font-family'] = frozenset(icu_lower(f) for f in ff)
|
||||
src = rule.get('src', None)
|
||||
if not src: continue
|
||||
style = parseStyle('background-image:%s'%src, validate=False)
|
||||
src = style.getProperty('background-image').propertyValue[0].uri
|
||||
if not src.startswith('file://'):
|
||||
self.log.warn('Unknown URI in @font-face: %r'%src)
|
||||
continue
|
||||
src = src[len('file://'):]
|
||||
if iswindows and src.startswith('/'):
|
||||
src = src[1:]
|
||||
src = src.replace('/', os.sep)
|
||||
src = unquote(src)
|
||||
name = self.container.abspath_to_name(src)
|
||||
if not self.container.has_name(name):
|
||||
self.log.warn('Font %r referenced in @font-face rule not found'
|
||||
%name)
|
||||
continue
|
||||
rule['src'] = name
|
||||
normalize_font_properties(rule)
|
||||
rule['width'] = widths[rule['font-stretch']]
|
||||
rule['weight'] = int(rule['font-weight'])
|
||||
rules.append(rule)
|
||||
|
||||
if not rules:
|
||||
return
|
||||
|
||||
for rule in rules:
|
||||
if rule['src'] not in self.font_stats:
|
||||
self.font_stats[rule['src']] = set()
|
||||
|
||||
self.page.evaljs('window.font_stats.get_font_usage()')
|
||||
font_usage = self.page.bridge_value
|
||||
if not isinstance(font_usage, list):
|
||||
raise Exception('Unknown error occurred while reading font usage')
|
||||
exclude = {'\n', '\r', '\t'}
|
||||
for font in font_usage:
|
||||
text = set()
|
||||
for t in font['text']:
|
||||
text |= frozenset(t)
|
||||
text.difference_update(exclude)
|
||||
if not text: continue
|
||||
for rule in get_matching_rules(rules, font):
|
||||
self.font_stats[rule['src']] |= text
|
||||
|
||||
if __name__ == '__main__':
|
||||
from calibre.ebooks.oeb.polish.container import get_container
|
||||
from calibre.utils.logging import default_log
|
||||
default_log.filter_level = default_log.DEBUG
|
||||
ebook = get_container(sys.argv[-1], default_log)
|
||||
print (StatsCollector(ebook).font_stats)
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user