Polish: Add option to embed referenced fonts

Book polishing: Add option to embed all referenced fonts when polishing books using the "Polish Books" tool. Fixes #1196038 [[enhancement] embed font without conversion](https://bugs.launchpad.net/calibre/+bug/1196038)
2025-07-09 03:04:10 -04:00 · 2013-07-01 15:08:54 +05:30 · 2013-07-01 15:08:54 +05:30 · 9952abad4a
commit 9952abad4a
parent 59346348c5
6 changed files with 245 additions and 15 deletions
--- a/resources/compiled_coffeescript.zip
+++ b/resources/compiled_coffeescript.zip
--- a/src/calibre/ebooks/oeb/polish/embed.py
+++ b/src/calibre/ebooks/oeb/polish/embed.py
@ -0,0 +1,158 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import sys
+
+from lxml import etree
+
+from calibre import prints
+from calibre.ebooks.oeb.base import XHTML
+from calibre.ebooks.oeb.polish.stats import normalize_font_properties
+from calibre.utils.filenames import ascii_filename
+
+props = {'font-family':None, 'font-weight':'normal', 'font-style':'normal', 'font-stretch':'normal'}
+
+def matching_rule(font, rules):
+    ff = font['font-family']
+    if not isinstance(ff, basestring):
+        ff = tuple(ff)[0]
+    family = icu_lower(ff)
+    wt = font['font-weight']
+    style = font['font-style']
+    stretch = font['font-stretch']
+
+    for rule in rules:
+        if rule['font-style'] == style and rule['font-stretch'] == stretch and rule['font-weight'] == wt:
+            ff = rule['font-family']
+            if not isinstance(ff, basestring):
+                ff = tuple(ff)[0]
+            if icu_lower(ff) == family:
+                return rule
+
+def embed_font(container, font, all_font_rules, report, warned):
+    rule = matching_rule(font, all_font_rules)
+    ff = font['font-family']
+    if not isinstance(ff, basestring):
+        ff = ff[0]
+    if rule is None:
+        from calibre.utils.fonts.scanner import font_scanner, NoFonts
+        if ff in warned:
+            return
+        try:
+            fonts = font_scanner.fonts_for_family(ff)
+        except NoFonts:
+            report(_('Failed to find fonts for family: %s, not embedding') % ff)
+            warned.add(ff)
+            return
+        wt = int(font.get('font-weight', '400'))
+        for f in fonts:
+            if f['weight'] == wt and f['font-style'] == font.get('font-style', 'normal') and f['font-stretch'] == font.get('font-stretch', 'normal'):
+                report('Embedding font %s from %s' % (f['full_name'], f['path']))
+                data = font_scanner.get_font_data(f)
+                fname = f['full_name']
+                ext = 'otf' if f['is_otf'] else 'ttf'
+                fname = ascii_filename(fname).replace(' ', '-').replace('(', '').replace(')', '')
+                item = container.generate_item('fonts/%s.%s'%(fname, ext), id_prefix='font')
+                name = container.href_to_name(item.get('href'), container.opf_name)
+                with container.open(name, 'wb') as out:
+                    out.write(data)
+                href = container.name_to_href(name)
+                rule = {k:f.get(k, v) for k, v in props.iteritems()}
+                rule['src'] = 'url(%s)' % href
+                rule['name'] = name
+                return rule
+        msg = _('Failed to find font matching: family: %s; weight: %s; style: %s; stretch: %s') % (
+            ff, font['font-weight'], font['font-style'], font['font-stretch'])
+        if msg not in warned:
+            warned.add(msg)
+            report(msg)
+    else:
+        name = rule['src']
+        href = container.name_to_href(name)
+        rule = {k:ff if k == 'font-family' else rule.get(k, v) for k, v in props.iteritems()}
+        rule['src'] = 'url(%s)' % href
+        rule['name'] = name
+        return rule
+
+def embed_all_fonts(container, stats, report):
+    all_font_rules = tuple(stats.all_font_rules.itervalues())
+    warned = set()
+    rules, nrules = [], []
+    modified = set()
+
+    for path in container.spine_items:
+        name = container.abspath_to_name(path)
+        fu = stats.font_usage_map.get(name, None)
+        fs = stats.font_spec_map.get(name, None)
+        fr = stats.font_rule_map.get(name, None)
+        if None in (fs, fu, fr):
+            continue
+        fs = {icu_lower(x) for x in fs}
+        for font in fu.itervalues():
+            if icu_lower(font['font-family']) not in fs:
+                continue
+            rule = matching_rule(font, fr)
+            if rule is None:
+                # This font was not already embedded in this HTML file, before
+                # processing started
+                rule = matching_rule(font, nrules)
+                if rule is None:
+                    rule = embed_font(container, font, all_font_rules, report, warned)
+                    if rule is not None:
+                        rules.append(rule)
+                        nrules.append(normalize_font_properties(rule.copy()))
+                        modified.add(name)
+                        stats.font_stats[rule['name']] = font['text']
+                else:
+                    # This font was previously embedded by this code, update its stats
+                    stats.font_stats[rule['name']] |= font['text']
+                    modified.add(name)
+
+    if not rules:
+        report(_('No embeddable fonts found'))
+        return
+
+    # Write out CSS
+    rules = [';\n\t'.join('%s: %s' % (
+        k, '"%s"' % v if k == 'font-family' else v) for k, v in rule.iteritems() if (k in props and props[k] != v and v != '400') or k == 'src')
+        for rule in rules]
+    css = '\n\n'.join(['@font-face {\n\t%s\n}' % r for r in rules])
+    item = container.generate_item('fonts.css', id_prefix='font_embed')
+    name = container.href_to_name(item.get('href'), container.opf_name)
+    with container.open(name, 'wb') as out:
+        out.write(css.encode('utf-8'))
+
+    # Add link to CSS in all files that need it
+    for spine_name in modified:
+        root = container.parsed(spine_name)
+        head = root.xpath('//*[local-name()="head"][1]')[0]
+        href = container.name_to_href(name, spine_name)
+        etree.SubElement(head, XHTML('link'), rel='stylesheet', type='text/css', href=href).tail = '\n'
+        container.dirty(spine_name)
+
+
+if __name__ == '__main__':
+    from calibre.ebooks.oeb.polish.container import get_container
+    from calibre.ebooks.oeb.polish.stats import StatsCollector
+    from calibre.utils.logging import default_log
+    default_log.filter_level = default_log.DEBUG
+    inbook = sys.argv[-1]
+    ebook = get_container(inbook, default_log)
+    report = []
+    stats = StatsCollector(ebook, do_embed=True)
+    embed_all_fonts(ebook, stats, report.append)
+    outbook, ext = inbook.rpartition('.')[0::2]
+    outbook += '_subset.'+ext
+    ebook.commit(outbook)
+    prints('\nReport:')
+    for msg in report:
+        prints(msg)
+    print()
+    prints('Output written to:', outbook)
+
--- a/src/calibre/ebooks/oeb/polish/font_stats.coffee
+++ b/src/calibre/ebooks/oeb/polish/font_stats.coffee
@ -67,6 +67,18 @@ class FontStats
                ans.push(usage)
        py_bridge.value = ans

+    get_font_families: () ->
+        ans = {}
+        for node in document.getElementsByTagName('*')
+            rules = document.defaultView.getMatchedCSSRules(node, '')
+            if rules
+                for rule in rules
+                    style = rule.style
+                    family = style.getPropertyValue('font-family')
+                    if family
+                        ans[family] = true
+        py_bridge.value = ans
+
 if window?
    window.font_stats = new FontStats()

--- a/src/calibre/ebooks/oeb/polish/main.py
+++ b/src/calibre/ebooks/oeb/polish/main.py
@ -14,6 +14,7 @@ from functools import partial
 from calibre.ebooks.oeb.polish.container import get_container
 from calibre.ebooks.oeb.polish.stats import StatsCollector
 from calibre.ebooks.oeb.polish.subset import subset_all_fonts
+from calibre.ebooks.oeb.polish.embed import embed_all_fonts
 from calibre.ebooks.oeb.polish.cover import set_cover
 from calibre.ebooks.oeb.polish.replace import smarten_punctuation
 from calibre.ebooks.oeb.polish.jacket import (
@ -21,6 +22,7 @@ from calibre.ebooks.oeb.polish.jacket import (
 from calibre.utils.logging import Log

 ALL_OPTS = {
+    'embed': False,
    'subset': False,
    'opf': None,
    'cover': None,
@ -47,6 +49,12 @@ changes needed for the desired effect.</p>
 <p>Note that polishing only works on files in the %s formats.</p>\
 ''')%_(' or ').join('<b>%s</b>'%x for x in SUPPORTED),

+'embed': _('''\
+<p>Embed all fonts that are referenced in the document and are not already embedded.
+This will scan your computer for the fonts, and if they are found, they will be
+embedded into the document.</p>
+'''),
+
 'subset': _('''\
 <p>Subsetting fonts means reducing an embedded font to contain
 only the characters used from that font in the book. This
@ -118,8 +126,8 @@ def polish(file_map, opts, log, report):
        ebook = get_container(inbook, log)
        jacket = None

-        if opts.subset:
-            stats = StatsCollector(ebook)
+        if opts.subset or opts.embed:
+            stats = StatsCollector(ebook, do_embed=opts.embed)

        if opts.opf:
            rt(_('Updating metadata'))
@ -159,6 +167,11 @@ def polish(file_map, opts, log, report):
            smarten_punctuation(ebook, report)
            report('')

+        if opts.embed:
+            rt(_('Embedding referenced fonts'))
+            embed_all_fonts(ebook, stats, report)
+            report('')
+
        if opts.subset:
            rt(_('Subsetting embedded fonts'))
            subset_all_fonts(ebook, stats.font_stats, report)
@ -197,6 +210,7 @@ def option_parser():
    parser = OptionParser(usage=USAGE)
    a = parser.add_option
    o = partial(a, default=False, action='store_true')
+    o('--embed-fonts', '-e', dest='embed', help=CLI_HELP['embed'])
    o('--subset-fonts', '-f', dest='subset', help=CLI_HELP['subset'])
    a('--cover', '-c', help=_(
        'Path to a cover image. Changes the cover specified in the ebook. '
--- a/src/calibre/ebooks/oeb/polish/stats.py
+++ b/src/calibre/ebooks/oeb/polish/stats.py
@ -7,10 +7,11 @@ __license__   = 'GPL v3'
 __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'

-import json, sys, os
+import json, sys, os, logging
 from urllib import unquote
+from collections import defaultdict

-from cssutils import parseStyle
+from cssutils import CSSParser
 from PyQt4.Qt import (pyqtProperty, QString, QEventLoop, Qt, QSize, QTimer,
                      pyqtSlot)
 from PyQt4.QtWebKit import QWebPage, QWebView
@ -41,14 +42,14 @@ def normalize_font_properties(font):
                   'extra-expanded', 'ultra-expanded'}:
        val = 'normal'
    font['font-stretch'] = val
+    return font

-widths = {x:i for i, x in enumerate(( 'ultra-condensed',
+widths = {x:i for i, x in enumerate(('ultra-condensed',
        'extra-condensed', 'condensed', 'semi-condensed', 'normal',
        'semi-expanded', 'expanded', 'extra-expanded', 'ultra-expanded'
        ))}

 def get_matching_rules(rules, font):
-    normalize_font_properties(font)
    matches = []

    # Filter on family
@ -100,7 +101,7 @@ def get_matching_rules(rules, font):
            return m
    return []

-class Page(QWebPage): # {{{
+class Page(QWebPage):  # {{{

    def __init__(self, log):
        self.log = log
@ -157,10 +158,12 @@ class Page(QWebPage): # {{{

 class StatsCollector(object):

-    def __init__(self, container):
+    def __init__(self, container, do_embed=False):
        self.container = container
        self.log = self.logger = container.log
+        self.do_embed = do_embed
        must_use_qt()
+        self.parser = CSSParser(loglevel=logging.CRITICAL, log=logging.getLogger('calibre.css'))

        self.loop = QEventLoop()
        self.view = QWebView()
@ -173,6 +176,10 @@ class StatsCollector(object):

        self.render_queue = list(container.spine_items)
        self.font_stats = {}
+        self.font_usage_map = {}
+        self.font_spec_map = {}
+        self.font_rule_map = {}
+        self.all_font_rules = {}

        QTimer.singleShot(0, self.render_book)

@ -235,27 +242,35 @@ class StatsCollector(object):
        rules = []
        for rule in font_face_rules:
            ff = rule.get('font-family', None)
-            if not ff: continue
-            style = parseStyle('font-family:%s'%ff, validate=False)
+            if not ff:
+                continue
+            style = self.parser.parseStyle('font-family:%s'%ff, validate=False)
            ff = [x.value for x in
                  style.getProperty('font-family').propertyValue]
            if not ff or ff[0] == 'inherit':
                continue
            rule['font-family'] = frozenset(icu_lower(f) for f in ff)
            src = rule.get('src', None)
-            if not src: continue
-            style = parseStyle('background-image:%s'%src, validate=False)
+            if not src:
+                continue
+            style = self.parser.parseStyle('background-image:%s'%src, validate=False)
            src = style.getProperty('background-image').propertyValue[0].uri
            name = self.href_to_name(src, '@font-face rule')
+            if name is None:
+                continue
            rule['src'] = name
            normalize_font_properties(rule)
            rule['width'] = widths[rule['font-stretch']]
            rule['weight'] = int(rule['font-weight'])
            rules.append(rule)

-        if not rules:
+        if not rules and not self.do_embed:
            return

+        self.font_rule_map[self.container.abspath_to_name(self.current_item)] = rules
+        for rule in rules:
+            self.all_font_rules[rule['src']] = rule
+
        for rule in rules:
            if rule['src'] not in self.font_stats:
                self.font_stats[rule['src']] = set()
@ -265,19 +280,48 @@ class StatsCollector(object):
        if not isinstance(font_usage, list):
            raise Exception('Unknown error occurred while reading font usage')
        exclude = {'\n', '\r', '\t'}
+        self.font_usage_map[self.container.abspath_to_name(self.current_item)] = fu = defaultdict(dict)
+        bad_fonts = {'serif', 'sans-serif', 'monospace', 'cursive', 'fantasy', 'sansserif', 'inherit'}
        for font in font_usage:
            text = set()
            for t in font['text']:
                text |= frozenset(t)
            text.difference_update(exclude)
-            if not text: continue
+            if not text:
+                continue
+            normalize_font_properties(font)
            for rule in get_matching_rules(rules, font):
                self.font_stats[rule['src']] |= text
+            if self.do_embed:
+                ff = [icu_lower(x) for x in font.get('font-family', [])]
+                if ff and ff[0] not in bad_fonts:
+                    keys = {'font-weight', 'font-style', 'font-stretch', 'font-family'}
+                    key = frozenset(((k, ff[0] if k == 'font-family' else v) for k, v in font.iteritems() if k in keys))
+                    val = fu[key]
+                    if not val:
+                        val.update({k:(font[k][0] if k == 'font-family' else font[k]) for k in keys})
+                        val['text'] = set()
+                    val['text'] |= text
+        self.font_usage_map[self.container.abspath_to_name(self.current_item)] = dict(fu)
+
+        if self.do_embed:
+            self.page.evaljs('window.font_stats.get_font_families()')
+            font_families = self.page.bridge_value
+            if not isinstance(font_families, dict):
+                raise Exception('Unknown error occurred while reading font families')
+            self.font_spec_map[self.container.abspath_to_name(self.current_item)] = fs = set()
+            for raw in font_families.iterkeys():
+                style = self.parser.parseStyle('font-family:' + raw, validate=False).getProperty('font-family')
+                for x in style.propertyValue:
+                    x = x.value
+                    if x and x.lower() not in bad_fonts:
+                        fs.add(x)

 if __name__ == '__main__':
    from calibre.ebooks.oeb.polish.container import get_container
    from calibre.utils.logging import default_log
    default_log.filter_level = default_log.DEBUG
    ebook = get_container(sys.argv[-1], default_log)
-    print (StatsCollector(ebook).font_stats)
+    print (StatsCollector(ebook, do_embed=True).font_stats)
+

--- a/src/calibre/gui2/actions/polish.py
+++ b/src/calibre/gui2/actions/polish.py
@ -45,6 +45,7 @@ class Polish(QDialog):  # {{{
                  ORIGINAL_* format before running it.</p>''')
            ),

+            'embed':_('<h3>Embed referenced fonts</h3>%s')%HELP['embed'],
            'subset':_('<h3>Subsetting fonts</h3>%s')%HELP['subset'],

            'smarten_punctuation':
@ -75,6 +76,7 @@ class Polish(QDialog):  # {{{

        count = 0
        self.all_actions = OrderedDict([
+            ('embed', _('&Embed all referenced fonts')),
            ('subset', _('&Subset all embedded fonts')),
            ('smarten_punctuation', _('Smarten &punctuation')),
            ('metadata', _('Update &metadata in the book files')),