diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py index 2aa0add3ee..d68c16c559 100644 --- a/src/calibre/ebooks/conversion/cli.py +++ b/src/calibre/ebooks/conversion/cli.py @@ -133,6 +133,7 @@ def add_pipeline_options(parser, plumber): [ 'base_font_size', 'disable_font_rescaling', 'font_size_mapping', 'embed_font_family', + 'subset_embedded_fonts', 'line_height', 'minimum_line_height', 'linearize_tables', 'extra_css', 'filter_css', diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index bfd2e36359..8d6a6e22f8 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -204,6 +204,17 @@ OptionRecommendation(name='embed_font_family', 'with some output formats, principally EPUB and AZW3.') ), +OptionRecommendation(name='subset_embedded_fonts', + recommended_value=False, level=OptionRecommendation.LOW, + help=_( + 'Subset all embedded fonts. Every embedded font is reduced ' + 'to contain only the glyphs used in this document. This decreases ' + 'the size of the font files. Useful if you are embedding a ' + 'particularly large font with lots of unused glyphs. Note that ' + 'subsetting is only supported for fonts that contain TrueType ' + 'outlines, not Postscript outlines.') + ), + OptionRecommendation(name='linearize_tables', recommended_value=False, level=OptionRecommendation.LOW, help=_('Some badly designed documents use tables to control the ' @@ -1112,6 +1123,10 @@ OptionRecommendation(name='search_replace', RemoveFakeMargins()(self.oeb, self.log, self.opts) RemoveAdobeMargins()(self.oeb, self.log, self.opts) + if self.opts.subset_embedded_fonts: + from calibre.ebooks.oeb.transforms.subset import SubsetFonts + SubsetFonts()(self.oeb, self.log, self.opts) + pr(0.9) self.flush() diff --git a/src/calibre/ebooks/oeb/transforms/subset.py b/src/calibre/ebooks/oeb/transforms/subset.py new file mode 100644 index 0000000000..a3e1b3bd10 --- /dev/null +++ b/src/calibre/ebooks/oeb/transforms/subset.py @@ -0,0 +1,284 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +from collections import defaultdict + +from calibre.ebooks.oeb.base import urlnormalize +from calibre.utils.fonts.subset import subset, NoGlyphs, UnsupportedFont + +class SubsetFonts(object): + + ''' + Subset all embedded fonts. Must be run after CSS flattening, as it requires + CSS normalization and flattening to work. + ''' + + def __call__(self, oeb, log, opts): + self.oeb, self.log, self.opts = oeb, log, opts + + self.find_embedded_fonts() + if not self.embedded_fonts: + self.log.debug('No embedded fonts found') + return + self.find_style_rules() + self.find_font_usage() + + totals = [0, 0] + + def remove(font): + totals[1] += len(font['item'].data) + self.oeb.manifest.remove(font['item']) + font['rule'].parentStyleSheet.deleteRule(font['rule']) + + for font in self.embedded_fonts: + if not font['chars']: + self.log('The font %s is unused. Removing it.'%font['src']) + remove(font) + continue + try: + raw, old_stats, new_stats = subset(font['item'].data, font['chars']) + except NoGlyphs: + self.log('The font %s has no used glyphs. Removing it.'%font['src']) + remove(font) + continue + except UnsupportedFont as e: + self.log.warn('The font %s is unsupported for subsetting. %s'%( + font['src'], e)) + sz = len(font['item'].data) + totals[0] += sz + totals[1] += sz + else: + font['item'].data = raw + nlen = sum(new_stats.itervalues()) + olen = sum(old_stats.itervalues()) + self.log('Decreased the font %s to %.1f%% of its original size'% + (font['src'], nlen/olen *100)) + totals[0] += nlen + totals[1] += olen + + font['item'].unload_data_from_memory() + + if totals[0]: + self.log('Reduced total font size to %.1f%% of original'% + (totals[0]/totals[1] * 100)) + + def get_font_properties(self, rule, default=None): + ''' + Given a CSS rule, extract normalized font properties from + it. Note that shorthand font property should already have been expanded + by the CSS flattening code. + ''' + props = {} + s = rule.style + for q in ('font-family', 'src', 'font-weight', 'font-stretch', + 'font-style'): + g = 'uri' if q == 'src' else 'value' + try: + val = s.getProperty(q).propertyValue[0] + val = getattr(val, g) + if q == 'font-family': + val = [x.value for x in s.getProperty(q).propertyValue] + if val and val[0] == 'inherit': + val = None + except (IndexError, KeyError, AttributeError, TypeError, ValueError): + val = None if q in {'src', 'font-family'} else default + if q in {'font-weight', 'font-stretch', 'font-style'}: + val = val.lower() if val else val + if val == 'inherit': + val = default + if q == 'font-weight': + val = {'normal':'400', 'bold':'700'}.get(val, val) + if val not in {'100', '200', '300', '400', '500', '600', '700', + '800', '900', 'bolder', 'lighter'}: + val = default + if val == 'normal': val = '400' + elif q == 'font-style': + if val not in {'normal', 'italic', 'oblique'}: + val = default + elif q == 'font-stretch': + if val not in { 'normal', 'ultra-condensed', 'extra-condensed', + 'condensed', 'semi-condensed', 'semi-expanded', + 'expanded', 'extra-expanded', 'ultra-expanded'}: + val = default + props[q] = val + return props + + def find_embedded_fonts(self): + ''' + Find all @font-face rules and extract the relevant info from them. + ''' + self.embedded_fonts = [] + for item in self.oeb.manifest: + if not hasattr(item.data, 'cssRules'): continue + for i, rule in enumerate(item.data.cssRules): + if rule.type != rule.FONT_FACE_RULE: + continue + props = self.get_font_properties(rule, default='normal') + if not props['font-family'] or not props['src']: + continue + + path = item.abshref(props['src']) + ff = self.oeb.manifest.hrefs.get(urlnormalize(path), None) + if not ff: + continue + props['item'] = ff + if props['font-weight'] in {'bolder', 'lighter'}: + props['font-weight'] = '400' + props['weight'] = int(props['font-weight']) + props['chars'] = set() + props['rule'] = rule + self.embedded_fonts.append(props) + + def find_style_rules(self): + ''' + Extract all font related style information from all stylesheets into a + dict mapping classes to font properties specified by that class. All + the heavy lifting has already been done by the CSS flattening code. + ''' + rules = defaultdict(dict) + for item in self.oeb.manifest: + if not hasattr(item.data, 'cssRules'): continue + for i, rule in enumerate(item.data.cssRules): + if rule.type != rule.STYLE_RULE: + continue + props = {k:v for k,v in + self.get_font_properties(rule).iteritems() if v} + if not props: + continue + for sel in rule.selectorList: + sel = sel.selectorText + if sel and sel.startswith('.'): + # We dont care about pseudo-selectors as the worst that + # can happen is some extra characters will remain in + # the font + sel = sel.partition(':')[0] + rules[sel[1:]].update(props) + + self.style_rules = dict(rules) + + def find_font_usage(self): + for item in self.oeb.manifest: + if not hasattr(item.data, 'xpath'): continue + for body in item.data.xpath('//*[local-name()="body"]'): + base = {'font-family':['serif'], 'font-weight': '400', + 'font-style':'normal', 'font-stretch':'normal'} + self.find_usage_in(body, base) + + def elem_style(self, cls, inherited_style): + ''' + Find the effective style for the given element. + ''' + classes = cls.split() + style = inherited_style.copy() + for cls in classes: + style.update(self.style_rules.get(cls, {})) + wt = style.get('font-weight', None) + pwt = inherited_style.get('font-weight', '400') + if wt == 'bolder': + style['font-weight'] = { + '100':'400', + '200':'400', + '300':'400', + '400':'700', + '500':'700', + }.get(pwt, '900') + elif wt == 'lighter': + style['font-weight'] = { + '600':'400', '700':'400', + '800':'700', '900':'700'}.get(pwt, '100') + + return style + + def used_font(self, style): + ''' + Given a style find the embedded font that matches it. Returns None if + no match is found ( can happen if not family matches). + ''' + ff = style.get('font-family', []) + lnames = {x.lower() for x in ff} + matching_set = [] + + # Filter on font-family + for ef in self.embedded_fonts: + flnames = {x.lower() for x in ef.get('font-family', [])} + if not lnames.intersection(flnames): + continue + matching_set.append(ef) + if not matching_set: + return None + + # Filter on font-stretch + widths = {x:i for i, x in enumerate(( 'ultra-condensed', + 'extra-condensed', 'condensed', 'semi-condensed', 'normal', + 'semi-expanded', 'expanded', 'extra-expanded', 'ultra-expanded' + ))} + + width = widths[style.get('font-stretch', 'normal')] + for f in matching_set: + f['width'] = widths[style.get('font-stretch', 'normal')] + + min_dist = min(abs(width-f['width']) for f in matching_set) + nearest = [f for f in matching_set if abs(width-f['width']) == + min_dist] + if width <= 4: + lmatches = [f for f in nearest if f['width'] <= width] + else: + lmatches = [f for f in nearest if f['width'] >= width] + matching_set = (lmatches or nearest) + + # Filter on font-style + fs = style.get('font-style', 'normal') + order = { + 'oblique':['oblique', 'italic', 'normal'], + 'normal':['normal', 'oblique', 'italic'] + }.get(fs, ['italic', 'oblique', 'normal']) + for q in order: + matches = [f for f in matching_set if f.get('font-style', 'normal') + == q] + if matches: + matching_set = matches + break + + # Filter on font weight + fw = int(style.get('font-weight', '400')) + if fw == 400: + q = [400, 500, 300, 200, 100, 600, 700, 800, 900] + elif fw == 500: + q = [500, 400, 300, 200, 100, 600, 700, 800, 900] + elif fw < 400: + q = [fw] + list(xrange(fw-100, -100, -100)) + list(xrange(fw+100, + 100, 1000)) + else: + q = [fw] + list(xrange(fw+100, 100, 1000)) + list(xrange(fw-100, + -100, -100)) + for wt in q: + matches = [f for f in matching_set if f['weight'] == wt] + if matches: + return matches[0] + + def find_chars(self, elem): + ans = set() + if elem.text: + ans |= set(elem.text) + for child in elem: + if child.tail: + ans |= set(child.tail) + return ans + + def find_usage_in(self, elem, inherited_style): + style = self.elem_style(elem.get('class', ''), inherited_style) + for child in elem: + self.find_usage_in(child, style) + font = self.used_font(style) + if font: + chars = self.find_chars(elem) + if chars: + font['chars'] |= chars + + diff --git a/src/calibre/gui2/convert/look_and_feel.py b/src/calibre/gui2/convert/look_and_feel.py index 1609a0add3..24ee288cc6 100644 --- a/src/calibre/gui2/convert/look_and_feel.py +++ b/src/calibre/gui2/convert/look_and_feel.py @@ -32,7 +32,7 @@ class LookAndFeelWidget(Widget, Ui_Form): Widget.__init__(self, parent, ['change_justification', 'extra_css', 'base_font_size', 'font_size_mapping', 'line_height', 'minimum_line_height', - 'embed_font_family', + 'embed_font_family', 'subset_embedded_fonts', 'smarten_punctuation', 'unsmarten_punctuation', 'disable_font_rescaling', 'insert_blank_line', 'remove_paragraph_spacing', diff --git a/src/calibre/gui2/convert/look_and_feel.ui b/src/calibre/gui2/convert/look_and_feel.ui index 1d3d1c1db3..16f781cb2c 100644 --- a/src/calibre/gui2/convert/look_and_feel.ui +++ b/src/calibre/gui2/convert/look_and_feel.ui @@ -406,7 +406,14 @@ - + + + + + + &Subset all embedded fonts + +