From e430bf331290fb788941e242481eac5906febb8a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 12 Dec 2020 13:41:48 +0530 Subject: [PATCH] PDF Output: Yet another attempt at working around chromium's broken TTF embedding. Fixes #1907849 [conversion to pdf results in bizarre kerning](https://bugs.launchpad.net/calibre/+bug/1907849) This has the advantage of making the merging code a lot simpler as well. --- setup/test.py | 2 - src/calibre/ebooks/pdf/html_writer.py | 261 ++------------------- src/calibre/ebooks/pdf/test_html_writer.py | 58 ----- src/calibre/utils/podofo/fonts.cpp | 85 +++---- 4 files changed, 54 insertions(+), 352 deletions(-) delete mode 100644 src/calibre/ebooks/pdf/test_html_writer.py diff --git a/setup/test.py b/setup/test.py index 3c0a2b27c1..34659e5b52 100644 --- a/setup/test.py +++ b/setup/test.py @@ -116,8 +116,6 @@ def find_tests(which_tests=None, exclude_tests=None): a(find_tests()) from calibre.ebooks.metadata.html import find_tests a(find_tests()) - from calibre.ebooks.pdf.test_html_writer import find_tests - a(find_tests()) from calibre.utils.xml_parse import find_tests a(find_tests()) from calibre.gui2.viewer.annotations import find_tests diff --git a/src/calibre/ebooks/pdf/html_writer.py b/src/calibre/ebooks/pdf/html_writer.py index 71e2a4d3ac..1e545bbfd6 100644 --- a/src/calibre/ebooks/pdf/html_writer.py +++ b/src/calibre/ebooks/pdf/html_writer.py @@ -8,22 +8,19 @@ import copy import json import os -import re import signal import sys from collections import namedtuple +from html5_parser import parse from io import BytesIO from itertools import count, repeat -from operator import attrgetter, itemgetter - -from html5_parser import parse from PyQt5.Qt import ( QApplication, QMarginsF, QObject, QPageLayout, Qt, QTimer, QUrl, pyqtSignal ) from PyQt5.QtWebEngineCore import QWebEngineUrlRequestInterceptor from PyQt5.QtWebEngineWidgets import QWebEnginePage, QWebEngineProfile -from calibre import detect_ncpus, prepare_string_for_xml, human_readable +from calibre import detect_ncpus, human_readable, prepare_string_for_xml from calibre.constants import __version__, iswindows from calibre.ebooks.metadata.xmp import metadata_to_xmp_packet from calibre.ebooks.oeb.base import XHTML, XPath @@ -45,9 +42,7 @@ from calibre.utils.podofo import ( dedup_type3_fonts, get_podofo, remove_unused_fonts, set_metadata_implementation ) from calibre.utils.short_uuid import uuid4 -from polyglot.builtins import ( - as_bytes, as_unicode, filter, iteritems, map, range, unicode_type -) +from polyglot.builtins import filter, iteritems, map, range, unicode_type from polyglot.urllib import urlparse OK, KILL_SIGNAL = range(0, 2) @@ -769,182 +764,6 @@ def all_glyph_ids_in_w_arrays(arrays, as_set=False): return ans if as_set else sorted(ans) -def merge_w_arrays(arrays): - ranges = [] - for w in arrays: - i = 0 - while i + 1 < len(w): - elem = w[i] - next_elem = w[i+1] - if isinstance(next_elem, list): - ranges.append(Range(elem, elem + len(next_elem) - 1, next_elem)) - i += 2 - elif i + 2 < len(w): - ranges.append(Range(elem, next_elem, [w[i+2]])) - i += 3 - else: - break - ranges.sort(key=attrgetter('sort_order')) - merged_ranges = ranges[:1] - for r in ranges[1:]: - prev_range = merged_ranges[-1] - left_over = prev_range.merge(r) - if left_over is not None: - merged_ranges.append(left_over) - if not merged_ranges: - return [] - ans = [] - for r in merged_ranges: - ans.extend(r.as_item) - return ans - - -def width_map_from_w_array(w): - ans = {} - i = 0 - while i + 1 < len(w): - elem = w[i] - next_elem = w[i+1] - if isinstance(next_elem, list): - for gid, width in zip(range(elem, elem + len(next_elem)), next_elem): - ans[gid] = width - i += 2 - else: - try: - width = w[i+2] - except IndexError: - width = 0 - for gid in range(elem, next_elem + 1): - ans[gid] = width - i += 3 - return ans - - -def merge_w_arrays_directly(arrays): - width_maps = tuple(map(width_map_from_w_array, arrays)) - - def getter(gid): - return max(m.get(gid, 0) for m in width_maps) - - all_gids = set() - for m in width_maps: - all_gids |= set(m) - - widths = [] - for gid in sorted(all_gids): - widths.extend((gid, gid, getter(gid))) - return merge_w_arrays((widths,)) - - -class CMap(object): - - def __init__(self): - self.start_codespace = sys.maxsize - self.end_codespace = 0 - self.ranges = set() - self.chars = set() - self.header = self.footer = None - - def add_codespace(self, start, end): - self.start_codespace = min(self.start_codespace, start) - self.end_codespace = max(self.end_codespace, end) - - def serialize(self): - chars = sorted(self.chars, key=itemgetter(0)) - - def ashex(x): - ans = '{:04X}'.format(x) - leftover = len(ans) % 4 - if leftover: - ans = ('0' * (4 - leftover)) + ans - return ans - - lines = ['1 begincodespacerange', '<{}> <{}>'.format(*map(ashex, (self.start_codespace, self.end_codespace))), 'endcodespacerange'] - while chars: - group, chars = chars[:100], chars[100:] - lines.append('{} beginbfchar'.format(len(group))) - for g in group: - lines.append('<{}> <{}>'.format(*map(ashex, g))) - lines.append('endbfchar') - - ranges = sorted(self.ranges, key=itemgetter(0)) - while ranges: - group, ranges = ranges[:100], ranges[100:] - lines.append('{} beginbfrange'.format(len(group))) - for g in group: - lines.append('<{}> <{}> <{}>'.format(*map(ashex, g))) - lines.append('endbfrange') - return self.header + '\n' + '\n'.join(lines) + '\n' + self.footer - - -def merge_cmaps(cmaps): - header, incmap, incodespace, inchar, inrange, footer = 'header cmap codespace char range footer'.split() - start_pat = re.compile(r'\d+\s+begin(codespacerange|bfrange|bfchar)') - ans = CMap() - for cmap in cmaps: - state = header - headerlines = [] - footerlines = [] - prefix_ended = False - for line in as_unicode(cmap, errors='replace').splitlines(): - line = line.strip() - if state is header: - headerlines.append(line) - if line == 'begincmap': - state = incmap - continue - if state is incmap: - if line == 'endcmap': - state = footer - footerlines.append(line) - continue - m = start_pat.match(line) - if m is not None: - state = incodespace if m.group(1) == 'codespacerange' else (inchar if m.group(1) == 'bfchar' else inrange) - prefix_ended = True - continue - if not prefix_ended: - headerlines.append(line) - continue - if state is incodespace: - if line == 'endcodespacerange': - state = incmap - else: - s, e = line.split() - s = int(s[1:-1], 16) - e = int(e[1:-1], 16) - ans.add_codespace(s, e) - continue - if state is inchar: - if line == 'endbfchar': - state = incmap - else: - a, b = line.split() - a = int(a[1:-1], 16) - b = int(b[1:-1], 16) - ans.chars.add((a, b)) - continue - if state is inrange: - if line == 'endbfrange': - state = incmap - else: - # technically bfrange can contain arrays for th eunicode - # value but from looking at SkPDFFont.cpp in chromium, it - # does not generate any - a, b, u = line.split() - a = int(a[1:-1], 16) - b = int(b[1:-1], 16) - u = int(u[1:-1], 16) - ans.ranges.add((a, b, u)) - continue - if state is footer: - footerlines.append(line) - if ans.header is None: - ans.header = '\n'.join(headerlines) - ans.footer = '\n'.join(footerlines) - return ans.serialize() - - def fonts_are_identical(fonts): sentinel = object() for key in ('ToUnicode', 'Data', 'W', 'W2'): @@ -957,27 +776,27 @@ def fonts_are_identical(fonts): return True -def merge_font(fonts, log): +def merge_font_files(fonts, log): + # As of Qt 5.15.1 Chromium has switched to harfbuzz and dropped sfntly. It + # now produces font descriptors whose W arrays dont match the glyph width + # information from the hhea table, in contravention of the PDF spec. So + # we can no longer merge font descriptors, all we can do is merge the + # actual sfnt data streams into a single stream and subset it to contain + # only the glyphs from all W arrays. # choose the largest font as the base font + fonts.sort(key=lambda f: len(f['Data'] or b''), reverse=True) - base_font = fonts[0] - t0_font = next(f for f in fonts if f['DescendantFont'] == base_font['Reference']) descendant_fonts = [f for f in fonts if f['Subtype'] != 'Type0'] - t0_fonts = [f for f in fonts if f['Subtype'] == 'Type0'] - references_to_drop = tuple(f['Reference'] for f in fonts if f is not base_font and f is not t0_font) - if fonts_are_identical(descendant_fonts): - return t0_font, base_font, references_to_drop - cmaps = list(filter(None, (f['ToUnicode'] for f in t0_fonts))) - if cmaps: - t0_font['ToUnicode'] = as_bytes(merge_cmaps(cmaps)) - base_font['sfnt'] = merge_truetype_fonts_for_pdf(tuple(f['sfnt'] for f in descendant_fonts), log) - arrays = tuple(filter(None, (f['W'] for f in descendant_fonts))) - if arrays: - base_font['W'] = merge_w_arrays_directly(arrays) - arrays = tuple(filter(None, (f['W2'] for f in descendant_fonts))) - if arrays: - base_font['W2'] = merge_w_arrays_directly(arrays) - return t0_font, base_font, references_to_drop + total_size = sum(len(f['Data']) for f in descendant_fonts) + merged_sfnt = merge_truetype_fonts_for_pdf(tuple(f['sfnt'] for f in descendant_fonts), log) + w_arrays = tuple(filter(None, (f['W'] for f in descendant_fonts))) + glyph_ids = all_glyph_ids_in_w_arrays(w_arrays, as_set=True) + h_arrays = tuple(filter(None, (f['W2'] for f in descendant_fonts))) + glyph_ids |= all_glyph_ids_in_w_arrays(h_arrays, as_set=True) + pdf_subset(merged_sfnt, glyph_ids) + font_data = merged_sfnt()[0] + log(f'Merged {len(fonts)} instances of {fonts[0]["BaseFont"]} reducing size from {human_readable(total_size)} to {human_readable(len(font_data))}') + return font_data, tuple(f['Reference'] for f in descendant_fonts) def merge_fonts(pdf_doc, log): @@ -1005,18 +824,10 @@ def merge_fonts(pdf_doc, log): for f in all_fonts: base_font_map.setdefault(f['BaseFont'], []).append(f) - replacements = {} - items = [] for name, fonts in iteritems(base_font_map): if mergeable(fonts): - t0_font, base_font, references_to_drop = merge_font(fonts, log) - for ref in references_to_drop: - replacements[ref] = t0_font['Reference'] - data = base_font['sfnt']()[0] - items.append(( - base_font['Reference'], t0_font['Reference'], base_font['W'] or [], base_font['W2'] or [], - data, t0_font['ToUnicode'] or b'')) - pdf_doc.merge_fonts(tuple(items), replacements) + font_data, references = merge_font_files(fonts, log) + pdf_doc.merge_fonts(font_data, references) def test_merge_fonts(): @@ -1024,28 +835,11 @@ def test_merge_fonts(): podofo = get_podofo() pdf_doc = podofo.PDFDoc() pdf_doc.open(path) - merge_fonts(pdf_doc) + from calibre.utils.logging import default_log + merge_fonts(pdf_doc, default_log) out = path.rpartition('.')[0] + '-merged.pdf' pdf_doc.save(out) print('Merged PDF written to', out) - - -def subset_fonts(pdf_doc, log): - all_fonts = pdf_doc.list_fonts(True) - for font in all_fonts: - if font['Subtype'] != 'Type0' and font['Data']: - try: - sfnt = Sfnt(font['Data']) - except UnsupportedFont: - continue - if b'glyf' not in sfnt: - continue - num, gen = font['Reference'] - glyphs = all_glyph_ids_in_w_arrays((font['W'] or (), font['W2'] or ()), as_set=True) - pdf_subset(sfnt, glyphs) - data = sfnt()[0] - log('Subset embedded font from: {} to {}'.format(human_readable(len(font['Data'])), human_readable(len(data)))) - pdf_doc.replace_font_data(data, num, gen) # }}} @@ -1346,11 +1140,6 @@ def convert(opf_path, opts, metadata=None, output_path=None, log=default_log, co if num_removed: log('Removed', num_removed, 'unused fonts') - # Originally added because of https://bugreports.qt.io/browse/QTBUG-88976 - # however even after that fix, calibre's font subsetting is superior to - # harfbuzz, so continue to use it. - subset_fonts(pdf_doc, log) - num_removed = pdf_doc.dedup_images() if num_removed: log('Removed', num_removed, 'duplicate images') diff --git a/src/calibre/ebooks/pdf/test_html_writer.py b/src/calibre/ebooks/pdf/test_html_writer.py deleted file mode 100644 index 251dd7f5d9..0000000000 --- a/src/calibre/ebooks/pdf/test_html_writer.py +++ /dev/null @@ -1,58 +0,0 @@ -#!/usr/bin/env python -# vim:fileencoding=utf-8 -# License: GPL v3 Copyright: 2019, Kovid Goyal - - -import unittest -from .html_writer import merge_w_arrays, merge_cmaps - - -class TestPDFWriter(unittest.TestCase): - - maxDiff = None - - def test_merge_w_arrays(self): - self.assertEqual(merge_w_arrays(( # merge neighbor arrays - [1, 3, 0.1], [3, [0.1, 0.2]])), [1, 3, 0.1, 4, 4, 0.2]) - self.assertEqual(merge_w_arrays(( # merge neighbor ranges - [1, 5, 0.1], [6, 8, 0.1])), [1, 8, 0.1]) - self.assertEqual(merge_w_arrays(( # merge neighbor ranges - [1, 5, 0.1], [6, 8, 0.2])), [1, 5, 0.1, 6, 8, 0.2]) - - self.assertEqual(merge_w_arrays(( # disjoin overlap - [1, 4, 0.1], [3, [0.1, 0.1, 0.2, 0.3]])), [1, 4, 0.1, 5, [0.2, 0.3]]) - self.assertEqual(merge_w_arrays(( # disjoin overlap - [1, [0.1, 0.2]], [2, 4, 0.2])), [1, [0.1, 0.2], 3, 4, 0.2]) - - self.assertEqual(merge_w_arrays(( # split overlapping arrays - [1, [0.1, 0.2, 0.3]], [3, 5, 0.3])), [1, [0.1, 0.2, 0.3], 4, 5, 0.3]) - self.assertEqual(merge_w_arrays(( # merge overlapping ranges, using first width - [1, 5, 0.1], [2, 4, 0.2])), [1, 5, 0.1]) - self.assertEqual(merge_w_arrays(( # merge overlapping arrays - [1, [0.1, 0.1]], [3, [0.2, 0.2]])), [1, [0.1, 0.1, 0.2, 0.2]]) - - self.assertEqual(merge_w_arrays(( - [1, 10, 99, 20, [1, 2, 3, 4]], - [3, 10, 99, 11, 13, 77, 19, [77, 1]])), - [1, 10, 99, 11, 13, 77, 19, [77, 1, 2, 3, 4]] - ) - - def test_merge_cmaps(self): - roundtrip = '/CIDInit /ProcSet findresource begin\n12 dict begin\nbegincmap\n/CIDSystemInfo\n<< /Registry (Adobe)\n/Ordering (UCS)\n/Supplement 0\n>> def\n/CMapName /Adobe-Identity-UCS def\n/CMapType 2 def\n1 begincodespacerange\n<0000> \nendcodespacerange\n12 beginbfchar\n<0003> <0020>\n<000F> <002C>\n<0011> <002E>\n<0013> <0030>\n<001A> <0037>\n<002C> <0049>\n<002E> <004B>\n<0030> <004D>\n<003D> <005A>\n<0070> <201C>\n<007B> <00A0>\n<01AC> \nendbfchar\n9 beginbfrange\n<000B> <000C> <0028>\n<0015> <0016> <0032>\n<0024> <0028> <0041>\n<0032> <0033> <004F>\n<0036> <0038> <0053>\n<003A> <003B> <0057>\n<0044> <004C> <0061>\n<004E> <0053> <006B>\n<0055> <005C> <0072>\nendbfrange\nendcmap\nCMapName currentdict /CMap defineresource pop\nend\nend' # noqa - self.assertEqual(roundtrip, merge_cmaps((roundtrip,))) - self.assertEqual(roundtrip, merge_cmaps((roundtrip, roundtrip))) - res = merge_cmaps(( - 'a\nbegincmap\nb\n1 begincodespacerange\n<0010> <00FF>\nendcodespacerange\n' - '1 beginbfchar\n<0001> <0020>\nendbfchar\n1 beginbfrange\n<0002> <000a> <00021>\nendbfrange\nendcmap\nc', - 'x\nbegincmap\ny\n1 begincodespacerange\n<0001> <0100>\nendcodespacerange\n' - '1 beginbfchar\n<0011> <0040>\nendbfchar\n1 beginbfrange\n<0012> <001a> <00051>\nendbfrange\nendcmap\nz' - )) - self.assertEqual( - 'a\nbegincmap\nb\n1 begincodespacerange\n<0001> <0100>\nendcodespacerange\n' - '2 beginbfchar\n<0001> <0020>\n<0011> <0040>\nendbfchar\n' - '2 beginbfrange\n<0002> <000A> <0021>\n<0012> <001A> <0051>\nendbfrange\nendcmap\nc', - res) - - -def find_tests(): - return unittest.defaultTestLoader.loadTestsFromTestCase(TestPDFWriter) diff --git a/src/calibre/utils/podofo/fonts.cpp b/src/calibre/utils/podofo/fonts.cpp index 6ff706d55f..b170013738 100644 --- a/src/calibre/utils/podofo/fonts.cpp +++ b/src/calibre/utils/podofo/fonts.cpp @@ -325,63 +325,36 @@ replace_font_data(PDFDoc *self, PyObject *args) { PyObject* merge_fonts(PDFDoc *self, PyObject *args) { - PyObject *items, *replacements; - if (!PyArg_ParseTuple(args, "O!O!", &PyTuple_Type, &items, &PyDict_Type, &replacements)) return NULL; - std::unordered_map ref_map; + const char *data; Py_ssize_t sz; + PyObject *references; + if (!PyArg_ParseTuple(args, "y#O!", &data, &sz, &PyTuple_Type, &references)) return NULL; PdfVecObjects &objects = self->doc->GetObjects(); - PyObject *key, *value; - Py_ssize_t pos = 0; - size_t c = 0; - while (PyDict_Next(replacements, &pos, &key, &value)) { - c++; - unsigned long num, gen; - if (!PyArg_ParseTuple(key, "kk", &num, &gen)) return NULL; - uint64_t k = ref_as_integer(static_cast(num), static_cast(gen)); - PdfReference ref(num, static_cast(gen)); - PdfObject *font = objects.GetObject(ref); - if (font) remove_font(objects, font); - if (!PyArg_ParseTuple(value, "kk", &num, &gen)) return NULL; - uint64_t v = ref_as_integer(num, static_cast(gen)); - ref_map[k] = v; - } - if (c > 0) replace_font_references(self, ref_map); - - for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(items); i++) { - long num, gen, t0num, t0gen; - PyObject *W, *W2; - const char *data, *tounicode_data; - Py_ssize_t sz, tounicode_sz; - if (!PyArg_ParseTuple(PyTuple_GET_ITEM(items, i), "(ll)(ll)O!O!s#s#", &num, &gen, &t0num, &t0gen, &PyList_Type, &W, &PyList_Type, &W2, &data, &sz, &tounicode_data, &tounicode_sz)) return NULL; - PdfReference ref(num, static_cast(gen)); - PdfObject *font = objects.GetObject(ref); - if (font) { - if (PyObject_IsTrue(W)) { - PdfArray w; - convert_w_array(W, w); - font->GetDictionary().AddKey("W", w); - } - if (PyObject_IsTrue(W2)) { - PdfArray w; - convert_w_array(W2, w); - font->GetDictionary().AddKey("W2", w); - } - const PdfObject *descriptor = font->GetIndirectKey("FontDescriptor"); - if (descriptor) { - PdfObject *ff = get_font_file(descriptor); - PdfStream *stream = ff->GetStream(); - stream->Set(data, sz); - } - } - if (tounicode_sz) { - PdfObject *t0font = objects.GetObject(PdfReference(t0num, static_cast(t0gen))); - if (t0font) { - PdfObject *s = t0font->GetIndirectKey("ToUnicode"); - if (!s) { PyErr_SetString(PyExc_ValueError, "Type0 font has no ToUnicode stream"); return NULL; } - s->GetStream()->Set(tounicode_data, tounicode_sz); - } - } - } - Py_RETURN_NONE; + PdfObject *font_file = NULL; + for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(references); i++) { + unsigned long num, gen; + if (!PyArg_ParseTuple(PyTuple_GET_ITEM(references, i), "kk", &num, &gen)) return NULL; + PdfObject *font = objects.GetObject(PdfReference(num, static_cast(gen))); + if (!font) { PyErr_SetString(PyExc_KeyError, "No font with the specified reference found"); return NULL; } + PdfObject *dobj = font->GetIndirectKey("FontDescriptor"); + if (!dobj) { PyErr_SetString(PyExc_ValueError, "Font does not have a descriptor"); return NULL; } + if (!dobj->IsDictionary()) { PyErr_SetString(PyExc_ValueError, "Font does not have a dictionary descriptor"); return NULL; } + PdfDictionary &descriptor = dobj->GetDictionary(); + const char *font_file_key = NULL; + if (descriptor.HasKey("FontFile")) font_file_key = "FontFile"; + else if (descriptor.HasKey("FontFile2")) font_file_key = "FontFile2"; + else if (descriptor.HasKey("FontFile3")) font_file_key = "FontFile3"; + else { PyErr_SetString(PyExc_ValueError, "Font descriptor does not have file data"); return NULL; } + PdfObject *ff = dobj->GetIndirectKey(font_file_key); + if (i == 0) { + font_file = ff; + PdfStream *stream = ff->GetStream(); + stream->Set(data, sz); + } else { + delete objects.RemoveObject(ff->Reference()); + descriptor.AddKey(font_file_key, font_file->Reference()); + } + } + Py_RETURN_NONE; } class CharProc {