PDF Output: Yet another attempt at working around chromium's broken TTF embedding. Fixes #1907849 [conversion to pdf results in bizarre kerning](https://bugs.launchpad.net/calibre/+bug/1907849)

This has the advantage of making the merging code a lot simpler as well.
This commit is contained in:
Kovid Goyal 2020-12-12 13:41:48 +05:30
parent 22f200fd10
commit e430bf3312
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
4 changed files with 54 additions and 352 deletions

View File

@ -116,8 +116,6 @@ def find_tests(which_tests=None, exclude_tests=None):
a(find_tests()) a(find_tests())
from calibre.ebooks.metadata.html import find_tests from calibre.ebooks.metadata.html import find_tests
a(find_tests()) a(find_tests())
from calibre.ebooks.pdf.test_html_writer import find_tests
a(find_tests())
from calibre.utils.xml_parse import find_tests from calibre.utils.xml_parse import find_tests
a(find_tests()) a(find_tests())
from calibre.gui2.viewer.annotations import find_tests from calibre.gui2.viewer.annotations import find_tests

View File

@ -8,22 +8,19 @@
import copy import copy
import json import json
import os import os
import re
import signal import signal
import sys import sys
from collections import namedtuple from collections import namedtuple
from html5_parser import parse
from io import BytesIO from io import BytesIO
from itertools import count, repeat from itertools import count, repeat
from operator import attrgetter, itemgetter
from html5_parser import parse
from PyQt5.Qt import ( from PyQt5.Qt import (
QApplication, QMarginsF, QObject, QPageLayout, Qt, QTimer, QUrl, pyqtSignal QApplication, QMarginsF, QObject, QPageLayout, Qt, QTimer, QUrl, pyqtSignal
) )
from PyQt5.QtWebEngineCore import QWebEngineUrlRequestInterceptor from PyQt5.QtWebEngineCore import QWebEngineUrlRequestInterceptor
from PyQt5.QtWebEngineWidgets import QWebEnginePage, QWebEngineProfile from PyQt5.QtWebEngineWidgets import QWebEnginePage, QWebEngineProfile
from calibre import detect_ncpus, prepare_string_for_xml, human_readable from calibre import detect_ncpus, human_readable, prepare_string_for_xml
from calibre.constants import __version__, iswindows from calibre.constants import __version__, iswindows
from calibre.ebooks.metadata.xmp import metadata_to_xmp_packet from calibre.ebooks.metadata.xmp import metadata_to_xmp_packet
from calibre.ebooks.oeb.base import XHTML, XPath from calibre.ebooks.oeb.base import XHTML, XPath
@ -45,9 +42,7 @@ from calibre.utils.podofo import (
dedup_type3_fonts, get_podofo, remove_unused_fonts, set_metadata_implementation dedup_type3_fonts, get_podofo, remove_unused_fonts, set_metadata_implementation
) )
from calibre.utils.short_uuid import uuid4 from calibre.utils.short_uuid import uuid4
from polyglot.builtins import ( from polyglot.builtins import filter, iteritems, map, range, unicode_type
as_bytes, as_unicode, filter, iteritems, map, range, unicode_type
)
from polyglot.urllib import urlparse from polyglot.urllib import urlparse
OK, KILL_SIGNAL = range(0, 2) OK, KILL_SIGNAL = range(0, 2)
@ -769,182 +764,6 @@ def all_glyph_ids_in_w_arrays(arrays, as_set=False):
return ans if as_set else sorted(ans) return ans if as_set else sorted(ans)
def merge_w_arrays(arrays):
ranges = []
for w in arrays:
i = 0
while i + 1 < len(w):
elem = w[i]
next_elem = w[i+1]
if isinstance(next_elem, list):
ranges.append(Range(elem, elem + len(next_elem) - 1, next_elem))
i += 2
elif i + 2 < len(w):
ranges.append(Range(elem, next_elem, [w[i+2]]))
i += 3
else:
break
ranges.sort(key=attrgetter('sort_order'))
merged_ranges = ranges[:1]
for r in ranges[1:]:
prev_range = merged_ranges[-1]
left_over = prev_range.merge(r)
if left_over is not None:
merged_ranges.append(left_over)
if not merged_ranges:
return []
ans = []
for r in merged_ranges:
ans.extend(r.as_item)
return ans
def width_map_from_w_array(w):
ans = {}
i = 0
while i + 1 < len(w):
elem = w[i]
next_elem = w[i+1]
if isinstance(next_elem, list):
for gid, width in zip(range(elem, elem + len(next_elem)), next_elem):
ans[gid] = width
i += 2
else:
try:
width = w[i+2]
except IndexError:
width = 0
for gid in range(elem, next_elem + 1):
ans[gid] = width
i += 3
return ans
def merge_w_arrays_directly(arrays):
width_maps = tuple(map(width_map_from_w_array, arrays))
def getter(gid):
return max(m.get(gid, 0) for m in width_maps)
all_gids = set()
for m in width_maps:
all_gids |= set(m)
widths = []
for gid in sorted(all_gids):
widths.extend((gid, gid, getter(gid)))
return merge_w_arrays((widths,))
class CMap(object):
def __init__(self):
self.start_codespace = sys.maxsize
self.end_codespace = 0
self.ranges = set()
self.chars = set()
self.header = self.footer = None
def add_codespace(self, start, end):
self.start_codespace = min(self.start_codespace, start)
self.end_codespace = max(self.end_codespace, end)
def serialize(self):
chars = sorted(self.chars, key=itemgetter(0))
def ashex(x):
ans = '{:04X}'.format(x)
leftover = len(ans) % 4
if leftover:
ans = ('0' * (4 - leftover)) + ans
return ans
lines = ['1 begincodespacerange', '<{}> <{}>'.format(*map(ashex, (self.start_codespace, self.end_codespace))), 'endcodespacerange']
while chars:
group, chars = chars[:100], chars[100:]
lines.append('{} beginbfchar'.format(len(group)))
for g in group:
lines.append('<{}> <{}>'.format(*map(ashex, g)))
lines.append('endbfchar')
ranges = sorted(self.ranges, key=itemgetter(0))
while ranges:
group, ranges = ranges[:100], ranges[100:]
lines.append('{} beginbfrange'.format(len(group)))
for g in group:
lines.append('<{}> <{}> <{}>'.format(*map(ashex, g)))
lines.append('endbfrange')
return self.header + '\n' + '\n'.join(lines) + '\n' + self.footer
def merge_cmaps(cmaps):
header, incmap, incodespace, inchar, inrange, footer = 'header cmap codespace char range footer'.split()
start_pat = re.compile(r'\d+\s+begin(codespacerange|bfrange|bfchar)')
ans = CMap()
for cmap in cmaps:
state = header
headerlines = []
footerlines = []
prefix_ended = False
for line in as_unicode(cmap, errors='replace').splitlines():
line = line.strip()
if state is header:
headerlines.append(line)
if line == 'begincmap':
state = incmap
continue
if state is incmap:
if line == 'endcmap':
state = footer
footerlines.append(line)
continue
m = start_pat.match(line)
if m is not None:
state = incodespace if m.group(1) == 'codespacerange' else (inchar if m.group(1) == 'bfchar' else inrange)
prefix_ended = True
continue
if not prefix_ended:
headerlines.append(line)
continue
if state is incodespace:
if line == 'endcodespacerange':
state = incmap
else:
s, e = line.split()
s = int(s[1:-1], 16)
e = int(e[1:-1], 16)
ans.add_codespace(s, e)
continue
if state is inchar:
if line == 'endbfchar':
state = incmap
else:
a, b = line.split()
a = int(a[1:-1], 16)
b = int(b[1:-1], 16)
ans.chars.add((a, b))
continue
if state is inrange:
if line == 'endbfrange':
state = incmap
else:
# technically bfrange can contain arrays for th eunicode
# value but from looking at SkPDFFont.cpp in chromium, it
# does not generate any
a, b, u = line.split()
a = int(a[1:-1], 16)
b = int(b[1:-1], 16)
u = int(u[1:-1], 16)
ans.ranges.add((a, b, u))
continue
if state is footer:
footerlines.append(line)
if ans.header is None:
ans.header = '\n'.join(headerlines)
ans.footer = '\n'.join(footerlines)
return ans.serialize()
def fonts_are_identical(fonts): def fonts_are_identical(fonts):
sentinel = object() sentinel = object()
for key in ('ToUnicode', 'Data', 'W', 'W2'): for key in ('ToUnicode', 'Data', 'W', 'W2'):
@ -957,27 +776,27 @@ def fonts_are_identical(fonts):
return True return True
def merge_font(fonts, log): def merge_font_files(fonts, log):
# As of Qt 5.15.1 Chromium has switched to harfbuzz and dropped sfntly. It
# now produces font descriptors whose W arrays dont match the glyph width
# information from the hhea table, in contravention of the PDF spec. So
# we can no longer merge font descriptors, all we can do is merge the
# actual sfnt data streams into a single stream and subset it to contain
# only the glyphs from all W arrays.
# choose the largest font as the base font # choose the largest font as the base font
fonts.sort(key=lambda f: len(f['Data'] or b''), reverse=True) fonts.sort(key=lambda f: len(f['Data'] or b''), reverse=True)
base_font = fonts[0]
t0_font = next(f for f in fonts if f['DescendantFont'] == base_font['Reference'])
descendant_fonts = [f for f in fonts if f['Subtype'] != 'Type0'] descendant_fonts = [f for f in fonts if f['Subtype'] != 'Type0']
t0_fonts = [f for f in fonts if f['Subtype'] == 'Type0'] total_size = sum(len(f['Data']) for f in descendant_fonts)
references_to_drop = tuple(f['Reference'] for f in fonts if f is not base_font and f is not t0_font) merged_sfnt = merge_truetype_fonts_for_pdf(tuple(f['sfnt'] for f in descendant_fonts), log)
if fonts_are_identical(descendant_fonts): w_arrays = tuple(filter(None, (f['W'] for f in descendant_fonts)))
return t0_font, base_font, references_to_drop glyph_ids = all_glyph_ids_in_w_arrays(w_arrays, as_set=True)
cmaps = list(filter(None, (f['ToUnicode'] for f in t0_fonts))) h_arrays = tuple(filter(None, (f['W2'] for f in descendant_fonts)))
if cmaps: glyph_ids |= all_glyph_ids_in_w_arrays(h_arrays, as_set=True)
t0_font['ToUnicode'] = as_bytes(merge_cmaps(cmaps)) pdf_subset(merged_sfnt, glyph_ids)
base_font['sfnt'] = merge_truetype_fonts_for_pdf(tuple(f['sfnt'] for f in descendant_fonts), log) font_data = merged_sfnt()[0]
arrays = tuple(filter(None, (f['W'] for f in descendant_fonts))) log(f'Merged {len(fonts)} instances of {fonts[0]["BaseFont"]} reducing size from {human_readable(total_size)} to {human_readable(len(font_data))}')
if arrays: return font_data, tuple(f['Reference'] for f in descendant_fonts)
base_font['W'] = merge_w_arrays_directly(arrays)
arrays = tuple(filter(None, (f['W2'] for f in descendant_fonts)))
if arrays:
base_font['W2'] = merge_w_arrays_directly(arrays)
return t0_font, base_font, references_to_drop
def merge_fonts(pdf_doc, log): def merge_fonts(pdf_doc, log):
@ -1005,18 +824,10 @@ def merge_fonts(pdf_doc, log):
for f in all_fonts: for f in all_fonts:
base_font_map.setdefault(f['BaseFont'], []).append(f) base_font_map.setdefault(f['BaseFont'], []).append(f)
replacements = {}
items = []
for name, fonts in iteritems(base_font_map): for name, fonts in iteritems(base_font_map):
if mergeable(fonts): if mergeable(fonts):
t0_font, base_font, references_to_drop = merge_font(fonts, log) font_data, references = merge_font_files(fonts, log)
for ref in references_to_drop: pdf_doc.merge_fonts(font_data, references)
replacements[ref] = t0_font['Reference']
data = base_font['sfnt']()[0]
items.append((
base_font['Reference'], t0_font['Reference'], base_font['W'] or [], base_font['W2'] or [],
data, t0_font['ToUnicode'] or b''))
pdf_doc.merge_fonts(tuple(items), replacements)
def test_merge_fonts(): def test_merge_fonts():
@ -1024,28 +835,11 @@ def test_merge_fonts():
podofo = get_podofo() podofo = get_podofo()
pdf_doc = podofo.PDFDoc() pdf_doc = podofo.PDFDoc()
pdf_doc.open(path) pdf_doc.open(path)
merge_fonts(pdf_doc) from calibre.utils.logging import default_log
merge_fonts(pdf_doc, default_log)
out = path.rpartition('.')[0] + '-merged.pdf' out = path.rpartition('.')[0] + '-merged.pdf'
pdf_doc.save(out) pdf_doc.save(out)
print('Merged PDF written to', out) print('Merged PDF written to', out)
def subset_fonts(pdf_doc, log):
all_fonts = pdf_doc.list_fonts(True)
for font in all_fonts:
if font['Subtype'] != 'Type0' and font['Data']:
try:
sfnt = Sfnt(font['Data'])
except UnsupportedFont:
continue
if b'glyf' not in sfnt:
continue
num, gen = font['Reference']
glyphs = all_glyph_ids_in_w_arrays((font['W'] or (), font['W2'] or ()), as_set=True)
pdf_subset(sfnt, glyphs)
data = sfnt()[0]
log('Subset embedded font from: {} to {}'.format(human_readable(len(font['Data'])), human_readable(len(data))))
pdf_doc.replace_font_data(data, num, gen)
# }}} # }}}
@ -1346,11 +1140,6 @@ def convert(opf_path, opts, metadata=None, output_path=None, log=default_log, co
if num_removed: if num_removed:
log('Removed', num_removed, 'unused fonts') log('Removed', num_removed, 'unused fonts')
# Originally added because of https://bugreports.qt.io/browse/QTBUG-88976
# however even after that fix, calibre's font subsetting is superior to
# harfbuzz, so continue to use it.
subset_fonts(pdf_doc, log)
num_removed = pdf_doc.dedup_images() num_removed = pdf_doc.dedup_images()
if num_removed: if num_removed:
log('Removed', num_removed, 'duplicate images') log('Removed', num_removed, 'duplicate images')

View File

@ -1,58 +0,0 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
# License: GPL v3 Copyright: 2019, Kovid Goyal <kovid at kovidgoyal.net>
import unittest
from .html_writer import merge_w_arrays, merge_cmaps
class TestPDFWriter(unittest.TestCase):
maxDiff = None
def test_merge_w_arrays(self):
self.assertEqual(merge_w_arrays(( # merge neighbor arrays
[1, 3, 0.1], [3, [0.1, 0.2]])), [1, 3, 0.1, 4, 4, 0.2])
self.assertEqual(merge_w_arrays(( # merge neighbor ranges
[1, 5, 0.1], [6, 8, 0.1])), [1, 8, 0.1])
self.assertEqual(merge_w_arrays(( # merge neighbor ranges
[1, 5, 0.1], [6, 8, 0.2])), [1, 5, 0.1, 6, 8, 0.2])
self.assertEqual(merge_w_arrays(( # disjoin overlap
[1, 4, 0.1], [3, [0.1, 0.1, 0.2, 0.3]])), [1, 4, 0.1, 5, [0.2, 0.3]])
self.assertEqual(merge_w_arrays(( # disjoin overlap
[1, [0.1, 0.2]], [2, 4, 0.2])), [1, [0.1, 0.2], 3, 4, 0.2])
self.assertEqual(merge_w_arrays(( # split overlapping arrays
[1, [0.1, 0.2, 0.3]], [3, 5, 0.3])), [1, [0.1, 0.2, 0.3], 4, 5, 0.3])
self.assertEqual(merge_w_arrays(( # merge overlapping ranges, using first width
[1, 5, 0.1], [2, 4, 0.2])), [1, 5, 0.1])
self.assertEqual(merge_w_arrays(( # merge overlapping arrays
[1, [0.1, 0.1]], [3, [0.2, 0.2]])), [1, [0.1, 0.1, 0.2, 0.2]])
self.assertEqual(merge_w_arrays((
[1, 10, 99, 20, [1, 2, 3, 4]],
[3, 10, 99, 11, 13, 77, 19, [77, 1]])),
[1, 10, 99, 11, 13, 77, 19, [77, 1, 2, 3, 4]]
)
def test_merge_cmaps(self):
roundtrip = '/CIDInit /ProcSet findresource begin\n12 dict begin\nbegincmap\n/CIDSystemInfo\n<< /Registry (Adobe)\n/Ordering (UCS)\n/Supplement 0\n>> def\n/CMapName /Adobe-Identity-UCS def\n/CMapType 2 def\n1 begincodespacerange\n<0000> <FFFF>\nendcodespacerange\n12 beginbfchar\n<0003> <0020>\n<000F> <002C>\n<0011> <002E>\n<0013> <0030>\n<001A> <0037>\n<002C> <0049>\n<002E> <004B>\n<0030> <004D>\n<003D> <005A>\n<0070> <201C>\n<007B> <00A0>\n<01AC> <FB01>\nendbfchar\n9 beginbfrange\n<000B> <000C> <0028>\n<0015> <0016> <0032>\n<0024> <0028> <0041>\n<0032> <0033> <004F>\n<0036> <0038> <0053>\n<003A> <003B> <0057>\n<0044> <004C> <0061>\n<004E> <0053> <006B>\n<0055> <005C> <0072>\nendbfrange\nendcmap\nCMapName currentdict /CMap defineresource pop\nend\nend' # noqa
self.assertEqual(roundtrip, merge_cmaps((roundtrip,)))
self.assertEqual(roundtrip, merge_cmaps((roundtrip, roundtrip)))
res = merge_cmaps((
'a\nbegincmap\nb\n1 begincodespacerange\n<0010> <00FF>\nendcodespacerange\n'
'1 beginbfchar\n<0001> <0020>\nendbfchar\n1 beginbfrange\n<0002> <000a> <00021>\nendbfrange\nendcmap\nc',
'x\nbegincmap\ny\n1 begincodespacerange\n<0001> <0100>\nendcodespacerange\n'
'1 beginbfchar\n<0011> <0040>\nendbfchar\n1 beginbfrange\n<0012> <001a> <00051>\nendbfrange\nendcmap\nz'
))
self.assertEqual(
'a\nbegincmap\nb\n1 begincodespacerange\n<0001> <0100>\nendcodespacerange\n'
'2 beginbfchar\n<0001> <0020>\n<0011> <0040>\nendbfchar\n'
'2 beginbfrange\n<0002> <000A> <0021>\n<0012> <001A> <0051>\nendbfrange\nendcmap\nc',
res)
def find_tests():
return unittest.defaultTestLoader.loadTestsFromTestCase(TestPDFWriter)

View File

@ -325,60 +325,33 @@ replace_font_data(PDFDoc *self, PyObject *args) {
PyObject* PyObject*
merge_fonts(PDFDoc *self, PyObject *args) { merge_fonts(PDFDoc *self, PyObject *args) {
PyObject *items, *replacements; const char *data; Py_ssize_t sz;
if (!PyArg_ParseTuple(args, "O!O!", &PyTuple_Type, &items, &PyDict_Type, &replacements)) return NULL; PyObject *references;
std::unordered_map<uint64_t, uint64_t> ref_map; if (!PyArg_ParseTuple(args, "y#O!", &data, &sz, &PyTuple_Type, &references)) return NULL;
PdfVecObjects &objects = self->doc->GetObjects(); PdfVecObjects &objects = self->doc->GetObjects();
PyObject *key, *value; PdfObject *font_file = NULL;
Py_ssize_t pos = 0; for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(references); i++) {
size_t c = 0;
while (PyDict_Next(replacements, &pos, &key, &value)) {
c++;
unsigned long num, gen; unsigned long num, gen;
if (!PyArg_ParseTuple(key, "kk", &num, &gen)) return NULL; if (!PyArg_ParseTuple(PyTuple_GET_ITEM(references, i), "kk", &num, &gen)) return NULL;
uint64_t k = ref_as_integer(static_cast<pdf_objnum>(num), static_cast<pdf_gennum>(gen)); PdfObject *font = objects.GetObject(PdfReference(num, static_cast<pdf_gennum>(gen)));
PdfReference ref(num, static_cast<pdf_gennum>(gen)); if (!font) { PyErr_SetString(PyExc_KeyError, "No font with the specified reference found"); return NULL; }
PdfObject *font = objects.GetObject(ref); PdfObject *dobj = font->GetIndirectKey("FontDescriptor");
if (font) remove_font(objects, font); if (!dobj) { PyErr_SetString(PyExc_ValueError, "Font does not have a descriptor"); return NULL; }
if (!PyArg_ParseTuple(value, "kk", &num, &gen)) return NULL; if (!dobj->IsDictionary()) { PyErr_SetString(PyExc_ValueError, "Font does not have a dictionary descriptor"); return NULL; }
uint64_t v = ref_as_integer(num, static_cast<pdf_gennum>(gen)); PdfDictionary &descriptor = dobj->GetDictionary();
ref_map[k] = v; const char *font_file_key = NULL;
} if (descriptor.HasKey("FontFile")) font_file_key = "FontFile";
if (c > 0) replace_font_references(self, ref_map); else if (descriptor.HasKey("FontFile2")) font_file_key = "FontFile2";
else if (descriptor.HasKey("FontFile3")) font_file_key = "FontFile3";
for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(items); i++) { else { PyErr_SetString(PyExc_ValueError, "Font descriptor does not have file data"); return NULL; }
long num, gen, t0num, t0gen; PdfObject *ff = dobj->GetIndirectKey(font_file_key);
PyObject *W, *W2; if (i == 0) {
const char *data, *tounicode_data; font_file = ff;
Py_ssize_t sz, tounicode_sz;
if (!PyArg_ParseTuple(PyTuple_GET_ITEM(items, i), "(ll)(ll)O!O!s#s#", &num, &gen, &t0num, &t0gen, &PyList_Type, &W, &PyList_Type, &W2, &data, &sz, &tounicode_data, &tounicode_sz)) return NULL;
PdfReference ref(num, static_cast<pdf_gennum>(gen));
PdfObject *font = objects.GetObject(ref);
if (font) {
if (PyObject_IsTrue(W)) {
PdfArray w;
convert_w_array(W, w);
font->GetDictionary().AddKey("W", w);
}
if (PyObject_IsTrue(W2)) {
PdfArray w;
convert_w_array(W2, w);
font->GetDictionary().AddKey("W2", w);
}
const PdfObject *descriptor = font->GetIndirectKey("FontDescriptor");
if (descriptor) {
PdfObject *ff = get_font_file(descriptor);
PdfStream *stream = ff->GetStream(); PdfStream *stream = ff->GetStream();
stream->Set(data, sz); stream->Set(data, sz);
} } else {
} delete objects.RemoveObject(ff->Reference());
if (tounicode_sz) { descriptor.AddKey(font_file_key, font_file->Reference());
PdfObject *t0font = objects.GetObject(PdfReference(t0num, static_cast<pdf_gennum>(t0gen)));
if (t0font) {
PdfObject *s = t0font->GetIndirectKey("ToUnicode");
if (!s) { PyErr_SetString(PyExc_ValueError, "Type0 font has no ToUnicode stream"); return NULL; }
s->GetStream()->Set(tounicode_data, tounicode_sz);
}
} }
} }
Py_RETURN_NONE; Py_RETURN_NONE;