From 0d1b99a4a4c845269af808fd61fc9cf9ebd7f125 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 23 Jul 2019 15:36:17 +0530 Subject: [PATCH] Implement merging of cmaps --- src/calibre/ebooks/pdf/html_writer.py | 125 ++++++++++++++++++++- src/calibre/ebooks/pdf/test_html_writer.py | 20 +++- src/calibre/utils/podofo/fonts.cpp | 29 ++++- 3 files changed, 162 insertions(+), 12 deletions(-) diff --git a/src/calibre/ebooks/pdf/html_writer.py b/src/calibre/ebooks/pdf/html_writer.py index 20fc490735..b848ba4b40 100644 --- a/src/calibre/ebooks/pdf/html_writer.py +++ b/src/calibre/ebooks/pdf/html_writer.py @@ -8,11 +8,12 @@ from __future__ import absolute_import, division, print_function, unicode_litera import copy import json import os +import re import signal import sys from collections import namedtuple from io import BytesIO -from operator import attrgetter +from operator import attrgetter, itemgetter from PyQt5.Qt import ( QApplication, QMarginsF, QObject, QPageLayout, QTimer, QUrl, pyqtSignal @@ -38,7 +39,7 @@ from calibre.utils.podofo import ( get_podofo, remove_unused_fonts, set_metadata_implementation ) from calibre.utils.short_uuid import uuid4 -from polyglot.builtins import filter, iteritems, map, range, unicode_type +from polyglot.builtins import as_bytes, filter, iteritems, map, range, unicode_type from polyglot.urllib import urlparse OK, KILL_SIGNAL = range(0, 2) @@ -600,14 +601,126 @@ def merge_w_arrays(arrays): return ans -def merge_font(fonts): - # TODO: Check if the ToUnicode entry in the Type0 dict needs to be merged +class CMap(object): + def __init__(self): + self.start_codespace = sys.maxsize + self.end_codespace = 0 + self.ranges = set() + self.chars = set() + self.header = self.footer = None + + def add_codespace(self, start, end): + self.start_codespace = min(self.start_codespace, start) + self.end_codespace = max(self.end_codespace, end) + + def serialize(self): + chars = sorted(self.chars, key=itemgetter(0)) + + def ashex(x): + ans = '{:04X}'.format(x) + leftover = len(ans) % 4 + if leftover: + ans = ('0' * (4 - leftover)) + ans + return ans + + lines = ['1 begincodespacerange', '<{}> <{}>'.format(*map(ashex, (self.start_codespace, self.end_codespace))), 'endcodespacerange'] + while chars: + group, chars = chars[:100], chars[100:] + del chars[:100] + lines.append('{} beginbfchar'.format(len(group))) + for g in group: + lines.append('<{}> <{}>'.format(*map(ashex, g))) + lines.append('endbfchar') + + ranges = sorted(self.ranges, key=itemgetter(0)) + while ranges: + group, ranges = ranges[:100], ranges[100:] + lines.append('{} beginbfrange'.format(len(group))) + for g in group: + lines.append('<{}> <{}> <{}>'.format(*map(ashex, g))) + lines.append('endbfrange') + return self.header + '\n' + '\n'.join(lines) + '\n' + self.footer + + +def merge_cmaps(cmaps): + header, incmap, incodespace, inchar, inrange, footer = 'header cmap codespace char range footer'.split() + start_pat = re.compile(r'\d+\s+begin(codespacerange|bfrange|bfchar)') + ans = CMap() + for cmap in cmaps: + state = header + headerlines = [] + footerlines = [] + prefix_ended = False + for line in cmap.decode('utf-8', 'replace').splitlines(): + line = line.strip() + if state is header: + headerlines.append(line) + if line == 'begincmap': + state = incmap + continue + if state is incmap: + if line == 'endcmap': + state = footer + footerlines.append(line) + continue + m = start_pat.match(line) + if m is not None: + state = incodespace if m.group(1) == 'codespacerange' else (inchar if m.group(1) == 'bfchar' else inrange) + prefix_ended = True + continue + if not prefix_ended: + headerlines.append(line) + continue + if state is incodespace: + if line == 'endcodespacerange': + state = incmap + else: + s, e = line.split() + s = int(s[1:-1], 16) + e = int(e[1:-1], 16) + ans.add_codespace(s, e) + continue + if state is inchar: + if line == 'endbfchar': + state = incmap + else: + a, b = line.split() + a = int(a[1:-1], 16) + b = int(b[1:-1], 16) + ans.chars.add((a, b)) + continue + if state is inrange: + if line == 'endbfrange': + state = incmap + else: + # technically bfrange can contain arrays for th eunicode + # value but from looking at SkPDFFont.cpp in chromium, it + # does not generate any + a, b, u = line.split() + a = int(a[1:-1], 16) + b = int(b[1:-1], 16) + u = int(u[1:-1], 16) + ans.ranges.add((a, b, u)) + continue + if state is footer: + footerlines.append(line) + if ans.header is None: + ans.header = '\n'.join(headerlines) + ans.footer = '\n'.join(footerlines) + return ans.serialize() + + +def merge_font(fonts): # choose the largest font as the base font fonts.sort(key=lambda f: len(f['Data'] or b''), reverse=True) base_font = fonts[0] t0_font = next(f for f in fonts if f['DescendantFont'] == base_font['Reference']) descendant_fonts = [f for f in fonts if f['Subtype'] != 'Type0'] + t0_fonts = [f for f in fonts if f['Subtype'] == 'Type0'] + cmaps = list(filter(None, (f['ToUnicode'] for f in t0_fonts))) + if cmaps: + t0_font['ToUnicode'] = as_bytes(merge_cmaps(cmaps)) for key in ('W', 'W2'): arrays = tuple(filter(None, (f[key] for f in descendant_fonts))) base_font[key] = merge_w_arrays(arrays) @@ -650,7 +763,9 @@ def merge_fonts(pdf_doc): for ref in references_to_drop: replacements[ref] = t0_font['Reference'] data = base_font['sfnt']()[0] - items.append((base_font['Reference'], base_font['W'] or [], base_font['W2'] or [], data)) + items.append(( + base_font['Reference'], t0_font['Reference'], base_font['W'] or [], base_font['W2'] or [], + data, t0_font['ToUnicode'] or b'')) pdf_doc.merge_fonts(tuple(items), replacements) diff --git a/src/calibre/ebooks/pdf/test_html_writer.py b/src/calibre/ebooks/pdf/test_html_writer.py index ec8adf3f2f..0cebe5a9ee 100644 --- a/src/calibre/ebooks/pdf/test_html_writer.py +++ b/src/calibre/ebooks/pdf/test_html_writer.py @@ -5,11 +5,13 @@ from __future__ import absolute_import, division, print_function, unicode_literals import unittest -from .html_writer import merge_w_arrays +from .html_writer import merge_w_arrays, merge_cmaps class TestPDFWriter(unittest.TestCase): + maxDiff = None + def test_merge_w_arrays(self): self.assertEqual(merge_w_arrays(( # merge neighbor arrays [1, 3, 0.1], [3, [0.1, 0.2]])), [1, 3, 0.1, 4, 4, 0.2]) @@ -36,6 +38,22 @@ class TestPDFWriter(unittest.TestCase): [1, 10, 99, 11, 13, 77, 19, [77, 1, 2, 3, 4]] ) + def test_merge_cmaps(self): + roundtrip = '/CIDInit /ProcSet findresource begin\n12 dict begin\nbegincmap\n/CIDSystemInfo\n<< /Registry (Adobe)\n/Ordering (UCS)\n/Supplement 0\n>> def\n/CMapName /Adobe-Identity-UCS def\n/CMapType 2 def\n1 begincodespacerange\n<0000> \nendcodespacerange\n12 beginbfchar\n<0003> <0020>\n<000F> <002C>\n<0011> <002E>\n<0013> <0030>\n<001A> <0037>\n<002C> <0049>\n<002E> <004B>\n<0030> <004D>\n<003D> <005A>\n<0070> <201C>\n<007B> <00A0>\n<01AC> \nendbfchar\n9 beginbfrange\n<000B> <000C> <0028>\n<0015> <0016> <0032>\n<0024> <0028> <0041>\n<0032> <0033> <004F>\n<0036> <0038> <0053>\n<003A> <003B> <0057>\n<0044> <004C> <0061>\n<004E> <0053> <006B>\n<0055> <005C> <0072>\nendbfrange\nendcmap\nCMapName currentdict /CMap defineresource pop\nend\nend' # noqa + self.assertEqual(roundtrip, merge_cmaps((roundtrip,))) + self.assertEqual(roundtrip, merge_cmaps((roundtrip, roundtrip))) + res = merge_cmaps(( + 'a\nbegincmap\nb\n1 begincodespacerange\n<0010> <00FF>\nendcodespacerange\n' + '1 beginbfchar\n<0001> <0020>\nendbfchar\n1 beginbfrange\n<0002> <000a> <00021>\nendbfrange\nendcmap\nc', + 'x\nbegincmap\ny\n1 begincodespacerange\n<0001> <0100>\nendcodespacerange\n' + '1 beginbfchar\n<0011> <0040>\nendbfchar\n1 beginbfrange\n<0012> <001a> <00051>\nendbfrange\nendcmap\nz' + )) + self.assertEqual( + 'a\nbegincmap\nb\n1 begincodespacerange\n<0001> <0100>\nendcodespacerange\n' + '2 beginbfchar\n<0001> <0020>\n<0011> <0040>\nendbfchar\n' + '2 beginbfrange\n<0002> <000A> <0021>\n<0012> <001A> <0051>\nendbfrange\nendcmap\nc', + res) + def find_tests(): return unittest.defaultTestLoader.loadTestsFromTestCase(TestPDFWriter) diff --git a/src/calibre/utils/podofo/fonts.cpp b/src/calibre/utils/podofo/fonts.cpp index fb629b10f9..886f93931c 100644 --- a/src/calibre/utils/podofo/fonts.cpp +++ b/src/calibre/utils/podofo/fonts.cpp @@ -173,7 +173,7 @@ list_fonts(PDFDoc *self, PyObject *args) { unsigned long num = ref.ObjectNumber(), generation = ref.GenerationNumber(); const PdfObject *descriptor = (*it)->GetIndirectKey("FontDescriptor"); pyunique_ptr descendant_font, stream_ref, encoding, w, w2; - PyBytesOutputStream stream_data; + PyBytesOutputStream stream_data, to_unicode; if (dict.HasKey("W")) { w.reset(convert_w_array(dict.GetKey("W")->GetArray())); if (!w) return NULL; @@ -200,10 +200,18 @@ list_fonts(PDFDoc *self, PyObject *args) { const PdfArray &df = dict.GetKey("DescendantFonts")->GetArray(); descendant_font.reset(ref_as_tuple(df[0].GetReference())); if (!descendant_font) return NULL; + if (get_font_data && dict.HasKey("ToUnicode")) { + const PdfReference &uref = dict.GetKey("ToUnicode")->GetReference(); + PdfObject *t = objects.GetObject(uref); + if (t) { + PdfStream *stream = t->GetStream(); + if (stream) stream->GetFilteredCopy(&to_unicode); + } + } } #define V(x) (x ? x.get() : Py_None) pyunique_ptr d(Py_BuildValue( - "{ss ss s(kk) sO sO sO sO sO sO}", + "{ss ss s(kk) sO sO sO sO sO sO sO}", "BaseFont", name.c_str(), "Subtype", subtype.c_str(), "Reference", num, generation, @@ -211,6 +219,7 @@ list_fonts(PDFDoc *self, PyObject *args) { "DescendantFont", V(descendant_font), "StreamRef", V(stream_ref), "Encoding", V(encoding), + "ToUnicode", V(to_unicode), "W", V(w), "W2", V(w2) )); #undef V @@ -282,11 +291,11 @@ merge_fonts(PDFDoc *self, PyObject *args) { if (c > 0) replace_font_references(self, ref_map); for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(items); i++) { - long num, gen; + long num, gen, t0num, t0gen; PyObject *W, *W2; - const char *data; - Py_ssize_t sz; - if (!PyArg_ParseTuple(PyTuple_GET_ITEM(items, i), "(ll)O!O!s#", &num, &gen, &PyList_Type, &W, &PyList_Type, &W2, &data, &sz)) return NULL; + const char *data, *tounicode_data; + Py_ssize_t sz, tounicode_sz; + if (!PyArg_ParseTuple(PyTuple_GET_ITEM(items, i), "(ll)(ll)O!O!s#s#", &num, &gen, &t0num, &t0gen, &PyList_Type, &W, &PyList_Type, &W2, &data, &sz, &tounicode_data, &tounicode_sz)) return NULL; PdfReference ref(num, gen); PdfObject *font = objects.GetObject(ref); if (font) { @@ -307,6 +316,14 @@ merge_fonts(PDFDoc *self, PyObject *args) { stream->Set(data, sz); } } + if (tounicode_sz) { + PdfObject *t0font = objects.GetObject(PdfReference(t0num, t0gen)); + if (t0font) { + PdfObject *s = t0font->GetIndirectKey("ToUnicode"); + if (!s) { PyErr_SetString(PyExc_ValueError, "Type0 font has no ToUnicode stream"); return NULL; } + s->GetStream()->Set(tounicode_data, tounicode_sz); + } + } } Py_RETURN_NONE; }