From 0d1b99a4a4c845269af808fd61fc9cf9ebd7f125 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 23 Jul 2019 15:36:17 +0530
Subject: [PATCH] Implement merging of cmaps

---
 src/calibre/ebooks/pdf/html_writer.py      | 125 ++++++++++++++++++++-
 src/calibre/ebooks/pdf/test_html_writer.py |  20 +++-
 src/calibre/utils/podofo/fonts.cpp         |  29 ++++-
 3 files changed, 162 insertions(+), 12 deletions(-)

diff --git a/src/calibre/ebooks/pdf/html_writer.py b/src/calibre/ebooks/pdf/html_writer.py
index 20fc490735..b848ba4b40 100644
--- a/src/calibre/ebooks/pdf/html_writer.py
+++ b/src/calibre/ebooks/pdf/html_writer.py
@@ -8,11 +8,12 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import copy
 import json
 import os
+import re
 import signal
 import sys
 from collections import namedtuple
 from io import BytesIO
-from operator import attrgetter
+from operator import attrgetter, itemgetter
 
 from PyQt5.Qt import (
     QApplication, QMarginsF, QObject, QPageLayout, QTimer, QUrl, pyqtSignal
@@ -38,7 +39,7 @@ from calibre.utils.podofo import (
     get_podofo, remove_unused_fonts, set_metadata_implementation
 )
 from calibre.utils.short_uuid import uuid4
-from polyglot.builtins import filter, iteritems, map, range, unicode_type
+from polyglot.builtins import as_bytes, filter, iteritems, map, range, unicode_type
 from polyglot.urllib import urlparse
 
 OK, KILL_SIGNAL = range(0, 2)
@@ -600,14 +601,126 @@ def merge_w_arrays(arrays):
     return ans
 
 
-def merge_font(fonts):
-    # TODO: Check if the ToUnicode entry in the Type0 dict needs to be merged
+class CMap(object):
 
+    def __init__(self):
+        self.start_codespace = sys.maxsize
+        self.end_codespace = 0
+        self.ranges = set()
+        self.chars = set()
+        self.header = self.footer = None
+
+    def add_codespace(self, start, end):
+        self.start_codespace = min(self.start_codespace, start)
+        self.end_codespace = max(self.end_codespace, end)
+
+    def serialize(self):
+        chars = sorted(self.chars, key=itemgetter(0))
+
+        def ashex(x):
+            ans = '{:04X}'.format(x)
+            leftover = len(ans) % 4
+            if leftover:
+                ans = ('0' * (4 - leftover)) + ans
+            return ans
+
+        lines = ['1 begincodespacerange', '<{}> <{}>'.format(*map(ashex, (self.start_codespace, self.end_codespace))), 'endcodespacerange']
+        while chars:
+            group, chars = chars[:100], chars[100:]
+            del chars[:100]
+            lines.append('{} beginbfchar'.format(len(group)))
+            for g in group:
+                lines.append('<{}> <{}>'.format(*map(ashex, g)))
+            lines.append('endbfchar')
+
+        ranges = sorted(self.ranges, key=itemgetter(0))
+        while ranges:
+            group, ranges = ranges[:100], ranges[100:]
+            lines.append('{} beginbfrange'.format(len(group)))
+            for g in group:
+                lines.append('<{}> <{}> <{}>'.format(*map(ashex, g)))
+            lines.append('endbfrange')
+        return self.header + '\n' + '\n'.join(lines) + '\n' + self.footer
+
+
+def merge_cmaps(cmaps):
+    header, incmap, incodespace, inchar, inrange, footer = 'header cmap codespace char range footer'.split()
+    start_pat = re.compile(r'\d+\s+begin(codespacerange|bfrange|bfchar)')
+    ans = CMap()
+    for cmap in cmaps:
+        state = header
+        headerlines = []
+        footerlines = []
+        prefix_ended = False
+        for line in cmap.decode('utf-8', 'replace').splitlines():
+            line = line.strip()
+            if state is header:
+                headerlines.append(line)
+                if line == 'begincmap':
+                    state = incmap
+                continue
+            if state is incmap:
+                if line == 'endcmap':
+                    state = footer
+                    footerlines.append(line)
+                    continue
+                m = start_pat.match(line)
+                if m is not None:
+                    state = incodespace if m.group(1) == 'codespacerange' else (inchar if m.group(1) == 'bfchar' else inrange)
+                    prefix_ended = True
+                    continue
+                if not prefix_ended:
+                    headerlines.append(line)
+                continue
+            if state is incodespace:
+                if line == 'endcodespacerange':
+                    state = incmap
+                else:
+                    s, e = line.split()
+                    s = int(s[1:-1], 16)
+                    e = int(e[1:-1], 16)
+                    ans.add_codespace(s, e)
+                continue
+            if state is inchar:
+                if line == 'endbfchar':
+                    state = incmap
+                else:
+                    a, b = line.split()
+                    a = int(a[1:-1], 16)
+                    b = int(b[1:-1], 16)
+                    ans.chars.add((a, b))
+                continue
+            if state is inrange:
+                if line == 'endbfrange':
+                    state = incmap
+                else:
+                    # technically bfrange can contain arrays for th eunicode
+                    # value but from looking at SkPDFFont.cpp in chromium, it
+                    # does not generate any
+                    a, b, u = line.split()
+                    a = int(a[1:-1], 16)
+                    b = int(b[1:-1], 16)
+                    u = int(u[1:-1], 16)
+                    ans.ranges.add((a, b, u))
+                continue
+            if state is footer:
+                footerlines.append(line)
+        if ans.header is None:
+            ans.header = '\n'.join(headerlines)
+            ans.footer = '\n'.join(footerlines)
+    return ans.serialize()
+
+
+def merge_font(fonts):
     # choose the largest font as the base font
     fonts.sort(key=lambda f: len(f['Data'] or b''), reverse=True)
     base_font = fonts[0]
     t0_font = next(f for f in fonts if f['DescendantFont'] == base_font['Reference'])
     descendant_fonts = [f for f in fonts if f['Subtype'] != 'Type0']
+    t0_fonts = [f for f in fonts if f['Subtype'] == 'Type0']
+    cmaps = list(filter(None, (f['ToUnicode'] for f in t0_fonts)))
+    if cmaps:
+        t0_font['ToUnicode'] = as_bytes(merge_cmaps(cmaps))
     for key in ('W', 'W2'):
         arrays = tuple(filter(None, (f[key] for f in descendant_fonts)))
         base_font[key] = merge_w_arrays(arrays)
@@ -650,7 +763,9 @@ def merge_fonts(pdf_doc):
             for ref in references_to_drop:
                 replacements[ref] = t0_font['Reference']
             data = base_font['sfnt']()[0]
-            items.append((base_font['Reference'], base_font['W'] or [], base_font['W2'] or [], data))
+            items.append((
+                base_font['Reference'], t0_font['Reference'], base_font['W'] or [], base_font['W2'] or [],
+                data, t0_font['ToUnicode'] or b''))
     pdf_doc.merge_fonts(tuple(items), replacements)
 
 
diff --git a/src/calibre/ebooks/pdf/test_html_writer.py b/src/calibre/ebooks/pdf/test_html_writer.py
index ec8adf3f2f..0cebe5a9ee 100644
--- a/src/calibre/ebooks/pdf/test_html_writer.py
+++ b/src/calibre/ebooks/pdf/test_html_writer.py
@@ -5,11 +5,13 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
 
 import unittest
-from .html_writer import merge_w_arrays
+from .html_writer import merge_w_arrays, merge_cmaps
 
 
 class TestPDFWriter(unittest.TestCase):
 
+    maxDiff = None
+
     def test_merge_w_arrays(self):
         self.assertEqual(merge_w_arrays((  # merge neighbor arrays
             [1, 3, 0.1], [3, [0.1, 0.2]])), [1, 3, 0.1, 4, 4, 0.2])
@@ -36,6 +38,22 @@ class TestPDFWriter(unittest.TestCase):
             [1, 10, 99, 11, 13, 77, 19, [77, 1, 2, 3, 4]]
         )
 
+    def test_merge_cmaps(self):
+        roundtrip = '/CIDInit /ProcSet findresource begin\n12 dict begin\nbegincmap\n/CIDSystemInfo\n<<  /Registry (Adobe)\n/Ordering (UCS)\n/Supplement 0\n>> def\n/CMapName /Adobe-Identity-UCS def\n/CMapType 2 def\n1 begincodespacerange\n<0000> <FFFF>\nendcodespacerange\n12 beginbfchar\n<0003> <0020>\n<000F> <002C>\n<0011> <002E>\n<0013> <0030>\n<001A> <0037>\n<002C> <0049>\n<002E> <004B>\n<0030> <004D>\n<003D> <005A>\n<0070> <201C>\n<007B> <00A0>\n<01AC> <FB01>\nendbfchar\n9 beginbfrange\n<000B> <000C> <0028>\n<0015> <0016> <0032>\n<0024> <0028> <0041>\n<0032> <0033> <004F>\n<0036> <0038> <0053>\n<003A> <003B> <0057>\n<0044> <004C> <0061>\n<004E> <0053> <006B>\n<0055> <005C> <0072>\nendbfrange\nendcmap\nCMapName currentdict /CMap defineresource pop\nend\nend'  # noqa
+        self.assertEqual(roundtrip, merge_cmaps((roundtrip,)))
+        self.assertEqual(roundtrip, merge_cmaps((roundtrip, roundtrip)))
+        res = merge_cmaps((
+            'a\nbegincmap\nb\n1 begincodespacerange\n<0010> <00FF>\nendcodespacerange\n'
+            '1 beginbfchar\n<0001> <0020>\nendbfchar\n1 beginbfrange\n<0002> <000a> <00021>\nendbfrange\nendcmap\nc',
+            'x\nbegincmap\ny\n1 begincodespacerange\n<0001> <0100>\nendcodespacerange\n'
+            '1 beginbfchar\n<0011> <0040>\nendbfchar\n1 beginbfrange\n<0012> <001a> <00051>\nendbfrange\nendcmap\nz'
+        ))
+        self.assertEqual(
+            'a\nbegincmap\nb\n1 begincodespacerange\n<0001> <0100>\nendcodespacerange\n'
+            '2 beginbfchar\n<0001> <0020>\n<0011> <0040>\nendbfchar\n'
+            '2 beginbfrange\n<0002> <000A> <0021>\n<0012> <001A> <0051>\nendbfrange\nendcmap\nc',
+            res)
+
 
 def find_tests():
     return unittest.defaultTestLoader.loadTestsFromTestCase(TestPDFWriter)
diff --git a/src/calibre/utils/podofo/fonts.cpp b/src/calibre/utils/podofo/fonts.cpp
index fb629b10f9..886f93931c 100644
--- a/src/calibre/utils/podofo/fonts.cpp
+++ b/src/calibre/utils/podofo/fonts.cpp
@@ -173,7 +173,7 @@ list_fonts(PDFDoc *self, PyObject *args) {
                     unsigned long num = ref.ObjectNumber(), generation = ref.GenerationNumber();
                     const PdfObject *descriptor = (*it)->GetIndirectKey("FontDescriptor");
                     pyunique_ptr descendant_font, stream_ref, encoding, w, w2;
-                    PyBytesOutputStream stream_data;
+                    PyBytesOutputStream stream_data, to_unicode;
                     if (dict.HasKey("W")) {
                         w.reset(convert_w_array(dict.GetKey("W")->GetArray()));
                         if (!w) return NULL;
@@ -200,10 +200,18 @@ list_fonts(PDFDoc *self, PyObject *args) {
                         const PdfArray &df = dict.GetKey("DescendantFonts")->GetArray();
                         descendant_font.reset(ref_as_tuple(df[0].GetReference()));
                         if (!descendant_font) return NULL;
+                        if (get_font_data && dict.HasKey("ToUnicode")) {
+                            const PdfReference &uref = dict.GetKey("ToUnicode")->GetReference();
+                            PdfObject *t = objects.GetObject(uref);
+                            if (t) {
+                                PdfStream *stream = t->GetStream();
+                                if (stream) stream->GetFilteredCopy(&to_unicode);
+                            }
+                        }
                     }
 #define V(x) (x ? x.get() : Py_None)
                     pyunique_ptr d(Py_BuildValue(
-                            "{ss ss s(kk) sO sO sO sO sO sO}",
+                            "{ss ss s(kk) sO sO sO sO sO sO sO}",
                             "BaseFont", name.c_str(),
                             "Subtype", subtype.c_str(),
                             "Reference", num, generation,
@@ -211,6 +219,7 @@ list_fonts(PDFDoc *self, PyObject *args) {
                             "DescendantFont", V(descendant_font),
                             "StreamRef", V(stream_ref),
                             "Encoding", V(encoding),
+                            "ToUnicode", V(to_unicode),
                             "W", V(w), "W2", V(w2)
                     ));
 #undef V
@@ -282,11 +291,11 @@ merge_fonts(PDFDoc *self, PyObject *args) {
     if (c > 0) replace_font_references(self, ref_map);
 
     for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(items); i++) {
-        long num, gen;
+        long num, gen, t0num, t0gen;
         PyObject *W, *W2;
-        const char *data;
-        Py_ssize_t sz;
-        if (!PyArg_ParseTuple(PyTuple_GET_ITEM(items, i), "(ll)O!O!s#", &num, &gen, &PyList_Type, &W, &PyList_Type, &W2, &data, &sz)) return NULL;
+        const char *data, *tounicode_data;
+        Py_ssize_t sz, tounicode_sz;
+        if (!PyArg_ParseTuple(PyTuple_GET_ITEM(items, i), "(ll)(ll)O!O!s#s#", &num, &gen, &t0num, &t0gen, &PyList_Type, &W, &PyList_Type, &W2, &data, &sz, &tounicode_data, &tounicode_sz)) return NULL;
         PdfReference ref(num, gen);
         PdfObject *font = objects.GetObject(ref);
         if (font) {
@@ -307,6 +316,14 @@ merge_fonts(PDFDoc *self, PyObject *args) {
                 stream->Set(data, sz);
             }
         }
+        if (tounicode_sz) {
+            PdfObject *t0font = objects.GetObject(PdfReference(t0num, t0gen));
+            if (t0font) {
+                PdfObject *s = t0font->GetIndirectKey("ToUnicode");
+                if (!s) { PyErr_SetString(PyExc_ValueError, "Type0 font has no ToUnicode stream"); return NULL; }
+                s->GetStream()->Set(tounicode_data, tounicode_sz);
+            }
+        }
     }
     Py_RETURN_NONE;
 }