Implement merging of cmaps

This commit is contained in:
Kovid Goyal 2019-07-23 15:36:17 +05:30
parent b573c33d1c
commit 0d1b99a4a4
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
3 changed files with 162 additions and 12 deletions

View File

@ -8,11 +8,12 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import copy
import json
import os
import re
import signal
import sys
from collections import namedtuple
from io import BytesIO
from operator import attrgetter
from operator import attrgetter, itemgetter
from PyQt5.Qt import (
QApplication, QMarginsF, QObject, QPageLayout, QTimer, QUrl, pyqtSignal
@ -38,7 +39,7 @@ from calibre.utils.podofo import (
get_podofo, remove_unused_fonts, set_metadata_implementation
)
from calibre.utils.short_uuid import uuid4
from polyglot.builtins import filter, iteritems, map, range, unicode_type
from polyglot.builtins import as_bytes, filter, iteritems, map, range, unicode_type
from polyglot.urllib import urlparse
OK, KILL_SIGNAL = range(0, 2)
@ -600,14 +601,126 @@ def merge_w_arrays(arrays):
return ans
def merge_font(fonts):
# TODO: Check if the ToUnicode entry in the Type0 dict needs to be merged
class CMap(object):
def __init__(self):
self.start_codespace = sys.maxsize
self.end_codespace = 0
self.ranges = set()
self.chars = set()
self.header = self.footer = None
def add_codespace(self, start, end):
self.start_codespace = min(self.start_codespace, start)
self.end_codespace = max(self.end_codespace, end)
def serialize(self):
chars = sorted(self.chars, key=itemgetter(0))
def ashex(x):
ans = '{:04X}'.format(x)
leftover = len(ans) % 4
if leftover:
ans = ('0' * (4 - leftover)) + ans
return ans
lines = ['1 begincodespacerange', '<{}> <{}>'.format(*map(ashex, (self.start_codespace, self.end_codespace))), 'endcodespacerange']
while chars:
group, chars = chars[:100], chars[100:]
del chars[:100]
lines.append('{} beginbfchar'.format(len(group)))
for g in group:
lines.append('<{}> <{}>'.format(*map(ashex, g)))
lines.append('endbfchar')
ranges = sorted(self.ranges, key=itemgetter(0))
while ranges:
group, ranges = ranges[:100], ranges[100:]
lines.append('{} beginbfrange'.format(len(group)))
for g in group:
lines.append('<{}> <{}> <{}>'.format(*map(ashex, g)))
lines.append('endbfrange')
return self.header + '\n' + '\n'.join(lines) + '\n' + self.footer
def merge_cmaps(cmaps):
header, incmap, incodespace, inchar, inrange, footer = 'header cmap codespace char range footer'.split()
start_pat = re.compile(r'\d+\s+begin(codespacerange|bfrange|bfchar)')
ans = CMap()
for cmap in cmaps:
state = header
headerlines = []
footerlines = []
prefix_ended = False
for line in cmap.decode('utf-8', 'replace').splitlines():
line = line.strip()
if state is header:
headerlines.append(line)
if line == 'begincmap':
state = incmap
continue
if state is incmap:
if line == 'endcmap':
state = footer
footerlines.append(line)
continue
m = start_pat.match(line)
if m is not None:
state = incodespace if m.group(1) == 'codespacerange' else (inchar if m.group(1) == 'bfchar' else inrange)
prefix_ended = True
continue
if not prefix_ended:
headerlines.append(line)
continue
if state is incodespace:
if line == 'endcodespacerange':
state = incmap
else:
s, e = line.split()
s = int(s[1:-1], 16)
e = int(e[1:-1], 16)
ans.add_codespace(s, e)
continue
if state is inchar:
if line == 'endbfchar':
state = incmap
else:
a, b = line.split()
a = int(a[1:-1], 16)
b = int(b[1:-1], 16)
ans.chars.add((a, b))
continue
if state is inrange:
if line == 'endbfrange':
state = incmap
else:
# technically bfrange can contain arrays for th eunicode
# value but from looking at SkPDFFont.cpp in chromium, it
# does not generate any
a, b, u = line.split()
a = int(a[1:-1], 16)
b = int(b[1:-1], 16)
u = int(u[1:-1], 16)
ans.ranges.add((a, b, u))
continue
if state is footer:
footerlines.append(line)
if ans.header is None:
ans.header = '\n'.join(headerlines)
ans.footer = '\n'.join(footerlines)
return ans.serialize()
def merge_font(fonts):
# choose the largest font as the base font
fonts.sort(key=lambda f: len(f['Data'] or b''), reverse=True)
base_font = fonts[0]
t0_font = next(f for f in fonts if f['DescendantFont'] == base_font['Reference'])
descendant_fonts = [f for f in fonts if f['Subtype'] != 'Type0']
t0_fonts = [f for f in fonts if f['Subtype'] == 'Type0']
cmaps = list(filter(None, (f['ToUnicode'] for f in t0_fonts)))
if cmaps:
t0_font['ToUnicode'] = as_bytes(merge_cmaps(cmaps))
for key in ('W', 'W2'):
arrays = tuple(filter(None, (f[key] for f in descendant_fonts)))
base_font[key] = merge_w_arrays(arrays)
@ -650,7 +763,9 @@ def merge_fonts(pdf_doc):
for ref in references_to_drop:
replacements[ref] = t0_font['Reference']
data = base_font['sfnt']()[0]
items.append((base_font['Reference'], base_font['W'] or [], base_font['W2'] or [], data))
items.append((
base_font['Reference'], t0_font['Reference'], base_font['W'] or [], base_font['W2'] or [],
data, t0_font['ToUnicode'] or b''))
pdf_doc.merge_fonts(tuple(items), replacements)

View File

@ -5,11 +5,13 @@
from __future__ import absolute_import, division, print_function, unicode_literals
import unittest
from .html_writer import merge_w_arrays
from .html_writer import merge_w_arrays, merge_cmaps
class TestPDFWriter(unittest.TestCase):
maxDiff = None
def test_merge_w_arrays(self):
self.assertEqual(merge_w_arrays(( # merge neighbor arrays
[1, 3, 0.1], [3, [0.1, 0.2]])), [1, 3, 0.1, 4, 4, 0.2])
@ -36,6 +38,22 @@ class TestPDFWriter(unittest.TestCase):
[1, 10, 99, 11, 13, 77, 19, [77, 1, 2, 3, 4]]
)
def test_merge_cmaps(self):
roundtrip = '/CIDInit /ProcSet findresource begin\n12 dict begin\nbegincmap\n/CIDSystemInfo\n<< /Registry (Adobe)\n/Ordering (UCS)\n/Supplement 0\n>> def\n/CMapName /Adobe-Identity-UCS def\n/CMapType 2 def\n1 begincodespacerange\n<0000> <FFFF>\nendcodespacerange\n12 beginbfchar\n<0003> <0020>\n<000F> <002C>\n<0011> <002E>\n<0013> <0030>\n<001A> <0037>\n<002C> <0049>\n<002E> <004B>\n<0030> <004D>\n<003D> <005A>\n<0070> <201C>\n<007B> <00A0>\n<01AC> <FB01>\nendbfchar\n9 beginbfrange\n<000B> <000C> <0028>\n<0015> <0016> <0032>\n<0024> <0028> <0041>\n<0032> <0033> <004F>\n<0036> <0038> <0053>\n<003A> <003B> <0057>\n<0044> <004C> <0061>\n<004E> <0053> <006B>\n<0055> <005C> <0072>\nendbfrange\nendcmap\nCMapName currentdict /CMap defineresource pop\nend\nend' # noqa
self.assertEqual(roundtrip, merge_cmaps((roundtrip,)))
self.assertEqual(roundtrip, merge_cmaps((roundtrip, roundtrip)))
res = merge_cmaps((
'a\nbegincmap\nb\n1 begincodespacerange\n<0010> <00FF>\nendcodespacerange\n'
'1 beginbfchar\n<0001> <0020>\nendbfchar\n1 beginbfrange\n<0002> <000a> <00021>\nendbfrange\nendcmap\nc',
'x\nbegincmap\ny\n1 begincodespacerange\n<0001> <0100>\nendcodespacerange\n'
'1 beginbfchar\n<0011> <0040>\nendbfchar\n1 beginbfrange\n<0012> <001a> <00051>\nendbfrange\nendcmap\nz'
))
self.assertEqual(
'a\nbegincmap\nb\n1 begincodespacerange\n<0001> <0100>\nendcodespacerange\n'
'2 beginbfchar\n<0001> <0020>\n<0011> <0040>\nendbfchar\n'
'2 beginbfrange\n<0002> <000A> <0021>\n<0012> <001A> <0051>\nendbfrange\nendcmap\nc',
res)
def find_tests():
return unittest.defaultTestLoader.loadTestsFromTestCase(TestPDFWriter)

View File

@ -173,7 +173,7 @@ list_fonts(PDFDoc *self, PyObject *args) {
unsigned long num = ref.ObjectNumber(), generation = ref.GenerationNumber();
const PdfObject *descriptor = (*it)->GetIndirectKey("FontDescriptor");
pyunique_ptr descendant_font, stream_ref, encoding, w, w2;
PyBytesOutputStream stream_data;
PyBytesOutputStream stream_data, to_unicode;
if (dict.HasKey("W")) {
w.reset(convert_w_array(dict.GetKey("W")->GetArray()));
if (!w) return NULL;
@ -200,10 +200,18 @@ list_fonts(PDFDoc *self, PyObject *args) {
const PdfArray &df = dict.GetKey("DescendantFonts")->GetArray();
descendant_font.reset(ref_as_tuple(df[0].GetReference()));
if (!descendant_font) return NULL;
if (get_font_data && dict.HasKey("ToUnicode")) {
const PdfReference &uref = dict.GetKey("ToUnicode")->GetReference();
PdfObject *t = objects.GetObject(uref);
if (t) {
PdfStream *stream = t->GetStream();
if (stream) stream->GetFilteredCopy(&to_unicode);
}
}
}
#define V(x) (x ? x.get() : Py_None)
pyunique_ptr d(Py_BuildValue(
"{ss ss s(kk) sO sO sO sO sO sO}",
"{ss ss s(kk) sO sO sO sO sO sO sO}",
"BaseFont", name.c_str(),
"Subtype", subtype.c_str(),
"Reference", num, generation,
@ -211,6 +219,7 @@ list_fonts(PDFDoc *self, PyObject *args) {
"DescendantFont", V(descendant_font),
"StreamRef", V(stream_ref),
"Encoding", V(encoding),
"ToUnicode", V(to_unicode),
"W", V(w), "W2", V(w2)
));
#undef V
@ -282,11 +291,11 @@ merge_fonts(PDFDoc *self, PyObject *args) {
if (c > 0) replace_font_references(self, ref_map);
for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(items); i++) {
long num, gen;
long num, gen, t0num, t0gen;
PyObject *W, *W2;
const char *data;
Py_ssize_t sz;
if (!PyArg_ParseTuple(PyTuple_GET_ITEM(items, i), "(ll)O!O!s#", &num, &gen, &PyList_Type, &W, &PyList_Type, &W2, &data, &sz)) return NULL;
const char *data, *tounicode_data;
Py_ssize_t sz, tounicode_sz;
if (!PyArg_ParseTuple(PyTuple_GET_ITEM(items, i), "(ll)(ll)O!O!s#s#", &num, &gen, &t0num, &t0gen, &PyList_Type, &W, &PyList_Type, &W2, &data, &sz, &tounicode_data, &tounicode_sz)) return NULL;
PdfReference ref(num, gen);
PdfObject *font = objects.GetObject(ref);
if (font) {
@ -307,6 +316,14 @@ merge_fonts(PDFDoc *self, PyObject *args) {
stream->Set(data, sz);
}
}
if (tounicode_sz) {
PdfObject *t0font = objects.GetObject(PdfReference(t0num, t0gen));
if (t0font) {
PdfObject *s = t0font->GetIndirectKey("ToUnicode");
if (!s) { PyErr_SetString(PyExc_ValueError, "Type0 font has no ToUnicode stream"); return NULL; }
s->GetStream()->Set(tounicode_data, tounicode_sz);
}
}
}
Py_RETURN_NONE;
}