Work on merging fonts

This commit is contained in:
Kovid Goyal 2019-07-18 06:18:46 +05:30
parent 929f65ecf2
commit d994cf7895
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
3 changed files with 138 additions and 8 deletions

View File

@ -9,8 +9,10 @@ import copy
import json import json
import os import os
import signal import signal
import sys
from collections import namedtuple from collections import namedtuple
from io import BytesIO from io import BytesIO
from operator import attrgetter
from PyQt5.Qt import ( from PyQt5.Qt import (
QApplication, QMarginsF, QObject, QPageLayout, QTimer, QUrl, pyqtSignal QApplication, QMarginsF, QObject, QPageLayout, QTimer, QUrl, pyqtSignal
@ -29,12 +31,13 @@ from calibre.ebooks.pdf.image_writer import (
from calibre.ebooks.pdf.render.serialize import PDFStream from calibre.ebooks.pdf.render.serialize import PDFStream
from calibre.gui2 import setup_unix_signals from calibre.gui2 import setup_unix_signals
from calibre.gui2.webengine import secure_webengine from calibre.gui2.webengine import secure_webengine
from calibre.utils.fonts.sfnt.container import Sfnt, UnsupportedFont
from calibre.utils.logging import default_log from calibre.utils.logging import default_log
from calibre.utils.podofo import ( from calibre.utils.podofo import (
get_podofo, remove_unused_fonts, set_metadata_implementation get_podofo, remove_unused_fonts, set_metadata_implementation
) )
from calibre.utils.short_uuid import uuid4 from calibre.utils.short_uuid import uuid4
from polyglot.builtins import iteritems, map, range, unicode_type from polyglot.builtins import filter, iteritems, map, range, unicode_type
from polyglot.urllib import urlparse from polyglot.urllib import urlparse
OK, KILL_SIGNAL = range(0, 2) OK, KILL_SIGNAL = range(0, 2)
@ -512,6 +515,104 @@ def add_pagenum_toc(root, toc, opts, page_number_display_map):
# }}} # }}}
# Fonts {{{
class Range(object):
__slots__ = ('first', 'last', 'width')
def __init__(self, first, last, width):
self.first, self.last, self.width = first, last, width
# Sort by first with larger ranges coming before smaller ones
self.sort_order = self.first, -self.last
def merge_w_arrays(arrays):
ranges = []
for w in arrays:
i = 0
while i + 1 < len(w):
elem = w[i]
next_elem = w[i+1]
if isinstance(next_elem, list):
ranges.extend(Range(elem + c, elem + c, w) for c, w in enumerate(next_elem))
i += 2
elif i + 2 < len(w):
ranges.append(Range(elem, next_elem, w[i+2]))
i += 3
else:
break
ranges.sort(key=attrgetter('sort_order'))
merged_ranges = ranges[:1]
for r in ranges[1:]:
prev_range = merged_ranges[-1]
left_over = prev_range.merge(r)
if left_over is not None:
merged_ranges.append(left_over)
if not merged_ranges:
return []
# combine consecutive single value ranges
def merge_font(fonts):
# TODO: Check if the ToUnicode entry in the Type) dict needs to be merged
# choose the largest font as the base font
fonts.sort(key=lambda f: len(f['Data'] or b''), reverse=True)
base_font = fonts[0]
t0_font = next(f for f in fonts if f['DescendantFont'] == base_font['Reference'])
descendant_fonts = [f for f in fonts if f['Subtype'] != 'Type0' and f is not base_font]
for key in ('W', 'W2'):
arrays = tuple(filter(True, (f[key] for f in descendant_fonts)))
base_font[key] = merge_w_arrays(arrays)
t0_font
def merge_fonts(pdf_doc):
all_fonts = pdf_doc.list_fonts(True)
base_font_map = {}
def mergeable(fonts):
has_type0 = False
for font in fonts:
if font['Subtype'] == 'Type0':
has_type0 = True
if not font['Encoding'] or not font['Encoding'].startswith('Identity-'):
return False
else:
if not font['Data']:
return False
try:
sfnt = Sfnt(font['Data'])
except UnsupportedFont:
return False
font['sfnt'] = sfnt
if b'glyf' not in sfnt:
# TODO: Add support for merging CFF tables
return False
return has_type0
for f in all_fonts:
base_font_map.setdefault(f['BaseFont'], []).append(f)
for name, fonts in iteritems(base_font_map):
if mergeable(fonts):
merge_font(fonts)
def test_merge_fonts():
path = sys.argv[-1]
podofo = get_podofo()
pdf_doc = podofo.PDFDoc()
pdf_doc.open(path)
merge_fonts(pdf_doc)
out = path.rpartition('.')[0] + '-merged.pdf'
pdf_doc.save(out)
print('Merged PDF writted to', out)
# }}}
def convert(opf_path, opts, metadata=None, output_path=None, log=default_log, cover_data=None, report_progress=lambda x, y: None): def convert(opf_path, opts, metadata=None, output_path=None, log=default_log, cover_data=None, report_progress=lambda x, y: None):
container = Container(opf_path, log) container = Container(opf_path, log)
report_progress(0.05, _('Parsed all content for markup transformation')) report_progress(0.05, _('Parsed all content for markup transformation'))
@ -567,8 +668,8 @@ def convert(opf_path, opts, metadata=None, output_path=None, log=default_log, co
add_toc(PDFOutlineRoot(pdf_doc), toc) add_toc(PDFOutlineRoot(pdf_doc), toc)
report_progress(0.75, _('Added links to PDF content')) report_progress(0.75, _('Added links to PDF content'))
# TODO: Remove duplicate fonts merge_fonts(pdf_doc)
# TODO: Subset and embed fonts before rendering PDF
# TODO: Support for mathematics # TODO: Support for mathematics
num_removed = remove_unused_fonts(pdf_doc) num_removed = remove_unused_fonts(pdf_doc)

View File

@ -729,7 +729,7 @@ static PyMethodDef PDFDoc_methods[] = {
{"alter_links", (PyCFunction)PDFDoc_alter_links, METH_VARARGS, {"alter_links", (PyCFunction)PDFDoc_alter_links, METH_VARARGS,
"alter_links() -> Change links in the document." "alter_links() -> Change links in the document."
}, },
{"list_fonts", (PyCFunction)list_fonts, METH_NOARGS, {"list_fonts", (PyCFunction)list_fonts, METH_VARARGS,
"list_fonts() -> Get list of fonts in document" "list_fonts() -> Get list of fonts in document"
}, },
{"used_fonts_in_page_range", (PyCFunction)used_fonts_in_page_range, METH_VARARGS, {"used_fonts_in_page_range", (PyCFunction)used_fonts_in_page_range, METH_VARARGS,

View File

@ -72,6 +72,25 @@ used_fonts_in_page(PdfPage *page, int page_num, PyObject *ans) {
return true; return true;
} }
static PyObject*
convert_w_array(const PdfArray &w) {
pyunique_ptr ans(PyList_New(0));
if (!ans) return NULL;
for (PdfArray::const_iterator it = w.begin(); it != w.end(); it++) {
pyunique_ptr item;
if ((*it).IsArray()) {
item.reset(convert_w_array((*it).GetArray()));
} else if ((*it).IsNumber()) {
item.reset(PyLong_FromLongLong((long long)(*it).GetNumber()));
} else if ((*it).IsReal()) {
item.reset(PyFloat_FromDouble((*it).GetReal()));
} else PyErr_SetString(PyExc_ValueError, "Unknown datatype in w array");
if (!item) return NULL;
if (PyList_Append(ans.get(), item.get()) != 0) return NULL;
}
return ans.release();
}
extern "C" { extern "C" {
PyObject* PyObject*
list_fonts(PDFDoc *self, PyObject *args) { list_fonts(PDFDoc *self, PyObject *args) {
@ -90,10 +109,19 @@ list_fonts(PDFDoc *self, PyObject *args) {
const PdfReference &ref = (*it)->Reference(); const PdfReference &ref = (*it)->Reference();
unsigned long num = ref.ObjectNumber(), generation = ref.GenerationNumber(); unsigned long num = ref.ObjectNumber(), generation = ref.GenerationNumber();
const PdfObject *descriptor = (*it)->GetIndirectKey("FontDescriptor"); const PdfObject *descriptor = (*it)->GetIndirectKey("FontDescriptor");
pyunique_ptr descendant_font, stream_ref, encoding; pyunique_ptr descendant_font, stream_ref, encoding, w, w2;
PyBytesOutputStream stream_data; PyBytesOutputStream stream_data;
if (dict.HasKey("W")) {
w.reset(convert_w_array(dict.GetKey("W")->GetArray()));
if (!w) return NULL;
}
if (dict.HasKey("W2")) {
w2.reset(convert_w_array(dict.GetKey("W2")->GetArray()));
if (!w2) return NULL;
}
if (dict.HasKey("Encoding") && dict.GetKey("Encoding")->IsName()) { if (dict.HasKey("Encoding") && dict.GetKey("Encoding")->IsName()) {
encoding.reset(PyUnicode_FromString(dict.GetKey("Encoding")->GetName().GetName().c_str())); encoding.reset(PyUnicode_FromString(dict.GetKey("Encoding")->GetName().GetName().c_str()));
if (!encoding) return NULL;
} }
if (descriptor) { if (descriptor) {
const PdfObject *ff = get_font_file(descriptor); const PdfObject *ff = get_font_file(descriptor);
@ -102,7 +130,7 @@ list_fonts(PDFDoc *self, PyObject *args) {
if (!stream_ref) return NULL; if (!stream_ref) return NULL;
const PdfStream *stream = ff->GetStream(); const PdfStream *stream = ff->GetStream();
if (stream && get_font_data) { if (stream && get_font_data) {
stream->GetCopy(&stream_data); stream->GetFilteredCopy(&stream_data);
} }
} }
} else if (dict.HasKey("DescendantFonts")) { } else if (dict.HasKey("DescendantFonts")) {
@ -112,14 +140,15 @@ list_fonts(PDFDoc *self, PyObject *args) {
} }
#define V(x) (x ? x.get() : Py_None) #define V(x) (x ? x.get() : Py_None)
pyunique_ptr d(Py_BuildValue( pyunique_ptr d(Py_BuildValue(
"{ss ss s(kk) sO sO sO sO}", "{ss ss s(kk) sO sO sO sO sO sO}",
"BaseFont", name.c_str(), "BaseFont", name.c_str(),
"Subtype", subtype.c_str(), "Subtype", subtype.c_str(),
"Reference", num, generation, "Reference", num, generation,
"Data", V(stream_data), "Data", V(stream_data),
"DescendantFont", V(descendant_font), "DescendantFont", V(descendant_font),
"StreamRef", V(stream_ref), "StreamRef", V(stream_ref),
"Encoding", V(encoding) "Encoding", V(encoding),
"W", V(w), "W2", V(w2)
)); ));
#undef V #undef V
if (!d) { return NULL; } if (!d) { return NULL; }