Add a ToUnicode map when embedding fonts

This commit is contained in:
Kovid Goyal 2012-12-20 20:46:45 +05:30
parent 992bf4b423
commit 5504db04a9
5 changed files with 106 additions and 13 deletions

View File

@ -493,7 +493,7 @@ if __name__ == '__main__':
# p.scale(2, 2) # p.scale(2, 2)
# p.rotate(45) # p.rotate(45)
# p.setPen(QColor(0, 0, 255)) # p.setPen(QColor(0, 0, 255))
p.drawText(QPoint(100, 300), 'Some text ū --- Д AV') p.drawText(QPoint(100, 300), 'Some text ū --- Д AV ff ff')
finally: finally:
p.end() p.end()
if dev.engine.errors: if dev.engine.errors:

View File

@ -7,10 +7,10 @@ __license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import re import re, unicodedata
from itertools import izip, groupby from itertools import izip, groupby
from operator import itemgetter from operator import itemgetter
from collections import Counter from collections import Counter, OrderedDict
from future_builtins import map from future_builtins import map
from calibre.ebooks.pdf.render.common import (Array, String, Stream, from calibre.ebooks.pdf.render.common import (Array, String, Stream,
@ -43,10 +43,12 @@ first. Each number gets mapped to a glyph id equal to itself by the
''' '''
import textwrap
class FontStream(Stream): class FontStream(Stream):
def __init__(self, is_otf): def __init__(self, is_otf, compress=False):
Stream.__init__(self) Stream.__init__(self, compress=compress)
self.is_otf = is_otf self.is_otf = is_otf
def add_extra_keys(self, d): def add_extra_keys(self, d):
@ -54,13 +56,62 @@ class FontStream(Stream):
if self.is_otf: if self.is_otf:
d['Subtype'] = Name('OpenType') d['Subtype'] = Name('OpenType')
def to_hex_string(c):
return bytes(hex(c)[2:]).rjust(4, b'0').decode('ascii')
class CMap(Stream):
skeleton = textwrap.dedent('''\
/CIDInit /ProcSet findresource begin
12 dict begin
begincmap
/CMapName {name}-cmap def
/CMapType 2 def
/CIDSystemInfo <<
/Registry (Adobe)
/Ordering (UCS)
/Supplement 0
>> def
1 begincodespacerange
<0000> <FFFF>
endcodespacerange
{mapping}
endcmap
CMapName currentdict /CMap defineresource pop
end
end
''')
def __init__(self, name, glyph_map, compress=False):
Stream.__init__(self, compress)
current_map = OrderedDict()
maps = []
for glyph_id in sorted(glyph_map):
if len(current_map) > 99:
maps.append(current_map)
current_map = OrderedDict()
val = []
for c in glyph_map[glyph_id]:
c = ord(c)
val.append(to_hex_string(c))
glyph_id = '<%s>'%to_hex_string(glyph_id)
current_map[glyph_id] = '<%s>'%''.join(val)
if current_map:
maps.append(current_map)
mapping = []
for m in maps:
meat = '\n'.join('%s %s'%(k, v) for k, v in m.iteritems())
mapping.append('%d beginbfchar\n%s\nendbfchar'%(len(m), meat))
self.write(self.skeleton.format(name=name, mapping='\n'.join(mapping)))
class Font(object): class Font(object):
def __init__(self, metrics, num, objects): def __init__(self, metrics, num, objects, compress):
self.metrics = metrics self.metrics, self.compress = metrics, compress
self.subset_tag = bytes(re.sub('.', lambda m: chr(int(m.group())+ord('A')), self.subset_tag = bytes(re.sub('.', lambda m: chr(int(m.group())+ord('A')),
oct(num))).rjust(6, b'A').decode('ascii') oct(num))).rjust(6, b'A').decode('ascii')
self.font_stream = FontStream(metrics.is_otf) self.font_stream = FontStream(metrics.is_otf, compress=compress)
self.font_descriptor = Dictionary({ self.font_descriptor = Dictionary({
'Type': Name('FontDescriptor'), 'Type': Name('FontDescriptor'),
'FontName': Name(metrics.postscript_name), 'FontName': Name(metrics.postscript_name),
@ -101,9 +152,17 @@ class Font(object):
# TODO: Subsetting and OpenType # TODO: Subsetting and OpenType
self.font_descriptor['FontFile2'] = objects.add(self.font_stream) self.font_descriptor['FontFile2'] = objects.add(self.font_stream)
self.write_widths(objects) self.write_widths(objects)
self.write_to_unicode(objects)
self.metrics.os2.zero_fstype() self.metrics.os2.zero_fstype()
self.metrics.sfnt(self.font_stream) self.metrics.sfnt(self.font_stream)
def write_to_unicode(self, objects):
glyph_map = self.metrics.sfnt['cmap'].get_char_codes(self.used_glyphs)
glyph_map = {k:unicodedata.normalize('NFKC', unichr(v)) for k, v in
glyph_map.iteritems()}
cmap = CMap(self.metrics.postscript_name, glyph_map, compress=self.compress)
self.font_dict['ToUnicode'] = objects.add(cmap)
def write_widths(self, objects): def write_widths(self, objects):
glyphs = sorted(self.used_glyphs|{0}) glyphs = sorted(self.used_glyphs|{0})
widths = {g:self.metrics.pdf_scale(w) for g, w in izip(glyphs, widths = {g:self.metrics.pdf_scale(w) for g, w in izip(glyphs,
@ -129,8 +188,9 @@ class Font(object):
class FontManager(object): class FontManager(object):
def __init__(self, objects): def __init__(self, objects, compress):
self.objects = objects self.objects = objects
self.compress = compress
self.std_map = {} self.std_map = {}
self.font_map = {} self.font_map = {}
self.fonts = [] self.fonts = []
@ -138,7 +198,7 @@ class FontManager(object):
def add_font(self, font_metrics, glyph_ids): def add_font(self, font_metrics, glyph_ids):
if font_metrics not in self.font_map: if font_metrics not in self.font_map:
self.fonts.append(Font(font_metrics, len(self.fonts), self.fonts.append(Font(font_metrics, len(self.fonts),
self.objects)) self.objects, self.compress))
d = self.objects.add(self.fonts[-1].font_dict) d = self.objects.add(self.fonts[-1].font_dict)
self.font_map[font_metrics] = (d, self.fonts[-1]) self.font_map[font_metrics] = (d, self.fonts[-1])

View File

@ -252,7 +252,7 @@ class PDFStream(object):
self.info = Dictionary({'Creator':String(creator), self.info = Dictionary({'Creator':String(creator),
'Producer':String(creator)}) 'Producer':String(creator)})
self.stroke_opacities, self.fill_opacities = {}, {} self.stroke_opacities, self.fill_opacities = {}, {}
self.font_manager = FontManager(self.objects) self.font_manager = FontManager(self.objects, self.compress)
@property @property
def page_tree(self): def page_tree(self):

View File

@ -13,7 +13,7 @@ __docformat__ = 'restructuredtext en'
from struct import unpack_from, calcsize, pack from struct import unpack_from, calcsize, pack
from collections import OrderedDict from collections import OrderedDict
from calibre.utils.fonts.utils import get_bmp_glyph_ids from calibre.utils.fonts.utils import get_bmp_glyph_ids, read_bmp_prefix
from calibre.utils.fonts.sfnt import UnknownTable, max_power_of_two from calibre.utils.fonts.sfnt import UnknownTable, max_power_of_two
from calibre.utils.fonts.sfnt.errors import UnsupportedFont from calibre.utils.fonts.sfnt.errors import UnsupportedFont
@ -165,6 +165,33 @@ class CmapTable(UnknownTable):
ans[chars[i]] = glyph_id ans[chars[i]] = glyph_id
return ans return ans
def get_char_codes(self, glyph_ids):
if self.bmp_table is None:
raise UnsupportedFont('This font has no Windows BMP cmap subtable.'
' Most likely a special purpose font.')
ans = {}
(start_count, end_count, range_offset, id_delta, glyph_id_len,
glyph_id_map, array_len) = read_bmp_prefix(self.bmp_table, 0)
glyph_ids = frozenset(glyph_ids)
for i, ec in enumerate(end_count):
sc = start_count[i]
ro = range_offset[i]
for code in xrange(sc, ec+1):
if ro == 0:
glyph_id = id_delta[i] + code
else:
idx = ro//2 + (code - sc) + i - array_len
glyph_id = glyph_id_map[idx]
if glyph_id != 0:
glyph_id += id_delta[i]
glyph_id %= 0x1000
if glyph_id in glyph_ids:
ans[glyph_id] = code
return ans
def set_character_map(self, cmap): def set_character_map(self, cmap):
self.version, self.num_tables = 0, 1 self.version, self.num_tables = 0, 1
fmt = b'>7H' fmt = b'>7H'

View File

@ -306,7 +306,7 @@ def remove_embed_restriction(raw):
verify_checksums(raw) verify_checksums(raw)
return raw return raw
def get_bmp_glyph_ids(table, bmp, codes): def read_bmp_prefix(table, bmp):
length, language, segcount = struct.unpack_from(b'>3H', table, bmp+2) length, language, segcount = struct.unpack_from(b'>3H', table, bmp+2)
array_len = segcount //2 array_len = segcount //2
offset = bmp + 7*2 offset = bmp + 7*2
@ -324,6 +324,12 @@ def get_bmp_glyph_ids(table, bmp, codes):
glyph_id_len = (length + bmp - (offset + array_sz))//2 glyph_id_len = (length + bmp - (offset + array_sz))//2
glyph_id_map = struct.unpack_from(b'>%dH'%glyph_id_len, table, offset + glyph_id_map = struct.unpack_from(b'>%dH'%glyph_id_len, table, offset +
array_sz) array_sz)
return (start_count, end_count, range_offset, id_delta, glyph_id_len,
glyph_id_map, array_len)
def get_bmp_glyph_ids(table, bmp, codes):
(start_count, end_count, range_offset, id_delta, glyph_id_len,
glyph_id_map, array_len) = read_bmp_prefix(table, bmp)
for code in codes: for code in codes:
found = False found = False