More robust implementation of glyph to unicode mapping

This commit is contained in:
Kovid Goyal 2012-12-21 12:14:31 +05:30
parent 302ce66f76
commit 8352098513
4 changed files with 77 additions and 45 deletions

View File

@ -91,7 +91,7 @@ class GlyphIndex(object):
else: else:
byts = bytearray(pack(b'>H', self.code)) byts = bytearray(pack(b'>H', self.code))
stream.write('<%s>'%''.join(map( stream.write('<%s>'%''.join(map(
lambda x: bytes(hex(x)[2:]).rjust(2, b'0'), byts))) lambda x: bytes(hex(int(x))[2:]).rjust(2, b'0'), byts)))
class Dictionary(dict): class Dictionary(dict):

View File

@ -205,6 +205,12 @@ class GraphicsState(object): # {{{
# }}} # }}}
class Font(FontMetrics):
def __init__(self, sfnt):
FontMetrics.__init__(self, sfnt)
self.glyph_map = {}
class PdfEngine(QPaintEngine): class PdfEngine(QPaintEngine):
def __init__(self, file_object, page_width, page_height, left_margin, def __init__(self, file_object, page_width, page_height, left_margin,
@ -235,7 +241,7 @@ class PdfEngine(QPaintEngine):
self.scale = sqrt(sy**2 + sx**2) self.scale = sqrt(sy**2 + sx**2)
self.xscale, self.yscale = sx, sy self.xscale, self.yscale = sx, sy
self.graphics_state = GraphicsState() self.graphics_state = GraphicsState()
self.errors = [] self.errors, self.debug = [], []
self.text_option = QTextOption() self.text_option = QTextOption()
self.text_option.setWrapMode(QTextOption.NoWrap) self.text_option.setWrapMode(QTextOption.NoWrap)
self.fonts = {} self.fonts = {}
@ -354,15 +360,67 @@ class PdfEngine(QPaintEngine):
self.pdf.draw_rect(bl.x(), bl.y(), rect.width(), rect.height(), self.pdf.draw_rect(bl.x(), bl.y(), rect.width(), rect.height(),
stroke=self.do_stroke, fill=self.do_fill) stroke=self.do_stroke, fill=self.do_fill)
def get_text_layout(self, text_item, text):
tl = QTextLayout(text, text_item.font(), self.paintDevice())
self.text_option.setTextDirection(Qt.RightToLeft if
text_item.renderFlags() & text_item.RightToLeft else Qt.LeftToRight)
tl.setTextOption(self.text_option)
return tl
def update_glyph_map(self, text, indices, text_item, glyph_map):
'''
Map glyphs back to the unicode text they represent.
'''
pos = 0
tl = self.get_text_layout(text_item, '')
indices = list(indices)
def get_glyphs(string):
tl.setText(string)
tl.beginLayout()
line = tl.createLine()
if not line.isValid():
tl.endLayout()
return []
line.setLineWidth(int(1e12))
tl.endLayout()
ans = []
for run in tl.glyphRuns():
ans.extend(run.glyphIndexes())
return ans
ipos = 0
while ipos < len(indices):
if indices[ipos] in glyph_map:
t = glyph_map[indices[ipos]]
if t == text[pos:pos+len(t)]:
pos += len(t)
ipos += 1
continue
found = False
for l in xrange(1, 10):
string = text[pos:pos+l]
g = get_glyphs(string)
if g and g[0] == indices[ipos]:
found = True
glyph_map[g[0]] = string
break
if not found:
self.debug.append(
'Failed to find glyph->unicode mapping for text: %s'%text)
break
ipos += 1
pos += l
return text[pos:]
@store_error @store_error
def drawTextItem(self, point, text_item): def drawTextItem(self, point, text_item):
# super(PdfEngine, self).drawTextItem(point+QPoint(0, 0), text_item) # super(PdfEngine, self).drawTextItem(point+QPoint(0, 0), text_item)
text = type(u'')(text_item.text()).replace('\n', ' ') text = type(u'')(text_item.text()).replace('\n', ' ')
text = unicodedata.normalize('NFKC', text) text = unicodedata.normalize('NFKC', text)
tl = QTextLayout(text, text_item.font(), self.paintDevice()) tl = self.get_text_layout(text_item, text)
self.text_option.setTextDirection(Qt.RightToLeft if
text_item.renderFlags() & text_item.RightToLeft else Qt.LeftToRight)
tl.setTextOption(self.text_option)
tl.setPosition(point) tl.setPosition(point)
tl.beginLayout() tl.beginLayout()
line = tl.createLine() line = tl.createLine()
@ -375,9 +433,10 @@ class PdfEngine(QPaintEngine):
rf = run.rawFont() rf = run.rawFont()
name = hash(bytes(rf.fontTable('name'))) name = hash(bytes(rf.fontTable('name')))
if name not in self.fonts: if name not in self.fonts:
self.fonts[name] = FontMetrics(Sfnt(rf)) self.fonts[name] = Font(Sfnt(rf))
metrics = self.fonts[name] metrics = self.fonts[name]
indices = run.glyphIndexes() indices = run.glyphIndexes()
text = self.update_glyph_map(text, indices, text_item, metrics.glyph_map)
glyphs = [] glyphs = []
pdf_pos = point pdf_pos = point
first_baseline = None first_baseline = None
@ -489,7 +548,7 @@ if __name__ == '__main__':
# f.setUnderline(True) # f.setUnderline(True)
# f.setOverline(True) # f.setOverline(True)
# f.setStrikeOut(True) # f.setStrikeOut(True)
f.setFamily('OpenDyslexic') f.setFamily('Calibri')
p.setFont(f) p.setFont(f)
# p.scale(2, 2) # p.scale(2, 2)
# p.rotate(45) # p.rotate(45)
@ -497,6 +556,8 @@ if __name__ == '__main__':
p.drawText(QPoint(100, 300), 'Some text ū --- Д AV ff ff') p.drawText(QPoint(100, 300), 'Some text ū --- Д AV ff ff')
finally: finally:
p.end() p.end()
for line in dev.engine.debug:
print (line)
if dev.engine.errors: if dev.engine.errors:
for err in dev.engine.errors: print (err) for err in dev.engine.errors: print (err)
raise SystemExit(1) raise SystemExit(1)

View File

@ -7,7 +7,7 @@ __license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import re, unicodedata import re
from itertools import izip, groupby from itertools import izip, groupby
from operator import itemgetter from operator import itemgetter
from collections import Counter, OrderedDict from collections import Counter, OrderedDict
@ -58,7 +58,7 @@ class FontStream(Stream):
d['Subtype'] = Name('CIDFontType0C') d['Subtype'] = Name('CIDFontType0C')
def to_hex_string(c): def to_hex_string(c):
return bytes(hex(c)[2:]).rjust(4, b'0').decode('ascii') return bytes(hex(int(c))[2:]).rjust(4, b'0').decode('ascii')
class CMap(Stream): class CMap(Stream):
@ -154,19 +154,17 @@ class Font(object):
self.font_descriptor['FontFile'+('3' if self.is_otf else '2') self.font_descriptor['FontFile'+('3' if self.is_otf else '2')
] = objects.add(self.font_stream) ] = objects.add(self.font_stream)
self.write_widths(objects) self.write_widths(objects)
glyph_map = self.metrics.sfnt['cmap'].get_char_codes(self.used_glyphs) self.write_to_unicode(objects)
self.write_to_unicode(objects, glyph_map) pdf_subset(self.metrics.sfnt, self.used_glyphs)
pdf_subset(self.metrics.sfnt, set(glyph_map))
if self.is_otf: if self.is_otf:
self.font_stream.write(self.metrics.sfnt['CFF '].raw) self.font_stream.write(self.metrics.sfnt['CFF '].raw)
else: else:
self.metrics.os2.zero_fstype() self.metrics.os2.zero_fstype()
self.metrics.sfnt(self.font_stream) self.metrics.sfnt(self.font_stream)
def write_to_unicode(self, objects, glyph_map): def write_to_unicode(self, objects):
glyph_map = {k:unicodedata.normalize('NFKC', unichr(v)) for k, v in cmap = CMap(self.metrics.postscript_name, self.metrics.glyph_map,
glyph_map.iteritems()} compress=self.compress)
cmap = CMap(self.metrics.postscript_name, glyph_map, compress=self.compress)
self.font_dict['ToUnicode'] = objects.add(cmap) self.font_dict['ToUnicode'] = objects.add(cmap)
def write_widths(self, objects): def write_widths(self, objects):

View File

@ -13,7 +13,7 @@ __docformat__ = 'restructuredtext en'
from struct import unpack_from, calcsize, pack from struct import unpack_from, calcsize, pack
from collections import OrderedDict from collections import OrderedDict
from calibre.utils.fonts.utils import get_bmp_glyph_ids, read_bmp_prefix from calibre.utils.fonts.utils import get_bmp_glyph_ids
from calibre.utils.fonts.sfnt import UnknownTable, max_power_of_two from calibre.utils.fonts.sfnt import UnknownTable, max_power_of_two
from calibre.utils.fonts.sfnt.errors import UnsupportedFont from calibre.utils.fonts.sfnt.errors import UnsupportedFont
@ -165,33 +165,6 @@ class CmapTable(UnknownTable):
ans[chars[i]] = glyph_id ans[chars[i]] = glyph_id
return ans return ans
def get_char_codes(self, glyph_ids):
if self.bmp_table is None:
raise UnsupportedFont('This font has no Windows BMP cmap subtable.'
' Most likely a special purpose font.')
ans = {}
(start_count, end_count, range_offset, id_delta, glyph_id_len,
glyph_id_map, array_len) = read_bmp_prefix(self.bmp_table, 0)
glyph_ids = frozenset(glyph_ids)
for i, ec in enumerate(end_count):
sc = start_count[i]
ro = range_offset[i]
for code in xrange(sc, ec+1):
if ro == 0:
glyph_id = id_delta[i] + code
else:
idx = ro//2 + (code - sc) + i - array_len
glyph_id = glyph_id_map[idx]
if glyph_id != 0:
glyph_id += id_delta[i]
glyph_id %= 0x1000
if glyph_id in glyph_ids:
ans[glyph_id] = code
return ans
def set_character_map(self, cmap): def set_character_map(self, cmap):
self.version, self.num_tables = 0, 1 self.version, self.num_tables = 0, 1
fmt = b'>7H' fmt = b'>7H'