More robust implementation of glyph to unicode mapping

This commit is contained in:
Kovid Goyal 2012-12-21 12:14:31 +05:30
parent 302ce66f76
commit 8352098513
4 changed files with 77 additions and 45 deletions

View File

@ -91,7 +91,7 @@ class GlyphIndex(object):
else:
byts = bytearray(pack(b'>H', self.code))
stream.write('<%s>'%''.join(map(
lambda x: bytes(hex(x)[2:]).rjust(2, b'0'), byts)))
lambda x: bytes(hex(int(x))[2:]).rjust(2, b'0'), byts)))
class Dictionary(dict):

View File

@ -205,6 +205,12 @@ class GraphicsState(object): # {{{
# }}}
class Font(FontMetrics):
def __init__(self, sfnt):
FontMetrics.__init__(self, sfnt)
self.glyph_map = {}
class PdfEngine(QPaintEngine):
def __init__(self, file_object, page_width, page_height, left_margin,
@ -235,7 +241,7 @@ class PdfEngine(QPaintEngine):
self.scale = sqrt(sy**2 + sx**2)
self.xscale, self.yscale = sx, sy
self.graphics_state = GraphicsState()
self.errors = []
self.errors, self.debug = [], []
self.text_option = QTextOption()
self.text_option.setWrapMode(QTextOption.NoWrap)
self.fonts = {}
@ -354,15 +360,67 @@ class PdfEngine(QPaintEngine):
self.pdf.draw_rect(bl.x(), bl.y(), rect.width(), rect.height(),
stroke=self.do_stroke, fill=self.do_fill)
def get_text_layout(self, text_item, text):
tl = QTextLayout(text, text_item.font(), self.paintDevice())
self.text_option.setTextDirection(Qt.RightToLeft if
text_item.renderFlags() & text_item.RightToLeft else Qt.LeftToRight)
tl.setTextOption(self.text_option)
return tl
def update_glyph_map(self, text, indices, text_item, glyph_map):
'''
Map glyphs back to the unicode text they represent.
'''
pos = 0
tl = self.get_text_layout(text_item, '')
indices = list(indices)
def get_glyphs(string):
tl.setText(string)
tl.beginLayout()
line = tl.createLine()
if not line.isValid():
tl.endLayout()
return []
line.setLineWidth(int(1e12))
tl.endLayout()
ans = []
for run in tl.glyphRuns():
ans.extend(run.glyphIndexes())
return ans
ipos = 0
while ipos < len(indices):
if indices[ipos] in glyph_map:
t = glyph_map[indices[ipos]]
if t == text[pos:pos+len(t)]:
pos += len(t)
ipos += 1
continue
found = False
for l in xrange(1, 10):
string = text[pos:pos+l]
g = get_glyphs(string)
if g and g[0] == indices[ipos]:
found = True
glyph_map[g[0]] = string
break
if not found:
self.debug.append(
'Failed to find glyph->unicode mapping for text: %s'%text)
break
ipos += 1
pos += l
return text[pos:]
@store_error
def drawTextItem(self, point, text_item):
# super(PdfEngine, self).drawTextItem(point+QPoint(0, 0), text_item)
text = type(u'')(text_item.text()).replace('\n', ' ')
text = unicodedata.normalize('NFKC', text)
tl = QTextLayout(text, text_item.font(), self.paintDevice())
self.text_option.setTextDirection(Qt.RightToLeft if
text_item.renderFlags() & text_item.RightToLeft else Qt.LeftToRight)
tl.setTextOption(self.text_option)
tl = self.get_text_layout(text_item, text)
tl.setPosition(point)
tl.beginLayout()
line = tl.createLine()
@ -375,9 +433,10 @@ class PdfEngine(QPaintEngine):
rf = run.rawFont()
name = hash(bytes(rf.fontTable('name')))
if name not in self.fonts:
self.fonts[name] = FontMetrics(Sfnt(rf))
self.fonts[name] = Font(Sfnt(rf))
metrics = self.fonts[name]
indices = run.glyphIndexes()
text = self.update_glyph_map(text, indices, text_item, metrics.glyph_map)
glyphs = []
pdf_pos = point
first_baseline = None
@ -489,7 +548,7 @@ if __name__ == '__main__':
# f.setUnderline(True)
# f.setOverline(True)
# f.setStrikeOut(True)
f.setFamily('OpenDyslexic')
f.setFamily('Calibri')
p.setFont(f)
# p.scale(2, 2)
# p.rotate(45)
@ -497,6 +556,8 @@ if __name__ == '__main__':
p.drawText(QPoint(100, 300), 'Some text ū --- Д AV ff ff')
finally:
p.end()
for line in dev.engine.debug:
print (line)
if dev.engine.errors:
for err in dev.engine.errors: print (err)
raise SystemExit(1)

View File

@ -7,7 +7,7 @@ __license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re, unicodedata
import re
from itertools import izip, groupby
from operator import itemgetter
from collections import Counter, OrderedDict
@ -58,7 +58,7 @@ class FontStream(Stream):
d['Subtype'] = Name('CIDFontType0C')
def to_hex_string(c):
return bytes(hex(c)[2:]).rjust(4, b'0').decode('ascii')
return bytes(hex(int(c))[2:]).rjust(4, b'0').decode('ascii')
class CMap(Stream):
@ -154,19 +154,17 @@ class Font(object):
self.font_descriptor['FontFile'+('3' if self.is_otf else '2')
] = objects.add(self.font_stream)
self.write_widths(objects)
glyph_map = self.metrics.sfnt['cmap'].get_char_codes(self.used_glyphs)
self.write_to_unicode(objects, glyph_map)
pdf_subset(self.metrics.sfnt, set(glyph_map))
self.write_to_unicode(objects)
pdf_subset(self.metrics.sfnt, self.used_glyphs)
if self.is_otf:
self.font_stream.write(self.metrics.sfnt['CFF '].raw)
else:
self.metrics.os2.zero_fstype()
self.metrics.sfnt(self.font_stream)
def write_to_unicode(self, objects, glyph_map):
glyph_map = {k:unicodedata.normalize('NFKC', unichr(v)) for k, v in
glyph_map.iteritems()}
cmap = CMap(self.metrics.postscript_name, glyph_map, compress=self.compress)
def write_to_unicode(self, objects):
cmap = CMap(self.metrics.postscript_name, self.metrics.glyph_map,
compress=self.compress)
self.font_dict['ToUnicode'] = objects.add(cmap)
def write_widths(self, objects):

View File

@ -13,7 +13,7 @@ __docformat__ = 'restructuredtext en'
from struct import unpack_from, calcsize, pack
from collections import OrderedDict
from calibre.utils.fonts.utils import get_bmp_glyph_ids, read_bmp_prefix
from calibre.utils.fonts.utils import get_bmp_glyph_ids
from calibre.utils.fonts.sfnt import UnknownTable, max_power_of_two
from calibre.utils.fonts.sfnt.errors import UnsupportedFont
@ -165,33 +165,6 @@ class CmapTable(UnknownTable):
ans[chars[i]] = glyph_id
return ans
def get_char_codes(self, glyph_ids):
if self.bmp_table is None:
raise UnsupportedFont('This font has no Windows BMP cmap subtable.'
' Most likely a special purpose font.')
ans = {}
(start_count, end_count, range_offset, id_delta, glyph_id_len,
glyph_id_map, array_len) = read_bmp_prefix(self.bmp_table, 0)
glyph_ids = frozenset(glyph_ids)
for i, ec in enumerate(end_count):
sc = start_count[i]
ro = range_offset[i]
for code in xrange(sc, ec+1):
if ro == 0:
glyph_id = id_delta[i] + code
else:
idx = ro//2 + (code - sc) + i - array_len
glyph_id = glyph_id_map[idx]
if glyph_id != 0:
glyph_id += id_delta[i]
glyph_id %= 0x1000
if glyph_id in glyph_ids:
ans[glyph_id] = code
return ans
def set_character_map(self, cmap):
self.version, self.num_tables = 0, 1
fmt = b'>7H'