From 835209851381f3a4e30d2aa34b9a69c1522335ac Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 21 Dec 2012 12:14:31 +0530 Subject: [PATCH] More robust implementation of glyph to unicode mapping --- src/calibre/ebooks/pdf/render/common.py | 2 +- src/calibre/ebooks/pdf/render/engine.py | 75 ++++++++++++++++++++++--- src/calibre/ebooks/pdf/render/fonts.py | 16 +++--- src/calibre/utils/fonts/sfnt/cmap.py | 29 +--------- 4 files changed, 77 insertions(+), 45 deletions(-) diff --git a/src/calibre/ebooks/pdf/render/common.py b/src/calibre/ebooks/pdf/render/common.py index 5a63fd8b99..66add0b64e 100644 --- a/src/calibre/ebooks/pdf/render/common.py +++ b/src/calibre/ebooks/pdf/render/common.py @@ -91,7 +91,7 @@ class GlyphIndex(object): else: byts = bytearray(pack(b'>H', self.code)) stream.write('<%s>'%''.join(map( - lambda x: bytes(hex(x)[2:]).rjust(2, b'0'), byts))) + lambda x: bytes(hex(int(x))[2:]).rjust(2, b'0'), byts))) class Dictionary(dict): diff --git a/src/calibre/ebooks/pdf/render/engine.py b/src/calibre/ebooks/pdf/render/engine.py index a1f0361473..41737402ea 100644 --- a/src/calibre/ebooks/pdf/render/engine.py +++ b/src/calibre/ebooks/pdf/render/engine.py @@ -205,6 +205,12 @@ class GraphicsState(object): # {{{ # }}} +class Font(FontMetrics): + + def __init__(self, sfnt): + FontMetrics.__init__(self, sfnt) + self.glyph_map = {} + class PdfEngine(QPaintEngine): def __init__(self, file_object, page_width, page_height, left_margin, @@ -235,7 +241,7 @@ class PdfEngine(QPaintEngine): self.scale = sqrt(sy**2 + sx**2) self.xscale, self.yscale = sx, sy self.graphics_state = GraphicsState() - self.errors = [] + self.errors, self.debug = [], [] self.text_option = QTextOption() self.text_option.setWrapMode(QTextOption.NoWrap) self.fonts = {} @@ -354,15 +360,67 @@ class PdfEngine(QPaintEngine): self.pdf.draw_rect(bl.x(), bl.y(), rect.width(), rect.height(), stroke=self.do_stroke, fill=self.do_fill) + def get_text_layout(self, text_item, text): + tl = QTextLayout(text, text_item.font(), self.paintDevice()) + self.text_option.setTextDirection(Qt.RightToLeft if + text_item.renderFlags() & text_item.RightToLeft else Qt.LeftToRight) + tl.setTextOption(self.text_option) + return tl + + def update_glyph_map(self, text, indices, text_item, glyph_map): + ''' + Map glyphs back to the unicode text they represent. + ''' + pos = 0 + tl = self.get_text_layout(text_item, '') + indices = list(indices) + + def get_glyphs(string): + tl.setText(string) + tl.beginLayout() + line = tl.createLine() + if not line.isValid(): + tl.endLayout() + return [] + line.setLineWidth(int(1e12)) + tl.endLayout() + ans = [] + for run in tl.glyphRuns(): + ans.extend(run.glyphIndexes()) + return ans + + ipos = 0 + while ipos < len(indices): + if indices[ipos] in glyph_map: + t = glyph_map[indices[ipos]] + if t == text[pos:pos+len(t)]: + pos += len(t) + ipos += 1 + continue + + found = False + for l in xrange(1, 10): + string = text[pos:pos+l] + g = get_glyphs(string) + if g and g[0] == indices[ipos]: + found = True + glyph_map[g[0]] = string + break + if not found: + self.debug.append( + 'Failed to find glyph->unicode mapping for text: %s'%text) + break + ipos += 1 + pos += l + + return text[pos:] + @store_error def drawTextItem(self, point, text_item): # super(PdfEngine, self).drawTextItem(point+QPoint(0, 0), text_item) text = type(u'')(text_item.text()).replace('\n', ' ') text = unicodedata.normalize('NFKC', text) - tl = QTextLayout(text, text_item.font(), self.paintDevice()) - self.text_option.setTextDirection(Qt.RightToLeft if - text_item.renderFlags() & text_item.RightToLeft else Qt.LeftToRight) - tl.setTextOption(self.text_option) + tl = self.get_text_layout(text_item, text) tl.setPosition(point) tl.beginLayout() line = tl.createLine() @@ -375,9 +433,10 @@ class PdfEngine(QPaintEngine): rf = run.rawFont() name = hash(bytes(rf.fontTable('name'))) if name not in self.fonts: - self.fonts[name] = FontMetrics(Sfnt(rf)) + self.fonts[name] = Font(Sfnt(rf)) metrics = self.fonts[name] indices = run.glyphIndexes() + text = self.update_glyph_map(text, indices, text_item, metrics.glyph_map) glyphs = [] pdf_pos = point first_baseline = None @@ -489,7 +548,7 @@ if __name__ == '__main__': # f.setUnderline(True) # f.setOverline(True) # f.setStrikeOut(True) - f.setFamily('OpenDyslexic') + f.setFamily('Calibri') p.setFont(f) # p.scale(2, 2) # p.rotate(45) @@ -497,6 +556,8 @@ if __name__ == '__main__': p.drawText(QPoint(100, 300), 'Some text ū --- Д AV ff ff') finally: p.end() + for line in dev.engine.debug: + print (line) if dev.engine.errors: for err in dev.engine.errors: print (err) raise SystemExit(1) diff --git a/src/calibre/ebooks/pdf/render/fonts.py b/src/calibre/ebooks/pdf/render/fonts.py index b0012c3dd7..e99cc7c218 100644 --- a/src/calibre/ebooks/pdf/render/fonts.py +++ b/src/calibre/ebooks/pdf/render/fonts.py @@ -7,7 +7,7 @@ __license__ = 'GPL v3' __copyright__ = '2012, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import re, unicodedata +import re from itertools import izip, groupby from operator import itemgetter from collections import Counter, OrderedDict @@ -58,7 +58,7 @@ class FontStream(Stream): d['Subtype'] = Name('CIDFontType0C') def to_hex_string(c): - return bytes(hex(c)[2:]).rjust(4, b'0').decode('ascii') + return bytes(hex(int(c))[2:]).rjust(4, b'0').decode('ascii') class CMap(Stream): @@ -154,19 +154,17 @@ class Font(object): self.font_descriptor['FontFile'+('3' if self.is_otf else '2') ] = objects.add(self.font_stream) self.write_widths(objects) - glyph_map = self.metrics.sfnt['cmap'].get_char_codes(self.used_glyphs) - self.write_to_unicode(objects, glyph_map) - pdf_subset(self.metrics.sfnt, set(glyph_map)) + self.write_to_unicode(objects) + pdf_subset(self.metrics.sfnt, self.used_glyphs) if self.is_otf: self.font_stream.write(self.metrics.sfnt['CFF '].raw) else: self.metrics.os2.zero_fstype() self.metrics.sfnt(self.font_stream) - def write_to_unicode(self, objects, glyph_map): - glyph_map = {k:unicodedata.normalize('NFKC', unichr(v)) for k, v in - glyph_map.iteritems()} - cmap = CMap(self.metrics.postscript_name, glyph_map, compress=self.compress) + def write_to_unicode(self, objects): + cmap = CMap(self.metrics.postscript_name, self.metrics.glyph_map, + compress=self.compress) self.font_dict['ToUnicode'] = objects.add(cmap) def write_widths(self, objects): diff --git a/src/calibre/utils/fonts/sfnt/cmap.py b/src/calibre/utils/fonts/sfnt/cmap.py index 13835f173d..a00eb56d6f 100644 --- a/src/calibre/utils/fonts/sfnt/cmap.py +++ b/src/calibre/utils/fonts/sfnt/cmap.py @@ -13,7 +13,7 @@ __docformat__ = 'restructuredtext en' from struct import unpack_from, calcsize, pack from collections import OrderedDict -from calibre.utils.fonts.utils import get_bmp_glyph_ids, read_bmp_prefix +from calibre.utils.fonts.utils import get_bmp_glyph_ids from calibre.utils.fonts.sfnt import UnknownTable, max_power_of_two from calibre.utils.fonts.sfnt.errors import UnsupportedFont @@ -165,33 +165,6 @@ class CmapTable(UnknownTable): ans[chars[i]] = glyph_id return ans - def get_char_codes(self, glyph_ids): - if self.bmp_table is None: - raise UnsupportedFont('This font has no Windows BMP cmap subtable.' - ' Most likely a special purpose font.') - ans = {} - (start_count, end_count, range_offset, id_delta, glyph_id_len, - glyph_id_map, array_len) = read_bmp_prefix(self.bmp_table, 0) - - glyph_ids = frozenset(glyph_ids) - - for i, ec in enumerate(end_count): - sc = start_count[i] - ro = range_offset[i] - for code in xrange(sc, ec+1): - if ro == 0: - glyph_id = id_delta[i] + code - else: - idx = ro//2 + (code - sc) + i - array_len - glyph_id = glyph_id_map[idx] - if glyph_id != 0: - glyph_id += id_delta[i] - glyph_id %= 0x1000 - if glyph_id in glyph_ids: - ans[glyph_id] = code - - return ans - def set_character_map(self, cmap): self.version, self.num_tables = 0, 1 fmt = b'>7H'