From 7f2be1f6ae28300128769b1aa035f36757b8d43c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 19 Mar 2015 09:48:28 +0530 Subject: [PATCH] PDF Output: Fix incorrect encoding of ASCII control characters and some Chines characters in metadata and Table of Contents strings. Fixes #1433848 [Incorrect Chinese Characters in PDF TOC converted from EPUB](https://bugs.launchpad.net/calibre/+bug/1433848) --- src/calibre/ebooks/pdf/render/common.py | 34 +++++++++++++++---------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/src/calibre/ebooks/pdf/render/common.py b/src/calibre/ebooks/pdf/render/common.py index b351ce5c08..bd5e8e2fe1 100644 --- a/src/calibre/ebooks/pdf/render/common.py +++ b/src/calibre/ebooks/pdf/render/common.py @@ -10,6 +10,7 @@ __docformat__ = 'restructuredtext en' import codecs, zlib from io import BytesIO from datetime import datetime +from binascii import hexlify from calibre.constants import plugins, ispy3 @@ -93,43 +94,50 @@ class Name(unicode): in raw] stream.write(b'/'+b''.join(buf)) -def escape_unbalanced_parantheses(bytestring): +def escape_pdf_string(bytestring): indices = [] bad = [] ba = bytearray(bytestring) + bad_map = {10:ord('n'), 13:ord('r'), 12:ord('f'), 8:ord('b'), 9:ord('\t'), 92:ord('\\')} for i, num in enumerate(ba): if num == 40: # ( - indices.append(i) + indices.append((i, 40)) elif num == 41: # ) if indices: indices.pop() else: - bad.append(i) - bad = sorted(list(indices) + bad, reverse=True) + bad.append((i, 41)) + elif num in bad_map: # '\n\r\f\b\t\\' see Table 3.2 in PDF 1.7 spec + bad.append((i, bad_map[num])) + bad = sorted(indices + bad, reverse=True) if not bad: return bytestring - for i in bad: - ba.insert(i, 92) # \ + for i, repl in bad: + ba[i:i+1] = (92, repl) # 92 = ord('\') return bytes(ba) class String(unicode): def pdf_serialize(self, stream): - s = self.replace('\\', '\\\\') try: - raw = s.encode('latin1') + raw = self.encode('latin1') if raw.startswith(codecs.BOM_UTF16_BE): - raw = codecs.BOM_UTF16_BE + s.encode('utf-16-be') + raw = codecs.BOM_UTF16_BE + self.encode('utf-16-be') except UnicodeEncodeError: - raw = codecs.BOM_UTF16_BE + s.encode('utf-16-be') - stream.write(b'('+escape_unbalanced_parantheses(raw)+b')') + raw = codecs.BOM_UTF16_BE + self.encode('utf-16-be') + stream.write(b'('+escape_pdf_string(raw)+b')') class UTF16String(unicode): def pdf_serialize(self, stream): - raw = codecs.BOM_UTF16_BE + self.encode('utf-16-be').replace(b'\\', b'\\\\') - stream.write(b'('+escape_unbalanced_parantheses(raw)+b')') + raw = codecs.BOM_UTF16_BE + self.encode('utf-16-be') + if False: + # Disabled as the parentheses based strings give easier to debug + # PDF files + stream.write(b'<' + hexlify(raw) + b'>') + else: + stream.write(b'('+escape_pdf_string(raw)+b')') class Dictionary(dict):