PDF Output: Fix incorrect encoding of ASCII control characters and some Chines characters in metadata and Table of Contents strings. Fixes #1433848 [Incorrect Chinese Characters in PDF TOC converted from EPUB](https://bugs.launchpad.net/calibre/+bug/1433848)

This commit is contained in:
Kovid Goyal 2015-03-19 09:48:28 +05:30
parent 647fdaad9a
commit 7f2be1f6ae

View File

@ -10,6 +10,7 @@ __docformat__ = 'restructuredtext en'
import codecs, zlib import codecs, zlib
from io import BytesIO from io import BytesIO
from datetime import datetime from datetime import datetime
from binascii import hexlify
from calibre.constants import plugins, ispy3 from calibre.constants import plugins, ispy3
@ -93,43 +94,50 @@ class Name(unicode):
in raw] in raw]
stream.write(b'/'+b''.join(buf)) stream.write(b'/'+b''.join(buf))
def escape_unbalanced_parantheses(bytestring): def escape_pdf_string(bytestring):
indices = [] indices = []
bad = [] bad = []
ba = bytearray(bytestring) ba = bytearray(bytestring)
bad_map = {10:ord('n'), 13:ord('r'), 12:ord('f'), 8:ord('b'), 9:ord('\t'), 92:ord('\\')}
for i, num in enumerate(ba): for i, num in enumerate(ba):
if num == 40: # ( if num == 40: # (
indices.append(i) indices.append((i, 40))
elif num == 41: # ) elif num == 41: # )
if indices: if indices:
indices.pop() indices.pop()
else: else:
bad.append(i) bad.append((i, 41))
bad = sorted(list(indices) + bad, reverse=True) elif num in bad_map: # '\n\r\f\b\t\\' see Table 3.2 in PDF 1.7 spec
bad.append((i, bad_map[num]))
bad = sorted(indices + bad, reverse=True)
if not bad: if not bad:
return bytestring return bytestring
for i in bad: for i, repl in bad:
ba.insert(i, 92) # \ ba[i:i+1] = (92, repl) # 92 = ord('\')
return bytes(ba) return bytes(ba)
class String(unicode): class String(unicode):
def pdf_serialize(self, stream): def pdf_serialize(self, stream):
s = self.replace('\\', '\\\\')
try: try:
raw = s.encode('latin1') raw = self.encode('latin1')
if raw.startswith(codecs.BOM_UTF16_BE): if raw.startswith(codecs.BOM_UTF16_BE):
raw = codecs.BOM_UTF16_BE + s.encode('utf-16-be') raw = codecs.BOM_UTF16_BE + self.encode('utf-16-be')
except UnicodeEncodeError: except UnicodeEncodeError:
raw = codecs.BOM_UTF16_BE + s.encode('utf-16-be') raw = codecs.BOM_UTF16_BE + self.encode('utf-16-be')
stream.write(b'('+escape_unbalanced_parantheses(raw)+b')') stream.write(b'('+escape_pdf_string(raw)+b')')
class UTF16String(unicode): class UTF16String(unicode):
def pdf_serialize(self, stream): def pdf_serialize(self, stream):
raw = codecs.BOM_UTF16_BE + self.encode('utf-16-be').replace(b'\\', b'\\\\') raw = codecs.BOM_UTF16_BE + self.encode('utf-16-be')
stream.write(b'('+escape_unbalanced_parantheses(raw)+b')') if False:
# Disabled as the parentheses based strings give easier to debug
# PDF files
stream.write(b'<' + hexlify(raw) + b'>')
else:
stream.write(b'('+escape_pdf_string(raw)+b')')
class Dictionary(dict): class Dictionary(dict):