PDF Output: Fix incorrect encoding of ASCII control characters and some Chines characters in metadata and Table of Contents strings. Fixes #1433848 [Incorrect Chinese Characters in PDF TOC converted from EPUB](https://bugs.launchpad.net/calibre/+bug/1433848)

This commit is contained in:
Kovid Goyal 2015-03-19 09:48:28 +05:30
parent 647fdaad9a
commit 7f2be1f6ae

View File

@ -10,6 +10,7 @@ __docformat__ = 'restructuredtext en'
import codecs, zlib
from io import BytesIO
from datetime import datetime
from binascii import hexlify
from calibre.constants import plugins, ispy3
@ -93,43 +94,50 @@ class Name(unicode):
in raw]
stream.write(b'/'+b''.join(buf))
def escape_unbalanced_parantheses(bytestring):
def escape_pdf_string(bytestring):
indices = []
bad = []
ba = bytearray(bytestring)
bad_map = {10:ord('n'), 13:ord('r'), 12:ord('f'), 8:ord('b'), 9:ord('\t'), 92:ord('\\')}
for i, num in enumerate(ba):
if num == 40: # (
indices.append(i)
indices.append((i, 40))
elif num == 41: # )
if indices:
indices.pop()
else:
bad.append(i)
bad = sorted(list(indices) + bad, reverse=True)
bad.append((i, 41))
elif num in bad_map: # '\n\r\f\b\t\\' see Table 3.2 in PDF 1.7 spec
bad.append((i, bad_map[num]))
bad = sorted(indices + bad, reverse=True)
if not bad:
return bytestring
for i in bad:
ba.insert(i, 92) # \
for i, repl in bad:
ba[i:i+1] = (92, repl) # 92 = ord('\')
return bytes(ba)
class String(unicode):
def pdf_serialize(self, stream):
s = self.replace('\\', '\\\\')
try:
raw = s.encode('latin1')
raw = self.encode('latin1')
if raw.startswith(codecs.BOM_UTF16_BE):
raw = codecs.BOM_UTF16_BE + s.encode('utf-16-be')
raw = codecs.BOM_UTF16_BE + self.encode('utf-16-be')
except UnicodeEncodeError:
raw = codecs.BOM_UTF16_BE + s.encode('utf-16-be')
stream.write(b'('+escape_unbalanced_parantheses(raw)+b')')
raw = codecs.BOM_UTF16_BE + self.encode('utf-16-be')
stream.write(b'('+escape_pdf_string(raw)+b')')
class UTF16String(unicode):
def pdf_serialize(self, stream):
raw = codecs.BOM_UTF16_BE + self.encode('utf-16-be').replace(b'\\', b'\\\\')
stream.write(b'('+escape_unbalanced_parantheses(raw)+b')')
raw = codecs.BOM_UTF16_BE + self.encode('utf-16-be')
if False:
# Disabled as the parentheses based strings give easier to debug
# PDF files
stream.write(b'<' + hexlify(raw) + b'>')
else:
stream.write(b'('+escape_pdf_string(raw)+b')')
class Dictionary(dict):