mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
PDF Output: Fix incorrect encoding of ASCII control characters and some Chines characters in metadata and Table of Contents strings. Fixes #1433848 [Incorrect Chinese Characters in PDF TOC converted from EPUB](https://bugs.launchpad.net/calibre/+bug/1433848)
This commit is contained in:
parent
647fdaad9a
commit
7f2be1f6ae
@ -10,6 +10,7 @@ __docformat__ = 'restructuredtext en'
|
||||
import codecs, zlib
|
||||
from io import BytesIO
|
||||
from datetime import datetime
|
||||
from binascii import hexlify
|
||||
|
||||
from calibre.constants import plugins, ispy3
|
||||
|
||||
@ -93,43 +94,50 @@ class Name(unicode):
|
||||
in raw]
|
||||
stream.write(b'/'+b''.join(buf))
|
||||
|
||||
def escape_unbalanced_parantheses(bytestring):
|
||||
def escape_pdf_string(bytestring):
|
||||
indices = []
|
||||
bad = []
|
||||
ba = bytearray(bytestring)
|
||||
bad_map = {10:ord('n'), 13:ord('r'), 12:ord('f'), 8:ord('b'), 9:ord('\t'), 92:ord('\\')}
|
||||
for i, num in enumerate(ba):
|
||||
if num == 40: # (
|
||||
indices.append(i)
|
||||
indices.append((i, 40))
|
||||
elif num == 41: # )
|
||||
if indices:
|
||||
indices.pop()
|
||||
else:
|
||||
bad.append(i)
|
||||
bad = sorted(list(indices) + bad, reverse=True)
|
||||
bad.append((i, 41))
|
||||
elif num in bad_map: # '\n\r\f\b\t\\' see Table 3.2 in PDF 1.7 spec
|
||||
bad.append((i, bad_map[num]))
|
||||
bad = sorted(indices + bad, reverse=True)
|
||||
if not bad:
|
||||
return bytestring
|
||||
for i in bad:
|
||||
ba.insert(i, 92) # \
|
||||
for i, repl in bad:
|
||||
ba[i:i+1] = (92, repl) # 92 = ord('\')
|
||||
return bytes(ba)
|
||||
|
||||
|
||||
class String(unicode):
|
||||
|
||||
def pdf_serialize(self, stream):
|
||||
s = self.replace('\\', '\\\\')
|
||||
try:
|
||||
raw = s.encode('latin1')
|
||||
raw = self.encode('latin1')
|
||||
if raw.startswith(codecs.BOM_UTF16_BE):
|
||||
raw = codecs.BOM_UTF16_BE + s.encode('utf-16-be')
|
||||
raw = codecs.BOM_UTF16_BE + self.encode('utf-16-be')
|
||||
except UnicodeEncodeError:
|
||||
raw = codecs.BOM_UTF16_BE + s.encode('utf-16-be')
|
||||
stream.write(b'('+escape_unbalanced_parantheses(raw)+b')')
|
||||
raw = codecs.BOM_UTF16_BE + self.encode('utf-16-be')
|
||||
stream.write(b'('+escape_pdf_string(raw)+b')')
|
||||
|
||||
class UTF16String(unicode):
|
||||
|
||||
def pdf_serialize(self, stream):
|
||||
raw = codecs.BOM_UTF16_BE + self.encode('utf-16-be').replace(b'\\', b'\\\\')
|
||||
stream.write(b'('+escape_unbalanced_parantheses(raw)+b')')
|
||||
raw = codecs.BOM_UTF16_BE + self.encode('utf-16-be')
|
||||
if False:
|
||||
# Disabled as the parentheses based strings give easier to debug
|
||||
# PDF files
|
||||
stream.write(b'<' + hexlify(raw) + b'>')
|
||||
else:
|
||||
stream.write(b'('+escape_pdf_string(raw)+b')')
|
||||
|
||||
class Dictionary(dict):
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user