mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
PDF Output: Fix incorrect encoding of ASCII control characters and some Chines characters in metadata and Table of Contents strings. Fixes #1433848 [Incorrect Chinese Characters in PDF TOC converted from EPUB](https://bugs.launchpad.net/calibre/+bug/1433848)
This commit is contained in:
parent
647fdaad9a
commit
7f2be1f6ae
@ -10,6 +10,7 @@ __docformat__ = 'restructuredtext en'
|
|||||||
import codecs, zlib
|
import codecs, zlib
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from binascii import hexlify
|
||||||
|
|
||||||
from calibre.constants import plugins, ispy3
|
from calibre.constants import plugins, ispy3
|
||||||
|
|
||||||
@ -93,43 +94,50 @@ class Name(unicode):
|
|||||||
in raw]
|
in raw]
|
||||||
stream.write(b'/'+b''.join(buf))
|
stream.write(b'/'+b''.join(buf))
|
||||||
|
|
||||||
def escape_unbalanced_parantheses(bytestring):
|
def escape_pdf_string(bytestring):
|
||||||
indices = []
|
indices = []
|
||||||
bad = []
|
bad = []
|
||||||
ba = bytearray(bytestring)
|
ba = bytearray(bytestring)
|
||||||
|
bad_map = {10:ord('n'), 13:ord('r'), 12:ord('f'), 8:ord('b'), 9:ord('\t'), 92:ord('\\')}
|
||||||
for i, num in enumerate(ba):
|
for i, num in enumerate(ba):
|
||||||
if num == 40: # (
|
if num == 40: # (
|
||||||
indices.append(i)
|
indices.append((i, 40))
|
||||||
elif num == 41: # )
|
elif num == 41: # )
|
||||||
if indices:
|
if indices:
|
||||||
indices.pop()
|
indices.pop()
|
||||||
else:
|
else:
|
||||||
bad.append(i)
|
bad.append((i, 41))
|
||||||
bad = sorted(list(indices) + bad, reverse=True)
|
elif num in bad_map: # '\n\r\f\b\t\\' see Table 3.2 in PDF 1.7 spec
|
||||||
|
bad.append((i, bad_map[num]))
|
||||||
|
bad = sorted(indices + bad, reverse=True)
|
||||||
if not bad:
|
if not bad:
|
||||||
return bytestring
|
return bytestring
|
||||||
for i in bad:
|
for i, repl in bad:
|
||||||
ba.insert(i, 92) # \
|
ba[i:i+1] = (92, repl) # 92 = ord('\')
|
||||||
return bytes(ba)
|
return bytes(ba)
|
||||||
|
|
||||||
|
|
||||||
class String(unicode):
|
class String(unicode):
|
||||||
|
|
||||||
def pdf_serialize(self, stream):
|
def pdf_serialize(self, stream):
|
||||||
s = self.replace('\\', '\\\\')
|
|
||||||
try:
|
try:
|
||||||
raw = s.encode('latin1')
|
raw = self.encode('latin1')
|
||||||
if raw.startswith(codecs.BOM_UTF16_BE):
|
if raw.startswith(codecs.BOM_UTF16_BE):
|
||||||
raw = codecs.BOM_UTF16_BE + s.encode('utf-16-be')
|
raw = codecs.BOM_UTF16_BE + self.encode('utf-16-be')
|
||||||
except UnicodeEncodeError:
|
except UnicodeEncodeError:
|
||||||
raw = codecs.BOM_UTF16_BE + s.encode('utf-16-be')
|
raw = codecs.BOM_UTF16_BE + self.encode('utf-16-be')
|
||||||
stream.write(b'('+escape_unbalanced_parantheses(raw)+b')')
|
stream.write(b'('+escape_pdf_string(raw)+b')')
|
||||||
|
|
||||||
class UTF16String(unicode):
|
class UTF16String(unicode):
|
||||||
|
|
||||||
def pdf_serialize(self, stream):
|
def pdf_serialize(self, stream):
|
||||||
raw = codecs.BOM_UTF16_BE + self.encode('utf-16-be').replace(b'\\', b'\\\\')
|
raw = codecs.BOM_UTF16_BE + self.encode('utf-16-be')
|
||||||
stream.write(b'('+escape_unbalanced_parantheses(raw)+b')')
|
if False:
|
||||||
|
# Disabled as the parentheses based strings give easier to debug
|
||||||
|
# PDF files
|
||||||
|
stream.write(b'<' + hexlify(raw) + b'>')
|
||||||
|
else:
|
||||||
|
stream.write(b'('+escape_pdf_string(raw)+b')')
|
||||||
|
|
||||||
class Dictionary(dict):
|
class Dictionary(dict):
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user