mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Newer pyPdf that is hopefully bug free.
This commit is contained in:
parent
40f5d79b61
commit
75953a47d2
@ -32,9 +32,8 @@
|
|||||||
Implementation of stream filters for PDF.
|
Implementation of stream filters for PDF.
|
||||||
"""
|
"""
|
||||||
__author__ = "Mathieu Fenniak"
|
__author__ = "Mathieu Fenniak"
|
||||||
__author_email__ = "mfenniak@pobox.com"
|
__author_email__ = "biziqe@mathieu.fenniak.net"
|
||||||
|
|
||||||
from generic import NameObject
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import zlib
|
import zlib
|
||||||
@ -208,6 +207,7 @@ class ASCII85Decode(object):
|
|||||||
decode = staticmethod(decode)
|
decode = staticmethod(decode)
|
||||||
|
|
||||||
def decodeStreamData(stream):
|
def decodeStreamData(stream):
|
||||||
|
from generic import NameObject
|
||||||
filters = stream.get("/Filter", ())
|
filters = stream.get("/Filter", ())
|
||||||
if len(filters) and not isinstance(filters[0], NameObject):
|
if len(filters) and not isinstance(filters[0], NameObject):
|
||||||
# we have a single filter instance
|
# we have a single filter instance
|
||||||
|
@ -32,11 +32,14 @@
|
|||||||
Implementation of generic PDF objects (dictionary, number, string, and so on)
|
Implementation of generic PDF objects (dictionary, number, string, and so on)
|
||||||
"""
|
"""
|
||||||
__author__ = "Mathieu Fenniak"
|
__author__ = "Mathieu Fenniak"
|
||||||
__author_email__ = "mfenniak@pobox.com"
|
__author_email__ = "biziqe@mathieu.fenniak.net"
|
||||||
|
|
||||||
import re
|
import re
|
||||||
from utils import readNonWhitespace, RC4_encrypt
|
from utils import readNonWhitespace, RC4_encrypt
|
||||||
import filters
|
import filters
|
||||||
|
import utils
|
||||||
|
import decimal
|
||||||
|
import codecs
|
||||||
|
|
||||||
def readObject(stream, pdf):
|
def readObject(stream, pdf):
|
||||||
tok = stream.read(1)
|
tok = stream.read(1)
|
||||||
@ -46,7 +49,7 @@ def readObject(stream, pdf):
|
|||||||
return BooleanObject.readFromStream(stream)
|
return BooleanObject.readFromStream(stream)
|
||||||
elif tok == '(':
|
elif tok == '(':
|
||||||
# string object
|
# string object
|
||||||
return StringObject.readFromStream(stream)
|
return readStringFromStream(stream)
|
||||||
elif tok == '/':
|
elif tok == '/':
|
||||||
# name object
|
# name object
|
||||||
return NameObject.readFromStream(stream)
|
return NameObject.readFromStream(stream)
|
||||||
@ -63,7 +66,7 @@ def readObject(stream, pdf):
|
|||||||
if peek == '<<':
|
if peek == '<<':
|
||||||
return DictionaryObject.readFromStream(stream, pdf)
|
return DictionaryObject.readFromStream(stream, pdf)
|
||||||
else:
|
else:
|
||||||
return StringObject.readHexStringFromStream(stream)
|
return readHexStringFromStream(stream)
|
||||||
elif tok == '%':
|
elif tok == '%':
|
||||||
# comment
|
# comment
|
||||||
while tok not in ('\r', '\n'):
|
while tok not in ('\r', '\n'):
|
||||||
@ -94,7 +97,9 @@ class NullObject(PdfObject):
|
|||||||
stream.write("null")
|
stream.write("null")
|
||||||
|
|
||||||
def readFromStream(stream):
|
def readFromStream(stream):
|
||||||
assert stream.read(4) == "null"
|
nulltxt = stream.read(4)
|
||||||
|
if nulltxt != "null":
|
||||||
|
raise utils.PdfReadError, "error reading null object"
|
||||||
return NullObject()
|
return NullObject()
|
||||||
readFromStream = staticmethod(readFromStream)
|
readFromStream = staticmethod(readFromStream)
|
||||||
|
|
||||||
@ -130,7 +135,9 @@ class ArrayObject(list, PdfObject):
|
|||||||
|
|
||||||
def readFromStream(stream, pdf):
|
def readFromStream(stream, pdf):
|
||||||
arr = ArrayObject()
|
arr = ArrayObject()
|
||||||
assert stream.read(1) == "["
|
tmp = stream.read(1)
|
||||||
|
if tmp != "[":
|
||||||
|
raise utils.PdfReadError, "error reading array"
|
||||||
while True:
|
while True:
|
||||||
# skip leading whitespace
|
# skip leading whitespace
|
||||||
tok = stream.read(1)
|
tok = stream.read(1)
|
||||||
@ -189,18 +196,15 @@ class IndirectObject(PdfObject):
|
|||||||
break
|
break
|
||||||
generation += tok
|
generation += tok
|
||||||
r = stream.read(1)
|
r = stream.read(1)
|
||||||
#if r != "R":
|
if r != "R":
|
||||||
# stream.seek(-20, 1)
|
raise utils.PdfReadError("error reading indirect object reference")
|
||||||
# print idnum, generation
|
|
||||||
# print repr(stream.read(40))
|
|
||||||
assert r == "R"
|
|
||||||
return IndirectObject(int(idnum), int(generation), pdf)
|
return IndirectObject(int(idnum), int(generation), pdf)
|
||||||
readFromStream = staticmethod(readFromStream)
|
readFromStream = staticmethod(readFromStream)
|
||||||
|
|
||||||
|
|
||||||
class FloatObject(float, PdfObject):
|
class FloatObject(decimal.Decimal, PdfObject):
|
||||||
def writeToStream(self, stream, encryption_key):
|
def writeToStream(self, stream, encryption_key):
|
||||||
stream.write(repr(self))
|
stream.write(str(self))
|
||||||
|
|
||||||
|
|
||||||
class NumberObject(int, PdfObject):
|
class NumberObject(int, PdfObject):
|
||||||
@ -225,20 +229,33 @@ class NumberObject(int, PdfObject):
|
|||||||
readFromStream = staticmethod(readFromStream)
|
readFromStream = staticmethod(readFromStream)
|
||||||
|
|
||||||
|
|
||||||
class StringObject(str, PdfObject):
|
##
|
||||||
def writeToStream(self, stream, encryption_key):
|
# Given a string (either a "str" or "unicode"), create a ByteStringObject or a
|
||||||
string = self
|
# TextStringObject to represent the string.
|
||||||
if encryption_key:
|
def createStringObject(string):
|
||||||
string = RC4_encrypt(encryption_key, string)
|
if isinstance(string, unicode):
|
||||||
stream.write("(")
|
return TextStringObject(string)
|
||||||
for c in string:
|
elif isinstance(string, str):
|
||||||
if not c.isalnum() and not c.isspace():
|
if string.startswith(codecs.BOM_UTF16_BE):
|
||||||
stream.write("\\%03o" % ord(c))
|
retval = TextStringObject(string.decode("utf-16"))
|
||||||
|
retval.autodetect_utf16 = True
|
||||||
|
return retval
|
||||||
else:
|
else:
|
||||||
stream.write(c)
|
# This is probably a big performance hit here, but we need to
|
||||||
stream.write(")")
|
# convert string objects into the text/unicode-aware version if
|
||||||
|
# possible... and the only way to check if that's possible is
|
||||||
|
# to try. Some strings are strings, some are just byte arrays.
|
||||||
|
try:
|
||||||
|
retval = TextStringObject(decode_pdfdocencoding(string))
|
||||||
|
retval.autodetect_pdfdocencoding = True
|
||||||
|
return retval
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
return ByteStringObject(string)
|
||||||
|
else:
|
||||||
|
raise TypeError("createStringObject should have str or unicode arg")
|
||||||
|
|
||||||
def readHexStringFromStream(stream):
|
|
||||||
|
def readHexStringFromStream(stream):
|
||||||
stream.read(1)
|
stream.read(1)
|
||||||
txt = ""
|
txt = ""
|
||||||
x = ""
|
x = ""
|
||||||
@ -254,10 +271,10 @@ class StringObject(str, PdfObject):
|
|||||||
x += "0"
|
x += "0"
|
||||||
if len(x) == 2:
|
if len(x) == 2:
|
||||||
txt += chr(int(x, base=16))
|
txt += chr(int(x, base=16))
|
||||||
return StringObject(txt)
|
return createStringObject(txt)
|
||||||
readHexStringFromStream = staticmethod(readHexStringFromStream)
|
|
||||||
|
|
||||||
def readFromStream(stream):
|
|
||||||
|
def readStringFromStream(stream):
|
||||||
tok = stream.read(1)
|
tok = stream.read(1)
|
||||||
parens = 1
|
parens = 1
|
||||||
txt = ""
|
txt = ""
|
||||||
@ -290,9 +307,92 @@ class StringObject(str, PdfObject):
|
|||||||
elif tok.isdigit():
|
elif tok.isdigit():
|
||||||
tok += stream.read(2)
|
tok += stream.read(2)
|
||||||
tok = chr(int(tok, base=8))
|
tok = chr(int(tok, base=8))
|
||||||
|
elif tok in "\n\r":
|
||||||
|
# This case is hit when a backslash followed by a line
|
||||||
|
# break occurs. If it's a multi-char EOL, consume the
|
||||||
|
# second character:
|
||||||
|
tok = stream.read(1)
|
||||||
|
if not tok in "\n\r":
|
||||||
|
stream.seek(-1, 1)
|
||||||
|
# Then don't add anything to the actual string, since this
|
||||||
|
# line break was escaped:
|
||||||
|
tok = ''
|
||||||
|
else:
|
||||||
|
raise utils.PdfReadError("Unexpected escaped string")
|
||||||
txt += tok
|
txt += tok
|
||||||
return StringObject(txt)
|
return createStringObject(txt)
|
||||||
readFromStream = staticmethod(readFromStream)
|
|
||||||
|
|
||||||
|
##
|
||||||
|
# Represents a string object where the text encoding could not be determined.
|
||||||
|
# This occurs quite often, as the PDF spec doesn't provide an alternate way to
|
||||||
|
# represent strings -- for example, the encryption data stored in files (like
|
||||||
|
# /O) is clearly not text, but is still stored in a "String" object.
|
||||||
|
class ByteStringObject(str, PdfObject):
|
||||||
|
|
||||||
|
##
|
||||||
|
# For compatibility with TextStringObject.original_bytes. This method
|
||||||
|
# returns self.
|
||||||
|
original_bytes = property(lambda self: self)
|
||||||
|
|
||||||
|
def writeToStream(self, stream, encryption_key):
|
||||||
|
bytearr = self
|
||||||
|
if encryption_key:
|
||||||
|
bytearr = RC4_encrypt(encryption_key, bytearr)
|
||||||
|
stream.write("<")
|
||||||
|
stream.write(bytearr.encode("hex"))
|
||||||
|
stream.write(">")
|
||||||
|
|
||||||
|
|
||||||
|
##
|
||||||
|
# Represents a string object that has been decoded into a real unicode string.
|
||||||
|
# If read from a PDF document, this string appeared to match the
|
||||||
|
# PDFDocEncoding, or contained a UTF-16BE BOM mark to cause UTF-16 decoding to
|
||||||
|
# occur.
|
||||||
|
class TextStringObject(unicode, PdfObject):
|
||||||
|
autodetect_pdfdocencoding = False
|
||||||
|
autodetect_utf16 = False
|
||||||
|
|
||||||
|
##
|
||||||
|
# It is occasionally possible that a text string object gets created where
|
||||||
|
# a byte string object was expected due to the autodetection mechanism --
|
||||||
|
# if that occurs, this "original_bytes" property can be used to
|
||||||
|
# back-calculate what the original encoded bytes were.
|
||||||
|
original_bytes = property(lambda self: self.get_original_bytes())
|
||||||
|
|
||||||
|
def get_original_bytes(self):
|
||||||
|
# We're a text string object, but the library is trying to get our raw
|
||||||
|
# bytes. This can happen if we auto-detected this string as text, but
|
||||||
|
# we were wrong. It's pretty common. Return the original bytes that
|
||||||
|
# would have been used to create this object, based upon the autodetect
|
||||||
|
# method.
|
||||||
|
if self.autodetect_utf16:
|
||||||
|
return codecs.BOM_UTF16_BE + self.encode("utf-16be")
|
||||||
|
elif self.autodetect_pdfdocencoding:
|
||||||
|
return encode_pdfdocencoding(self)
|
||||||
|
else:
|
||||||
|
raise Exception("no information about original bytes")
|
||||||
|
|
||||||
|
def writeToStream(self, stream, encryption_key):
|
||||||
|
# Try to write the string out as a PDFDocEncoding encoded string. It's
|
||||||
|
# nicer to look at in the PDF file. Sadly, we take a performance hit
|
||||||
|
# here for trying...
|
||||||
|
try:
|
||||||
|
bytearr = encode_pdfdocencoding(self)
|
||||||
|
except UnicodeEncodeError:
|
||||||
|
bytearr = codecs.BOM_UTF16_BE + self.encode("utf-16be")
|
||||||
|
if encryption_key:
|
||||||
|
bytearr = RC4_encrypt(encryption_key, bytearr)
|
||||||
|
obj = ByteStringObject(bytearr)
|
||||||
|
obj.writeToStream(stream, None)
|
||||||
|
else:
|
||||||
|
stream.write("(")
|
||||||
|
for c in bytearr:
|
||||||
|
if not c.isalnum() and c != ' ':
|
||||||
|
stream.write("\\%03o" % ord(c))
|
||||||
|
else:
|
||||||
|
stream.write(c)
|
||||||
|
stream.write(")")
|
||||||
|
|
||||||
|
|
||||||
class NameObject(str, PdfObject):
|
class NameObject(str, PdfObject):
|
||||||
@ -306,7 +406,8 @@ class NameObject(str, PdfObject):
|
|||||||
|
|
||||||
def readFromStream(stream):
|
def readFromStream(stream):
|
||||||
name = stream.read(1)
|
name = stream.read(1)
|
||||||
assert name == "/"
|
if name != "/":
|
||||||
|
raise utils.PdfReadError, "name read error"
|
||||||
while True:
|
while True:
|
||||||
tok = stream.read(1)
|
tok = stream.read(1)
|
||||||
if tok.isspace() or tok in NameObject.delimiterCharacters:
|
if tok.isspace() or tok in NameObject.delimiterCharacters:
|
||||||
@ -331,7 +432,9 @@ class DictionaryObject(dict, PdfObject):
|
|||||||
stream.write(">>")
|
stream.write(">>")
|
||||||
|
|
||||||
def readFromStream(stream, pdf):
|
def readFromStream(stream, pdf):
|
||||||
assert stream.read(2) == "<<"
|
tmp = stream.read(2)
|
||||||
|
if tmp != "<<":
|
||||||
|
raise utils.PdfReadError, "dictionary read error"
|
||||||
data = {}
|
data = {}
|
||||||
while True:
|
while True:
|
||||||
tok = readNonWhitespace(stream)
|
tok = readNonWhitespace(stream)
|
||||||
@ -345,7 +448,7 @@ class DictionaryObject(dict, PdfObject):
|
|||||||
value = readObject(stream, pdf)
|
value = readObject(stream, pdf)
|
||||||
if data.has_key(key):
|
if data.has_key(key):
|
||||||
# multiple definitions of key not permitted
|
# multiple definitions of key not permitted
|
||||||
assert False
|
raise utils.PdfReadError, "multiple definitions in dictionary"
|
||||||
data[key] = value
|
data[key] = value
|
||||||
pos = stream.tell()
|
pos = stream.tell()
|
||||||
s = readNonWhitespace(stream)
|
s = readNonWhitespace(stream)
|
||||||
@ -384,7 +487,7 @@ class DictionaryObject(dict, PdfObject):
|
|||||||
data["__streamdata__"] = data["__streamdata__"][:-1]
|
data["__streamdata__"] = data["__streamdata__"][:-1]
|
||||||
else:
|
else:
|
||||||
stream.seek(pos, 0)
|
stream.seek(pos, 0)
|
||||||
raise "Unable to find 'endstream' marker after stream."
|
raise utils.PdfReadError, "Unable to find 'endstream' marker after stream."
|
||||||
else:
|
else:
|
||||||
stream.seek(pos, 0)
|
stream.seek(pos, 0)
|
||||||
if data.has_key("__streamdata__"):
|
if data.has_key("__streamdata__"):
|
||||||
@ -469,7 +572,7 @@ class EncodedStreamObject(StreamObject):
|
|||||||
return decoded._data
|
return decoded._data
|
||||||
|
|
||||||
def setData(self, data):
|
def setData(self, data):
|
||||||
raise "Creating EncodedStreamObject is not currently supported"
|
raise utils.PdfReadError, "Creating EncodedStreamObject is not currently supported"
|
||||||
|
|
||||||
|
|
||||||
class RectangleObject(ArrayObject):
|
class RectangleObject(ArrayObject):
|
||||||
@ -540,3 +643,69 @@ class RectangleObject(ArrayObject):
|
|||||||
upperLeft = property(getUpperLeft, setUpperLeft, None, None)
|
upperLeft = property(getUpperLeft, setUpperLeft, None, None)
|
||||||
upperRight = property(getUpperRight, setUpperRight, None, None)
|
upperRight = property(getUpperRight, setUpperRight, None, None)
|
||||||
|
|
||||||
|
|
||||||
|
def encode_pdfdocencoding(unicode_string):
|
||||||
|
retval = ''
|
||||||
|
for c in unicode_string:
|
||||||
|
try:
|
||||||
|
retval += chr(_pdfDocEncoding_rev[c])
|
||||||
|
except KeyError:
|
||||||
|
raise UnicodeEncodeError("pdfdocencoding", c, -1, -1,
|
||||||
|
"does not exist in translation table")
|
||||||
|
return retval
|
||||||
|
|
||||||
|
def decode_pdfdocencoding(byte_array):
|
||||||
|
retval = u''
|
||||||
|
for b in byte_array:
|
||||||
|
c = _pdfDocEncoding[ord(b)]
|
||||||
|
if c == u'\u0000':
|
||||||
|
raise UnicodeDecodeError("pdfdocencoding", b, -1, -1,
|
||||||
|
"does not exist in translation table")
|
||||||
|
retval += c
|
||||||
|
return retval
|
||||||
|
|
||||||
|
_pdfDocEncoding = (
|
||||||
|
u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000',
|
||||||
|
u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000',
|
||||||
|
u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000',
|
||||||
|
u'\u02d8', u'\u02c7', u'\u02c6', u'\u02d9', u'\u02dd', u'\u02db', u'\u02da', u'\u02dc',
|
||||||
|
u'\u0020', u'\u0021', u'\u0022', u'\u0023', u'\u0024', u'\u0025', u'\u0026', u'\u0027',
|
||||||
|
u'\u0028', u'\u0029', u'\u002a', u'\u002b', u'\u002c', u'\u002d', u'\u002e', u'\u002f',
|
||||||
|
u'\u0030', u'\u0031', u'\u0032', u'\u0033', u'\u0034', u'\u0035', u'\u0036', u'\u0037',
|
||||||
|
u'\u0038', u'\u0039', u'\u003a', u'\u003b', u'\u003c', u'\u003d', u'\u003e', u'\u003f',
|
||||||
|
u'\u0040', u'\u0041', u'\u0042', u'\u0043', u'\u0044', u'\u0045', u'\u0046', u'\u0047',
|
||||||
|
u'\u0048', u'\u0049', u'\u004a', u'\u004b', u'\u004c', u'\u004d', u'\u004e', u'\u004f',
|
||||||
|
u'\u0050', u'\u0051', u'\u0052', u'\u0053', u'\u0054', u'\u0055', u'\u0056', u'\u0057',
|
||||||
|
u'\u0058', u'\u0059', u'\u005a', u'\u005b', u'\u005c', u'\u005d', u'\u005e', u'\u005f',
|
||||||
|
u'\u0060', u'\u0061', u'\u0062', u'\u0063', u'\u0064', u'\u0065', u'\u0066', u'\u0067',
|
||||||
|
u'\u0068', u'\u0069', u'\u006a', u'\u006b', u'\u006c', u'\u006d', u'\u006e', u'\u006f',
|
||||||
|
u'\u0070', u'\u0071', u'\u0072', u'\u0073', u'\u0074', u'\u0075', u'\u0076', u'\u0077',
|
||||||
|
u'\u0078', u'\u0079', u'\u007a', u'\u007b', u'\u007c', u'\u007d', u'\u007e', u'\u0000',
|
||||||
|
u'\u2022', u'\u2020', u'\u2021', u'\u2026', u'\u2014', u'\u2013', u'\u0192', u'\u2044',
|
||||||
|
u'\u2039', u'\u203a', u'\u2212', u'\u2030', u'\u201e', u'\u201c', u'\u201d', u'\u2018',
|
||||||
|
u'\u2019', u'\u201a', u'\u2122', u'\ufb01', u'\ufb02', u'\u0141', u'\u0152', u'\u0160',
|
||||||
|
u'\u0178', u'\u017d', u'\u0131', u'\u0142', u'\u0153', u'\u0161', u'\u017e', u'\u0000',
|
||||||
|
u'\u20ac', u'\u00a1', u'\u00a2', u'\u00a3', u'\u00a4', u'\u00a5', u'\u00a6', u'\u00a7',
|
||||||
|
u'\u00a8', u'\u00a9', u'\u00aa', u'\u00ab', u'\u00ac', u'\u0000', u'\u00ae', u'\u00af',
|
||||||
|
u'\u00b0', u'\u00b1', u'\u00b2', u'\u00b3', u'\u00b4', u'\u00b5', u'\u00b6', u'\u00b7',
|
||||||
|
u'\u00b8', u'\u00b9', u'\u00ba', u'\u00bb', u'\u00bc', u'\u00bd', u'\u00be', u'\u00bf',
|
||||||
|
u'\u00c0', u'\u00c1', u'\u00c2', u'\u00c3', u'\u00c4', u'\u00c5', u'\u00c6', u'\u00c7',
|
||||||
|
u'\u00c8', u'\u00c9', u'\u00ca', u'\u00cb', u'\u00cc', u'\u00cd', u'\u00ce', u'\u00cf',
|
||||||
|
u'\u00d0', u'\u00d1', u'\u00d2', u'\u00d3', u'\u00d4', u'\u00d5', u'\u00d6', u'\u00d7',
|
||||||
|
u'\u00d8', u'\u00d9', u'\u00da', u'\u00db', u'\u00dc', u'\u00dd', u'\u00de', u'\u00df',
|
||||||
|
u'\u00e0', u'\u00e1', u'\u00e2', u'\u00e3', u'\u00e4', u'\u00e5', u'\u00e6', u'\u00e7',
|
||||||
|
u'\u00e8', u'\u00e9', u'\u00ea', u'\u00eb', u'\u00ec', u'\u00ed', u'\u00ee', u'\u00ef',
|
||||||
|
u'\u00f0', u'\u00f1', u'\u00f2', u'\u00f3', u'\u00f4', u'\u00f5', u'\u00f6', u'\u00f7',
|
||||||
|
u'\u00f8', u'\u00f9', u'\u00fa', u'\u00fb', u'\u00fc', u'\u00fd', u'\u00fe', u'\u00ff'
|
||||||
|
)
|
||||||
|
|
||||||
|
assert len(_pdfDocEncoding) == 256
|
||||||
|
|
||||||
|
_pdfDocEncoding_rev = {}
|
||||||
|
for i in xrange(256):
|
||||||
|
char = _pdfDocEncoding[i]
|
||||||
|
if char == u"\u0000":
|
||||||
|
continue
|
||||||
|
assert char not in _pdfDocEncoding_rev
|
||||||
|
_pdfDocEncoding_rev[char] = i
|
||||||
|
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
# vim: sw=4:expandtab:foldmethod=marker
|
# vim: sw=4:expandtab:foldmethod=marker
|
||||||
#
|
#
|
||||||
# Copyright (c) 2006, Mathieu Fenniak
|
# Copyright (c) 2006, Mathieu Fenniak
|
||||||
|
# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com>
|
||||||
|
#
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
#
|
#
|
||||||
# Redistribution and use in source and binary forms, with or without
|
# Redistribution and use in source and binary forms, with or without
|
||||||
@ -34,7 +36,7 @@ be able to split and merge PDF files by page, and that's about all it can do.
|
|||||||
It may be a solid base for future PDF file work in Python.
|
It may be a solid base for future PDF file work in Python.
|
||||||
"""
|
"""
|
||||||
__author__ = "Mathieu Fenniak"
|
__author__ = "Mathieu Fenniak"
|
||||||
__author_email__ = "mfenniak@pobox.com"
|
__author_email__ = "biziqe@mathieu.fenniak.net"
|
||||||
|
|
||||||
import struct
|
import struct
|
||||||
try:
|
try:
|
||||||
@ -44,6 +46,7 @@ except ImportError:
|
|||||||
|
|
||||||
import filters
|
import filters
|
||||||
import utils
|
import utils
|
||||||
|
import warnings
|
||||||
from generic import *
|
from generic import *
|
||||||
from utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirtualList
|
from utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirtualList
|
||||||
from sets import ImmutableSet
|
from sets import ImmutableSet
|
||||||
@ -68,7 +71,7 @@ class PdfFileWriter(object):
|
|||||||
# info object
|
# info object
|
||||||
info = DictionaryObject()
|
info = DictionaryObject()
|
||||||
info.update({
|
info.update({
|
||||||
NameObject("/Producer"): StringObject("Python PDF Library - http://pybrary.net/pyPdf/")
|
NameObject("/Producer"): createStringObject(u"Python PDF Library - http://pybrary.net/pyPdf/")
|
||||||
})
|
})
|
||||||
self._info = self._addObject(info)
|
self._info = self._addObject(info)
|
||||||
|
|
||||||
@ -128,10 +131,10 @@ class PdfFileWriter(object):
|
|||||||
keylen = 40 / 8
|
keylen = 40 / 8
|
||||||
# permit everything:
|
# permit everything:
|
||||||
P = -1
|
P = -1
|
||||||
O = StringObject(_alg33(owner_pwd, user_pwd, rev, keylen))
|
O = ByteStringObject(_alg33(owner_pwd, user_pwd, rev, keylen))
|
||||||
ID_1 = md5.new(repr(time.time())).digest()
|
ID_1 = md5.new(repr(time.time())).digest()
|
||||||
ID_2 = md5.new(repr(random.random())).digest()
|
ID_2 = md5.new(repr(random.random())).digest()
|
||||||
self._ID = ArrayObject((StringObject(ID_1), StringObject(ID_2)))
|
self._ID = ArrayObject((ByteStringObject(ID_1), ByteStringObject(ID_2)))
|
||||||
if rev == 2:
|
if rev == 2:
|
||||||
U, key = _alg34(user_pwd, O, P, ID_1)
|
U, key = _alg34(user_pwd, O, P, ID_1)
|
||||||
else:
|
else:
|
||||||
@ -143,8 +146,8 @@ class PdfFileWriter(object):
|
|||||||
if V == 2:
|
if V == 2:
|
||||||
encrypt[NameObject("/Length")] = NumberObject(keylen * 8)
|
encrypt[NameObject("/Length")] = NumberObject(keylen * 8)
|
||||||
encrypt[NameObject("/R")] = NumberObject(rev)
|
encrypt[NameObject("/R")] = NumberObject(rev)
|
||||||
encrypt[NameObject("/O")] = StringObject(O)
|
encrypt[NameObject("/O")] = ByteStringObject(O)
|
||||||
encrypt[NameObject("/U")] = StringObject(U)
|
encrypt[NameObject("/U")] = ByteStringObject(U)
|
||||||
encrypt[NameObject("/P")] = NumberObject(P)
|
encrypt[NameObject("/P")] = NumberObject(P)
|
||||||
self._encrypt = self._addObject(encrypt)
|
self._encrypt = self._addObject(encrypt)
|
||||||
self._encrypt_key = key
|
self._encrypt_key = key
|
||||||
@ -212,8 +215,6 @@ class PdfFileWriter(object):
|
|||||||
for key, value in data.items():
|
for key, value in data.items():
|
||||||
origvalue = value
|
origvalue = value
|
||||||
value = self._sweepIndirectReferences(externMap, value)
|
value = self._sweepIndirectReferences(externMap, value)
|
||||||
if value == None:
|
|
||||||
print objects, value, origvalue
|
|
||||||
if isinstance(value, StreamObject):
|
if isinstance(value, StreamObject):
|
||||||
# a dictionary value is a stream. streams must be indirect
|
# a dictionary value is a stream. streams must be indirect
|
||||||
# objects, so we need to change this value.
|
# objects, so we need to change this value.
|
||||||
@ -271,6 +272,7 @@ class PdfFileWriter(object):
|
|||||||
class PdfFileReader(object):
|
class PdfFileReader(object):
|
||||||
def __init__(self, stream):
|
def __init__(self, stream):
|
||||||
self.flattenedPages = None
|
self.flattenedPages = None
|
||||||
|
self.pageNumbers = {}
|
||||||
self.resolvedObjects = {}
|
self.resolvedObjects = {}
|
||||||
self.read(stream)
|
self.read(stream)
|
||||||
self.stream = stream
|
self.stream = stream
|
||||||
@ -329,6 +331,144 @@ class PdfFileReader(object):
|
|||||||
self._flatten()
|
self._flatten()
|
||||||
return self.flattenedPages[pageNumber]
|
return self.flattenedPages[pageNumber]
|
||||||
|
|
||||||
|
##
|
||||||
|
# Read-only property that accesses the
|
||||||
|
# {@link #PdfFileReader.getNamedDestinations
|
||||||
|
# getNamedDestinations} function.
|
||||||
|
# <p>
|
||||||
|
# Stability: Added in v1.10, will exist for all future v1.x releases.
|
||||||
|
namedDestinations = property(lambda self:
|
||||||
|
self.getNamedDestinations(), None, None)
|
||||||
|
|
||||||
|
##
|
||||||
|
# Retrieves the named destinations present in the document.
|
||||||
|
# <p>
|
||||||
|
# Stability: Added in v1.10, will exist for all future v1.x releases.
|
||||||
|
# @return Returns a dict which maps names to {@link #Destination
|
||||||
|
# destinations}.
|
||||||
|
def getNamedDestinations(self, tree = None, map = None):
|
||||||
|
if self.flattenedPages == None:
|
||||||
|
self._flatten()
|
||||||
|
|
||||||
|
get = self.safeGetObject
|
||||||
|
if map == None:
|
||||||
|
map = {}
|
||||||
|
catalog = get(self.trailer["/Root"])
|
||||||
|
|
||||||
|
# get the name tree
|
||||||
|
if catalog.has_key("/Dests"):
|
||||||
|
tree = get(catalog["/Dests"])
|
||||||
|
elif catalog.has_key("/Names"):
|
||||||
|
names = get(catalog['/Names'])
|
||||||
|
if names.has_key("/Dests"):
|
||||||
|
tree = get(names['/Dests'])
|
||||||
|
|
||||||
|
if tree == None:
|
||||||
|
return map
|
||||||
|
|
||||||
|
if tree.has_key("/Kids"):
|
||||||
|
# recurse down the tree
|
||||||
|
for kid in get(tree["/Kids"]):
|
||||||
|
self.getNamedDestinations(get(kid), map)
|
||||||
|
|
||||||
|
if tree.has_key("/Names"):
|
||||||
|
names = get(tree["/Names"])
|
||||||
|
for i in range(0, len(names), 2):
|
||||||
|
key = get(names[i])
|
||||||
|
val = get(names[i+1])
|
||||||
|
if isinstance(val, DictionaryObject) and val.has_key('/D'):
|
||||||
|
val = get(val['/D'])
|
||||||
|
dest = self._buildDestination(val, key)
|
||||||
|
if dest != None:
|
||||||
|
map[key] = dest
|
||||||
|
|
||||||
|
return map
|
||||||
|
|
||||||
|
##
|
||||||
|
# Read-only property that accesses the {@link #PdfFileReader.getOutlines
|
||||||
|
# getOutlines} function.
|
||||||
|
# <p>
|
||||||
|
# Stability: Added in v1.10, will exist for all future v1.x releases.
|
||||||
|
outlines = property(lambda self: self.getOutlines(), None, None)
|
||||||
|
|
||||||
|
##
|
||||||
|
# Retrieves the document outline present in the document.
|
||||||
|
# <p>
|
||||||
|
# Stability: Added in v1.10, will exist for all future v1.x releases.
|
||||||
|
# @return Returns a nested list of {@link #Destination destinations}.
|
||||||
|
def getOutlines(self, node = None, outlines = None):
|
||||||
|
if self.flattenedPages == None:
|
||||||
|
self._flatten()
|
||||||
|
|
||||||
|
get = self.safeGetObject
|
||||||
|
if outlines == None:
|
||||||
|
outlines = []
|
||||||
|
catalog = get(self.trailer["/Root"])
|
||||||
|
|
||||||
|
# get the outline dictionary and named destinations
|
||||||
|
if catalog.has_key("/Outlines"):
|
||||||
|
lines = get(catalog["/Outlines"])
|
||||||
|
if lines.has_key("/First"):
|
||||||
|
node = get(lines["/First"])
|
||||||
|
self._namedDests = self.getNamedDestinations()
|
||||||
|
|
||||||
|
if node == None:
|
||||||
|
return outlines
|
||||||
|
|
||||||
|
# see if there are any more outlines
|
||||||
|
while 1:
|
||||||
|
outline = self._buildOutline(node)
|
||||||
|
if outline:
|
||||||
|
outlines.append(outline)
|
||||||
|
|
||||||
|
# check for sub-outlines
|
||||||
|
if node.has_key("/First"):
|
||||||
|
subOutlines = []
|
||||||
|
self.getOutlines(get(node["/First"]), subOutlines)
|
||||||
|
if subOutlines:
|
||||||
|
outlines.append(subOutlines)
|
||||||
|
|
||||||
|
if not node.has_key("/Next"):
|
||||||
|
break
|
||||||
|
node = get(node["/Next"])
|
||||||
|
|
||||||
|
return outlines
|
||||||
|
|
||||||
|
def _buildDestination(self, array, title):
|
||||||
|
if not (isinstance(array, ArrayObject) and len(array) >= 2 and \
|
||||||
|
isinstance(array[0], IndirectObject)):
|
||||||
|
return None
|
||||||
|
|
||||||
|
pageKey = (array[0].generation, array[0].idnum)
|
||||||
|
if not self.pageNumbers.has_key(pageKey):
|
||||||
|
return None
|
||||||
|
|
||||||
|
pageNum = self.pageNumbers[pageKey]
|
||||||
|
return Destination(*([title, pageNum]+array[1:]))
|
||||||
|
|
||||||
|
def _buildOutline(self, node):
|
||||||
|
dest, title, outline = None, None, None
|
||||||
|
|
||||||
|
if node.has_key("/A") and node.has_key("/Title"):
|
||||||
|
# Action, section 8.5 (only type GoTo supported)
|
||||||
|
title = self.safeGetObject(node["/Title"])
|
||||||
|
action = self.safeGetObject(node["/A"])
|
||||||
|
if action["/S"] == "/GoTo":
|
||||||
|
dest = self.safeGetObject(action["/D"])
|
||||||
|
elif node.has_key("/Dest") and node.has_key("/Title"):
|
||||||
|
# Destination, section 8.2.1
|
||||||
|
title = self.safeGetObject(node["/Title"])
|
||||||
|
dest = self.safeGetObject(node["/Dest"])
|
||||||
|
|
||||||
|
# if destination found, then create outline
|
||||||
|
if dest:
|
||||||
|
if isinstance(dest, ArrayObject):
|
||||||
|
outline = self._buildDestination(dest, title)
|
||||||
|
elif isinstance(dest, str) and self._namedDests.has_key(dest):
|
||||||
|
outline = self._namedDests[dest]
|
||||||
|
outline.title = title
|
||||||
|
return outline
|
||||||
|
|
||||||
##
|
##
|
||||||
# Read-only property that emulates a list based upon the {@link
|
# Read-only property that emulates a list based upon the {@link
|
||||||
# #PdfFileReader.getNumPages getNumPages} and {@link #PdfFileReader.getPage
|
# #PdfFileReader.getNumPages getNumPages} and {@link #PdfFileReader.getPage
|
||||||
@ -349,14 +489,16 @@ class PdfFileReader(object):
|
|||||||
self.flattenedPages = []
|
self.flattenedPages = []
|
||||||
catalog = self.getObject(self.trailer["/Root"])
|
catalog = self.getObject(self.trailer["/Root"])
|
||||||
pages = self.getObject(catalog["/Pages"])
|
pages = self.getObject(catalog["/Pages"])
|
||||||
|
indirectReference = None
|
||||||
if isinstance(pages, IndirectObject):
|
if isinstance(pages, IndirectObject):
|
||||||
|
indirectReference = pages
|
||||||
pages = self.getObject(pages)
|
pages = self.getObject(pages)
|
||||||
t = pages["/Type"]
|
t = pages["/Type"]
|
||||||
if t == "/Pages":
|
if t == "/Pages":
|
||||||
for attr in inheritablePageAttributes:
|
for attr in inheritablePageAttributes:
|
||||||
if pages.has_key(attr):
|
if pages.has_key(attr):
|
||||||
inherit[attr] = pages[attr]
|
inherit[attr] = pages[attr]
|
||||||
for page in pages["/Kids"]:
|
for page in self.safeGetObject(pages["/Kids"]):
|
||||||
self._flatten(page, inherit)
|
self._flatten(page, inherit)
|
||||||
elif t == "/Page":
|
elif t == "/Page":
|
||||||
for attr,value in inherit.items():
|
for attr,value in inherit.items():
|
||||||
@ -364,8 +506,11 @@ class PdfFileReader(object):
|
|||||||
# parent's value:
|
# parent's value:
|
||||||
if not pages.has_key(attr):
|
if not pages.has_key(attr):
|
||||||
pages[attr] = value
|
pages[attr] = value
|
||||||
pageObj = PageObject(self)
|
pageObj = PageObject(self, indirectReference)
|
||||||
pageObj.update(pages)
|
pageObj.update(pages)
|
||||||
|
if indirectReference:
|
||||||
|
key = (indirectReference.generation, indirectReference.idnum)
|
||||||
|
self.pageNumbers[key] = len(self.flattenedPages)
|
||||||
self.flattenedPages.append(pageObj)
|
self.flattenedPages.append(pageObj)
|
||||||
|
|
||||||
def safeGetObject(self, obj):
|
def safeGetObject(self, obj):
|
||||||
@ -425,8 +570,8 @@ class PdfFileReader(object):
|
|||||||
return retval
|
return retval
|
||||||
|
|
||||||
def _decryptObject(self, obj, key):
|
def _decryptObject(self, obj, key):
|
||||||
if isinstance(obj, StringObject):
|
if isinstance(obj, ByteStringObject) or isinstance(obj, TextStringObject):
|
||||||
obj = StringObject(utils.RC4_encrypt(key, obj))
|
obj = createStringObject(utils.RC4_encrypt(key, obj.original_bytes))
|
||||||
elif isinstance(obj, StreamObject):
|
elif isinstance(obj, StreamObject):
|
||||||
obj._data = utils.RC4_encrypt(key, obj._data)
|
obj._data = utils.RC4_encrypt(key, obj._data)
|
||||||
elif isinstance(obj, DictionaryObject):
|
elif isinstance(obj, DictionaryObject):
|
||||||
@ -438,6 +583,11 @@ class PdfFileReader(object):
|
|||||||
return obj
|
return obj
|
||||||
|
|
||||||
def readObjectHeader(self, stream):
|
def readObjectHeader(self, stream):
|
||||||
|
# Should never be necessary to read out whitespace, since the
|
||||||
|
# cross-reference table should put us in the right spot to read the
|
||||||
|
# object header. In reality... some files have stupid cross reference
|
||||||
|
# tables that are off by whitespace bytes.
|
||||||
|
readNonWhitespace(stream); stream.seek(-1, 1)
|
||||||
idnum = readUntilWhitespace(stream)
|
idnum = readUntilWhitespace(stream)
|
||||||
generation = readUntilWhitespace(stream)
|
generation = readUntilWhitespace(stream)
|
||||||
obj = stream.read(3)
|
obj = stream.read(3)
|
||||||
@ -456,13 +606,15 @@ class PdfFileReader(object):
|
|||||||
line = ''
|
line = ''
|
||||||
while not line:
|
while not line:
|
||||||
line = self.readNextEndLine(stream)
|
line = self.readNextEndLine(stream)
|
||||||
assert line[:5] == "%%EOF"
|
if line[:5] != "%%EOF":
|
||||||
|
raise utils.PdfReadError, "EOF marker not found"
|
||||||
|
|
||||||
# find startxref entry - the location of the xref table
|
# find startxref entry - the location of the xref table
|
||||||
line = self.readNextEndLine(stream)
|
line = self.readNextEndLine(stream)
|
||||||
startxref = int(line)
|
startxref = int(line)
|
||||||
line = self.readNextEndLine(stream)
|
line = self.readNextEndLine(stream)
|
||||||
assert line[:9] == "startxref"
|
if line[:9] != "startxref":
|
||||||
|
raise utils.PdfReadError, "startxref not found"
|
||||||
|
|
||||||
# read all cross reference tables and their trailers
|
# read all cross reference tables and their trailers
|
||||||
self.xref = {}
|
self.xref = {}
|
||||||
@ -475,7 +627,8 @@ class PdfFileReader(object):
|
|||||||
if x == "x":
|
if x == "x":
|
||||||
# standard cross-reference table
|
# standard cross-reference table
|
||||||
ref = stream.read(4)
|
ref = stream.read(4)
|
||||||
assert ref[:3] == "ref"
|
if ref[:3] != "ref":
|
||||||
|
raise utils.PdfReadError, "xref table read error"
|
||||||
readNonWhitespace(stream)
|
readNonWhitespace(stream)
|
||||||
stream.seek(-1, 1)
|
stream.seek(-1, 1)
|
||||||
while 1:
|
while 1:
|
||||||
@ -661,7 +814,7 @@ class PdfFileReader(object):
|
|||||||
def _authenticateUserPassword(self, password):
|
def _authenticateUserPassword(self, password):
|
||||||
encrypt = self.safeGetObject(self.trailer['/Encrypt'])
|
encrypt = self.safeGetObject(self.trailer['/Encrypt'])
|
||||||
rev = self.safeGetObject(encrypt['/R'])
|
rev = self.safeGetObject(encrypt['/R'])
|
||||||
owner_entry = self.safeGetObject(encrypt['/O'])
|
owner_entry = self.safeGetObject(encrypt['/O']).original_bytes
|
||||||
p_entry = self.safeGetObject(encrypt['/P'])
|
p_entry = self.safeGetObject(encrypt['/P'])
|
||||||
id_entry = self.safeGetObject(self.trailer['/ID'])
|
id_entry = self.safeGetObject(self.trailer['/ID'])
|
||||||
id1_entry = self.safeGetObject(id_entry[0])
|
id1_entry = self.safeGetObject(id_entry[0])
|
||||||
@ -672,7 +825,7 @@ class PdfFileReader(object):
|
|||||||
self.safeGetObject(encrypt["/Length"]) / 8, owner_entry,
|
self.safeGetObject(encrypt["/Length"]) / 8, owner_entry,
|
||||||
p_entry, id1_entry,
|
p_entry, id1_entry,
|
||||||
self.safeGetObject(encrypt.get("/EncryptMetadata", False)))
|
self.safeGetObject(encrypt.get("/EncryptMetadata", False)))
|
||||||
real_U = self.safeGetObject(encrypt['/U'])
|
real_U = self.safeGetObject(encrypt['/U']).original_bytes
|
||||||
return U == real_U, key
|
return U == real_U, key
|
||||||
|
|
||||||
def getIsEncrypted(self):
|
def getIsEncrypted(self):
|
||||||
@ -721,9 +874,10 @@ def createRectangleAccessor(name, fallback):
|
|||||||
# will be created by accessing the {@link #PdfFileReader.getPage getPage}
|
# will be created by accessing the {@link #PdfFileReader.getPage getPage}
|
||||||
# function of the {@link #PdfFileReader PdfFileReader} class.
|
# function of the {@link #PdfFileReader PdfFileReader} class.
|
||||||
class PageObject(DictionaryObject):
|
class PageObject(DictionaryObject):
|
||||||
def __init__(self, pdf):
|
def __init__(self, pdf, indirectReference = None):
|
||||||
DictionaryObject.__init__(self)
|
DictionaryObject.__init__(self)
|
||||||
self.pdf = pdf
|
self.pdf = pdf
|
||||||
|
self.indirectReference = indirectReference
|
||||||
|
|
||||||
##
|
##
|
||||||
# Rotates a page clockwise by increments of 90 degrees.
|
# Rotates a page clockwise by increments of 90 degrees.
|
||||||
@ -856,26 +1010,35 @@ class PageObject(DictionaryObject):
|
|||||||
# <p>
|
# <p>
|
||||||
# Stability: Added in v1.7, will exist for all future v1.x releases. May
|
# Stability: Added in v1.7, will exist for all future v1.x releases. May
|
||||||
# be overhauled to provide more ordered text in the future.
|
# be overhauled to provide more ordered text in the future.
|
||||||
# @return a string object
|
# @return a unicode string object
|
||||||
def extractText(self):
|
def extractText(self):
|
||||||
text = ""
|
text = u""
|
||||||
content = self["/Contents"].getObject()
|
content = self["/Contents"].getObject()
|
||||||
if not isinstance(content, ContentStream):
|
if not isinstance(content, ContentStream):
|
||||||
content = ContentStream(content, self.pdf)
|
content = ContentStream(content, self.pdf)
|
||||||
|
# Note: we check all strings are TextStringObjects. ByteStringObjects
|
||||||
|
# are strings where the byte->string encoding was unknown, so adding
|
||||||
|
# them to the text here would be gibberish.
|
||||||
for operands,operator in content.operations:
|
for operands,operator in content.operations:
|
||||||
if operator == "Tj":
|
if operator == "Tj":
|
||||||
text += operands[0]
|
_text = operands[0]
|
||||||
|
if isinstance(_text, TextStringObject):
|
||||||
|
text += _text
|
||||||
elif operator == "T*":
|
elif operator == "T*":
|
||||||
text += "\n"
|
text += "\n"
|
||||||
elif operator == "'":
|
elif operator == "'":
|
||||||
text += "\n"
|
text += "\n"
|
||||||
|
_text = operands[0]
|
||||||
|
if isinstance(_text, TextStringObject):
|
||||||
text += operands[0]
|
text += operands[0]
|
||||||
elif operator == "\"":
|
elif operator == '"':
|
||||||
|
_text = operands[2]
|
||||||
|
if isinstance(_text, TextStringObject):
|
||||||
text += "\n"
|
text += "\n"
|
||||||
text += operands[2]
|
text += _text
|
||||||
elif operator == "TJ":
|
elif operator == "TJ":
|
||||||
for i in operands[0]:
|
for i in operands[0]:
|
||||||
if isinstance(i, StringObject):
|
if isinstance(i, TextStringObject):
|
||||||
text += i
|
text += i
|
||||||
return text
|
return text
|
||||||
|
|
||||||
@ -946,7 +1109,7 @@ class ContentStream(DecodedStreamObject):
|
|||||||
if peek == '':
|
if peek == '':
|
||||||
break
|
break
|
||||||
stream.seek(-1, 1)
|
stream.seek(-1, 1)
|
||||||
if peek.isalpha() or peek == "'" or peek == "\"":
|
if peek.isalpha() or peek == "'" or peek == '"':
|
||||||
operator = readUntilWhitespace(stream, maxchars=2)
|
operator = readUntilWhitespace(stream, maxchars=2)
|
||||||
if operator == "BI":
|
if operator == "BI":
|
||||||
# begin inline image - a completely different parsing
|
# begin inline image - a completely different parsing
|
||||||
@ -1021,43 +1184,139 @@ class ContentStream(DecodedStreamObject):
|
|||||||
|
|
||||||
##
|
##
|
||||||
# A class representing the basic document metadata provided in a PDF File.
|
# A class representing the basic document metadata provided in a PDF File.
|
||||||
|
# <p>
|
||||||
|
# As of pyPdf v1.10, all text properties of the document metadata have two
|
||||||
|
# properties, eg. author and author_raw. The non-raw property will always
|
||||||
|
# return a TextStringObject, making it ideal for a case where the metadata is
|
||||||
|
# being displayed. The raw property can sometimes return a ByteStringObject,
|
||||||
|
# if pyPdf was unable to decode the string's text encoding; this requires
|
||||||
|
# additional safety in the caller and therefore is not as commonly accessed.
|
||||||
class DocumentInformation(DictionaryObject):
|
class DocumentInformation(DictionaryObject):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
DictionaryObject.__init__(self)
|
DictionaryObject.__init__(self)
|
||||||
|
|
||||||
|
def getText(self, key):
|
||||||
|
retval = self.get(key, None)
|
||||||
|
if isinstance(retval, TextStringObject):
|
||||||
|
return retval
|
||||||
|
return None
|
||||||
|
|
||||||
##
|
##
|
||||||
# Read-only property accessing the document's title. Added in v1.6, will
|
# Read-only property accessing the document's title. Added in v1.6, will
|
||||||
# exist for all future v1.x releases.
|
# exist for all future v1.x releases. Modified in v1.10 to always return a
|
||||||
# @return A string, or None if the title is not provided.
|
# unicode string (TextStringObject).
|
||||||
title = property(lambda self: self.get("/Title", None), None, None)
|
# @return A unicode string, or None if the title is not provided.
|
||||||
|
title = property(lambda self: self.getText("/Title"))
|
||||||
|
title_raw = property(lambda self: self.get("/Title"))
|
||||||
|
|
||||||
##
|
##
|
||||||
# Read-only property accessing the document's author. Added in v1.6, will
|
# Read-only property accessing the document's author. Added in v1.6, will
|
||||||
# exist for all future v1.x releases.
|
# exist for all future v1.x releases. Modified in v1.10 to always return a
|
||||||
# @return A string, or None if the author is not provided.
|
# unicode string (TextStringObject).
|
||||||
author = property(lambda self: self.get("/Author", None), None, None)
|
# @return A unicode string, or None if the author is not provided.
|
||||||
|
author = property(lambda self: self.getText("/Author"))
|
||||||
|
author_raw = property(lambda self: self.get("/Author"))
|
||||||
|
|
||||||
##
|
##
|
||||||
# Read-only property accessing the subject of the document. Added in v1.6,
|
# Read-only property accessing the subject of the document. Added in v1.6,
|
||||||
# will exist for all future v1.x releases.
|
# will exist for all future v1.x releases. Modified in v1.10 to always
|
||||||
# @return A string, or None if the subject is not provided.
|
# return a unicode string (TextStringObject).
|
||||||
subject = property(lambda self: self.get("/Subject", None), None, None)
|
# @return A unicode string, or None if the subject is not provided.
|
||||||
|
subject = property(lambda self: self.getText("/Subject"))
|
||||||
|
subject_raw = property(lambda self: self.get("/Subject"))
|
||||||
|
|
||||||
##
|
##
|
||||||
# Read-only property accessing the document's creator. If the document was
|
# Read-only property accessing the document's creator. If the document was
|
||||||
# converted to PDF from another format, the name of the application (for
|
# converted to PDF from another format, the name of the application (for
|
||||||
# example, OpenOffice) that created the original document from which it was
|
# example, OpenOffice) that created the original document from which it was
|
||||||
# converted. Added in v1.6, will exist for all future v1.x releases.
|
# converted. Added in v1.6, will exist for all future v1.x releases.
|
||||||
# @return A string, or None if the creator is not provided.
|
# Modified in v1.10 to always return a unicode string (TextStringObject).
|
||||||
creator = property(lambda self: self.get("/Creator", None), None, None)
|
# @return A unicode string, or None if the creator is not provided.
|
||||||
|
creator = property(lambda self: self.getText("/Creator"))
|
||||||
|
creator_raw = property(lambda self: self.get("/Creator"))
|
||||||
|
|
||||||
##
|
##
|
||||||
# Read-only property accessing the document's producer. If the document
|
# Read-only property accessing the document's producer. If the document
|
||||||
# was converted to PDF from another format, the name of the application
|
# was converted to PDF from another format, the name of the application
|
||||||
# (for example, OSX Quartz) that converted it to PDF. Added in v1.6, will
|
# (for example, OSX Quartz) that converted it to PDF. Added in v1.6, will
|
||||||
# exist for all future v1.x releases.
|
# exist for all future v1.x releases. Modified in v1.10 to always return a
|
||||||
# @return A string, or None if the producer is not provided.
|
# unicode string (TextStringObject).
|
||||||
producer = property(lambda self: self.get("/Producer", None), None, None)
|
# @return A unicode string, or None if the producer is not provided.
|
||||||
|
producer = property(lambda self: self.getText("/Producer"))
|
||||||
|
producer_raw = property(lambda self: self.get("/Producer"))
|
||||||
|
|
||||||
|
|
||||||
|
##
|
||||||
|
# A class representing a destination within a PDF file.
|
||||||
|
# See section 8.2.1 of the PDF 1.6 reference.
|
||||||
|
# Stability: Added in v1.10, will exist for all v1.x releases.
|
||||||
|
class Destination(DictionaryObject):
|
||||||
|
def __init__(self, *args):
|
||||||
|
DictionaryObject.__init__(self)
|
||||||
|
self.title = args[0]
|
||||||
|
self["/Page"], self["/Type"] = args[1], args[2]
|
||||||
|
|
||||||
|
# from table 8.2 of the PDF 1.6 reference.
|
||||||
|
mapNull = lambda x: {True: None, False: x}[isinstance(x, NullObject)]
|
||||||
|
params = map(mapNull, args[3:])
|
||||||
|
type = self["/Type"]
|
||||||
|
|
||||||
|
if type == "/XYZ":
|
||||||
|
self["/Left"], self["/Top"], self["/Zoom"] = params
|
||||||
|
elif type == "/FitR":
|
||||||
|
self["/Left"], self["/Bottom"], \
|
||||||
|
self["/Right"], self["/Top"] = params
|
||||||
|
elif type in ["/FitH", "FitBH"]:
|
||||||
|
self["/Top"], = params
|
||||||
|
elif type in ["/FitV", "FitBV"]:
|
||||||
|
self["/Left"], = params
|
||||||
|
elif type in ["/Fit", "FitB"]:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
raise utils.PdfReadError, "Unknown Destination Type: " + type
|
||||||
|
|
||||||
|
def setTitle(self, title):
|
||||||
|
self["/Title"] = title.strip()
|
||||||
|
|
||||||
|
##
|
||||||
|
# Read-write property accessing the destination title.
|
||||||
|
# @return A string.
|
||||||
|
title = property(lambda self: self.get("/Title"), setTitle, None)
|
||||||
|
|
||||||
|
##
|
||||||
|
# Read-only property accessing the destination page.
|
||||||
|
# @return An integer.
|
||||||
|
page = property(lambda self: self.get("/Page"), None, None)
|
||||||
|
|
||||||
|
##
|
||||||
|
# Read-only property accessing the destination type.
|
||||||
|
# @return A string.
|
||||||
|
type = property(lambda self: self.get("/Type"), None, None)
|
||||||
|
|
||||||
|
##
|
||||||
|
# Read-only property accessing the zoom factor.
|
||||||
|
# @return A number, or None if not available.
|
||||||
|
zoom = property(lambda self: self.get("/Zoom", None), None, None)
|
||||||
|
|
||||||
|
##
|
||||||
|
# Read-only property accessing the left horizontal coordinate.
|
||||||
|
# @return A number, or None if not available.
|
||||||
|
left = property(lambda self: self.get("/Left", None), None, None)
|
||||||
|
|
||||||
|
##
|
||||||
|
# Read-only property accessing the right horizontal coordinate.
|
||||||
|
# @return A number, or None if not available.
|
||||||
|
right = property(lambda self: self.get("/Right", None), None, None)
|
||||||
|
|
||||||
|
##
|
||||||
|
# Read-only property accessing the top vertical coordinate.
|
||||||
|
# @return A number, or None if not available.
|
||||||
|
top = property(lambda self: self.get("/Top", None), None, None)
|
||||||
|
|
||||||
|
##
|
||||||
|
# Read-only property accessing the bottom vertical coordinate.
|
||||||
|
# @return A number, or None if not available.
|
||||||
|
bottom = property(lambda self: self.get("/Bottom", None), None, None)
|
||||||
|
|
||||||
|
|
||||||
def convertToInt(d, size):
|
def convertToInt(d, size):
|
||||||
@ -1078,65 +1337,150 @@ _encryption_padding = '\x28\xbf\x4e\x5e\x4e\x75\x8a\x41\x64\x00\x4e\x56' + \
|
|||||||
'\xff\xfa\x01\x08\x2e\x2e\x00\xb6\xd0\x68\x3e\x80\x2f\x0c' + \
|
'\xff\xfa\x01\x08\x2e\x2e\x00\xb6\xd0\x68\x3e\x80\x2f\x0c' + \
|
||||||
'\xa9\xfe\x64\x53\x69\x7a'
|
'\xa9\xfe\x64\x53\x69\x7a'
|
||||||
|
|
||||||
|
# Implementation of algorithm 3.2 of the PDF standard security handler,
|
||||||
|
# section 3.5.2 of the PDF 1.6 reference.
|
||||||
def _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt=True):
|
def _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt=True):
|
||||||
import md5, struct
|
# 1. Pad or truncate the password string to exactly 32 bytes. If the
|
||||||
m = md5.new()
|
# password string is more than 32 bytes long, use only its first 32 bytes;
|
||||||
|
# if it is less than 32 bytes long, pad it by appending the required number
|
||||||
|
# of additional bytes from the beginning of the padding string
|
||||||
|
# (_encryption_padding).
|
||||||
password = (password + _encryption_padding)[:32]
|
password = (password + _encryption_padding)[:32]
|
||||||
m.update(password)
|
# 2. Initialize the MD5 hash function and pass the result of step 1 as
|
||||||
|
# input to this function.
|
||||||
|
import md5, struct
|
||||||
|
m = md5.new(password)
|
||||||
|
# 3. Pass the value of the encryption dictionary's /O entry to the MD5 hash
|
||||||
|
# function.
|
||||||
m.update(owner_entry)
|
m.update(owner_entry)
|
||||||
|
# 4. Treat the value of the /P entry as an unsigned 4-byte integer and pass
|
||||||
|
# these bytes to the MD5 hash function, low-order byte first.
|
||||||
p_entry = struct.pack('<i', p_entry)
|
p_entry = struct.pack('<i', p_entry)
|
||||||
m.update(p_entry)
|
m.update(p_entry)
|
||||||
|
# 5. Pass the first element of the file's file identifier array to the MD5
|
||||||
|
# hash function.
|
||||||
m.update(id1_entry)
|
m.update(id1_entry)
|
||||||
|
# 6. (Revision 3 or greater) If document metadata is not being encrypted,
|
||||||
|
# pass 4 bytes with the value 0xFFFFFFFF to the MD5 hash function.
|
||||||
if rev >= 3 and not metadata_encrypt:
|
if rev >= 3 and not metadata_encrypt:
|
||||||
m.update("\xff\xff\xff\xff")
|
m.update("\xff\xff\xff\xff")
|
||||||
|
# 7. Finish the hash.
|
||||||
md5_hash = m.digest()
|
md5_hash = m.digest()
|
||||||
|
# 8. (Revision 3 or greater) Do the following 50 times: Take the output
|
||||||
|
# from the previous MD5 hash and pass the first n bytes of the output as
|
||||||
|
# input into a new MD5 hash, where n is the number of bytes of the
|
||||||
|
# encryption key as defined by the value of the encryption dictionary's
|
||||||
|
# /Length entry.
|
||||||
if rev >= 3:
|
if rev >= 3:
|
||||||
for i in range(50):
|
for i in range(50):
|
||||||
md5_hash = md5.new(md5_hash[:keylen]).digest()
|
md5_hash = md5.new(md5_hash[:keylen]).digest()
|
||||||
|
# 9. Set the encryption key to the first n bytes of the output from the
|
||||||
|
# final MD5 hash, where n is always 5 for revision 2 but, for revision 3 or
|
||||||
|
# greater, depends on the value of the encryption dictionary's /Length
|
||||||
|
# entry.
|
||||||
return md5_hash[:keylen]
|
return md5_hash[:keylen]
|
||||||
|
|
||||||
|
# Implementation of algorithm 3.3 of the PDF standard security handler,
|
||||||
|
# section 3.5.2 of the PDF 1.6 reference.
|
||||||
def _alg33(owner_pwd, user_pwd, rev, keylen):
|
def _alg33(owner_pwd, user_pwd, rev, keylen):
|
||||||
|
# steps 1 - 4
|
||||||
key = _alg33_1(owner_pwd, rev, keylen)
|
key = _alg33_1(owner_pwd, rev, keylen)
|
||||||
|
# 5. Pad or truncate the user password string as described in step 1 of
|
||||||
|
# algorithm 3.2.
|
||||||
user_pwd = (user_pwd + _encryption_padding)[:32]
|
user_pwd = (user_pwd + _encryption_padding)[:32]
|
||||||
|
# 6. Encrypt the result of step 5, using an RC4 encryption function with
|
||||||
|
# the encryption key obtained in step 4.
|
||||||
val = utils.RC4_encrypt(key, user_pwd)
|
val = utils.RC4_encrypt(key, user_pwd)
|
||||||
|
# 7. (Revision 3 or greater) Do the following 19 times: Take the output
|
||||||
|
# from the previous invocation of the RC4 function and pass it as input to
|
||||||
|
# a new invocation of the function; use an encryption key generated by
|
||||||
|
# taking each byte of the encryption key obtained in step 4 and performing
|
||||||
|
# an XOR operation between that byte and the single-byte value of the
|
||||||
|
# iteration counter (from 1 to 19).
|
||||||
if rev >= 3:
|
if rev >= 3:
|
||||||
for i in range(1, 20):
|
for i in range(1, 20):
|
||||||
new_key = ''
|
new_key = ''
|
||||||
for l in range(len(key)):
|
for l in range(len(key)):
|
||||||
new_key += chr(ord(key[l]) ^ i)
|
new_key += chr(ord(key[l]) ^ i)
|
||||||
val = utils.RC4_encrypt(new_key, val)
|
val = utils.RC4_encrypt(new_key, val)
|
||||||
|
# 8. Store the output from the final invocation of the RC4 as the value of
|
||||||
|
# the /O entry in the encryption dictionary.
|
||||||
return val
|
return val
|
||||||
|
|
||||||
|
# Steps 1-4 of algorithm 3.3
|
||||||
def _alg33_1(password, rev, keylen):
|
def _alg33_1(password, rev, keylen):
|
||||||
import md5
|
# 1. Pad or truncate the owner password string as described in step 1 of
|
||||||
m = md5.new()
|
# algorithm 3.2. If there is no owner password, use the user password
|
||||||
|
# instead.
|
||||||
password = (password + _encryption_padding)[:32]
|
password = (password + _encryption_padding)[:32]
|
||||||
m.update(password)
|
# 2. Initialize the MD5 hash function and pass the result of step 1 as
|
||||||
|
# input to this function.
|
||||||
|
import md5
|
||||||
|
m = md5.new(password)
|
||||||
|
# 3. (Revision 3 or greater) Do the following 50 times: Take the output
|
||||||
|
# from the previous MD5 hash and pass it as input into a new MD5 hash.
|
||||||
md5_hash = m.digest()
|
md5_hash = m.digest()
|
||||||
if rev >= 3:
|
if rev >= 3:
|
||||||
for i in range(50):
|
for i in range(50):
|
||||||
md5_hash = md5.new(md5_hash).digest()
|
md5_hash = md5.new(md5_hash).digest()
|
||||||
|
# 4. Create an RC4 encryption key using the first n bytes of the output
|
||||||
|
# from the final MD5 hash, where n is always 5 for revision 2 but, for
|
||||||
|
# revision 3 or greater, depends on the value of the encryption
|
||||||
|
# dictionary's /Length entry.
|
||||||
key = md5_hash[:keylen]
|
key = md5_hash[:keylen]
|
||||||
return key
|
return key
|
||||||
|
|
||||||
|
# Implementation of algorithm 3.4 of the PDF standard security handler,
|
||||||
|
# section 3.5.2 of the PDF 1.6 reference.
|
||||||
def _alg34(password, owner_entry, p_entry, id1_entry):
|
def _alg34(password, owner_entry, p_entry, id1_entry):
|
||||||
|
# 1. Create an encryption key based on the user password string, as
|
||||||
|
# described in algorithm 3.2.
|
||||||
key = _alg32(password, 2, 5, owner_entry, p_entry, id1_entry)
|
key = _alg32(password, 2, 5, owner_entry, p_entry, id1_entry)
|
||||||
|
# 2. Encrypt the 32-byte padding string shown in step 1 of algorithm 3.2,
|
||||||
|
# using an RC4 encryption function with the encryption key from the
|
||||||
|
# preceding step.
|
||||||
U = utils.RC4_encrypt(key, _encryption_padding)
|
U = utils.RC4_encrypt(key, _encryption_padding)
|
||||||
|
# 3. Store the result of step 2 as the value of the /U entry in the
|
||||||
|
# encryption dictionary.
|
||||||
return U, key
|
return U, key
|
||||||
|
|
||||||
|
# Implementation of algorithm 3.4 of the PDF standard security handler,
|
||||||
|
# section 3.5.2 of the PDF 1.6 reference.
|
||||||
def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt):
|
def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt):
|
||||||
|
# 1. Create an encryption key based on the user password string, as
|
||||||
|
# described in Algorithm 3.2.
|
||||||
|
key = _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry)
|
||||||
|
# 2. Initialize the MD5 hash function and pass the 32-byte padding string
|
||||||
|
# shown in step 1 of Algorithm 3.2 as input to this function.
|
||||||
import md5
|
import md5
|
||||||
m = md5.new()
|
m = md5.new()
|
||||||
m.update(_encryption_padding)
|
m.update(_encryption_padding)
|
||||||
|
# 3. Pass the first element of the file's file identifier array (the value
|
||||||
|
# of the ID entry in the document's trailer dictionary; see Table 3.13 on
|
||||||
|
# page 73) to the hash function and finish the hash. (See implementation
|
||||||
|
# note 25 in Appendix H.)
|
||||||
m.update(id1_entry)
|
m.update(id1_entry)
|
||||||
md5_hash = m.digest()
|
md5_hash = m.digest()
|
||||||
key = _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry)
|
# 4. Encrypt the 16-byte result of the hash, using an RC4 encryption
|
||||||
|
# function with the encryption key from step 1.
|
||||||
val = utils.RC4_encrypt(key, md5_hash)
|
val = utils.RC4_encrypt(key, md5_hash)
|
||||||
|
# 5. Do the following 19 times: Take the output from the previous
|
||||||
|
# invocation of the RC4 function and pass it as input to a new invocation
|
||||||
|
# of the function; use an encryption key generated by taking each byte of
|
||||||
|
# the original encryption key (obtained in step 2) and performing an XOR
|
||||||
|
# operation between that byte and the single-byte value of the iteration
|
||||||
|
# counter (from 1 to 19).
|
||||||
for i in range(1, 20):
|
for i in range(1, 20):
|
||||||
new_key = ''
|
new_key = ''
|
||||||
for l in range(len(key)):
|
for l in range(len(key)):
|
||||||
new_key += chr(ord(key[l]) ^ i)
|
new_key += chr(ord(key[l]) ^ i)
|
||||||
val = utils.RC4_encrypt(new_key, val)
|
val = utils.RC4_encrypt(new_key, val)
|
||||||
|
# 6. Append 16 bytes of arbitrary padding to the output from the final
|
||||||
|
# invocation of the RC4 function and store the 32-byte result as the value
|
||||||
|
# of the U entry in the encryption dictionary.
|
||||||
|
# (implementator note: I don't know what "arbitrary padding" is supposed to
|
||||||
|
# mean, so I have used null bytes. This seems to match a few other
|
||||||
|
# people's implementations)
|
||||||
return val + ('\x00' * 16), key
|
return val + ('\x00' * 16), key
|
||||||
|
|
||||||
#if __name__ == "__main__":
|
#if __name__ == "__main__":
|
||||||
|
@ -32,7 +32,7 @@
|
|||||||
Utility functions for PDF library.
|
Utility functions for PDF library.
|
||||||
"""
|
"""
|
||||||
__author__ = "Mathieu Fenniak"
|
__author__ = "Mathieu Fenniak"
|
||||||
__author_email__ = "mfenniak@pobox.com"
|
__author_email__ = "biziqe@mathieu.fenniak.net"
|
||||||
|
|
||||||
def readUntilWhitespace(stream, maxchars=None):
|
def readUntilWhitespace(stream, maxchars=None):
|
||||||
txt = ""
|
txt = ""
|
||||||
@ -86,6 +86,9 @@ def RC4_encrypt(key, plaintext):
|
|||||||
retval += chr(ord(plaintext[x]) ^ t)
|
retval += chr(ord(plaintext[x]) ^ t)
|
||||||
return retval
|
return retval
|
||||||
|
|
||||||
|
class PdfReadError(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# test RC4
|
# test RC4
|
||||||
out = RC4_encrypt("Key", "Plaintext")
|
out = RC4_encrypt("Key", "Plaintext")
|
||||||
|
Loading…
x
Reference in New Issue
Block a user