From 75953a47d239bcdda2b15f725610a4860c8b2c3e Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 20 Sep 2007 01:54:53 +0000 Subject: [PATCH] Newer pyPdf that is hopefully bug free. --- src/libprs500/ebooks/pyPdf/filters.py | 4 +- src/libprs500/ebooks/pyPdf/generic.py | 1253 ++++++++++++++----------- src/libprs500/ebooks/pyPdf/pdf.py | 440 ++++++++- src/libprs500/ebooks/pyPdf/utils.py | 5 +- 4 files changed, 1109 insertions(+), 593 deletions(-) diff --git a/src/libprs500/ebooks/pyPdf/filters.py b/src/libprs500/ebooks/pyPdf/filters.py index 17a325f76f..581cd52111 100644 --- a/src/libprs500/ebooks/pyPdf/filters.py +++ b/src/libprs500/ebooks/pyPdf/filters.py @@ -32,9 +32,8 @@ Implementation of stream filters for PDF. """ __author__ = "Mathieu Fenniak" -__author_email__ = "mfenniak@pobox.com" +__author_email__ = "biziqe@mathieu.fenniak.net" -from generic import NameObject try: import zlib @@ -208,6 +207,7 @@ class ASCII85Decode(object): decode = staticmethod(decode) def decodeStreamData(stream): + from generic import NameObject filters = stream.get("/Filter", ()) if len(filters) and not isinstance(filters[0], NameObject): # we have a single filter instance diff --git a/src/libprs500/ebooks/pyPdf/generic.py b/src/libprs500/ebooks/pyPdf/generic.py index 4fea8fa640..69a9ad7b5e 100644 --- a/src/libprs500/ebooks/pyPdf/generic.py +++ b/src/libprs500/ebooks/pyPdf/generic.py @@ -1,542 +1,711 @@ -# vim: sw=4:expandtab:foldmethod=marker -# -# Copyright (c) 2006, Mathieu Fenniak -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# -# * Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# * The name of the author may not be used to endorse or promote products -# derived from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -# POSSIBILITY OF SUCH DAMAGE. - - -""" -Implementation of generic PDF objects (dictionary, number, string, and so on) -""" -__author__ = "Mathieu Fenniak" -__author_email__ = "mfenniak@pobox.com" - -import re -from utils import readNonWhitespace, RC4_encrypt -import filters - -def readObject(stream, pdf): - tok = stream.read(1) - stream.seek(-1, 1) # reset to start - if tok == 't' or tok == 'f': - # boolean object - return BooleanObject.readFromStream(stream) - elif tok == '(': - # string object - return StringObject.readFromStream(stream) - elif tok == '/': - # name object - return NameObject.readFromStream(stream) - elif tok == '[': - # array object - return ArrayObject.readFromStream(stream, pdf) - elif tok == 'n': - # null object - return NullObject.readFromStream(stream) - elif tok == '<': - # hexadecimal string OR dictionary - peek = stream.read(2) - stream.seek(-2, 1) # reset to start - if peek == '<<': - return DictionaryObject.readFromStream(stream, pdf) - else: - return StringObject.readHexStringFromStream(stream) - elif tok == '%': - # comment - while tok not in ('\r', '\n'): - tok = stream.read(1) - tok = readNonWhitespace(stream) - stream.seek(-1, 1) - return readObject(stream, pdf) - else: - # number object OR indirect reference - if tok == '+' or tok == '-': - # number - return NumberObject.readFromStream(stream) - peek = stream.read(20) - stream.seek(-len(peek), 1) # reset to start - if re.match(r"(\d+)\s(\d+)\sR[^a-zA-Z]", peek) != None: - return IndirectObject.readFromStream(stream, pdf) - else: - return NumberObject.readFromStream(stream) - -class PdfObject(object): - def getObject(self): - """Resolves indirect references.""" - return self - - -class NullObject(PdfObject): - def writeToStream(self, stream, encryption_key): - stream.write("null") - - def readFromStream(stream): - assert stream.read(4) == "null" - return NullObject() - readFromStream = staticmethod(readFromStream) - - -class BooleanObject(PdfObject): - def __init__(self, value): - self.value = value - - def writeToStream(self, stream, encryption_key): - if self.value: - stream.write("true") - else: - stream.write("false") - - def readFromStream(stream): - word = stream.read(4) - if word == "true": - return BooleanObject(True) - elif word == "fals": - stream.read(1) - return BooleanObject(False) - assert False - readFromStream = staticmethod(readFromStream) - - -class ArrayObject(list, PdfObject): - def writeToStream(self, stream, encryption_key): - stream.write("[") - for data in self: - stream.write(" ") - data.writeToStream(stream, encryption_key) - stream.write(" ]") - - def readFromStream(stream, pdf): - arr = ArrayObject() - assert stream.read(1) == "[" - while True: - # skip leading whitespace - tok = stream.read(1) - while tok.isspace(): - tok = stream.read(1) - stream.seek(-1, 1) - # check for array ending - peekahead = stream.read(1) - if peekahead == "]": - break - stream.seek(-1, 1) - # read and append obj - arr.append(readObject(stream, pdf)) - return arr - readFromStream = staticmethod(readFromStream) - - -class IndirectObject(PdfObject): - def __init__(self, idnum, generation, pdf): - self.idnum = idnum - self.generation = generation - self.pdf = pdf - - def getObject(self): - return self.pdf.getObject(self).getObject() - - def __repr__(self): - return "IndirectObject(%r, %r)" % (self.idnum, self.generation) - - def __eq__(self, other): - return ( - other != None and - isinstance(other, IndirectObject) and - self.idnum == other.idnum and - self.generation == other.generation and - self.pdf is other.pdf - ) - - def __ne__(self, other): - return not self.__eq__(other) - - def writeToStream(self, stream, encryption_key): - stream.write("%s %s R" % (self.idnum, self.generation)) - - def readFromStream(stream, pdf): - idnum = "" - while True: - tok = stream.read(1) - if tok.isspace(): - break - idnum += tok - generation = "" - while True: - tok = stream.read(1) - if tok.isspace(): - break - generation += tok - r = stream.read(1) - #if r != "R": - # stream.seek(-20, 1) - # print idnum, generation - # print repr(stream.read(40)) - assert r == "R" - return IndirectObject(int(idnum), int(generation), pdf) - readFromStream = staticmethod(readFromStream) - - -class FloatObject(float, PdfObject): - def writeToStream(self, stream, encryption_key): - stream.write(repr(self)) - - -class NumberObject(int, PdfObject): - def __init__(self, value): - int.__init__(self, value) - - def writeToStream(self, stream, encryption_key): - stream.write(repr(self)) - - def readFromStream(stream): - name = "" - while True: - tok = stream.read(1) - if tok != '+' and tok != '-' and tok != '.' and not tok.isdigit(): - stream.seek(-1, 1) - break - name += tok - if name.find(".") != -1: - return FloatObject(name) - else: - return NumberObject(name) - readFromStream = staticmethod(readFromStream) - - -class StringObject(str, PdfObject): - def writeToStream(self, stream, encryption_key): - string = self - if encryption_key: - string = RC4_encrypt(encryption_key, string) - stream.write("(") - for c in string: - if not c.isalnum() and not c.isspace(): - stream.write("\\%03o" % ord(c)) - else: - stream.write(c) - stream.write(")") - - def readHexStringFromStream(stream): - stream.read(1) - txt = "" - x = "" - while True: - tok = readNonWhitespace(stream) - if tok == ">": - break - x += tok - if len(x) == 2: - txt += chr(int(x, base=16)) - x = "" - if len(x) == 1: - x += "0" - if len(x) == 2: - txt += chr(int(x, base=16)) - return StringObject(txt) - readHexStringFromStream = staticmethod(readHexStringFromStream) - - def readFromStream(stream): - tok = stream.read(1) - parens = 1 - txt = "" - while True: - tok = stream.read(1) - if tok == "(": - parens += 1 - elif tok == ")": - parens -= 1 - if parens == 0: - break - elif tok == "\\": - tok = stream.read(1) - if tok == "n": - tok = "\n" - elif tok == "r": - tok = "\r" - elif tok == "t": - tok = "\t" - elif tok == "b": - tok == "\b" - elif tok == "f": - tok = "\f" - elif tok == "(": - tok = "(" - elif tok == ")": - tok = ")" - elif tok == "\\": - tok = "\\" - elif tok.isdigit(): - tok += stream.read(2) - tok = chr(int(tok, base=8)) - txt += tok - return StringObject(txt) - readFromStream = staticmethod(readFromStream) - - -class NameObject(str, PdfObject): - delimiterCharacters = "(", ")", "<", ">", "[", "]", "{", "}", "/", "%" - - def __init__(self, data): - str.__init__(self, data) - - def writeToStream(self, stream, encryption_key): - stream.write(self) - - def readFromStream(stream): - name = stream.read(1) - assert name == "/" - while True: - tok = stream.read(1) - if tok.isspace() or tok in NameObject.delimiterCharacters: - stream.seek(-1, 1) - break - name += tok - return NameObject(name) - readFromStream = staticmethod(readFromStream) - - -class DictionaryObject(dict, PdfObject): - def __init__(self): - pass - - def writeToStream(self, stream, encryption_key): - stream.write("<<\n") - for key, value in self.items(): - key.writeToStream(stream, encryption_key) - stream.write(" ") - value.writeToStream(stream, encryption_key) - stream.write("\n") - stream.write(">>") - - def readFromStream(stream, pdf): - assert stream.read(2) == "<<" - data = {} - while True: - tok = readNonWhitespace(stream) - if tok == ">": - stream.read(1) - break - stream.seek(-1, 1) - key = readObject(stream, pdf) - tok = readNonWhitespace(stream) - stream.seek(-1, 1) - value = readObject(stream, pdf) - if data.has_key(key): - # multiple definitions of key not permitted - assert False - data[key] = value - pos = stream.tell() - s = readNonWhitespace(stream) - if s == 's' and stream.read(5) == 'tream': - eol = stream.read(1) - # odd PDF file output has spaces after 'stream' keyword but before EOL. - # patch provided by Danial Sandler - while eol == ' ': - eol = stream.read(1) - assert eol in ("\n", "\r") - if eol == "\r": - # read \n after - stream.read(1) - # this is a stream object, not a dictionary - assert data.has_key("/Length") - length = data["/Length"] - if isinstance(length, IndirectObject): - t = stream.tell() - length = pdf.getObject(length) - stream.seek(t, 0) - data["__streamdata__"] = stream.read(length) - e = readNonWhitespace(stream) - ndstream = stream.read(8) - if (e + ndstream) != "endstream": - # (sigh) - the odd PDF file has a length that is too long, so - # we need to read backwards to find the "endstream" ending. - # ReportLab (unknown version) generates files with this bug, - # and Python users into PDF files tend to be our audience. - # we need to do this to correct the streamdata and chop off - # an extra character. - pos = stream.tell() - stream.seek(-10, 1) - end = stream.read(9) - if end == "endstream": - # we found it by looking back one character further. - data["__streamdata__"] = data["__streamdata__"][:-1] - else: - stream.seek(pos, 0) - raise "Unable to find 'endstream' marker after stream." - else: - stream.seek(pos, 0) - if data.has_key("__streamdata__"): - return StreamObject.initializeFromDictionary(data) - else: - retval = DictionaryObject() - retval.update(data) - return retval - readFromStream = staticmethod(readFromStream) - - -class StreamObject(DictionaryObject): - def __init__(self): - self._data = None - self.decodedSelf = None - - def writeToStream(self, stream, encryption_key): - self[NameObject("/Length")] = NumberObject(len(self._data)) - DictionaryObject.writeToStream(self, stream, encryption_key) - del self["/Length"] - stream.write("\nstream\n") - data = self._data - if encryption_key: - data = RC4_encrypt(encryption_key, data) - stream.write(data) - stream.write("\nendstream") - - def initializeFromDictionary(data): - if data.has_key("/Filter"): - retval = EncodedStreamObject() - else: - retval = DecodedStreamObject() - retval._data = data["__streamdata__"] - del data["__streamdata__"] - del data["/Length"] - retval.update(data) - return retval - initializeFromDictionary = staticmethod(initializeFromDictionary) - - def flateEncode(self): - if self.has_key("/Filter"): - f = self["/Filter"] - if isinstance(f, ArrayObject): - f.insert(0, NameObject("/FlateDecode")) - else: - newf = ArrayObject() - newf.append(NameObject("/FlateDecode")) - newf.append(f) - f = newf - else: - f = NameObject("/FlateDecode") - retval = EncodedStreamObject() - retval[NameObject("/Filter")] = f - retval._data = filters.FlateDecode.encode(self._data) - return retval - - -class DecodedStreamObject(StreamObject): - def getData(self): - return self._data - - def setData(self, data): - self._data = data - - -class EncodedStreamObject(StreamObject): - def __init__(self): - self.decodedSelf = None - - def getData(self): - if self.decodedSelf: - # cached version of decoded object - return self.decodedSelf.getData() - else: - # create decoded object - decoded = StreamObject() - decoded._data = filters.decodeStreamData(self) - for key, value in self.items(): - if not key in ("/Length", "/Filter", "/DecodeParms"): - decoded[key] = value - self.decodedSelf = decoded - return decoded._data - - def setData(self, data): - raise "Creating EncodedStreamObject is not currently supported" - - -class RectangleObject(ArrayObject): - def __init__(self, arr): - # must have four points - assert len(arr) == 4 - # automatically convert arr[x] into NumberObject(arr[x]) if necessary - ArrayObject.__init__(self, [self.ensureIsNumber(x) for x in arr]) - - def ensureIsNumber(self, value): - if not isinstance(value, NumberObject): - value = NumberObject(value) - return value - - def __repr__(self): - return "RectangleObject(%s)" % repr(list(self)) - - def getLowerLeft_x(self): - return self[0] - - def getLowerLeft_y(self): - return self[1] - - def getUpperRight_x(self): - return self[2] - - def getUpperRight_y(self): - return self[3] - - def getUpperLeft_x(self): - return self.getLowerLeft_x() - - def getUpperLeft_y(self): - return self.getUpperRight_y() - - def getLowerRight_x(self): - return self.getUpperRight_x() - - def getLowerRight_y(self): - return self.getLowerLeft_y() - - def getLowerLeft(self): - return self.getLowerLeft_x(), self.getLowerLeft_y() - - def getLowerRight(self): - return self.getLowerRight_x(), self.getLowerRight_y() - - def getUpperLeft(self): - return self.getUpperLeft_x(), self.getUpperLeft_y() - - def getUpperRight(self): - return self.getUpperRight_x(), self.getUpperRight_y() - - def setLowerLeft(self, value): - self[0], self[1] = [self.ensureIsNumber(x) for x in value] - - def setLowerRight(self, value): - self[2], self[1] = [self.ensureIsNumber(x) for x in value] - - def setUpperLeft(self, value): - self[0], self[3] = [self.ensureIsNumber(x) for x in value] - - def setUpperRight(self, value): - self[2], self[3] = [self.ensureIsNumber(x) for x in value] - - lowerLeft = property(getLowerLeft, setLowerLeft, None, None) - lowerRight = property(getLowerRight, setLowerRight, None, None) - upperLeft = property(getUpperLeft, setUpperLeft, None, None) - upperRight = property(getUpperRight, setUpperRight, None, None) - +# vim: sw=4:expandtab:foldmethod=marker +# +# Copyright (c) 2006, Mathieu Fenniak +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + + +""" +Implementation of generic PDF objects (dictionary, number, string, and so on) +""" +__author__ = "Mathieu Fenniak" +__author_email__ = "biziqe@mathieu.fenniak.net" + +import re +from utils import readNonWhitespace, RC4_encrypt +import filters +import utils +import decimal +import codecs + +def readObject(stream, pdf): + tok = stream.read(1) + stream.seek(-1, 1) # reset to start + if tok == 't' or tok == 'f': + # boolean object + return BooleanObject.readFromStream(stream) + elif tok == '(': + # string object + return readStringFromStream(stream) + elif tok == '/': + # name object + return NameObject.readFromStream(stream) + elif tok == '[': + # array object + return ArrayObject.readFromStream(stream, pdf) + elif tok == 'n': + # null object + return NullObject.readFromStream(stream) + elif tok == '<': + # hexadecimal string OR dictionary + peek = stream.read(2) + stream.seek(-2, 1) # reset to start + if peek == '<<': + return DictionaryObject.readFromStream(stream, pdf) + else: + return readHexStringFromStream(stream) + elif tok == '%': + # comment + while tok not in ('\r', '\n'): + tok = stream.read(1) + tok = readNonWhitespace(stream) + stream.seek(-1, 1) + return readObject(stream, pdf) + else: + # number object OR indirect reference + if tok == '+' or tok == '-': + # number + return NumberObject.readFromStream(stream) + peek = stream.read(20) + stream.seek(-len(peek), 1) # reset to start + if re.match(r"(\d+)\s(\d+)\sR[^a-zA-Z]", peek) != None: + return IndirectObject.readFromStream(stream, pdf) + else: + return NumberObject.readFromStream(stream) + +class PdfObject(object): + def getObject(self): + """Resolves indirect references.""" + return self + + +class NullObject(PdfObject): + def writeToStream(self, stream, encryption_key): + stream.write("null") + + def readFromStream(stream): + nulltxt = stream.read(4) + if nulltxt != "null": + raise utils.PdfReadError, "error reading null object" + return NullObject() + readFromStream = staticmethod(readFromStream) + + +class BooleanObject(PdfObject): + def __init__(self, value): + self.value = value + + def writeToStream(self, stream, encryption_key): + if self.value: + stream.write("true") + else: + stream.write("false") + + def readFromStream(stream): + word = stream.read(4) + if word == "true": + return BooleanObject(True) + elif word == "fals": + stream.read(1) + return BooleanObject(False) + assert False + readFromStream = staticmethod(readFromStream) + + +class ArrayObject(list, PdfObject): + def writeToStream(self, stream, encryption_key): + stream.write("[") + for data in self: + stream.write(" ") + data.writeToStream(stream, encryption_key) + stream.write(" ]") + + def readFromStream(stream, pdf): + arr = ArrayObject() + tmp = stream.read(1) + if tmp != "[": + raise utils.PdfReadError, "error reading array" + while True: + # skip leading whitespace + tok = stream.read(1) + while tok.isspace(): + tok = stream.read(1) + stream.seek(-1, 1) + # check for array ending + peekahead = stream.read(1) + if peekahead == "]": + break + stream.seek(-1, 1) + # read and append obj + arr.append(readObject(stream, pdf)) + return arr + readFromStream = staticmethod(readFromStream) + + +class IndirectObject(PdfObject): + def __init__(self, idnum, generation, pdf): + self.idnum = idnum + self.generation = generation + self.pdf = pdf + + def getObject(self): + return self.pdf.getObject(self).getObject() + + def __repr__(self): + return "IndirectObject(%r, %r)" % (self.idnum, self.generation) + + def __eq__(self, other): + return ( + other != None and + isinstance(other, IndirectObject) and + self.idnum == other.idnum and + self.generation == other.generation and + self.pdf is other.pdf + ) + + def __ne__(self, other): + return not self.__eq__(other) + + def writeToStream(self, stream, encryption_key): + stream.write("%s %s R" % (self.idnum, self.generation)) + + def readFromStream(stream, pdf): + idnum = "" + while True: + tok = stream.read(1) + if tok.isspace(): + break + idnum += tok + generation = "" + while True: + tok = stream.read(1) + if tok.isspace(): + break + generation += tok + r = stream.read(1) + if r != "R": + raise utils.PdfReadError("error reading indirect object reference") + return IndirectObject(int(idnum), int(generation), pdf) + readFromStream = staticmethod(readFromStream) + + +class FloatObject(decimal.Decimal, PdfObject): + def writeToStream(self, stream, encryption_key): + stream.write(str(self)) + + +class NumberObject(int, PdfObject): + def __init__(self, value): + int.__init__(self, value) + + def writeToStream(self, stream, encryption_key): + stream.write(repr(self)) + + def readFromStream(stream): + name = "" + while True: + tok = stream.read(1) + if tok != '+' and tok != '-' and tok != '.' and not tok.isdigit(): + stream.seek(-1, 1) + break + name += tok + if name.find(".") != -1: + return FloatObject(name) + else: + return NumberObject(name) + readFromStream = staticmethod(readFromStream) + + +## +# Given a string (either a "str" or "unicode"), create a ByteStringObject or a +# TextStringObject to represent the string. +def createStringObject(string): + if isinstance(string, unicode): + return TextStringObject(string) + elif isinstance(string, str): + if string.startswith(codecs.BOM_UTF16_BE): + retval = TextStringObject(string.decode("utf-16")) + retval.autodetect_utf16 = True + return retval + else: + # This is probably a big performance hit here, but we need to + # convert string objects into the text/unicode-aware version if + # possible... and the only way to check if that's possible is + # to try. Some strings are strings, some are just byte arrays. + try: + retval = TextStringObject(decode_pdfdocencoding(string)) + retval.autodetect_pdfdocencoding = True + return retval + except UnicodeDecodeError: + return ByteStringObject(string) + else: + raise TypeError("createStringObject should have str or unicode arg") + + +def readHexStringFromStream(stream): + stream.read(1) + txt = "" + x = "" + while True: + tok = readNonWhitespace(stream) + if tok == ">": + break + x += tok + if len(x) == 2: + txt += chr(int(x, base=16)) + x = "" + if len(x) == 1: + x += "0" + if len(x) == 2: + txt += chr(int(x, base=16)) + return createStringObject(txt) + + +def readStringFromStream(stream): + tok = stream.read(1) + parens = 1 + txt = "" + while True: + tok = stream.read(1) + if tok == "(": + parens += 1 + elif tok == ")": + parens -= 1 + if parens == 0: + break + elif tok == "\\": + tok = stream.read(1) + if tok == "n": + tok = "\n" + elif tok == "r": + tok = "\r" + elif tok == "t": + tok = "\t" + elif tok == "b": + tok == "\b" + elif tok == "f": + tok = "\f" + elif tok == "(": + tok = "(" + elif tok == ")": + tok = ")" + elif tok == "\\": + tok = "\\" + elif tok.isdigit(): + tok += stream.read(2) + tok = chr(int(tok, base=8)) + elif tok in "\n\r": + # This case is hit when a backslash followed by a line + # break occurs. If it's a multi-char EOL, consume the + # second character: + tok = stream.read(1) + if not tok in "\n\r": + stream.seek(-1, 1) + # Then don't add anything to the actual string, since this + # line break was escaped: + tok = '' + else: + raise utils.PdfReadError("Unexpected escaped string") + txt += tok + return createStringObject(txt) + + +## +# Represents a string object where the text encoding could not be determined. +# This occurs quite often, as the PDF spec doesn't provide an alternate way to +# represent strings -- for example, the encryption data stored in files (like +# /O) is clearly not text, but is still stored in a "String" object. +class ByteStringObject(str, PdfObject): + + ## + # For compatibility with TextStringObject.original_bytes. This method + # returns self. + original_bytes = property(lambda self: self) + + def writeToStream(self, stream, encryption_key): + bytearr = self + if encryption_key: + bytearr = RC4_encrypt(encryption_key, bytearr) + stream.write("<") + stream.write(bytearr.encode("hex")) + stream.write(">") + + +## +# Represents a string object that has been decoded into a real unicode string. +# If read from a PDF document, this string appeared to match the +# PDFDocEncoding, or contained a UTF-16BE BOM mark to cause UTF-16 decoding to +# occur. +class TextStringObject(unicode, PdfObject): + autodetect_pdfdocencoding = False + autodetect_utf16 = False + + ## + # It is occasionally possible that a text string object gets created where + # a byte string object was expected due to the autodetection mechanism -- + # if that occurs, this "original_bytes" property can be used to + # back-calculate what the original encoded bytes were. + original_bytes = property(lambda self: self.get_original_bytes()) + + def get_original_bytes(self): + # We're a text string object, but the library is trying to get our raw + # bytes. This can happen if we auto-detected this string as text, but + # we were wrong. It's pretty common. Return the original bytes that + # would have been used to create this object, based upon the autodetect + # method. + if self.autodetect_utf16: + return codecs.BOM_UTF16_BE + self.encode("utf-16be") + elif self.autodetect_pdfdocencoding: + return encode_pdfdocencoding(self) + else: + raise Exception("no information about original bytes") + + def writeToStream(self, stream, encryption_key): + # Try to write the string out as a PDFDocEncoding encoded string. It's + # nicer to look at in the PDF file. Sadly, we take a performance hit + # here for trying... + try: + bytearr = encode_pdfdocencoding(self) + except UnicodeEncodeError: + bytearr = codecs.BOM_UTF16_BE + self.encode("utf-16be") + if encryption_key: + bytearr = RC4_encrypt(encryption_key, bytearr) + obj = ByteStringObject(bytearr) + obj.writeToStream(stream, None) + else: + stream.write("(") + for c in bytearr: + if not c.isalnum() and c != ' ': + stream.write("\\%03o" % ord(c)) + else: + stream.write(c) + stream.write(")") + + +class NameObject(str, PdfObject): + delimiterCharacters = "(", ")", "<", ">", "[", "]", "{", "}", "/", "%" + + def __init__(self, data): + str.__init__(self, data) + + def writeToStream(self, stream, encryption_key): + stream.write(self) + + def readFromStream(stream): + name = stream.read(1) + if name != "/": + raise utils.PdfReadError, "name read error" + while True: + tok = stream.read(1) + if tok.isspace() or tok in NameObject.delimiterCharacters: + stream.seek(-1, 1) + break + name += tok + return NameObject(name) + readFromStream = staticmethod(readFromStream) + + +class DictionaryObject(dict, PdfObject): + def __init__(self): + pass + + def writeToStream(self, stream, encryption_key): + stream.write("<<\n") + for key, value in self.items(): + key.writeToStream(stream, encryption_key) + stream.write(" ") + value.writeToStream(stream, encryption_key) + stream.write("\n") + stream.write(">>") + + def readFromStream(stream, pdf): + tmp = stream.read(2) + if tmp != "<<": + raise utils.PdfReadError, "dictionary read error" + data = {} + while True: + tok = readNonWhitespace(stream) + if tok == ">": + stream.read(1) + break + stream.seek(-1, 1) + key = readObject(stream, pdf) + tok = readNonWhitespace(stream) + stream.seek(-1, 1) + value = readObject(stream, pdf) + if data.has_key(key): + # multiple definitions of key not permitted + raise utils.PdfReadError, "multiple definitions in dictionary" + data[key] = value + pos = stream.tell() + s = readNonWhitespace(stream) + if s == 's' and stream.read(5) == 'tream': + eol = stream.read(1) + # odd PDF file output has spaces after 'stream' keyword but before EOL. + # patch provided by Danial Sandler + while eol == ' ': + eol = stream.read(1) + assert eol in ("\n", "\r") + if eol == "\r": + # read \n after + stream.read(1) + # this is a stream object, not a dictionary + assert data.has_key("/Length") + length = data["/Length"] + if isinstance(length, IndirectObject): + t = stream.tell() + length = pdf.getObject(length) + stream.seek(t, 0) + data["__streamdata__"] = stream.read(length) + e = readNonWhitespace(stream) + ndstream = stream.read(8) + if (e + ndstream) != "endstream": + # (sigh) - the odd PDF file has a length that is too long, so + # we need to read backwards to find the "endstream" ending. + # ReportLab (unknown version) generates files with this bug, + # and Python users into PDF files tend to be our audience. + # we need to do this to correct the streamdata and chop off + # an extra character. + pos = stream.tell() + stream.seek(-10, 1) + end = stream.read(9) + if end == "endstream": + # we found it by looking back one character further. + data["__streamdata__"] = data["__streamdata__"][:-1] + else: + stream.seek(pos, 0) + raise utils.PdfReadError, "Unable to find 'endstream' marker after stream." + else: + stream.seek(pos, 0) + if data.has_key("__streamdata__"): + return StreamObject.initializeFromDictionary(data) + else: + retval = DictionaryObject() + retval.update(data) + return retval + readFromStream = staticmethod(readFromStream) + + +class StreamObject(DictionaryObject): + def __init__(self): + self._data = None + self.decodedSelf = None + + def writeToStream(self, stream, encryption_key): + self[NameObject("/Length")] = NumberObject(len(self._data)) + DictionaryObject.writeToStream(self, stream, encryption_key) + del self["/Length"] + stream.write("\nstream\n") + data = self._data + if encryption_key: + data = RC4_encrypt(encryption_key, data) + stream.write(data) + stream.write("\nendstream") + + def initializeFromDictionary(data): + if data.has_key("/Filter"): + retval = EncodedStreamObject() + else: + retval = DecodedStreamObject() + retval._data = data["__streamdata__"] + del data["__streamdata__"] + del data["/Length"] + retval.update(data) + return retval + initializeFromDictionary = staticmethod(initializeFromDictionary) + + def flateEncode(self): + if self.has_key("/Filter"): + f = self["/Filter"] + if isinstance(f, ArrayObject): + f.insert(0, NameObject("/FlateDecode")) + else: + newf = ArrayObject() + newf.append(NameObject("/FlateDecode")) + newf.append(f) + f = newf + else: + f = NameObject("/FlateDecode") + retval = EncodedStreamObject() + retval[NameObject("/Filter")] = f + retval._data = filters.FlateDecode.encode(self._data) + return retval + + +class DecodedStreamObject(StreamObject): + def getData(self): + return self._data + + def setData(self, data): + self._data = data + + +class EncodedStreamObject(StreamObject): + def __init__(self): + self.decodedSelf = None + + def getData(self): + if self.decodedSelf: + # cached version of decoded object + return self.decodedSelf.getData() + else: + # create decoded object + decoded = StreamObject() + decoded._data = filters.decodeStreamData(self) + for key, value in self.items(): + if not key in ("/Length", "/Filter", "/DecodeParms"): + decoded[key] = value + self.decodedSelf = decoded + return decoded._data + + def setData(self, data): + raise utils.PdfReadError, "Creating EncodedStreamObject is not currently supported" + + +class RectangleObject(ArrayObject): + def __init__(self, arr): + # must have four points + assert len(arr) == 4 + # automatically convert arr[x] into NumberObject(arr[x]) if necessary + ArrayObject.__init__(self, [self.ensureIsNumber(x) for x in arr]) + + def ensureIsNumber(self, value): + if not isinstance(value, NumberObject): + value = NumberObject(value) + return value + + def __repr__(self): + return "RectangleObject(%s)" % repr(list(self)) + + def getLowerLeft_x(self): + return self[0] + + def getLowerLeft_y(self): + return self[1] + + def getUpperRight_x(self): + return self[2] + + def getUpperRight_y(self): + return self[3] + + def getUpperLeft_x(self): + return self.getLowerLeft_x() + + def getUpperLeft_y(self): + return self.getUpperRight_y() + + def getLowerRight_x(self): + return self.getUpperRight_x() + + def getLowerRight_y(self): + return self.getLowerLeft_y() + + def getLowerLeft(self): + return self.getLowerLeft_x(), self.getLowerLeft_y() + + def getLowerRight(self): + return self.getLowerRight_x(), self.getLowerRight_y() + + def getUpperLeft(self): + return self.getUpperLeft_x(), self.getUpperLeft_y() + + def getUpperRight(self): + return self.getUpperRight_x(), self.getUpperRight_y() + + def setLowerLeft(self, value): + self[0], self[1] = [self.ensureIsNumber(x) for x in value] + + def setLowerRight(self, value): + self[2], self[1] = [self.ensureIsNumber(x) for x in value] + + def setUpperLeft(self, value): + self[0], self[3] = [self.ensureIsNumber(x) for x in value] + + def setUpperRight(self, value): + self[2], self[3] = [self.ensureIsNumber(x) for x in value] + + lowerLeft = property(getLowerLeft, setLowerLeft, None, None) + lowerRight = property(getLowerRight, setLowerRight, None, None) + upperLeft = property(getUpperLeft, setUpperLeft, None, None) + upperRight = property(getUpperRight, setUpperRight, None, None) + + +def encode_pdfdocencoding(unicode_string): + retval = '' + for c in unicode_string: + try: + retval += chr(_pdfDocEncoding_rev[c]) + except KeyError: + raise UnicodeEncodeError("pdfdocencoding", c, -1, -1, + "does not exist in translation table") + return retval + +def decode_pdfdocencoding(byte_array): + retval = u'' + for b in byte_array: + c = _pdfDocEncoding[ord(b)] + if c == u'\u0000': + raise UnicodeDecodeError("pdfdocencoding", b, -1, -1, + "does not exist in translation table") + retval += c + return retval + +_pdfDocEncoding = ( + u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', + u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', + u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', + u'\u02d8', u'\u02c7', u'\u02c6', u'\u02d9', u'\u02dd', u'\u02db', u'\u02da', u'\u02dc', + u'\u0020', u'\u0021', u'\u0022', u'\u0023', u'\u0024', u'\u0025', u'\u0026', u'\u0027', + u'\u0028', u'\u0029', u'\u002a', u'\u002b', u'\u002c', u'\u002d', u'\u002e', u'\u002f', + u'\u0030', u'\u0031', u'\u0032', u'\u0033', u'\u0034', u'\u0035', u'\u0036', u'\u0037', + u'\u0038', u'\u0039', u'\u003a', u'\u003b', u'\u003c', u'\u003d', u'\u003e', u'\u003f', + u'\u0040', u'\u0041', u'\u0042', u'\u0043', u'\u0044', u'\u0045', u'\u0046', u'\u0047', + u'\u0048', u'\u0049', u'\u004a', u'\u004b', u'\u004c', u'\u004d', u'\u004e', u'\u004f', + u'\u0050', u'\u0051', u'\u0052', u'\u0053', u'\u0054', u'\u0055', u'\u0056', u'\u0057', + u'\u0058', u'\u0059', u'\u005a', u'\u005b', u'\u005c', u'\u005d', u'\u005e', u'\u005f', + u'\u0060', u'\u0061', u'\u0062', u'\u0063', u'\u0064', u'\u0065', u'\u0066', u'\u0067', + u'\u0068', u'\u0069', u'\u006a', u'\u006b', u'\u006c', u'\u006d', u'\u006e', u'\u006f', + u'\u0070', u'\u0071', u'\u0072', u'\u0073', u'\u0074', u'\u0075', u'\u0076', u'\u0077', + u'\u0078', u'\u0079', u'\u007a', u'\u007b', u'\u007c', u'\u007d', u'\u007e', u'\u0000', + u'\u2022', u'\u2020', u'\u2021', u'\u2026', u'\u2014', u'\u2013', u'\u0192', u'\u2044', + u'\u2039', u'\u203a', u'\u2212', u'\u2030', u'\u201e', u'\u201c', u'\u201d', u'\u2018', + u'\u2019', u'\u201a', u'\u2122', u'\ufb01', u'\ufb02', u'\u0141', u'\u0152', u'\u0160', + u'\u0178', u'\u017d', u'\u0131', u'\u0142', u'\u0153', u'\u0161', u'\u017e', u'\u0000', + u'\u20ac', u'\u00a1', u'\u00a2', u'\u00a3', u'\u00a4', u'\u00a5', u'\u00a6', u'\u00a7', + u'\u00a8', u'\u00a9', u'\u00aa', u'\u00ab', u'\u00ac', u'\u0000', u'\u00ae', u'\u00af', + u'\u00b0', u'\u00b1', u'\u00b2', u'\u00b3', u'\u00b4', u'\u00b5', u'\u00b6', u'\u00b7', + u'\u00b8', u'\u00b9', u'\u00ba', u'\u00bb', u'\u00bc', u'\u00bd', u'\u00be', u'\u00bf', + u'\u00c0', u'\u00c1', u'\u00c2', u'\u00c3', u'\u00c4', u'\u00c5', u'\u00c6', u'\u00c7', + u'\u00c8', u'\u00c9', u'\u00ca', u'\u00cb', u'\u00cc', u'\u00cd', u'\u00ce', u'\u00cf', + u'\u00d0', u'\u00d1', u'\u00d2', u'\u00d3', u'\u00d4', u'\u00d5', u'\u00d6', u'\u00d7', + u'\u00d8', u'\u00d9', u'\u00da', u'\u00db', u'\u00dc', u'\u00dd', u'\u00de', u'\u00df', + u'\u00e0', u'\u00e1', u'\u00e2', u'\u00e3', u'\u00e4', u'\u00e5', u'\u00e6', u'\u00e7', + u'\u00e8', u'\u00e9', u'\u00ea', u'\u00eb', u'\u00ec', u'\u00ed', u'\u00ee', u'\u00ef', + u'\u00f0', u'\u00f1', u'\u00f2', u'\u00f3', u'\u00f4', u'\u00f5', u'\u00f6', u'\u00f7', + u'\u00f8', u'\u00f9', u'\u00fa', u'\u00fb', u'\u00fc', u'\u00fd', u'\u00fe', u'\u00ff' +) + +assert len(_pdfDocEncoding) == 256 + +_pdfDocEncoding_rev = {} +for i in xrange(256): + char = _pdfDocEncoding[i] + if char == u"\u0000": + continue + assert char not in _pdfDocEncoding_rev + _pdfDocEncoding_rev[char] = i + diff --git a/src/libprs500/ebooks/pyPdf/pdf.py b/src/libprs500/ebooks/pyPdf/pdf.py index fdaacaf574..f64c1a6c22 100644 --- a/src/libprs500/ebooks/pyPdf/pdf.py +++ b/src/libprs500/ebooks/pyPdf/pdf.py @@ -1,6 +1,8 @@ # vim: sw=4:expandtab:foldmethod=marker # # Copyright (c) 2006, Mathieu Fenniak +# Copyright (c) 2007, Ashish Kulkarni +# # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -34,7 +36,7 @@ be able to split and merge PDF files by page, and that's about all it can do. It may be a solid base for future PDF file work in Python. """ __author__ = "Mathieu Fenniak" -__author_email__ = "mfenniak@pobox.com" +__author_email__ = "biziqe@mathieu.fenniak.net" import struct try: @@ -44,6 +46,7 @@ except ImportError: import filters import utils +import warnings from generic import * from utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirtualList from sets import ImmutableSet @@ -68,7 +71,7 @@ class PdfFileWriter(object): # info object info = DictionaryObject() info.update({ - NameObject("/Producer"): StringObject("Python PDF Library - http://pybrary.net/pyPdf/") + NameObject("/Producer"): createStringObject(u"Python PDF Library - http://pybrary.net/pyPdf/") }) self._info = self._addObject(info) @@ -128,10 +131,10 @@ class PdfFileWriter(object): keylen = 40 / 8 # permit everything: P = -1 - O = StringObject(_alg33(owner_pwd, user_pwd, rev, keylen)) + O = ByteStringObject(_alg33(owner_pwd, user_pwd, rev, keylen)) ID_1 = md5.new(repr(time.time())).digest() ID_2 = md5.new(repr(random.random())).digest() - self._ID = ArrayObject((StringObject(ID_1), StringObject(ID_2))) + self._ID = ArrayObject((ByteStringObject(ID_1), ByteStringObject(ID_2))) if rev == 2: U, key = _alg34(user_pwd, O, P, ID_1) else: @@ -143,8 +146,8 @@ class PdfFileWriter(object): if V == 2: encrypt[NameObject("/Length")] = NumberObject(keylen * 8) encrypt[NameObject("/R")] = NumberObject(rev) - encrypt[NameObject("/O")] = StringObject(O) - encrypt[NameObject("/U")] = StringObject(U) + encrypt[NameObject("/O")] = ByteStringObject(O) + encrypt[NameObject("/U")] = ByteStringObject(U) encrypt[NameObject("/P")] = NumberObject(P) self._encrypt = self._addObject(encrypt) self._encrypt_key = key @@ -212,8 +215,6 @@ class PdfFileWriter(object): for key, value in data.items(): origvalue = value value = self._sweepIndirectReferences(externMap, value) - if value == None: - print objects, value, origvalue if isinstance(value, StreamObject): # a dictionary value is a stream. streams must be indirect # objects, so we need to change this value. @@ -271,6 +272,7 @@ class PdfFileWriter(object): class PdfFileReader(object): def __init__(self, stream): self.flattenedPages = None + self.pageNumbers = {} self.resolvedObjects = {} self.read(stream) self.stream = stream @@ -329,6 +331,144 @@ class PdfFileReader(object): self._flatten() return self.flattenedPages[pageNumber] + ## + # Read-only property that accesses the + # {@link #PdfFileReader.getNamedDestinations + # getNamedDestinations} function. + #

+ # Stability: Added in v1.10, will exist for all future v1.x releases. + namedDestinations = property(lambda self: + self.getNamedDestinations(), None, None) + + ## + # Retrieves the named destinations present in the document. + #

+ # Stability: Added in v1.10, will exist for all future v1.x releases. + # @return Returns a dict which maps names to {@link #Destination + # destinations}. + def getNamedDestinations(self, tree = None, map = None): + if self.flattenedPages == None: + self._flatten() + + get = self.safeGetObject + if map == None: + map = {} + catalog = get(self.trailer["/Root"]) + + # get the name tree + if catalog.has_key("/Dests"): + tree = get(catalog["/Dests"]) + elif catalog.has_key("/Names"): + names = get(catalog['/Names']) + if names.has_key("/Dests"): + tree = get(names['/Dests']) + + if tree == None: + return map + + if tree.has_key("/Kids"): + # recurse down the tree + for kid in get(tree["/Kids"]): + self.getNamedDestinations(get(kid), map) + + if tree.has_key("/Names"): + names = get(tree["/Names"]) + for i in range(0, len(names), 2): + key = get(names[i]) + val = get(names[i+1]) + if isinstance(val, DictionaryObject) and val.has_key('/D'): + val = get(val['/D']) + dest = self._buildDestination(val, key) + if dest != None: + map[key] = dest + + return map + + ## + # Read-only property that accesses the {@link #PdfFileReader.getOutlines + # getOutlines} function. + #

+ # Stability: Added in v1.10, will exist for all future v1.x releases. + outlines = property(lambda self: self.getOutlines(), None, None) + + ## + # Retrieves the document outline present in the document. + #

+ # Stability: Added in v1.10, will exist for all future v1.x releases. + # @return Returns a nested list of {@link #Destination destinations}. + def getOutlines(self, node = None, outlines = None): + if self.flattenedPages == None: + self._flatten() + + get = self.safeGetObject + if outlines == None: + outlines = [] + catalog = get(self.trailer["/Root"]) + + # get the outline dictionary and named destinations + if catalog.has_key("/Outlines"): + lines = get(catalog["/Outlines"]) + if lines.has_key("/First"): + node = get(lines["/First"]) + self._namedDests = self.getNamedDestinations() + + if node == None: + return outlines + + # see if there are any more outlines + while 1: + outline = self._buildOutline(node) + if outline: + outlines.append(outline) + + # check for sub-outlines + if node.has_key("/First"): + subOutlines = [] + self.getOutlines(get(node["/First"]), subOutlines) + if subOutlines: + outlines.append(subOutlines) + + if not node.has_key("/Next"): + break + node = get(node["/Next"]) + + return outlines + + def _buildDestination(self, array, title): + if not (isinstance(array, ArrayObject) and len(array) >= 2 and \ + isinstance(array[0], IndirectObject)): + return None + + pageKey = (array[0].generation, array[0].idnum) + if not self.pageNumbers.has_key(pageKey): + return None + + pageNum = self.pageNumbers[pageKey] + return Destination(*([title, pageNum]+array[1:])) + + def _buildOutline(self, node): + dest, title, outline = None, None, None + + if node.has_key("/A") and node.has_key("/Title"): + # Action, section 8.5 (only type GoTo supported) + title = self.safeGetObject(node["/Title"]) + action = self.safeGetObject(node["/A"]) + if action["/S"] == "/GoTo": + dest = self.safeGetObject(action["/D"]) + elif node.has_key("/Dest") and node.has_key("/Title"): + # Destination, section 8.2.1 + title = self.safeGetObject(node["/Title"]) + dest = self.safeGetObject(node["/Dest"]) + + # if destination found, then create outline + if dest: + if isinstance(dest, ArrayObject): + outline = self._buildDestination(dest, title) + elif isinstance(dest, str) and self._namedDests.has_key(dest): + outline = self._namedDests[dest] + outline.title = title + return outline + ## # Read-only property that emulates a list based upon the {@link # #PdfFileReader.getNumPages getNumPages} and {@link #PdfFileReader.getPage @@ -349,14 +489,16 @@ class PdfFileReader(object): self.flattenedPages = [] catalog = self.getObject(self.trailer["/Root"]) pages = self.getObject(catalog["/Pages"]) + indirectReference = None if isinstance(pages, IndirectObject): + indirectReference = pages pages = self.getObject(pages) t = pages["/Type"] if t == "/Pages": for attr in inheritablePageAttributes: if pages.has_key(attr): inherit[attr] = pages[attr] - for page in pages["/Kids"]: + for page in self.safeGetObject(pages["/Kids"]): self._flatten(page, inherit) elif t == "/Page": for attr,value in inherit.items(): @@ -364,8 +506,11 @@ class PdfFileReader(object): # parent's value: if not pages.has_key(attr): pages[attr] = value - pageObj = PageObject(self) + pageObj = PageObject(self, indirectReference) pageObj.update(pages) + if indirectReference: + key = (indirectReference.generation, indirectReference.idnum) + self.pageNumbers[key] = len(self.flattenedPages) self.flattenedPages.append(pageObj) def safeGetObject(self, obj): @@ -425,8 +570,8 @@ class PdfFileReader(object): return retval def _decryptObject(self, obj, key): - if isinstance(obj, StringObject): - obj = StringObject(utils.RC4_encrypt(key, obj)) + if isinstance(obj, ByteStringObject) or isinstance(obj, TextStringObject): + obj = createStringObject(utils.RC4_encrypt(key, obj.original_bytes)) elif isinstance(obj, StreamObject): obj._data = utils.RC4_encrypt(key, obj._data) elif isinstance(obj, DictionaryObject): @@ -438,6 +583,11 @@ class PdfFileReader(object): return obj def readObjectHeader(self, stream): + # Should never be necessary to read out whitespace, since the + # cross-reference table should put us in the right spot to read the + # object header. In reality... some files have stupid cross reference + # tables that are off by whitespace bytes. + readNonWhitespace(stream); stream.seek(-1, 1) idnum = readUntilWhitespace(stream) generation = readUntilWhitespace(stream) obj = stream.read(3) @@ -456,13 +606,15 @@ class PdfFileReader(object): line = '' while not line: line = self.readNextEndLine(stream) - assert line[:5] == "%%EOF" + if line[:5] != "%%EOF": + raise utils.PdfReadError, "EOF marker not found" # find startxref entry - the location of the xref table line = self.readNextEndLine(stream) startxref = int(line) line = self.readNextEndLine(stream) - assert line[:9] == "startxref" + if line[:9] != "startxref": + raise utils.PdfReadError, "startxref not found" # read all cross reference tables and their trailers self.xref = {} @@ -475,7 +627,8 @@ class PdfFileReader(object): if x == "x": # standard cross-reference table ref = stream.read(4) - assert ref[:3] == "ref" + if ref[:3] != "ref": + raise utils.PdfReadError, "xref table read error" readNonWhitespace(stream) stream.seek(-1, 1) while 1: @@ -661,7 +814,7 @@ class PdfFileReader(object): def _authenticateUserPassword(self, password): encrypt = self.safeGetObject(self.trailer['/Encrypt']) rev = self.safeGetObject(encrypt['/R']) - owner_entry = self.safeGetObject(encrypt['/O']) + owner_entry = self.safeGetObject(encrypt['/O']).original_bytes p_entry = self.safeGetObject(encrypt['/P']) id_entry = self.safeGetObject(self.trailer['/ID']) id1_entry = self.safeGetObject(id_entry[0]) @@ -672,7 +825,7 @@ class PdfFileReader(object): self.safeGetObject(encrypt["/Length"]) / 8, owner_entry, p_entry, id1_entry, self.safeGetObject(encrypt.get("/EncryptMetadata", False))) - real_U = self.safeGetObject(encrypt['/U']) + real_U = self.safeGetObject(encrypt['/U']).original_bytes return U == real_U, key def getIsEncrypted(self): @@ -721,9 +874,10 @@ def createRectangleAccessor(name, fallback): # will be created by accessing the {@link #PdfFileReader.getPage getPage} # function of the {@link #PdfFileReader PdfFileReader} class. class PageObject(DictionaryObject): - def __init__(self, pdf): + def __init__(self, pdf, indirectReference = None): DictionaryObject.__init__(self) self.pdf = pdf + self.indirectReference = indirectReference ## # Rotates a page clockwise by increments of 90 degrees. @@ -856,26 +1010,35 @@ class PageObject(DictionaryObject): #

# Stability: Added in v1.7, will exist for all future v1.x releases. May # be overhauled to provide more ordered text in the future. - # @return a string object + # @return a unicode string object def extractText(self): - text = "" + text = u"" content = self["/Contents"].getObject() if not isinstance(content, ContentStream): content = ContentStream(content, self.pdf) + # Note: we check all strings are TextStringObjects. ByteStringObjects + # are strings where the byte->string encoding was unknown, so adding + # them to the text here would be gibberish. for operands,operator in content.operations: if operator == "Tj": - text += operands[0] + _text = operands[0] + if isinstance(_text, TextStringObject): + text += _text elif operator == "T*": text += "\n" elif operator == "'": text += "\n" - text += operands[0] - elif operator == "\"": - text += "\n" - text += operands[2] + _text = operands[0] + if isinstance(_text, TextStringObject): + text += operands[0] + elif operator == '"': + _text = operands[2] + if isinstance(_text, TextStringObject): + text += "\n" + text += _text elif operator == "TJ": for i in operands[0]: - if isinstance(i, StringObject): + if isinstance(i, TextStringObject): text += i return text @@ -946,7 +1109,7 @@ class ContentStream(DecodedStreamObject): if peek == '': break stream.seek(-1, 1) - if peek.isalpha() or peek == "'" or peek == "\"": + if peek.isalpha() or peek == "'" or peek == '"': operator = readUntilWhitespace(stream, maxchars=2) if operator == "BI": # begin inline image - a completely different parsing @@ -1021,43 +1184,139 @@ class ContentStream(DecodedStreamObject): ## # A class representing the basic document metadata provided in a PDF File. +#

+# As of pyPdf v1.10, all text properties of the document metadata have two +# properties, eg. author and author_raw. The non-raw property will always +# return a TextStringObject, making it ideal for a case where the metadata is +# being displayed. The raw property can sometimes return a ByteStringObject, +# if pyPdf was unable to decode the string's text encoding; this requires +# additional safety in the caller and therefore is not as commonly accessed. class DocumentInformation(DictionaryObject): def __init__(self): DictionaryObject.__init__(self) + def getText(self, key): + retval = self.get(key, None) + if isinstance(retval, TextStringObject): + return retval + return None + ## # Read-only property accessing the document's title. Added in v1.6, will - # exist for all future v1.x releases. - # @return A string, or None if the title is not provided. - title = property(lambda self: self.get("/Title", None), None, None) + # exist for all future v1.x releases. Modified in v1.10 to always return a + # unicode string (TextStringObject). + # @return A unicode string, or None if the title is not provided. + title = property(lambda self: self.getText("/Title")) + title_raw = property(lambda self: self.get("/Title")) ## # Read-only property accessing the document's author. Added in v1.6, will - # exist for all future v1.x releases. - # @return A string, or None if the author is not provided. - author = property(lambda self: self.get("/Author", None), None, None) + # exist for all future v1.x releases. Modified in v1.10 to always return a + # unicode string (TextStringObject). + # @return A unicode string, or None if the author is not provided. + author = property(lambda self: self.getText("/Author")) + author_raw = property(lambda self: self.get("/Author")) ## # Read-only property accessing the subject of the document. Added in v1.6, - # will exist for all future v1.x releases. - # @return A string, or None if the subject is not provided. - subject = property(lambda self: self.get("/Subject", None), None, None) + # will exist for all future v1.x releases. Modified in v1.10 to always + # return a unicode string (TextStringObject). + # @return A unicode string, or None if the subject is not provided. + subject = property(lambda self: self.getText("/Subject")) + subject_raw = property(lambda self: self.get("/Subject")) ## # Read-only property accessing the document's creator. If the document was # converted to PDF from another format, the name of the application (for # example, OpenOffice) that created the original document from which it was # converted. Added in v1.6, will exist for all future v1.x releases. - # @return A string, or None if the creator is not provided. - creator = property(lambda self: self.get("/Creator", None), None, None) + # Modified in v1.10 to always return a unicode string (TextStringObject). + # @return A unicode string, or None if the creator is not provided. + creator = property(lambda self: self.getText("/Creator")) + creator_raw = property(lambda self: self.get("/Creator")) ## # Read-only property accessing the document's producer. If the document # was converted to PDF from another format, the name of the application # (for example, OSX Quartz) that converted it to PDF. Added in v1.6, will - # exist for all future v1.x releases. - # @return A string, or None if the producer is not provided. - producer = property(lambda self: self.get("/Producer", None), None, None) + # exist for all future v1.x releases. Modified in v1.10 to always return a + # unicode string (TextStringObject). + # @return A unicode string, or None if the producer is not provided. + producer = property(lambda self: self.getText("/Producer")) + producer_raw = property(lambda self: self.get("/Producer")) + + +## +# A class representing a destination within a PDF file. +# See section 8.2.1 of the PDF 1.6 reference. +# Stability: Added in v1.10, will exist for all v1.x releases. +class Destination(DictionaryObject): + def __init__(self, *args): + DictionaryObject.__init__(self) + self.title = args[0] + self["/Page"], self["/Type"] = args[1], args[2] + + # from table 8.2 of the PDF 1.6 reference. + mapNull = lambda x: {True: None, False: x}[isinstance(x, NullObject)] + params = map(mapNull, args[3:]) + type = self["/Type"] + + if type == "/XYZ": + self["/Left"], self["/Top"], self["/Zoom"] = params + elif type == "/FitR": + self["/Left"], self["/Bottom"], \ + self["/Right"], self["/Top"] = params + elif type in ["/FitH", "FitBH"]: + self["/Top"], = params + elif type in ["/FitV", "FitBV"]: + self["/Left"], = params + elif type in ["/Fit", "FitB"]: + pass + else: + raise utils.PdfReadError, "Unknown Destination Type: " + type + + def setTitle(self, title): + self["/Title"] = title.strip() + + ## + # Read-write property accessing the destination title. + # @return A string. + title = property(lambda self: self.get("/Title"), setTitle, None) + + ## + # Read-only property accessing the destination page. + # @return An integer. + page = property(lambda self: self.get("/Page"), None, None) + + ## + # Read-only property accessing the destination type. + # @return A string. + type = property(lambda self: self.get("/Type"), None, None) + + ## + # Read-only property accessing the zoom factor. + # @return A number, or None if not available. + zoom = property(lambda self: self.get("/Zoom", None), None, None) + + ## + # Read-only property accessing the left horizontal coordinate. + # @return A number, or None if not available. + left = property(lambda self: self.get("/Left", None), None, None) + + ## + # Read-only property accessing the right horizontal coordinate. + # @return A number, or None if not available. + right = property(lambda self: self.get("/Right", None), None, None) + + ## + # Read-only property accessing the top vertical coordinate. + # @return A number, or None if not available. + top = property(lambda self: self.get("/Top", None), None, None) + + ## + # Read-only property accessing the bottom vertical coordinate. + # @return A number, or None if not available. + bottom = property(lambda self: self.get("/Bottom", None), None, None) def convertToInt(d, size): @@ -1078,65 +1337,150 @@ _encryption_padding = '\x28\xbf\x4e\x5e\x4e\x75\x8a\x41\x64\x00\x4e\x56' + \ '\xff\xfa\x01\x08\x2e\x2e\x00\xb6\xd0\x68\x3e\x80\x2f\x0c' + \ '\xa9\xfe\x64\x53\x69\x7a' +# Implementation of algorithm 3.2 of the PDF standard security handler, +# section 3.5.2 of the PDF 1.6 reference. def _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt=True): - import md5, struct - m = md5.new() + # 1. Pad or truncate the password string to exactly 32 bytes. If the + # password string is more than 32 bytes long, use only its first 32 bytes; + # if it is less than 32 bytes long, pad it by appending the required number + # of additional bytes from the beginning of the padding string + # (_encryption_padding). password = (password + _encryption_padding)[:32] - m.update(password) + # 2. Initialize the MD5 hash function and pass the result of step 1 as + # input to this function. + import md5, struct + m = md5.new(password) + # 3. Pass the value of the encryption dictionary's /O entry to the MD5 hash + # function. m.update(owner_entry) + # 4. Treat the value of the /P entry as an unsigned 4-byte integer and pass + # these bytes to the MD5 hash function, low-order byte first. p_entry = struct.pack('= 3 and not metadata_encrypt: m.update("\xff\xff\xff\xff") + # 7. Finish the hash. md5_hash = m.digest() + # 8. (Revision 3 or greater) Do the following 50 times: Take the output + # from the previous MD5 hash and pass the first n bytes of the output as + # input into a new MD5 hash, where n is the number of bytes of the + # encryption key as defined by the value of the encryption dictionary's + # /Length entry. if rev >= 3: for i in range(50): md5_hash = md5.new(md5_hash[:keylen]).digest() + # 9. Set the encryption key to the first n bytes of the output from the + # final MD5 hash, where n is always 5 for revision 2 but, for revision 3 or + # greater, depends on the value of the encryption dictionary's /Length + # entry. return md5_hash[:keylen] +# Implementation of algorithm 3.3 of the PDF standard security handler, +# section 3.5.2 of the PDF 1.6 reference. def _alg33(owner_pwd, user_pwd, rev, keylen): + # steps 1 - 4 key = _alg33_1(owner_pwd, rev, keylen) + # 5. Pad or truncate the user password string as described in step 1 of + # algorithm 3.2. user_pwd = (user_pwd + _encryption_padding)[:32] + # 6. Encrypt the result of step 5, using an RC4 encryption function with + # the encryption key obtained in step 4. val = utils.RC4_encrypt(key, user_pwd) + # 7. (Revision 3 or greater) Do the following 19 times: Take the output + # from the previous invocation of the RC4 function and pass it as input to + # a new invocation of the function; use an encryption key generated by + # taking each byte of the encryption key obtained in step 4 and performing + # an XOR operation between that byte and the single-byte value of the + # iteration counter (from 1 to 19). if rev >= 3: for i in range(1, 20): new_key = '' for l in range(len(key)): new_key += chr(ord(key[l]) ^ i) val = utils.RC4_encrypt(new_key, val) + # 8. Store the output from the final invocation of the RC4 as the value of + # the /O entry in the encryption dictionary. return val +# Steps 1-4 of algorithm 3.3 def _alg33_1(password, rev, keylen): - import md5 - m = md5.new() + # 1. Pad or truncate the owner password string as described in step 1 of + # algorithm 3.2. If there is no owner password, use the user password + # instead. password = (password + _encryption_padding)[:32] - m.update(password) + # 2. Initialize the MD5 hash function and pass the result of step 1 as + # input to this function. + import md5 + m = md5.new(password) + # 3. (Revision 3 or greater) Do the following 50 times: Take the output + # from the previous MD5 hash and pass it as input into a new MD5 hash. md5_hash = m.digest() if rev >= 3: for i in range(50): md5_hash = md5.new(md5_hash).digest() + # 4. Create an RC4 encryption key using the first n bytes of the output + # from the final MD5 hash, where n is always 5 for revision 2 but, for + # revision 3 or greater, depends on the value of the encryption + # dictionary's /Length entry. key = md5_hash[:keylen] return key +# Implementation of algorithm 3.4 of the PDF standard security handler, +# section 3.5.2 of the PDF 1.6 reference. def _alg34(password, owner_entry, p_entry, id1_entry): + # 1. Create an encryption key based on the user password string, as + # described in algorithm 3.2. key = _alg32(password, 2, 5, owner_entry, p_entry, id1_entry) + # 2. Encrypt the 32-byte padding string shown in step 1 of algorithm 3.2, + # using an RC4 encryption function with the encryption key from the + # preceding step. U = utils.RC4_encrypt(key, _encryption_padding) + # 3. Store the result of step 2 as the value of the /U entry in the + # encryption dictionary. return U, key +# Implementation of algorithm 3.4 of the PDF standard security handler, +# section 3.5.2 of the PDF 1.6 reference. def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt): + # 1. Create an encryption key based on the user password string, as + # described in Algorithm 3.2. + key = _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry) + # 2. Initialize the MD5 hash function and pass the 32-byte padding string + # shown in step 1 of Algorithm 3.2 as input to this function. import md5 m = md5.new() m.update(_encryption_padding) + # 3. Pass the first element of the file's file identifier array (the value + # of the ID entry in the document's trailer dictionary; see Table 3.13 on + # page 73) to the hash function and finish the hash. (See implementation + # note 25 in Appendix H.) m.update(id1_entry) md5_hash = m.digest() - key = _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry) + # 4. Encrypt the 16-byte result of the hash, using an RC4 encryption + # function with the encryption key from step 1. val = utils.RC4_encrypt(key, md5_hash) + # 5. Do the following 19 times: Take the output from the previous + # invocation of the RC4 function and pass it as input to a new invocation + # of the function; use an encryption key generated by taking each byte of + # the original encryption key (obtained in step 2) and performing an XOR + # operation between that byte and the single-byte value of the iteration + # counter (from 1 to 19). for i in range(1, 20): new_key = '' for l in range(len(key)): new_key += chr(ord(key[l]) ^ i) val = utils.RC4_encrypt(new_key, val) + # 6. Append 16 bytes of arbitrary padding to the output from the final + # invocation of the RC4 function and store the 32-byte result as the value + # of the U entry in the encryption dictionary. + # (implementator note: I don't know what "arbitrary padding" is supposed to + # mean, so I have used null bytes. This seems to match a few other + # people's implementations) return val + ('\x00' * 16), key #if __name__ == "__main__": diff --git a/src/libprs500/ebooks/pyPdf/utils.py b/src/libprs500/ebooks/pyPdf/utils.py index d6769c248f..860a42e669 100644 --- a/src/libprs500/ebooks/pyPdf/utils.py +++ b/src/libprs500/ebooks/pyPdf/utils.py @@ -32,7 +32,7 @@ Utility functions for PDF library. """ __author__ = "Mathieu Fenniak" -__author_email__ = "mfenniak@pobox.com" +__author_email__ = "biziqe@mathieu.fenniak.net" def readUntilWhitespace(stream, maxchars=None): txt = "" @@ -86,6 +86,9 @@ def RC4_encrypt(key, plaintext): retval += chr(ord(plaintext[x]) ^ t) return retval +class PdfReadError(Exception): + pass + if __name__ == "__main__": # test RC4 out = RC4_encrypt("Key", "Plaintext")