From f7332494ae54e29c5928b5443a2cd7d5ce3954c3 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 7 Sep 2007 15:43:39 +0000 Subject: [PATCH] Implement pure python solution for reading PDF metadata --- src/libprs500/ebooks/metadata/pdf.py | 82 +- src/libprs500/ebooks/pyPdf/__init__.py | 2 + src/libprs500/ebooks/pyPdf/filters.py | 239 +++++ src/libprs500/ebooks/pyPdf/generic.py | 542 +++++++++++ src/libprs500/ebooks/pyPdf/pdf.py | 1162 ++++++++++++++++++++++++ src/libprs500/ebooks/pyPdf/utils.py | 94 ++ 6 files changed, 2059 insertions(+), 62 deletions(-) create mode 100644 src/libprs500/ebooks/pyPdf/__init__.py create mode 100644 src/libprs500/ebooks/pyPdf/filters.py create mode 100644 src/libprs500/ebooks/pyPdf/generic.py create mode 100644 src/libprs500/ebooks/pyPdf/pdf.py create mode 100644 src/libprs500/ebooks/pyPdf/utils.py diff --git a/src/libprs500/ebooks/metadata/pdf.py b/src/libprs500/ebooks/metadata/pdf.py index c0596e930e..1b7880ce39 100644 --- a/src/libprs500/ebooks/metadata/pdf.py +++ b/src/libprs500/ebooks/metadata/pdf.py @@ -14,83 +14,41 @@ ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. '''Read meta information from PDF files''' -import sys, os, copy +import sys, os -from libprs500.ebooks.metadata import MetaInformation, get_parser -from libprs500.ptempfile import PersistentTemporaryFile +from libprs500.ebooks.metadata import MetaInformation +from libprs500.ebooks.pyPdf import PdfFileReader def get_metadata(stream): """ Return metadata as a L{MetaInfo} object """ if hasattr(stream, 'name'): - title = stream.name + title = os.path.splitext(os.path.basename(stream.name))[0] else: title = 'Unknown' mi = MetaInformation(title, 'Unknown') - stream.seek(0) - pt = PersistentTemporaryFile('.pdf') - pt.write(stream.read()) - pt.close() - return get_metadata_from_file(pt.name, mi) - -def set_metadata(path, options): - try: - import podofo - doc = podofo.PdfDocument() - doc.Load(path) - info = doc.GetInfo() - if options.title: - info.SetTitle(options.title) - if options.authors: - info.SetAuthor(options.authors) - if options.category: - info.SetSubject(options.category) - pt = PersistentTemporaryFile('.pdf') - pt.close() - doc.Write(pt.name) - stream = open(path, 'wb') - stream.write(open(pt.name, 'rb').read()) - stream.close() - except ImportError: - return False - return True - -def get_metadata_from_file(path, default_mi=None): - if default_mi is None: - title = os.path.splitext(os.path.basename(path))[0] - mi = MetaInformation(title, 'Unknown') - else: - mi = copy.copy(default_mi) - try: - import podofo - doc = podofo.PdfDocument() - doc.Load(path) - info = doc.GetInfo() - if info.GetTitle(): - mi.title = info.GetTitle() - if info.GetAuthor(): - mi.authors = info.GetAuthor().split(',') - if info.GetSubject(): - mi.category = info.GetSubject() - except ImportError: - pass - finally: - return mi - - + info = PdfFileReader(stream).getDocumentInfo() + if info.title: + mi.title = title + if info.author: + src = info.author.split('&') + authors = [] + for au in src: + authors += au.split(',') + mi.authors = authors + mi.author = info.author + if info.subject: + mi.category = info.subject + return mi + + def main(args=sys.argv): - parser = get_parser('pdf') - options, args = parser.parse_args(args) if len(args) != 2: print >>sys.stderr, 'No filename specified.' return 1 path = os.path.abspath(os.path.expanduser(args[1])) - if not set_metadata(path, options): - print >>sys.stderr, 'You do not have the podofo python extension installed. Cannot read PDF files.' - return 1 - - print get_metadata_from_file(path) + print get_metadata(open(path, 'rb')) return 0 if __name__ == '__main__': diff --git a/src/libprs500/ebooks/pyPdf/__init__.py b/src/libprs500/ebooks/pyPdf/__init__.py new file mode 100644 index 0000000000..af02553da6 --- /dev/null +++ b/src/libprs500/ebooks/pyPdf/__init__.py @@ -0,0 +1,2 @@ +from pdf import PdfFileReader, PdfFileWriter +__all__ = ["pdf"] diff --git a/src/libprs500/ebooks/pyPdf/filters.py b/src/libprs500/ebooks/pyPdf/filters.py new file mode 100644 index 0000000000..17a325f76f --- /dev/null +++ b/src/libprs500/ebooks/pyPdf/filters.py @@ -0,0 +1,239 @@ +# vim: sw=4:expandtab:foldmethod=marker +# +# Copyright (c) 2006, Mathieu Fenniak +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + + +""" +Implementation of stream filters for PDF. +""" +__author__ = "Mathieu Fenniak" +__author_email__ = "mfenniak@pobox.com" + +from generic import NameObject + +try: + import zlib + def decompress(data): + return zlib.decompress(data) + def compress(data): + return zlib.compress(data) +except ImportError: + # Unable to import zlib. Attempt to use the System.IO.Compression + # library from the .NET framework. (IronPython only) + import System + from System import IO, Collections, Array + def _string_to_bytearr(buf): + retval = Array.CreateInstance(System.Byte, len(buf)) + for i in range(len(buf)): + retval[i] = ord(buf[i]) + return retval + def _bytearr_to_string(bytes): + retval = "" + for i in range(bytes.Length): + retval += chr(bytes[i]) + return retval + def _read_bytes(stream): + ms = IO.MemoryStream() + buf = Array.CreateInstance(System.Byte, 2048) + while True: + bytes = stream.Read(buf, 0, buf.Length) + if bytes == 0: + break + else: + ms.Write(buf, 0, bytes) + retval = ms.ToArray() + ms.Close() + return retval + def decompress(data): + bytes = _string_to_bytearr(data) + ms = IO.MemoryStream() + ms.Write(bytes, 0, bytes.Length) + ms.Position = 0 # fseek 0 + gz = IO.Compression.DeflateStream(ms, IO.Compression.CompressionMode.Decompress) + bytes = _read_bytes(gz) + retval = _bytearr_to_string(bytes) + gz.Close() + return retval + def compress(data): + bytes = _string_to_bytearr(data) + ms = IO.MemoryStream() + gz = IO.Compression.DeflateStream(ms, IO.Compression.CompressionMode.Compress, True) + gz.Write(bytes, 0, bytes.Length) + gz.Close() + ms.Position = 0 # fseek 0 + bytes = ms.ToArray() + retval = _bytearr_to_string(bytes) + ms.Close() + return retval + + +class FlateDecode(object): + def decode(data, decodeParms): + data = decompress(data) + predictor = 1 + if decodeParms: + predictor = decodeParms.get("/Predictor", 1) + # predictor 1 == no predictor + if predictor != 1: + columns = decodeParms["/Columns"] + if predictor >= 10: + newdata = "" + # PNG prediction can vary from row to row + rowlength = columns + 1 + assert len(data) % rowlength == 0 + prev_rowdata = "\x00"*rowlength + for row in range(len(data) / rowlength): + rowdata = list(data[(row*rowlength):((row+1)*rowlength)]) + filterByte = ord(rowdata[0]) + if filterByte == 0: + pass + elif filterByte == 1: + for i in range(2, rowlength): + rowdata[i] = chr((ord(rowdata[i]) + ord(rowdata[i-1])) % 256) + elif filterByte == 2: + for i in range(1, rowlength): + rowdata[i] = chr((ord(rowdata[i]) + ord(prev_rowdata[i])) % 256) + else: + # unsupported PNG filter + assert False + prev_rowdata = rowdata + newdata += ''.join(rowdata[1:]) + data = newdata + else: + # unsupported predictor + assert False + return data + decode = staticmethod(decode) + + def encode(data): + return compress(data) + encode = staticmethod(encode) + +class ASCIIHexDecode(object): + def decode(data, decodeParms=None): + retval = "" + char = "" + x = 0 + while True: + c = data[x] + if c == ">": + break + elif c.isspace(): + x += 1 + continue + char += c + if len(char) == 2: + retval += chr(int(char, base=16)) + char = "" + x += 1 + assert char == "" + return retval + decode = staticmethod(decode) + +class ASCII85Decode(object): + def decode(data, decodeParms=None): + retval = "" + group = [] + x = 0 + hitEod = False + # remove all whitespace from data + data = [y for y in data if not (y in ' \n\r\t')] + while not hitEod: + c = data[x] + if len(retval) == 0 and c == "<" and data[x+1] == "~": + x += 2 + continue + #elif c.isspace(): + # x += 1 + # continue + elif c == 'z': + assert len(group) == 0 + retval += '\x00\x00\x00\x00' + continue + elif c == "~" and data[x+1] == ">": + if len(group) != 0: + # cannot have a final group of just 1 char + assert len(group) > 1 + cnt = len(group) - 1 + group += [ 85, 85, 85 ] + hitEod = cnt + else: + break + else: + c = ord(c) - 33 + assert c >= 0 and c < 85 + group += [ c ] + if len(group) >= 5: + b = group[0] * (85**4) + \ + group[1] * (85**3) + \ + group[2] * (85**2) + \ + group[3] * 85 + \ + group[4] + assert b < (2**32 - 1) + c4 = chr((b >> 0) % 256) + c3 = chr((b >> 8) % 256) + c2 = chr((b >> 16) % 256) + c1 = chr(b >> 24) + retval += (c1 + c2 + c3 + c4) + if hitEod: + retval = retval[:-4+hitEod] + group = [] + x += 1 + return retval + decode = staticmethod(decode) + +def decodeStreamData(stream): + filters = stream.get("/Filter", ()) + if len(filters) and not isinstance(filters[0], NameObject): + # we have a single filter instance + filters = (filters,) + data = stream._data + for filterType in filters: + if filterType == "/FlateDecode": + data = FlateDecode.decode(data, stream.get("/DecodeParms")) + elif filterType == "/ASCIIHexDecode": + data = ASCIIHexDecode.decode(data) + elif filterType == "/ASCII85Decode": + data = ASCII85Decode.decode(data) + else: + # unsupported filter + assert False + return data + +if __name__ == "__main__": + assert "abc" == ASCIIHexDecode.decode('61\n626\n3>') + + ascii85Test = """ + <~9jqo^BlbD-BleB1DJ+*+F(f,q/0JhKFCj@.4Gp$d7F!,L7@<6@)/0JDEF@3BB/F*&OCAfu2/AKY + i(DIb:@FD,*)+C]U=@3BN#EcYf8ATD3s@q?d$AftVqCh[NqF-FD5W8ARlolDIa + l(DIduD.RTpAKYo'+CT/5+Cei#DII?(E,9)oF*2M7/c~> + """ + ascii85_originalText="Man is distinguished, not only by his reason, but by this singular passion from other animals, which is a lust of the mind, that by a perseverance of delight in the continued and indefatigable generation of knowledge, exceeds the short vehemence of any carnal pleasure." + assert ASCII85Decode.decode(ascii85Test) == ascii85_originalText diff --git a/src/libprs500/ebooks/pyPdf/generic.py b/src/libprs500/ebooks/pyPdf/generic.py new file mode 100644 index 0000000000..4fea8fa640 --- /dev/null +++ b/src/libprs500/ebooks/pyPdf/generic.py @@ -0,0 +1,542 @@ +# vim: sw=4:expandtab:foldmethod=marker +# +# Copyright (c) 2006, Mathieu Fenniak +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + + +""" +Implementation of generic PDF objects (dictionary, number, string, and so on) +""" +__author__ = "Mathieu Fenniak" +__author_email__ = "mfenniak@pobox.com" + +import re +from utils import readNonWhitespace, RC4_encrypt +import filters + +def readObject(stream, pdf): + tok = stream.read(1) + stream.seek(-1, 1) # reset to start + if tok == 't' or tok == 'f': + # boolean object + return BooleanObject.readFromStream(stream) + elif tok == '(': + # string object + return StringObject.readFromStream(stream) + elif tok == '/': + # name object + return NameObject.readFromStream(stream) + elif tok == '[': + # array object + return ArrayObject.readFromStream(stream, pdf) + elif tok == 'n': + # null object + return NullObject.readFromStream(stream) + elif tok == '<': + # hexadecimal string OR dictionary + peek = stream.read(2) + stream.seek(-2, 1) # reset to start + if peek == '<<': + return DictionaryObject.readFromStream(stream, pdf) + else: + return StringObject.readHexStringFromStream(stream) + elif tok == '%': + # comment + while tok not in ('\r', '\n'): + tok = stream.read(1) + tok = readNonWhitespace(stream) + stream.seek(-1, 1) + return readObject(stream, pdf) + else: + # number object OR indirect reference + if tok == '+' or tok == '-': + # number + return NumberObject.readFromStream(stream) + peek = stream.read(20) + stream.seek(-len(peek), 1) # reset to start + if re.match(r"(\d+)\s(\d+)\sR[^a-zA-Z]", peek) != None: + return IndirectObject.readFromStream(stream, pdf) + else: + return NumberObject.readFromStream(stream) + +class PdfObject(object): + def getObject(self): + """Resolves indirect references.""" + return self + + +class NullObject(PdfObject): + def writeToStream(self, stream, encryption_key): + stream.write("null") + + def readFromStream(stream): + assert stream.read(4) == "null" + return NullObject() + readFromStream = staticmethod(readFromStream) + + +class BooleanObject(PdfObject): + def __init__(self, value): + self.value = value + + def writeToStream(self, stream, encryption_key): + if self.value: + stream.write("true") + else: + stream.write("false") + + def readFromStream(stream): + word = stream.read(4) + if word == "true": + return BooleanObject(True) + elif word == "fals": + stream.read(1) + return BooleanObject(False) + assert False + readFromStream = staticmethod(readFromStream) + + +class ArrayObject(list, PdfObject): + def writeToStream(self, stream, encryption_key): + stream.write("[") + for data in self: + stream.write(" ") + data.writeToStream(stream, encryption_key) + stream.write(" ]") + + def readFromStream(stream, pdf): + arr = ArrayObject() + assert stream.read(1) == "[" + while True: + # skip leading whitespace + tok = stream.read(1) + while tok.isspace(): + tok = stream.read(1) + stream.seek(-1, 1) + # check for array ending + peekahead = stream.read(1) + if peekahead == "]": + break + stream.seek(-1, 1) + # read and append obj + arr.append(readObject(stream, pdf)) + return arr + readFromStream = staticmethod(readFromStream) + + +class IndirectObject(PdfObject): + def __init__(self, idnum, generation, pdf): + self.idnum = idnum + self.generation = generation + self.pdf = pdf + + def getObject(self): + return self.pdf.getObject(self).getObject() + + def __repr__(self): + return "IndirectObject(%r, %r)" % (self.idnum, self.generation) + + def __eq__(self, other): + return ( + other != None and + isinstance(other, IndirectObject) and + self.idnum == other.idnum and + self.generation == other.generation and + self.pdf is other.pdf + ) + + def __ne__(self, other): + return not self.__eq__(other) + + def writeToStream(self, stream, encryption_key): + stream.write("%s %s R" % (self.idnum, self.generation)) + + def readFromStream(stream, pdf): + idnum = "" + while True: + tok = stream.read(1) + if tok.isspace(): + break + idnum += tok + generation = "" + while True: + tok = stream.read(1) + if tok.isspace(): + break + generation += tok + r = stream.read(1) + #if r != "R": + # stream.seek(-20, 1) + # print idnum, generation + # print repr(stream.read(40)) + assert r == "R" + return IndirectObject(int(idnum), int(generation), pdf) + readFromStream = staticmethod(readFromStream) + + +class FloatObject(float, PdfObject): + def writeToStream(self, stream, encryption_key): + stream.write(repr(self)) + + +class NumberObject(int, PdfObject): + def __init__(self, value): + int.__init__(self, value) + + def writeToStream(self, stream, encryption_key): + stream.write(repr(self)) + + def readFromStream(stream): + name = "" + while True: + tok = stream.read(1) + if tok != '+' and tok != '-' and tok != '.' and not tok.isdigit(): + stream.seek(-1, 1) + break + name += tok + if name.find(".") != -1: + return FloatObject(name) + else: + return NumberObject(name) + readFromStream = staticmethod(readFromStream) + + +class StringObject(str, PdfObject): + def writeToStream(self, stream, encryption_key): + string = self + if encryption_key: + string = RC4_encrypt(encryption_key, string) + stream.write("(") + for c in string: + if not c.isalnum() and not c.isspace(): + stream.write("\\%03o" % ord(c)) + else: + stream.write(c) + stream.write(")") + + def readHexStringFromStream(stream): + stream.read(1) + txt = "" + x = "" + while True: + tok = readNonWhitespace(stream) + if tok == ">": + break + x += tok + if len(x) == 2: + txt += chr(int(x, base=16)) + x = "" + if len(x) == 1: + x += "0" + if len(x) == 2: + txt += chr(int(x, base=16)) + return StringObject(txt) + readHexStringFromStream = staticmethod(readHexStringFromStream) + + def readFromStream(stream): + tok = stream.read(1) + parens = 1 + txt = "" + while True: + tok = stream.read(1) + if tok == "(": + parens += 1 + elif tok == ")": + parens -= 1 + if parens == 0: + break + elif tok == "\\": + tok = stream.read(1) + if tok == "n": + tok = "\n" + elif tok == "r": + tok = "\r" + elif tok == "t": + tok = "\t" + elif tok == "b": + tok == "\b" + elif tok == "f": + tok = "\f" + elif tok == "(": + tok = "(" + elif tok == ")": + tok = ")" + elif tok == "\\": + tok = "\\" + elif tok.isdigit(): + tok += stream.read(2) + tok = chr(int(tok, base=8)) + txt += tok + return StringObject(txt) + readFromStream = staticmethod(readFromStream) + + +class NameObject(str, PdfObject): + delimiterCharacters = "(", ")", "<", ">", "[", "]", "{", "}", "/", "%" + + def __init__(self, data): + str.__init__(self, data) + + def writeToStream(self, stream, encryption_key): + stream.write(self) + + def readFromStream(stream): + name = stream.read(1) + assert name == "/" + while True: + tok = stream.read(1) + if tok.isspace() or tok in NameObject.delimiterCharacters: + stream.seek(-1, 1) + break + name += tok + return NameObject(name) + readFromStream = staticmethod(readFromStream) + + +class DictionaryObject(dict, PdfObject): + def __init__(self): + pass + + def writeToStream(self, stream, encryption_key): + stream.write("<<\n") + for key, value in self.items(): + key.writeToStream(stream, encryption_key) + stream.write(" ") + value.writeToStream(stream, encryption_key) + stream.write("\n") + stream.write(">>") + + def readFromStream(stream, pdf): + assert stream.read(2) == "<<" + data = {} + while True: + tok = readNonWhitespace(stream) + if tok == ">": + stream.read(1) + break + stream.seek(-1, 1) + key = readObject(stream, pdf) + tok = readNonWhitespace(stream) + stream.seek(-1, 1) + value = readObject(stream, pdf) + if data.has_key(key): + # multiple definitions of key not permitted + assert False + data[key] = value + pos = stream.tell() + s = readNonWhitespace(stream) + if s == 's' and stream.read(5) == 'tream': + eol = stream.read(1) + # odd PDF file output has spaces after 'stream' keyword but before EOL. + # patch provided by Danial Sandler + while eol == ' ': + eol = stream.read(1) + assert eol in ("\n", "\r") + if eol == "\r": + # read \n after + stream.read(1) + # this is a stream object, not a dictionary + assert data.has_key("/Length") + length = data["/Length"] + if isinstance(length, IndirectObject): + t = stream.tell() + length = pdf.getObject(length) + stream.seek(t, 0) + data["__streamdata__"] = stream.read(length) + e = readNonWhitespace(stream) + ndstream = stream.read(8) + if (e + ndstream) != "endstream": + # (sigh) - the odd PDF file has a length that is too long, so + # we need to read backwards to find the "endstream" ending. + # ReportLab (unknown version) generates files with this bug, + # and Python users into PDF files tend to be our audience. + # we need to do this to correct the streamdata and chop off + # an extra character. + pos = stream.tell() + stream.seek(-10, 1) + end = stream.read(9) + if end == "endstream": + # we found it by looking back one character further. + data["__streamdata__"] = data["__streamdata__"][:-1] + else: + stream.seek(pos, 0) + raise "Unable to find 'endstream' marker after stream." + else: + stream.seek(pos, 0) + if data.has_key("__streamdata__"): + return StreamObject.initializeFromDictionary(data) + else: + retval = DictionaryObject() + retval.update(data) + return retval + readFromStream = staticmethod(readFromStream) + + +class StreamObject(DictionaryObject): + def __init__(self): + self._data = None + self.decodedSelf = None + + def writeToStream(self, stream, encryption_key): + self[NameObject("/Length")] = NumberObject(len(self._data)) + DictionaryObject.writeToStream(self, stream, encryption_key) + del self["/Length"] + stream.write("\nstream\n") + data = self._data + if encryption_key: + data = RC4_encrypt(encryption_key, data) + stream.write(data) + stream.write("\nendstream") + + def initializeFromDictionary(data): + if data.has_key("/Filter"): + retval = EncodedStreamObject() + else: + retval = DecodedStreamObject() + retval._data = data["__streamdata__"] + del data["__streamdata__"] + del data["/Length"] + retval.update(data) + return retval + initializeFromDictionary = staticmethod(initializeFromDictionary) + + def flateEncode(self): + if self.has_key("/Filter"): + f = self["/Filter"] + if isinstance(f, ArrayObject): + f.insert(0, NameObject("/FlateDecode")) + else: + newf = ArrayObject() + newf.append(NameObject("/FlateDecode")) + newf.append(f) + f = newf + else: + f = NameObject("/FlateDecode") + retval = EncodedStreamObject() + retval[NameObject("/Filter")] = f + retval._data = filters.FlateDecode.encode(self._data) + return retval + + +class DecodedStreamObject(StreamObject): + def getData(self): + return self._data + + def setData(self, data): + self._data = data + + +class EncodedStreamObject(StreamObject): + def __init__(self): + self.decodedSelf = None + + def getData(self): + if self.decodedSelf: + # cached version of decoded object + return self.decodedSelf.getData() + else: + # create decoded object + decoded = StreamObject() + decoded._data = filters.decodeStreamData(self) + for key, value in self.items(): + if not key in ("/Length", "/Filter", "/DecodeParms"): + decoded[key] = value + self.decodedSelf = decoded + return decoded._data + + def setData(self, data): + raise "Creating EncodedStreamObject is not currently supported" + + +class RectangleObject(ArrayObject): + def __init__(self, arr): + # must have four points + assert len(arr) == 4 + # automatically convert arr[x] into NumberObject(arr[x]) if necessary + ArrayObject.__init__(self, [self.ensureIsNumber(x) for x in arr]) + + def ensureIsNumber(self, value): + if not isinstance(value, NumberObject): + value = NumberObject(value) + return value + + def __repr__(self): + return "RectangleObject(%s)" % repr(list(self)) + + def getLowerLeft_x(self): + return self[0] + + def getLowerLeft_y(self): + return self[1] + + def getUpperRight_x(self): + return self[2] + + def getUpperRight_y(self): + return self[3] + + def getUpperLeft_x(self): + return self.getLowerLeft_x() + + def getUpperLeft_y(self): + return self.getUpperRight_y() + + def getLowerRight_x(self): + return self.getUpperRight_x() + + def getLowerRight_y(self): + return self.getLowerLeft_y() + + def getLowerLeft(self): + return self.getLowerLeft_x(), self.getLowerLeft_y() + + def getLowerRight(self): + return self.getLowerRight_x(), self.getLowerRight_y() + + def getUpperLeft(self): + return self.getUpperLeft_x(), self.getUpperLeft_y() + + def getUpperRight(self): + return self.getUpperRight_x(), self.getUpperRight_y() + + def setLowerLeft(self, value): + self[0], self[1] = [self.ensureIsNumber(x) for x in value] + + def setLowerRight(self, value): + self[2], self[1] = [self.ensureIsNumber(x) for x in value] + + def setUpperLeft(self, value): + self[0], self[3] = [self.ensureIsNumber(x) for x in value] + + def setUpperRight(self, value): + self[2], self[3] = [self.ensureIsNumber(x) for x in value] + + lowerLeft = property(getLowerLeft, setLowerLeft, None, None) + lowerRight = property(getLowerRight, setLowerRight, None, None) + upperLeft = property(getUpperLeft, setUpperLeft, None, None) + upperRight = property(getUpperRight, setUpperRight, None, None) + diff --git a/src/libprs500/ebooks/pyPdf/pdf.py b/src/libprs500/ebooks/pyPdf/pdf.py new file mode 100644 index 0000000000..fdaacaf574 --- /dev/null +++ b/src/libprs500/ebooks/pyPdf/pdf.py @@ -0,0 +1,1162 @@ +# vim: sw=4:expandtab:foldmethod=marker +# +# Copyright (c) 2006, Mathieu Fenniak +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + + +""" +A pure-Python PDF library with very minimal capabilities. It was designed to +be able to split and merge PDF files by page, and that's about all it can do. +It may be a solid base for future PDF file work in Python. +""" +__author__ = "Mathieu Fenniak" +__author_email__ = "mfenniak@pobox.com" + +import struct +try: + from cStringIO import StringIO +except ImportError: + from StringIO import StringIO + +import filters +import utils +from generic import * +from utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirtualList +from sets import ImmutableSet + +## +# This class supports writing PDF files out, given pages produced by another +# class (typically {@link #PdfFileReader PdfFileReader}). +class PdfFileWriter(object): + def __init__(self): + self._header = "%PDF-1.3" + self._objects = [] # array of indirect objects + + # The root of our page tree node. + pages = DictionaryObject() + pages.update({ + NameObject("/Type"): NameObject("/Pages"), + NameObject("/Count"): NumberObject(0), + NameObject("/Kids"): ArrayObject(), + }) + self._pages = self._addObject(pages) + + # info object + info = DictionaryObject() + info.update({ + NameObject("/Producer"): StringObject("Python PDF Library - http://pybrary.net/pyPdf/") + }) + self._info = self._addObject(info) + + # root object + root = DictionaryObject() + root.update({ + NameObject("/Type"): NameObject("/Catalog"), + NameObject("/Pages"): self._pages, + }) + self._root = self._addObject(root) + + def _addObject(self, obj): + self._objects.append(obj) + return IndirectObject(len(self._objects), 0, self) + + def getObject(self, ido): + assert ido.pdf == self + return self._objects[ido.idnum - 1] + + ## + # Adds a page to this PDF file. The page is usually acquired from a + # {@link #PdfFileReader PdfFileReader} instance. + #

+ # Stability: Added in v1.0, will exist for all v1.x releases. + # + # @param page The page to add to the document. This argument should be + # an instance of {@link #PageObject PageObject}. + def addPage(self, page): + assert page["/Type"] == "/Page" + page[NameObject("/Parent")] = self._pages + page = self._addObject(page) + pages = self.getObject(self._pages) + pages["/Kids"].append(page) + pages["/Count"] = NumberObject(pages["/Count"] + 1) + + ## + # Encrypt this PDF file with the PDF Standard encryption handler. + # @param user_pwd The "user password", which allows for opening and reading + # the PDF file with the restrictions provided. + # @param owner_pwd The "owner password", which allows for opening the PDF + # files without any restrictions. By default, the owner password is the + # same as the user password. + # @param use_128bit Boolean argument as to whether to use 128bit + # encryption. When false, 40bit encryption will be used. By default, this + # flag is on. + def encrypt(self, user_pwd, owner_pwd = None, use_128bit = True): + import md5, time, random + if owner_pwd == None: + owner_pwd = user_pwd + if use_128bit: + V = 2 + rev = 3 + keylen = 128 / 8 + else: + V = 1 + rev = 2 + keylen = 40 / 8 + # permit everything: + P = -1 + O = StringObject(_alg33(owner_pwd, user_pwd, rev, keylen)) + ID_1 = md5.new(repr(time.time())).digest() + ID_2 = md5.new(repr(random.random())).digest() + self._ID = ArrayObject((StringObject(ID_1), StringObject(ID_2))) + if rev == 2: + U, key = _alg34(user_pwd, O, P, ID_1) + else: + assert rev == 3 + U, key = _alg35(user_pwd, rev, keylen, O, P, ID_1, False) + encrypt = DictionaryObject() + encrypt[NameObject("/Filter")] = NameObject("/Standard") + encrypt[NameObject("/V")] = NumberObject(V) + if V == 2: + encrypt[NameObject("/Length")] = NumberObject(keylen * 8) + encrypt[NameObject("/R")] = NumberObject(rev) + encrypt[NameObject("/O")] = StringObject(O) + encrypt[NameObject("/U")] = StringObject(U) + encrypt[NameObject("/P")] = NumberObject(P) + self._encrypt = self._addObject(encrypt) + self._encrypt_key = key + + ## + # Writes the collection of pages added to this object out as a PDF file. + #

+ # Stability: Added in v1.0, will exist for all v1.x releases. + # @param stream An object to write the file to. The object must support + # the write method, and the tell method, similar to a file object. + def write(self, stream): + import struct, md5 + + externalReferenceMap = {} + self.stack = [] + self._sweepIndirectReferences(externalReferenceMap, self._root) + del self.stack + + # Begin writing: + object_positions = [] + stream.write(self._header + "\n") + for i in range(len(self._objects)): + idnum = (i + 1) + obj = self._objects[i] + object_positions.append(stream.tell()) + stream.write(str(idnum) + " 0 obj\n") + key = None + if hasattr(self, "_encrypt") and idnum != self._encrypt.idnum: + pack1 = struct.pack(" +# Stability: Added in v1.0, will exist for all v1.x releases. +# +# @param stream An object that supports the standard read and seek methods +# similar to a file object. +class PdfFileReader(object): + def __init__(self, stream): + self.flattenedPages = None + self.resolvedObjects = {} + self.read(stream) + self.stream = stream + self._override_encryption = False + + ## + # Retrieves the PDF file's document information dictionary, if it exists. + # Note that some PDF files use metadata streams instead of docinfo + # dictionaries, and these metadata streams will not be accessed by this + # function. + #

+ # Stability: Added in v1.6, will exist for all future v1.x releases. + # @return Returns a {@link #DocumentInformation DocumentInformation} + # instance, or None if none exists. + def getDocumentInfo(self): + if not self.trailer.has_key("/Info"): + return None + obj = self.getObject(self.trailer['/Info']) + retval = DocumentInformation() + retval.update(obj) + return retval + + ## + # Read-only property that accesses the {@link + # #PdfFileReader.getDocumentInfo getDocumentInfo} function. + #

+ # Stability: Added in v1.7, will exist for all future v1.x releases. + documentInfo = property(lambda self: self.getDocumentInfo(), None, None) + + ## + # Calculates the number of pages in this PDF file. + #

+ # Stability: Added in v1.0, will exist for all v1.x releases. + # @return Returns an integer. + def getNumPages(self): + if self.flattenedPages == None: + self._flatten() + return len(self.flattenedPages) + + ## + # Read-only property that accesses the {@link #PdfFileReader.getNumPages + # getNumPages} function. + #

+ # Stability: Added in v1.7, will exist for all future v1.x releases. + numPages = property(lambda self: self.getNumPages(), None, None) + + ## + # Retrieves a page by number from this PDF file. + #

+ # Stability: Added in v1.0, will exist for all v1.x releases. + # @return Returns a {@link #PageObject PageObject} instance. + def getPage(self, pageNumber): + ## ensure that we're not trying to access an encrypted PDF + #assert not self.trailer.has_key("/Encrypt") + if self.flattenedPages == None: + self._flatten() + return self.flattenedPages[pageNumber] + + ## + # Read-only property that emulates a list based upon the {@link + # #PdfFileReader.getNumPages getNumPages} and {@link #PdfFileReader.getPage + # getPage} functions. + #

+ # Stability: Added in v1.7, and will exist for all future v1.x releases. + pages = property(lambda self: ConvertFunctionsToVirtualList(self.getNumPages, self.getPage), + None, None) + + def _flatten(self, pages = None, inherit = None): + inheritablePageAttributes = ( + NameObject("/Resources"), NameObject("/MediaBox"), + NameObject("/CropBox"), NameObject("/Rotate") + ) + if inherit == None: + inherit = dict() + if pages == None: + self.flattenedPages = [] + catalog = self.getObject(self.trailer["/Root"]) + pages = self.getObject(catalog["/Pages"]) + if isinstance(pages, IndirectObject): + pages = self.getObject(pages) + t = pages["/Type"] + if t == "/Pages": + for attr in inheritablePageAttributes: + if pages.has_key(attr): + inherit[attr] = pages[attr] + for page in pages["/Kids"]: + self._flatten(page, inherit) + elif t == "/Page": + for attr,value in inherit.items(): + # if the page has it's own value, it does not inherit the + # parent's value: + if not pages.has_key(attr): + pages[attr] = value + pageObj = PageObject(self) + pageObj.update(pages) + self.flattenedPages.append(pageObj) + + def safeGetObject(self, obj): + if isinstance(obj, IndirectObject): + return self.safeGetObject(self.getObject(obj)) + return obj + + def getObject(self, indirectReference): + retval = self.resolvedObjects.get(indirectReference.generation, {}).get(indirectReference.idnum, None) + if retval != None: + return retval + if indirectReference.generation == 0 and \ + self.xref_objStm.has_key(indirectReference.idnum): + # indirect reference to object in object stream + # read the entire object stream into memory + stmnum,idx = self.xref_objStm[indirectReference.idnum] + objStm = self.getObject(IndirectObject(stmnum, 0, self)) + assert objStm['/Type'] == '/ObjStm' + assert idx < objStm['/N'] + streamData = StringIO(objStm.getData()) + for i in range(objStm['/N']): + objnum = NumberObject.readFromStream(streamData) + readNonWhitespace(streamData) + streamData.seek(-1, 1) + offset = NumberObject.readFromStream(streamData) + readNonWhitespace(streamData) + streamData.seek(-1, 1) + t = streamData.tell() + streamData.seek(objStm['/First']+offset, 0) + obj = readObject(streamData, self) + self.resolvedObjects[0][objnum] = obj + streamData.seek(t, 0) + return self.resolvedObjects[0][indirectReference.idnum] + start = self.xref[indirectReference.generation][indirectReference.idnum] + self.stream.seek(start, 0) + idnum, generation = self.readObjectHeader(self.stream) + assert idnum == indirectReference.idnum + assert generation == indirectReference.generation + retval = readObject(self.stream, self) + + # override encryption is used for the /Encrypt dictionary + if not self._override_encryption and self.isEncrypted: + # if we don't have the encryption key: + if not hasattr(self, '_decryption_key'): + raise Exception, "file has not been decrypted" + # otherwise, decrypt here... + import struct, md5 + pack1 = struct.pack(" + # It does not matter which password was matched. Both passwords provide + # the correct decryption key that will allow the document to be used with + # this library. + #

+ # Stability: Added in v1.8, will exist for all future v1.x releases. + # + # @return 0 if the password failed, 1 if the password matched the user + # password, and 2 if the password matched the owner password. + # + # @exception NotImplementedError Document uses an unsupported encryption + # method. + def decrypt(self, password): + self._override_encryption = True + try: + return self._decrypt(password) + finally: + self._override_encryption = False + + def _decrypt(self, password): + encrypt = self.safeGetObject(self.trailer['/Encrypt']) + if encrypt['/Filter'] != '/Standard': + raise NotImplementedError, "only Standard PDF encryption handler is available" + if not (encrypt['/V'] in (1, 2)): + raise NotImplementedError, "only algorithm code 1 and 2 are supported" + user_password, key = self._authenticateUserPassword(password) + if user_password: + self._decryption_key = key + return 1 + else: + rev = self.safeGetObject(encrypt['/R']) + if rev == 2: + keylen = 5 + else: + keylen = self.safeGetObject(encrypt['/Length']) / 8 + key = _alg33_1(password, rev, keylen) + real_O = self.safeGetObject(encrypt["/O"]) + if rev == 2: + userpass = utils.RC4_encrypt(key, real_O) + else: + val = real_O + for i in range(19, -1, -1): + new_key = '' + for l in range(len(key)): + new_key += chr(ord(key[l]) ^ i) + val = utils.RC4_encrypt(new_key, val) + userpass = val + owner_password, key = self._authenticateUserPassword(userpass) + if owner_password: + self._decryption_key = key + return 2 + return 0 + + def _authenticateUserPassword(self, password): + encrypt = self.safeGetObject(self.trailer['/Encrypt']) + rev = self.safeGetObject(encrypt['/R']) + owner_entry = self.safeGetObject(encrypt['/O']) + p_entry = self.safeGetObject(encrypt['/P']) + id_entry = self.safeGetObject(self.trailer['/ID']) + id1_entry = self.safeGetObject(id_entry[0]) + if rev == 2: + U, key = _alg34(password, owner_entry, p_entry, id1_entry) + elif rev >= 3: + U, key = _alg35(password, rev, + self.safeGetObject(encrypt["/Length"]) / 8, owner_entry, + p_entry, id1_entry, + self.safeGetObject(encrypt.get("/EncryptMetadata", False))) + real_U = self.safeGetObject(encrypt['/U']) + return U == real_U, key + + def getIsEncrypted(self): + return self.trailer.has_key("/Encrypt") + + ## + # Read-only boolean property showing whether this PDF file is encrypted. + # Note that this property, if true, will remain true even after the {@link + # #PdfFileReader.decrypt decrypt} function is called. + isEncrypted = property(lambda self: self.getIsEncrypted(), None, None) + + +def getRectangle(self, name, defaults): + retval = self.get(name) + if isinstance(retval, RectangleObject): + return retval + if retval == None: + for d in defaults: + retval = self.get(d) + if retval != None: + break + if isinstance(retval, IndirectObject): + retval = self.pdf.getObject(retval) + retval = RectangleObject(retval) + setRectangle(self, name, retval) + return retval + +def setRectangle(self, name, value): + if not isinstance(name, NameObject): + name = NameObject(name) + self[name] = value + +def deleteRectangle(self, name): + del self[name] + +def createRectangleAccessor(name, fallback): + return \ + property( + lambda self: getRectangle(self, name, fallback), + lambda self, value: setRectangle(self, name, value), + lambda self: deleteRectangle(self, name) + ) + +## +# This class represents a single page within a PDF file. Typically this object +# will be created by accessing the {@link #PdfFileReader.getPage getPage} +# function of the {@link #PdfFileReader PdfFileReader} class. +class PageObject(DictionaryObject): + def __init__(self, pdf): + DictionaryObject.__init__(self) + self.pdf = pdf + + ## + # Rotates a page clockwise by increments of 90 degrees. + #

+ # Stability: Added in v1.1, will exist for all future v1.x releases. + # @param angle Angle to rotate the page. Must be an increment of 90 deg. + def rotateClockwise(self, angle): + assert angle % 90 == 0 + self._rotate(angle) + return self + + ## + # Rotates a page counter-clockwise by increments of 90 degrees. + #

+ # Stability: Added in v1.1, will exist for all future v1.x releases. + # @param angle Angle to rotate the page. Must be an increment of 90 deg. + def rotateCounterClockwise(self, angle): + assert angle % 90 == 0 + self._rotate(-angle) + return self + + def _rotate(self, angle): + currentAngle = self.get("/Rotate", 0) + self[NameObject("/Rotate")] = NumberObject(currentAngle + angle) + + def _mergeResources(res1, res2, resource): + newRes = DictionaryObject() + newRes.update(res1.get(resource, DictionaryObject()).getObject()) + page2Res = res2.get(resource, DictionaryObject()).getObject() + renameRes = {} + for key in page2Res.keys(): + if newRes.has_key(key) and newRes[key] != page2Res[key]: + newname = NameObject(key + "renamed") + renameRes[key] = newname + newRes[newname] = page2Res[key] + elif not newRes.has_key(key): + newRes[key] = page2Res[key] + return newRes, renameRes + _mergeResources = staticmethod(_mergeResources) + + def _contentStreamRename(stream, rename, pdf): + if not rename: + return stream + stream = ContentStream(stream, pdf) + for operands,operator in stream.operations: + for i in range(len(operands)): + op = operands[i] + if isinstance(op, NameObject): + operands[i] = rename.get(op, op) + return stream + _contentStreamRename = staticmethod(_contentStreamRename) + + def _pushPopGS(contents, pdf): + # adds a graphics state "push" and "pop" to the beginning and end + # of a content stream. This isolates it from changes such as + # transformation matricies. + stream = ContentStream(contents, pdf) + stream.operations.insert(0, [[], "q"]) + stream.operations.append([[], "Q"]) + return stream + _pushPopGS = staticmethod(_pushPopGS) + + ## + # Merges the content streams of two pages into one. Resource references + # (i.e. fonts) are maintained from both pages. The mediabox/cropbox/etc + # of this page are not altered. The parameter page's content stream will + # be added to the end of this page's content stream, meaning that it will + # be drawn after, or "on top" of this page. + #

+ # Stability: Added in v1.4, will exist for all future 1.x releases. + # @param page2 An instance of {@link #PageObject PageObject} to be merged + # into this one. + def mergePage(self, page2): + + # First we work on merging the resource dictionaries. This allows us + # to find out what symbols in the content streams we might need to + # rename. + + newResources = DictionaryObject() + rename = {} + originalResources = self["/Resources"].getObject() + page2Resources = page2["/Resources"].getObject() + + for res in "/ExtGState", "/Font", "/XObject", "/ColorSpace", "/Pattern", "/Shading": + new, newrename = PageObject._mergeResources(originalResources, page2Resources, res) + if new: + newResources[NameObject(res)] = new + rename.update(newrename) + + # Combine /ProcSet sets. + newResources[NameObject("/ProcSet")] = ArrayObject( + ImmutableSet(originalResources.get("/ProcSet", ArrayObject()).getObject()).union( + ImmutableSet(page2Resources.get("/ProcSet", ArrayObject()).getObject()) + ) + ) + + newContentArray = ArrayObject() + + originalContent = self["/Contents"].getObject() + newContentArray.append(PageObject._pushPopGS(originalContent, self.pdf)) + + page2Content = page2['/Contents'].getObject() + page2Content = PageObject._contentStreamRename(page2Content, rename, self.pdf) + page2Content = PageObject._pushPopGS(page2Content, self.pdf) + newContentArray.append(page2Content) + + self[NameObject('/Contents')] = ContentStream(newContentArray, self.pdf) + self[NameObject('/Resources')] = newResources + + ## + # Compresses the size of this page by joining all content streams and + # applying a FlateDecode filter. + #

+ # Stability: Added in v1.6, will exist for all future v1.x releases. + # However, it is possible that this function will perform no action if + # content stream compression becomes "automatic" for some reason. + def compressContentStreams(self): + content = self["/Contents"].getObject() + if not isinstance(content, ContentStream): + content = ContentStream(content, self.pdf) + self[NameObject("/Contents")] = content.flateEncode() + + ## + # Locate all text drawing commands, in the order they are provided in the + # content stream, and extract the text. This works well for some PDF + # files, but poorly for others, depending on the generator used. This will + # be refined in the future. Do not rely on the order of text coming out of + # this function, as it will change if this function is made more + # sophisticated. + #

+ # Stability: Added in v1.7, will exist for all future v1.x releases. May + # be overhauled to provide more ordered text in the future. + # @return a string object + def extractText(self): + text = "" + content = self["/Contents"].getObject() + if not isinstance(content, ContentStream): + content = ContentStream(content, self.pdf) + for operands,operator in content.operations: + if operator == "Tj": + text += operands[0] + elif operator == "T*": + text += "\n" + elif operator == "'": + text += "\n" + text += operands[0] + elif operator == "\"": + text += "\n" + text += operands[2] + elif operator == "TJ": + for i in operands[0]: + if isinstance(i, StringObject): + text += i + return text + + ## + # A rectangle (RectangleObject), expressed in default user space units, + # defining the boundaries of the physical medium on which the page is + # intended to be displayed or printed. + #

+ # Stability: Added in v1.4, will exist for all future v1.x releases. + mediaBox = createRectangleAccessor("/MediaBox", ()) + + ## + # A rectangle (RectangleObject), expressed in default user space units, + # defining the visible region of default user space. When the page is + # displayed or printed, its contents are to be clipped (cropped) to this + # rectangle and then imposed on the output medium in some + # implementation-defined manner. Default value: same as MediaBox. + #

+ # Stability: Added in v1.4, will exist for all future v1.x releases. + cropBox = createRectangleAccessor("/CropBox", ("/CropBox",)) + + ## + # A rectangle (RectangleObject), expressed in default user space units, + # defining the region to which the contents of the page should be clipped + # when output in a production enviroment. + #

+ # Stability: Added in v1.4, will exist for all future v1.x releases. + bleedBox = createRectangleAccessor("/BleedBox", ("/CropBox", "/MediaBox")) + + ## + # A rectangle (RectangleObject), expressed in default user space units, + # defining the intended dimensions of the finished page after trimming. + #

+ # Stability: Added in v1.4, will exist for all future v1.x releases. + trimBox = createRectangleAccessor("/TrimBox", ("/CropBox", "/MediaBox")) + + ## + # A rectangle (RectangleObject), expressed in default user space units, + # defining the extent of the page's meaningful content as intended by the + # page's creator. + #

+ # Stability: Added in v1.4, will exist for all future v1.x releases. + artBox = createRectangleAccessor("/ArtBox", ("/CropBox", "/MediaBox")) + + +class ContentStream(DecodedStreamObject): + def __init__(self, stream, pdf): + self.pdf = pdf + self.operations = [] + # stream may be a StreamObject or an ArrayObject containing + # multiple StreamObjects to be cat'd together. + stream = stream.getObject() + if isinstance(stream, ArrayObject): + data = "" + for s in stream: + data += s.getObject().getData() + stream = StringIO(data) + else: + stream = StringIO(stream.getData()) + self.__parseContentStream(stream) + + def __parseContentStream(self, stream): + # file("f:\\tmp.txt", "w").write(stream.read()) + stream.seek(0, 0) + operands = [] + while True: + peek = readNonWhitespace(stream) + if peek == '': + break + stream.seek(-1, 1) + if peek.isalpha() or peek == "'" or peek == "\"": + operator = readUntilWhitespace(stream, maxchars=2) + if operator == "BI": + # begin inline image - a completely different parsing + # mechanism is required, of course... thanks buddy... + assert operands == [] + ii = self._readInlineImage(stream) + self.operations.append((ii, "INLINE IMAGE")) + else: + self.operations.append((operands, operator)) + operands = [] + else: + operands.append(readObject(stream, None)) + + def _readInlineImage(self, stream): + # begin reading just after the "BI" - begin image + # first read the dictionary of settings. + settings = DictionaryObject() + while True: + tok = readNonWhitespace(stream) + stream.seek(-1, 1) + if tok == "I": + # "ID" - begin of image data + break + key = readObject(stream, self.pdf) + tok = readNonWhitespace(stream) + stream.seek(-1, 1) + value = readObject(stream, self.pdf) + settings[key] = value + # left at beginning of ID + tmp = stream.read(3) + assert tmp[:2] == "ID" + data = "" + while True: + tok = stream.read(1) + if tok == "E": + next = stream.read(1) + if next == "I": + break + else: + stream.seek(-1, 1) + data += tok + else: + data += tok + x = readNonWhitespace(stream) + stream.seek(-1, 1) + return {"settings": settings, "data": data} + + def _getData(self): + newdata = StringIO() + for operands,operator in self.operations: + if operator == "INLINE IMAGE": + newdata.write("BI") + dicttext = StringIO() + operands["settings"].writeToStream(dicttext, None) + newdata.write(dicttext.getvalue()[2:-2]) + newdata.write("ID ") + newdata.write(operands["data"]) + newdata.write("EI") + else: + for op in operands: + op.writeToStream(newdata, None) + newdata.write(" ") + newdata.write(operator) + newdata.write("\n") + return newdata.getvalue() + + def _setData(self, value): + self.__parseContentStream(StringIO(value)) + + _data = property(_getData, _setData) + + +## +# A class representing the basic document metadata provided in a PDF File. +class DocumentInformation(DictionaryObject): + def __init__(self): + DictionaryObject.__init__(self) + + ## + # Read-only property accessing the document's title. Added in v1.6, will + # exist for all future v1.x releases. + # @return A string, or None if the title is not provided. + title = property(lambda self: self.get("/Title", None), None, None) + + ## + # Read-only property accessing the document's author. Added in v1.6, will + # exist for all future v1.x releases. + # @return A string, or None if the author is not provided. + author = property(lambda self: self.get("/Author", None), None, None) + + ## + # Read-only property accessing the subject of the document. Added in v1.6, + # will exist for all future v1.x releases. + # @return A string, or None if the subject is not provided. + subject = property(lambda self: self.get("/Subject", None), None, None) + + ## + # Read-only property accessing the document's creator. If the document was + # converted to PDF from another format, the name of the application (for + # example, OpenOffice) that created the original document from which it was + # converted. Added in v1.6, will exist for all future v1.x releases. + # @return A string, or None if the creator is not provided. + creator = property(lambda self: self.get("/Creator", None), None, None) + + ## + # Read-only property accessing the document's producer. If the document + # was converted to PDF from another format, the name of the application + # (for example, OSX Quartz) that converted it to PDF. Added in v1.6, will + # exist for all future v1.x releases. + # @return A string, or None if the producer is not provided. + producer = property(lambda self: self.get("/Producer", None), None, None) + + +def convertToInt(d, size): + if size <= 4: + d = "\x00\x00\x00\x00" + d + d = d[-4:] + return struct.unpack(">l", d)[0] + elif size <= 8: + d = "\x00\x00\x00\x00\x00\x00\x00\x00" + d + d = d[-8:] + return struct.unpack(">q", d)[0] + else: + # size too big + assert False + +# ref: pdf1.8 spec section 3.5.2 algorithm 3.2 +_encryption_padding = '\x28\xbf\x4e\x5e\x4e\x75\x8a\x41\x64\x00\x4e\x56' + \ + '\xff\xfa\x01\x08\x2e\x2e\x00\xb6\xd0\x68\x3e\x80\x2f\x0c' + \ + '\xa9\xfe\x64\x53\x69\x7a' + +def _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt=True): + import md5, struct + m = md5.new() + password = (password + _encryption_padding)[:32] + m.update(password) + m.update(owner_entry) + p_entry = struct.pack('= 3 and not metadata_encrypt: + m.update("\xff\xff\xff\xff") + md5_hash = m.digest() + if rev >= 3: + for i in range(50): + md5_hash = md5.new(md5_hash[:keylen]).digest() + return md5_hash[:keylen] + +def _alg33(owner_pwd, user_pwd, rev, keylen): + key = _alg33_1(owner_pwd, rev, keylen) + user_pwd = (user_pwd + _encryption_padding)[:32] + val = utils.RC4_encrypt(key, user_pwd) + if rev >= 3: + for i in range(1, 20): + new_key = '' + for l in range(len(key)): + new_key += chr(ord(key[l]) ^ i) + val = utils.RC4_encrypt(new_key, val) + return val + +def _alg33_1(password, rev, keylen): + import md5 + m = md5.new() + password = (password + _encryption_padding)[:32] + m.update(password) + md5_hash = m.digest() + if rev >= 3: + for i in range(50): + md5_hash = md5.new(md5_hash).digest() + key = md5_hash[:keylen] + return key + +def _alg34(password, owner_entry, p_entry, id1_entry): + key = _alg32(password, 2, 5, owner_entry, p_entry, id1_entry) + U = utils.RC4_encrypt(key, _encryption_padding) + return U, key + +def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt): + import md5 + m = md5.new() + m.update(_encryption_padding) + m.update(id1_entry) + md5_hash = m.digest() + key = _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry) + val = utils.RC4_encrypt(key, md5_hash) + for i in range(1, 20): + new_key = '' + for l in range(len(key)): + new_key += chr(ord(key[l]) ^ i) + val = utils.RC4_encrypt(new_key, val) + return val + ('\x00' * 16), key + +#if __name__ == "__main__": +# output = PdfFileWriter() +# +# input1 = PdfFileReader(file("test\\5000-s1-05e.pdf", "rb")) +# page1 = input1.getPage(0) +# +# input2 = PdfFileReader(file("test\\PDFReference16.pdf", "rb")) +# page2 = input2.getPage(0) +# page3 = input2.getPage(1) +# page1.mergePage(page2) +# page1.mergePage(page3) +# +# input3 = PdfFileReader(file("test\\cc-cc.pdf", "rb")) +# page1.mergePage(input3.getPage(0)) +# +# page1.compressContentStreams() +# +# output.addPage(page1) +# output.write(file("test\\merge-test.pdf", "wb")) + + diff --git a/src/libprs500/ebooks/pyPdf/utils.py b/src/libprs500/ebooks/pyPdf/utils.py new file mode 100644 index 0000000000..d6769c248f --- /dev/null +++ b/src/libprs500/ebooks/pyPdf/utils.py @@ -0,0 +1,94 @@ +# vim: sw=4:expandtab:foldmethod=marker +# +# Copyright (c) 2006, Mathieu Fenniak +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + + +""" +Utility functions for PDF library. +""" +__author__ = "Mathieu Fenniak" +__author_email__ = "mfenniak@pobox.com" + +def readUntilWhitespace(stream, maxchars=None): + txt = "" + while True: + tok = stream.read(1) + if tok.isspace() or not tok: + break + txt += tok + if len(txt) == maxchars: + break + return txt + +def readNonWhitespace(stream): + tok = ' ' + while tok == '\n' or tok == '\r' or tok == ' ' or tok == '\t': + tok = stream.read(1) + return tok + +class ConvertFunctionsToVirtualList(object): + def __init__(self, lengthFunction, getFunction): + self.lengthFunction = lengthFunction + self.getFunction = getFunction + + def __len__(self): + return self.lengthFunction() + + def __getitem__(self, index): + if not isinstance(index, int): + raise TypeError, "sequence indices must be integers" + len_self = len(self) + if index < 0: + # support negative indexes + index = len_self + index + if index < 0 or index >= len_self: + raise IndexError, "sequence index out of range" + return self.getFunction(index) + +def RC4_encrypt(key, plaintext): + S = [i for i in range(256)] + j = 0 + for i in range(256): + j = (j + S[i] + ord(key[i % len(key)])) % 256 + S[i], S[j] = S[j], S[i] + i, j = 0, 0 + retval = "" + for x in range(len(plaintext)): + i = (i + 1) % 256 + j = (j + S[i]) % 256 + S[i], S[j] = S[j], S[i] + t = S[(S[i] + S[j]) % 256] + retval += chr(ord(plaintext[x]) ^ t) + return retval + +if __name__ == "__main__": + # test RC4 + out = RC4_encrypt("Key", "Plaintext") + print repr(out) + pt = RC4_encrypt("Key", out) + print repr(pt)