From f7332494ae54e29c5928b5443a2cd7d5ce3954c3 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Fri, 7 Sep 2007 15:43:39 +0000
Subject: [PATCH] Implement pure python solution for reading PDF metadata

---
 src/libprs500/ebooks/metadata/pdf.py   |   82 +-
 src/libprs500/ebooks/pyPdf/__init__.py |    2 +
 src/libprs500/ebooks/pyPdf/filters.py  |  239 +++++
 src/libprs500/ebooks/pyPdf/generic.py  |  542 +++++++++++
 src/libprs500/ebooks/pyPdf/pdf.py      | 1162 ++++++++++++++++++++++++
 src/libprs500/ebooks/pyPdf/utils.py    |   94 ++
 6 files changed, 2059 insertions(+), 62 deletions(-)
 create mode 100644 src/libprs500/ebooks/pyPdf/__init__.py
 create mode 100644 src/libprs500/ebooks/pyPdf/filters.py
 create mode 100644 src/libprs500/ebooks/pyPdf/generic.py
 create mode 100644 src/libprs500/ebooks/pyPdf/pdf.py
 create mode 100644 src/libprs500/ebooks/pyPdf/utils.py

diff --git a/src/libprs500/ebooks/metadata/pdf.py b/src/libprs500/ebooks/metadata/pdf.py
index c0596e930e..1b7880ce39 100644
--- a/src/libprs500/ebooks/metadata/pdf.py
+++ b/src/libprs500/ebooks/metadata/pdf.py
@@ -14,83 +14,41 @@
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 '''Read meta information from PDF files'''
 
-import sys, os, copy
+import sys, os
 
-from libprs500.ebooks.metadata import MetaInformation, get_parser
-from libprs500.ptempfile import PersistentTemporaryFile
+from libprs500.ebooks.metadata import MetaInformation
+from libprs500.ebooks.pyPdf import PdfFileReader
 
 def get_metadata(stream):
     """ Return metadata as a L{MetaInfo} object """
     if hasattr(stream, 'name'):
-        title = stream.name
+        title = os.path.splitext(os.path.basename(stream.name))[0]
     else:
         title = 'Unknown'
     mi = MetaInformation(title, 'Unknown')
-
     stream.seek(0)
-    pt = PersistentTemporaryFile('.pdf')
-    pt.write(stream.read())
-    pt.close()
-    return get_metadata_from_file(pt.name, mi)
-    
-def set_metadata(path, options):
-    try:
-        import podofo
-        doc = podofo.PdfDocument()
-        doc.Load(path)
-        info = doc.GetInfo()
-        if options.title:
-            info.SetTitle(options.title)
-        if options.authors:
-            info.SetAuthor(options.authors)
-        if options.category:
-            info.SetSubject(options.category)
-        pt = PersistentTemporaryFile('.pdf')
-        pt.close() 
-        doc.Write(pt.name)
-        stream = open(path, 'wb')
-        stream.write(open(pt.name, 'rb').read())
-        stream.close()
-    except ImportError:
-        return False
-    return True
-
-def get_metadata_from_file(path, default_mi=None):
-    if default_mi is None:
-        title = os.path.splitext(os.path.basename(path))[0]
-        mi = MetaInformation(title, 'Unknown')
-    else:
-        mi = copy.copy(default_mi)
-    try:
-        import podofo
-        doc = podofo.PdfDocument()
-        doc.Load(path)
-        info = doc.GetInfo()
-        if info.GetTitle():
-            mi.title = info.GetTitle()
-        if info.GetAuthor():
-            mi.authors = info.GetAuthor().split(',')
-        if info.GetSubject():
-            mi.category = info.GetSubject()
-    except ImportError:        
-        pass
-    finally:
-        return mi
-    
-
+    info = PdfFileReader(stream).getDocumentInfo()
+    if info.title:
+        mi.title = title
+    if info.author:
+        src = info.author.split('&')
+        authors = []
+        for au in src:
+            authors += au.split(',')
+        mi.authors = authors
+        mi.author = info.author
+    if info.subject:
+        mi.category = info.subject
+    return mi
+        
+            
 def main(args=sys.argv):
-    parser = get_parser('pdf')
-    options, args = parser.parse_args(args)
     if len(args) != 2:
         print >>sys.stderr, 'No filename specified.'
         return 1
     
     path = os.path.abspath(os.path.expanduser(args[1]))
-    if not set_metadata(path, options):
-        print >>sys.stderr, 'You do not have the podofo python extension installed. Cannot read PDF files.'
-        return 1
-    
-    print get_metadata_from_file(path)
+    print get_metadata(open(path, 'rb'))
     return 0
 
 if __name__ == '__main__':
diff --git a/src/libprs500/ebooks/pyPdf/__init__.py b/src/libprs500/ebooks/pyPdf/__init__.py
new file mode 100644
index 0000000000..af02553da6
--- /dev/null
+++ b/src/libprs500/ebooks/pyPdf/__init__.py
@@ -0,0 +1,2 @@
+from pdf import PdfFileReader, PdfFileWriter
+__all__ = ["pdf"]
diff --git a/src/libprs500/ebooks/pyPdf/filters.py b/src/libprs500/ebooks/pyPdf/filters.py
new file mode 100644
index 0000000000..17a325f76f
--- /dev/null
+++ b/src/libprs500/ebooks/pyPdf/filters.py
@@ -0,0 +1,239 @@
+# vim: sw=4:expandtab:foldmethod=marker
+#
+# Copyright (c) 2006, Mathieu Fenniak
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# * Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+# * The name of the author may not be used to endorse or promote products
+# derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+"""
+Implementation of stream filters for PDF.
+"""
+__author__ = "Mathieu Fenniak"
+__author_email__ = "mfenniak@pobox.com"
+
+from generic import NameObject
+
+try:
+    import zlib
+    def decompress(data):
+        return zlib.decompress(data)
+    def compress(data):
+        return zlib.compress(data)
+except ImportError:
+    # Unable to import zlib.  Attempt to use the System.IO.Compression
+    # library from the .NET framework. (IronPython only)
+    import System
+    from System import IO, Collections, Array
+    def _string_to_bytearr(buf):
+        retval = Array.CreateInstance(System.Byte, len(buf))
+        for i in range(len(buf)):
+            retval[i] = ord(buf[i])
+        return retval
+    def _bytearr_to_string(bytes):
+        retval = ""
+        for i in range(bytes.Length):
+            retval += chr(bytes[i])
+        return retval
+    def _read_bytes(stream):
+        ms = IO.MemoryStream()
+        buf = Array.CreateInstance(System.Byte, 2048)
+        while True:
+            bytes = stream.Read(buf, 0, buf.Length)
+            if bytes == 0:
+                break
+            else:
+                ms.Write(buf, 0, bytes)
+        retval = ms.ToArray()
+        ms.Close()
+        return retval
+    def decompress(data):
+        bytes = _string_to_bytearr(data)
+        ms = IO.MemoryStream()
+        ms.Write(bytes, 0, bytes.Length)
+        ms.Position = 0  # fseek 0
+        gz = IO.Compression.DeflateStream(ms, IO.Compression.CompressionMode.Decompress)
+        bytes = _read_bytes(gz)
+        retval = _bytearr_to_string(bytes)
+        gz.Close()
+        return retval
+    def compress(data):
+        bytes = _string_to_bytearr(data)
+        ms = IO.MemoryStream()
+        gz = IO.Compression.DeflateStream(ms, IO.Compression.CompressionMode.Compress, True)
+        gz.Write(bytes, 0, bytes.Length)
+        gz.Close()
+        ms.Position = 0 # fseek 0
+        bytes = ms.ToArray()
+        retval = _bytearr_to_string(bytes)
+        ms.Close()
+        return retval
+
+
+class FlateDecode(object):
+    def decode(data, decodeParms):
+        data = decompress(data)
+        predictor = 1
+        if decodeParms:
+            predictor = decodeParms.get("/Predictor", 1)
+        # predictor 1 == no predictor
+        if predictor != 1:
+            columns = decodeParms["/Columns"]
+            if predictor >= 10:
+                newdata = ""
+                # PNG prediction can vary from row to row
+                rowlength = columns + 1
+                assert len(data) % rowlength == 0
+                prev_rowdata = "\x00"*rowlength
+                for row in range(len(data) / rowlength):
+                    rowdata = list(data[(row*rowlength):((row+1)*rowlength)])
+                    filterByte = ord(rowdata[0])
+                    if filterByte == 0:
+                        pass
+                    elif filterByte == 1:
+                        for i in range(2, rowlength):
+                            rowdata[i] = chr((ord(rowdata[i]) + ord(rowdata[i-1])) % 256)
+                    elif filterByte == 2:
+                        for i in range(1, rowlength):
+                            rowdata[i] = chr((ord(rowdata[i]) + ord(prev_rowdata[i])) % 256)
+                    else:
+                        # unsupported PNG filter
+                        assert False
+                    prev_rowdata = rowdata
+                    newdata += ''.join(rowdata[1:])
+                data = newdata
+            else:
+                # unsupported predictor
+                assert False
+        return data
+    decode = staticmethod(decode)
+
+    def encode(data):
+        return compress(data)
+    encode = staticmethod(encode)
+
+class ASCIIHexDecode(object):
+    def decode(data, decodeParms=None):
+        retval = ""
+        char = ""
+        x = 0
+        while True:
+            c = data[x]
+            if c == ">":
+                break
+            elif c.isspace():
+                x += 1
+                continue
+            char += c
+            if len(char) == 2:
+                retval += chr(int(char, base=16))
+                char = ""
+            x += 1
+        assert char == ""
+        return retval
+    decode = staticmethod(decode)
+
+class ASCII85Decode(object):
+    def decode(data, decodeParms=None):
+        retval = ""
+        group = []
+        x = 0
+        hitEod = False
+        # remove all whitespace from data
+        data = [y for y in data if not (y in ' \n\r\t')]
+        while not hitEod:
+            c = data[x]
+            if len(retval) == 0 and c == "<" and data[x+1] == "~":
+                x += 2
+                continue
+            #elif c.isspace():
+            #    x += 1
+            #    continue
+            elif c == 'z':
+                assert len(group) == 0
+                retval += '\x00\x00\x00\x00'
+                continue
+            elif c == "~" and data[x+1] == ">":
+                if len(group) != 0:
+                    # cannot have a final group of just 1 char
+                    assert len(group) > 1
+                    cnt = len(group) - 1
+                    group += [ 85, 85, 85 ]
+                    hitEod = cnt
+                else:
+                    break
+            else:
+                c = ord(c) - 33
+                assert c >= 0 and c < 85
+                group += [ c ]
+            if len(group) >= 5:
+                b = group[0] * (85**4) + \
+                    group[1] * (85**3) + \
+                    group[2] * (85**2) + \
+                    group[3] * 85 + \
+                    group[4]
+                assert b < (2**32 - 1)
+                c4 = chr((b >> 0) % 256)
+                c3 = chr((b >> 8) % 256)
+                c2 = chr((b >> 16) % 256)
+                c1 = chr(b >> 24)
+                retval += (c1 + c2 + c3 + c4)
+                if hitEod:
+                    retval = retval[:-4+hitEod]
+                group = []
+            x += 1
+        return retval
+    decode = staticmethod(decode)
+
+def decodeStreamData(stream):
+    filters = stream.get("/Filter", ())
+    if len(filters) and not isinstance(filters[0], NameObject):
+        # we have a single filter instance
+        filters = (filters,)
+    data = stream._data
+    for filterType in filters:
+        if filterType == "/FlateDecode":
+            data = FlateDecode.decode(data, stream.get("/DecodeParms"))
+        elif filterType == "/ASCIIHexDecode":
+            data = ASCIIHexDecode.decode(data)
+        elif filterType == "/ASCII85Decode":
+            data = ASCII85Decode.decode(data)
+        else:
+            # unsupported filter
+            assert False
+    return data
+
+if __name__ == "__main__":
+    assert "abc" == ASCIIHexDecode.decode('61\n626\n3>')
+
+    ascii85Test = """
+     <~9jqo^BlbD-BleB1DJ+*+F(f,q/0JhKF<GL>Cj@.4Gp$d7F!,L7@<6@)/0JDEF<G%<+EV:2F!,
+     O<DJ+*.@<*K0@<6L(Df-\\0Ec5e;DffZ(EZee.Bl.9pF"AGXBPCsi+DGm>@3BB/F*&OCAfu2/AKY
+     i(DIb:@FD,*)+C]U=@3BN#EcYf8ATD3s@q?d$AftVqCh[NqF<G:8+EV:.+Cf>-FD5W8ARlolDIa
+     l(DId<j@<?3r@:F%a+D58'ATD4$Bl@l3De:,-DJs`8ARoFb/0JMK@qB4^F!,R<AKZ&-DfTqBG%G
+     >uD.RTpAKYo'+CT/5+Cei#DII?(E,9)oF*2M7/c~>
+    """
+    ascii85_originalText="Man is distinguished, not only by his reason, but by this singular passion from other animals, which is a lust of the mind, that by a perseverance of delight in the continued and indefatigable generation of knowledge, exceeds the short vehemence of any carnal pleasure."
+    assert ASCII85Decode.decode(ascii85Test) == ascii85_originalText
diff --git a/src/libprs500/ebooks/pyPdf/generic.py b/src/libprs500/ebooks/pyPdf/generic.py
new file mode 100644
index 0000000000..4fea8fa640
--- /dev/null
+++ b/src/libprs500/ebooks/pyPdf/generic.py
@@ -0,0 +1,542 @@
+# vim: sw=4:expandtab:foldmethod=marker
+#
+# Copyright (c) 2006, Mathieu Fenniak
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# * Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+# * The name of the author may not be used to endorse or promote products
+# derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+"""
+Implementation of generic PDF objects (dictionary, number, string, and so on)
+"""
+__author__ = "Mathieu Fenniak"
+__author_email__ = "mfenniak@pobox.com"
+
+import re
+from utils import readNonWhitespace, RC4_encrypt
+import filters
+
+def readObject(stream, pdf):
+    tok = stream.read(1)
+    stream.seek(-1, 1) # reset to start
+    if tok == 't' or tok == 'f':
+        # boolean object
+        return BooleanObject.readFromStream(stream)
+    elif tok == '(':
+        # string object
+        return StringObject.readFromStream(stream)
+    elif tok == '/':
+        # name object
+        return NameObject.readFromStream(stream)
+    elif tok == '[':
+        # array object
+        return ArrayObject.readFromStream(stream, pdf)
+    elif tok == 'n':
+        # null object
+        return NullObject.readFromStream(stream)
+    elif tok == '<':
+        # hexadecimal string OR dictionary
+        peek = stream.read(2)
+        stream.seek(-2, 1) # reset to start
+        if peek == '<<':
+            return DictionaryObject.readFromStream(stream, pdf)
+        else:
+            return StringObject.readHexStringFromStream(stream)
+    elif tok == '%':
+        # comment
+        while tok not in ('\r', '\n'):
+            tok = stream.read(1)
+        tok = readNonWhitespace(stream)
+        stream.seek(-1, 1)
+        return readObject(stream, pdf)
+    else:
+        # number object OR indirect reference
+        if tok == '+' or tok == '-':
+            # number
+            return NumberObject.readFromStream(stream)
+        peek = stream.read(20)
+        stream.seek(-len(peek), 1) # reset to start
+        if re.match(r"(\d+)\s(\d+)\sR[^a-zA-Z]", peek) != None:
+            return IndirectObject.readFromStream(stream, pdf)
+        else:
+            return NumberObject.readFromStream(stream)
+
+class PdfObject(object):
+    def getObject(self):
+        """Resolves indirect references."""
+        return self
+
+
+class NullObject(PdfObject):
+    def writeToStream(self, stream, encryption_key):
+        stream.write("null")
+
+    def readFromStream(stream):
+        assert stream.read(4) == "null"
+        return NullObject()
+    readFromStream = staticmethod(readFromStream)
+
+
+class BooleanObject(PdfObject):
+    def __init__(self, value):
+        self.value = value
+
+    def writeToStream(self, stream, encryption_key):
+        if self.value:
+            stream.write("true")
+        else:
+            stream.write("false")
+
+    def readFromStream(stream):
+        word = stream.read(4)
+        if word == "true":
+            return BooleanObject(True)
+        elif word == "fals":
+            stream.read(1)
+            return BooleanObject(False)
+        assert False
+    readFromStream = staticmethod(readFromStream)
+
+
+class ArrayObject(list, PdfObject):
+    def writeToStream(self, stream, encryption_key):
+        stream.write("[")
+        for data in self:
+            stream.write(" ")
+            data.writeToStream(stream, encryption_key)
+        stream.write(" ]")
+
+    def readFromStream(stream, pdf):
+        arr = ArrayObject()
+        assert stream.read(1) == "["
+        while True:
+            # skip leading whitespace
+            tok = stream.read(1)
+            while tok.isspace():
+                tok = stream.read(1)
+            stream.seek(-1, 1)
+            # check for array ending
+            peekahead = stream.read(1)
+            if peekahead == "]":
+                break
+            stream.seek(-1, 1)
+            # read and append obj
+            arr.append(readObject(stream, pdf))
+        return arr
+    readFromStream = staticmethod(readFromStream)
+
+
+class IndirectObject(PdfObject):
+    def __init__(self, idnum, generation, pdf):
+        self.idnum = idnum
+        self.generation = generation
+        self.pdf = pdf
+
+    def getObject(self):
+        return self.pdf.getObject(self).getObject()
+
+    def __repr__(self):
+        return "IndirectObject(%r, %r)" % (self.idnum, self.generation)
+
+    def __eq__(self, other):
+        return (
+            other != None and
+            isinstance(other, IndirectObject) and
+            self.idnum == other.idnum and
+            self.generation == other.generation and
+            self.pdf is other.pdf
+            )
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+    def writeToStream(self, stream, encryption_key):
+        stream.write("%s %s R" % (self.idnum, self.generation))
+
+    def readFromStream(stream, pdf):
+        idnum = ""
+        while True:
+            tok = stream.read(1)
+            if tok.isspace():
+                break
+            idnum += tok
+        generation = ""
+        while True:
+            tok = stream.read(1)
+            if tok.isspace():
+                break
+            generation += tok
+        r = stream.read(1)
+        #if r != "R":
+        #    stream.seek(-20, 1)
+        #    print idnum, generation
+        #    print repr(stream.read(40))
+        assert r == "R"
+        return IndirectObject(int(idnum), int(generation), pdf)
+    readFromStream = staticmethod(readFromStream)
+
+
+class FloatObject(float, PdfObject):
+    def writeToStream(self, stream, encryption_key):
+        stream.write(repr(self))
+
+
+class NumberObject(int, PdfObject):
+    def __init__(self, value):
+        int.__init__(self, value)
+
+    def writeToStream(self, stream, encryption_key):
+        stream.write(repr(self))
+
+    def readFromStream(stream):
+        name = ""
+        while True:
+            tok = stream.read(1)
+            if tok != '+' and tok != '-' and tok != '.' and not tok.isdigit():
+                stream.seek(-1, 1)
+                break
+            name += tok
+        if name.find(".") != -1:
+            return FloatObject(name)
+        else:
+            return NumberObject(name)
+    readFromStream = staticmethod(readFromStream)
+
+
+class StringObject(str, PdfObject):
+    def writeToStream(self, stream, encryption_key):
+        string = self
+        if encryption_key:
+            string = RC4_encrypt(encryption_key, string)
+        stream.write("(")
+        for c in string:
+            if not c.isalnum() and not c.isspace():
+                stream.write("\\%03o" % ord(c))
+            else:
+                stream.write(c)
+        stream.write(")")
+
+    def readHexStringFromStream(stream):
+        stream.read(1)
+        txt = ""
+        x = ""
+        while True:
+            tok = readNonWhitespace(stream)
+            if tok == ">":
+                break
+            x += tok
+            if len(x) == 2:
+                txt += chr(int(x, base=16))
+                x = ""
+        if len(x) == 1:
+            x += "0"
+        if len(x) == 2:
+            txt += chr(int(x, base=16))
+        return StringObject(txt)
+    readHexStringFromStream = staticmethod(readHexStringFromStream)
+
+    def readFromStream(stream):
+        tok = stream.read(1)
+        parens = 1
+        txt = ""
+        while True:
+            tok = stream.read(1)
+            if tok == "(":
+                parens += 1
+            elif tok == ")":
+                parens -= 1
+                if parens == 0:
+                    break
+            elif tok == "\\":
+                tok = stream.read(1)
+                if tok == "n":
+                    tok = "\n"
+                elif tok == "r":
+                    tok = "\r"
+                elif tok == "t":
+                    tok = "\t"
+                elif tok == "b":
+                    tok == "\b"
+                elif tok == "f":
+                    tok = "\f"
+                elif tok == "(":
+                    tok = "("
+                elif tok == ")":
+                    tok = ")"
+                elif tok == "\\":
+                    tok = "\\"
+                elif tok.isdigit():
+                    tok += stream.read(2)
+                    tok = chr(int(tok, base=8))
+            txt += tok
+        return StringObject(txt)
+    readFromStream = staticmethod(readFromStream)
+
+
+class NameObject(str, PdfObject):
+    delimiterCharacters = "(", ")", "<", ">", "[", "]", "{", "}", "/", "%"
+
+    def __init__(self, data):
+        str.__init__(self, data)
+
+    def writeToStream(self, stream, encryption_key):
+        stream.write(self)
+
+    def readFromStream(stream):
+        name = stream.read(1)
+        assert name == "/"
+        while True:
+            tok = stream.read(1)
+            if tok.isspace() or tok in NameObject.delimiterCharacters:
+                stream.seek(-1, 1)
+                break
+            name += tok
+        return NameObject(name)
+    readFromStream = staticmethod(readFromStream)
+
+
+class DictionaryObject(dict, PdfObject):
+    def __init__(self):
+        pass
+
+    def writeToStream(self, stream, encryption_key):
+        stream.write("<<\n")
+        for key, value in self.items():
+            key.writeToStream(stream, encryption_key)
+            stream.write(" ")
+            value.writeToStream(stream, encryption_key)
+            stream.write("\n")
+        stream.write(">>")
+
+    def readFromStream(stream, pdf):
+        assert stream.read(2) == "<<"
+        data = {}
+        while True:
+            tok = readNonWhitespace(stream)
+            if tok == ">":
+                stream.read(1)
+                break
+            stream.seek(-1, 1)
+            key = readObject(stream, pdf)
+            tok = readNonWhitespace(stream)
+            stream.seek(-1, 1)
+            value = readObject(stream, pdf)
+            if data.has_key(key):
+                # multiple definitions of key not permitted
+                assert False
+            data[key] = value
+        pos = stream.tell()
+        s = readNonWhitespace(stream)
+        if s == 's' and stream.read(5) == 'tream':
+            eol = stream.read(1)
+            # odd PDF file output has spaces after 'stream' keyword but before EOL.
+            # patch provided by Danial Sandler
+            while eol == ' ':
+                eol = stream.read(1)
+            assert eol in ("\n", "\r")
+            if eol == "\r":
+                # read \n after
+                stream.read(1)
+            # this is a stream object, not a dictionary
+            assert data.has_key("/Length")
+            length = data["/Length"]
+            if isinstance(length, IndirectObject):
+                t = stream.tell()
+                length = pdf.getObject(length)
+                stream.seek(t, 0)
+            data["__streamdata__"] = stream.read(length)
+            e = readNonWhitespace(stream)
+            ndstream = stream.read(8)
+            if (e + ndstream) != "endstream":
+                # (sigh) - the odd PDF file has a length that is too long, so
+                # we need to read backwards to find the "endstream" ending.
+                # ReportLab (unknown version) generates files with this bug,
+                # and Python users into PDF files tend to be our audience.
+                # we need to do this to correct the streamdata and chop off
+                # an extra character.
+                pos = stream.tell()
+                stream.seek(-10, 1)
+                end = stream.read(9)
+                if end == "endstream":
+                    # we found it by looking back one character further.
+                    data["__streamdata__"] = data["__streamdata__"][:-1]
+                else:
+                    stream.seek(pos, 0)
+                    raise "Unable to find 'endstream' marker after stream."
+        else:
+            stream.seek(pos, 0)
+        if data.has_key("__streamdata__"):
+            return StreamObject.initializeFromDictionary(data)
+        else:
+            retval = DictionaryObject()
+            retval.update(data)
+            return retval
+    readFromStream = staticmethod(readFromStream)
+
+
+class StreamObject(DictionaryObject):
+    def __init__(self):
+        self._data = None
+        self.decodedSelf = None
+
+    def writeToStream(self, stream, encryption_key):
+        self[NameObject("/Length")] = NumberObject(len(self._data))
+        DictionaryObject.writeToStream(self, stream, encryption_key)
+        del self["/Length"]
+        stream.write("\nstream\n")
+        data = self._data
+        if encryption_key:
+            data = RC4_encrypt(encryption_key, data)
+        stream.write(data)
+        stream.write("\nendstream")
+
+    def initializeFromDictionary(data):
+        if data.has_key("/Filter"):
+            retval = EncodedStreamObject()
+        else:
+            retval = DecodedStreamObject()
+        retval._data = data["__streamdata__"]
+        del data["__streamdata__"]
+        del data["/Length"]
+        retval.update(data)
+        return retval
+    initializeFromDictionary = staticmethod(initializeFromDictionary)
+
+    def flateEncode(self):
+        if self.has_key("/Filter"):
+            f = self["/Filter"]
+            if isinstance(f, ArrayObject):
+                f.insert(0, NameObject("/FlateDecode"))
+            else:
+                newf = ArrayObject()
+                newf.append(NameObject("/FlateDecode"))
+                newf.append(f)
+                f = newf
+        else:
+            f = NameObject("/FlateDecode")
+        retval = EncodedStreamObject()
+        retval[NameObject("/Filter")] = f
+        retval._data = filters.FlateDecode.encode(self._data)
+        return retval
+
+
+class DecodedStreamObject(StreamObject):
+    def getData(self):
+        return self._data
+
+    def setData(self, data):
+        self._data = data
+
+
+class EncodedStreamObject(StreamObject):
+    def __init__(self):
+        self.decodedSelf = None
+
+    def getData(self):
+        if self.decodedSelf:
+            # cached version of decoded object
+            return self.decodedSelf.getData()
+        else:
+            # create decoded object
+            decoded = StreamObject()
+            decoded._data = filters.decodeStreamData(self)
+            for key, value in self.items():
+                if not key in ("/Length", "/Filter", "/DecodeParms"):
+                    decoded[key] = value
+            self.decodedSelf = decoded
+            return decoded._data
+
+    def setData(self, data):
+        raise "Creating EncodedStreamObject is not currently supported"
+
+
+class RectangleObject(ArrayObject):
+    def __init__(self, arr):
+        # must have four points
+        assert len(arr) == 4
+        # automatically convert arr[x] into NumberObject(arr[x]) if necessary
+        ArrayObject.__init__(self, [self.ensureIsNumber(x) for x in arr])
+
+    def ensureIsNumber(self, value):
+        if not isinstance(value, NumberObject):
+            value = NumberObject(value)
+        return value
+
+    def __repr__(self):
+        return "RectangleObject(%s)" % repr(list(self))
+
+    def getLowerLeft_x(self):
+        return self[0]
+
+    def getLowerLeft_y(self):
+        return self[1]
+
+    def getUpperRight_x(self):
+        return self[2]
+
+    def getUpperRight_y(self):
+        return self[3]
+
+    def getUpperLeft_x(self):
+        return self.getLowerLeft_x()
+    
+    def getUpperLeft_y(self):
+        return self.getUpperRight_y()
+
+    def getLowerRight_x(self):
+        return self.getUpperRight_x()
+
+    def getLowerRight_y(self):
+        return self.getLowerLeft_y()
+
+    def getLowerLeft(self):
+        return self.getLowerLeft_x(), self.getLowerLeft_y()
+
+    def getLowerRight(self):
+        return self.getLowerRight_x(), self.getLowerRight_y()
+
+    def getUpperLeft(self):
+        return self.getUpperLeft_x(), self.getUpperLeft_y()
+
+    def getUpperRight(self):
+        return self.getUpperRight_x(), self.getUpperRight_y()
+
+    def setLowerLeft(self, value):
+        self[0], self[1] = [self.ensureIsNumber(x) for x in value]
+
+    def setLowerRight(self, value):
+        self[2], self[1] = [self.ensureIsNumber(x) for x in value]
+
+    def setUpperLeft(self, value):
+        self[0], self[3] = [self.ensureIsNumber(x) for x in value]
+
+    def setUpperRight(self, value):
+        self[2], self[3] = [self.ensureIsNumber(x) for x in value]
+
+    lowerLeft = property(getLowerLeft, setLowerLeft, None, None)
+    lowerRight = property(getLowerRight, setLowerRight, None, None)
+    upperLeft = property(getUpperLeft, setUpperLeft, None, None)
+    upperRight = property(getUpperRight, setUpperRight, None, None)
+
diff --git a/src/libprs500/ebooks/pyPdf/pdf.py b/src/libprs500/ebooks/pyPdf/pdf.py
new file mode 100644
index 0000000000..fdaacaf574
--- /dev/null
+++ b/src/libprs500/ebooks/pyPdf/pdf.py
@@ -0,0 +1,1162 @@
+# vim: sw=4:expandtab:foldmethod=marker
+#
+# Copyright (c) 2006, Mathieu Fenniak
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# * Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+# * The name of the author may not be used to endorse or promote products
+# derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+"""
+A pure-Python PDF library with very minimal capabilities.  It was designed to
+be able to split and merge PDF files by page, and that's about all it can do.
+It may be a solid base for future PDF file work in Python.
+"""
+__author__ = "Mathieu Fenniak"
+__author_email__ = "mfenniak@pobox.com"
+
+import struct
+try:
+    from cStringIO import StringIO
+except ImportError:
+    from StringIO import StringIO
+
+import filters
+import utils
+from generic import *
+from utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirtualList
+from sets import ImmutableSet
+
+##
+# This class supports writing PDF files out, given pages produced by another
+# class (typically {@link #PdfFileReader PdfFileReader}).
+class PdfFileWriter(object):
+    def __init__(self):
+        self._header = "%PDF-1.3"
+        self._objects = []  # array of indirect objects
+
+        # The root of our page tree node.
+        pages = DictionaryObject()
+        pages.update({
+                NameObject("/Type"): NameObject("/Pages"),
+                NameObject("/Count"): NumberObject(0),
+                NameObject("/Kids"): ArrayObject(),
+                })
+        self._pages = self._addObject(pages)
+
+        # info object
+        info = DictionaryObject()
+        info.update({
+                NameObject("/Producer"): StringObject("Python PDF Library - http://pybrary.net/pyPdf/")
+                })
+        self._info = self._addObject(info)
+
+        # root object
+        root = DictionaryObject()
+        root.update({
+            NameObject("/Type"): NameObject("/Catalog"),
+            NameObject("/Pages"): self._pages,
+            })
+        self._root = self._addObject(root)
+
+    def _addObject(self, obj):
+        self._objects.append(obj)
+        return IndirectObject(len(self._objects), 0, self)
+
+    def getObject(self, ido):
+        assert ido.pdf == self
+        return self._objects[ido.idnum - 1]
+
+    ##
+    # Adds a page to this PDF file.  The page is usually acquired from a
+    # {@link #PdfFileReader PdfFileReader} instance.
+    # <p>
+    # Stability: Added in v1.0, will exist for all v1.x releases.
+    #
+    # @param page The page to add to the document.  This argument should be
+    #             an instance of {@link #PageObject PageObject}.
+    def addPage(self, page):
+        assert page["/Type"] == "/Page"
+        page[NameObject("/Parent")] = self._pages
+        page = self._addObject(page)
+        pages = self.getObject(self._pages)
+        pages["/Kids"].append(page)
+        pages["/Count"] = NumberObject(pages["/Count"] + 1)
+
+    ##
+    # Encrypt this PDF file with the PDF Standard encryption handler.
+    # @param user_pwd The "user password", which allows for opening and reading
+    # the PDF file with the restrictions provided.
+    # @param owner_pwd The "owner password", which allows for opening the PDF
+    # files without any restrictions.  By default, the owner password is the
+    # same as the user password.
+    # @param use_128bit Boolean argument as to whether to use 128bit
+    # encryption.  When false, 40bit encryption will be used.  By default, this
+    # flag is on.
+    def encrypt(self, user_pwd, owner_pwd = None, use_128bit = True):
+        import md5, time, random
+        if owner_pwd == None:
+            owner_pwd = user_pwd
+        if use_128bit:
+            V = 2
+            rev = 3
+            keylen = 128 / 8
+        else:
+            V = 1
+            rev = 2
+            keylen = 40 / 8
+        # permit everything:
+        P = -1
+        O = StringObject(_alg33(owner_pwd, user_pwd, rev, keylen))
+        ID_1 = md5.new(repr(time.time())).digest()
+        ID_2 = md5.new(repr(random.random())).digest()
+        self._ID = ArrayObject((StringObject(ID_1), StringObject(ID_2)))
+        if rev == 2:
+            U, key = _alg34(user_pwd, O, P, ID_1)
+        else:
+            assert rev == 3
+            U, key = _alg35(user_pwd, rev, keylen, O, P, ID_1, False)
+        encrypt = DictionaryObject()
+        encrypt[NameObject("/Filter")] = NameObject("/Standard")
+        encrypt[NameObject("/V")] = NumberObject(V)
+        if V == 2:
+            encrypt[NameObject("/Length")] = NumberObject(keylen * 8)
+        encrypt[NameObject("/R")] = NumberObject(rev)
+        encrypt[NameObject("/O")] = StringObject(O)
+        encrypt[NameObject("/U")] = StringObject(U)
+        encrypt[NameObject("/P")] = NumberObject(P)
+        self._encrypt = self._addObject(encrypt)
+        self._encrypt_key = key
+
+    ##
+    # Writes the collection of pages added to this object out as a PDF file.
+    # <p>
+    # Stability: Added in v1.0, will exist for all v1.x releases.
+    # @param stream An object to write the file to.  The object must support
+    # the write method, and the tell method, similar to a file object.
+    def write(self, stream):
+        import struct, md5
+
+        externalReferenceMap = {}
+        self.stack = []
+        self._sweepIndirectReferences(externalReferenceMap, self._root)
+        del self.stack
+
+        # Begin writing:
+        object_positions = []
+        stream.write(self._header + "\n")
+        for i in range(len(self._objects)):
+            idnum = (i + 1)
+            obj = self._objects[i]
+            object_positions.append(stream.tell())
+            stream.write(str(idnum) + " 0 obj\n")
+            key = None
+            if hasattr(self, "_encrypt") and idnum != self._encrypt.idnum:
+                pack1 = struct.pack("<i", i + 1)[:3]
+                pack2 = struct.pack("<i", 0)[:2]
+                key = self._encrypt_key + pack1 + pack2
+                assert len(key) == (len(self._encrypt_key) + 5)
+                md5_hash = md5.new(key).digest()
+                key = md5_hash[:min(16, len(self._encrypt_key) + 5)]
+            obj.writeToStream(stream, key)
+            stream.write("\nendobj\n")
+
+        # xref table
+        xref_location = stream.tell()
+        stream.write("xref\n")
+        stream.write("0 %s\n" % (len(self._objects) + 1))
+        stream.write("%010d %05d f \n" % (0, 65535))
+        for offset in object_positions:
+            stream.write("%010d %05d n \n" % (offset, 0))
+
+        # trailer
+        stream.write("trailer\n")
+        trailer = DictionaryObject()
+        trailer.update({
+                NameObject("/Size"): NumberObject(len(self._objects) + 1),
+                NameObject("/Root"): self._root,
+                NameObject("/Info"): self._info,
+                })
+        if hasattr(self, "_ID"):
+            trailer[NameObject("/ID")] = self._ID
+        if hasattr(self, "_encrypt"):
+            trailer[NameObject("/Encrypt")] = self._encrypt
+        trailer.writeToStream(stream, None)
+        
+        # eof
+        stream.write("\nstartxref\n%s\n%%%%EOF\n" % (xref_location))
+
+    def _sweepIndirectReferences(self, externMap, data):
+        if isinstance(data, DictionaryObject):
+            for key, value in data.items():
+                origvalue = value
+                value = self._sweepIndirectReferences(externMap, value)
+                if value == None:
+                    print objects, value, origvalue
+                if isinstance(value, StreamObject):
+                    # a dictionary value is a stream.  streams must be indirect
+                    # objects, so we need to change this value.
+                    value = self._addObject(value)
+                data[key] = value
+            return data
+        elif isinstance(data, ArrayObject):
+            for i in range(len(data)):
+                value = self._sweepIndirectReferences(externMap, data[i])
+                if isinstance(value, StreamObject):
+                    # an array value is a stream.  streams must be indirect
+                    # objects, so we need to change this value
+                    value = self._addObject(value)
+                data[i] = value
+            return data
+        elif isinstance(data, IndirectObject):
+            # internal indirect references are fine
+            if data.pdf == self:
+                if data.idnum in self.stack:
+                    return data
+                else:
+                    self.stack.append(data.idnum)
+                    realdata = self.getObject(data)
+                    self._sweepIndirectReferences(externMap, realdata)
+                    self.stack.pop()
+                    return data
+            else:
+                newobj = externMap.get(data.pdf, {}).get(data.generation, {}).get(data.idnum, None)
+                if newobj == None:
+                    newobj = data.pdf.getObject(data)
+                    self._objects.append(None) # placeholder
+                    idnum = len(self._objects)
+                    newobj_ido = IndirectObject(idnum, 0, self)
+                    if not externMap.has_key(data.pdf):
+                        externMap[data.pdf] = {}
+                    if not externMap[data.pdf].has_key(data.generation):
+                        externMap[data.pdf][data.generation] = {}
+                    externMap[data.pdf][data.generation][data.idnum] = newobj_ido
+                    newobj = self._sweepIndirectReferences(externMap, newobj)
+                    self._objects[idnum-1] = newobj
+                    return newobj_ido
+                return newobj
+        else:
+            return data
+
+
+##
+# Initializes a PdfFileReader object.  This operation can take some time, as
+# the PDF stream's cross-reference tables are read into memory.
+# <p>
+# Stability: Added in v1.0, will exist for all v1.x releases.
+#
+# @param stream An object that supports the standard read and seek methods
+#               similar to a file object.
+class PdfFileReader(object):
+    def __init__(self, stream):
+        self.flattenedPages = None
+        self.resolvedObjects = {}
+        self.read(stream)
+        self.stream = stream
+        self._override_encryption = False
+
+    ##
+    # Retrieves the PDF file's document information dictionary, if it exists.
+    # Note that some PDF files use metadata streams instead of docinfo
+    # dictionaries, and these metadata streams will not be accessed by this
+    # function.
+    # <p>
+    # Stability: Added in v1.6, will exist for all future v1.x releases.
+    # @return Returns a {@link #DocumentInformation DocumentInformation}
+    #         instance, or None if none exists.
+    def getDocumentInfo(self):
+        if not self.trailer.has_key("/Info"):
+            return None
+        obj = self.getObject(self.trailer['/Info'])
+        retval = DocumentInformation()
+        retval.update(obj)
+        return retval
+
+    ##
+    # Read-only property that accesses the {@link
+    # #PdfFileReader.getDocumentInfo getDocumentInfo} function.
+    # <p>
+    # Stability: Added in v1.7, will exist for all future v1.x releases.
+    documentInfo = property(lambda self: self.getDocumentInfo(), None, None)
+
+    ##
+    # Calculates the number of pages in this PDF file.
+    # <p>
+    # Stability: Added in v1.0, will exist for all v1.x releases.
+    # @return Returns an integer.
+    def getNumPages(self):
+        if self.flattenedPages == None:
+            self._flatten()
+        return len(self.flattenedPages)
+
+    ##
+    # Read-only property that accesses the {@link #PdfFileReader.getNumPages
+    # getNumPages} function.
+    # <p>
+    # Stability: Added in v1.7, will exist for all future v1.x releases.
+    numPages = property(lambda self: self.getNumPages(), None, None)
+
+    ##
+    # Retrieves a page by number from this PDF file.
+    # <p>
+    # Stability: Added in v1.0, will exist for all v1.x releases.
+    # @return Returns a {@link #PageObject PageObject} instance.
+    def getPage(self, pageNumber):
+        ## ensure that we're not trying to access an encrypted PDF
+        #assert not self.trailer.has_key("/Encrypt")
+        if self.flattenedPages == None:
+            self._flatten()
+        return self.flattenedPages[pageNumber]
+
+    ##
+    # Read-only property that emulates a list based upon the {@link
+    # #PdfFileReader.getNumPages getNumPages} and {@link #PdfFileReader.getPage
+    # getPage} functions.
+    # <p>
+    # Stability: Added in v1.7, and will exist for all future v1.x releases.
+    pages = property(lambda self: ConvertFunctionsToVirtualList(self.getNumPages, self.getPage),
+            None, None)
+
+    def _flatten(self, pages = None, inherit = None):
+        inheritablePageAttributes = (
+            NameObject("/Resources"), NameObject("/MediaBox"),
+            NameObject("/CropBox"), NameObject("/Rotate")
+            )
+        if inherit == None:
+            inherit = dict()
+        if pages == None:
+            self.flattenedPages = []
+            catalog = self.getObject(self.trailer["/Root"])
+            pages = self.getObject(catalog["/Pages"])
+        if isinstance(pages, IndirectObject):
+            pages = self.getObject(pages)
+        t = pages["/Type"]
+        if t == "/Pages":
+            for attr in inheritablePageAttributes:
+                if pages.has_key(attr):
+                    inherit[attr] = pages[attr]
+            for page in pages["/Kids"]:
+                self._flatten(page, inherit)
+        elif t == "/Page":
+            for attr,value in inherit.items():
+                # if the page has it's own value, it does not inherit the
+                # parent's value:
+                if not pages.has_key(attr):
+                    pages[attr] = value
+            pageObj = PageObject(self)
+            pageObj.update(pages)
+            self.flattenedPages.append(pageObj)
+
+    def safeGetObject(self, obj):
+        if isinstance(obj, IndirectObject):
+            return self.safeGetObject(self.getObject(obj))
+        return obj
+
+    def getObject(self, indirectReference):
+        retval = self.resolvedObjects.get(indirectReference.generation, {}).get(indirectReference.idnum, None)
+        if retval != None:
+            return retval
+        if indirectReference.generation == 0 and \
+           self.xref_objStm.has_key(indirectReference.idnum):
+            # indirect reference to object in object stream
+            # read the entire object stream into memory
+            stmnum,idx = self.xref_objStm[indirectReference.idnum]
+            objStm = self.getObject(IndirectObject(stmnum, 0, self))
+            assert objStm['/Type'] == '/ObjStm'
+            assert idx < objStm['/N']
+            streamData = StringIO(objStm.getData())
+            for i in range(objStm['/N']):
+                objnum = NumberObject.readFromStream(streamData)
+                readNonWhitespace(streamData)
+                streamData.seek(-1, 1)
+                offset = NumberObject.readFromStream(streamData)
+                readNonWhitespace(streamData)
+                streamData.seek(-1, 1)
+                t = streamData.tell()
+                streamData.seek(objStm['/First']+offset, 0)
+                obj = readObject(streamData, self)
+                self.resolvedObjects[0][objnum] = obj
+                streamData.seek(t, 0)
+            return self.resolvedObjects[0][indirectReference.idnum]
+        start = self.xref[indirectReference.generation][indirectReference.idnum]
+        self.stream.seek(start, 0)
+        idnum, generation = self.readObjectHeader(self.stream)
+        assert idnum == indirectReference.idnum
+        assert generation == indirectReference.generation
+        retval = readObject(self.stream, self)
+
+        # override encryption is used for the /Encrypt dictionary
+        if not self._override_encryption and self.isEncrypted:
+            # if we don't have the encryption key:
+            if not hasattr(self, '_decryption_key'):
+                raise Exception, "file has not been decrypted"
+            # otherwise, decrypt here...
+            import struct, md5
+            pack1 = struct.pack("<i", indirectReference.idnum)[:3]
+            pack2 = struct.pack("<i", indirectReference.generation)[:2]
+            key = self._decryption_key + pack1 + pack2
+            assert len(key) == (len(self._decryption_key) + 5)
+            md5_hash = md5.new(key).digest()
+            key = md5_hash[:min(16, len(self._decryption_key) + 5)]
+            retval = self._decryptObject(retval, key)
+
+        self.cacheIndirectObject(generation, idnum, retval)
+        return retval
+
+    def _decryptObject(self, obj, key):
+        if isinstance(obj, StringObject):
+            obj = StringObject(utils.RC4_encrypt(key, obj))
+        elif isinstance(obj, StreamObject):
+            obj._data = utils.RC4_encrypt(key, obj._data)
+        elif isinstance(obj, DictionaryObject):
+            for dictkey, value in obj.items():
+                obj[dictkey] = self._decryptObject(value, key)
+        elif isinstance(obj, ArrayObject):
+            for i in range(len(obj)):
+                obj[i] = self._decryptObject(obj[i], key)
+        return obj
+
+    def readObjectHeader(self, stream):
+        idnum = readUntilWhitespace(stream)
+        generation = readUntilWhitespace(stream)
+        obj = stream.read(3)
+        readNonWhitespace(stream)
+        stream.seek(-1, 1)
+        return int(idnum), int(generation)
+
+    def cacheIndirectObject(self, generation, idnum, obj):
+        if not self.resolvedObjects.has_key(generation):
+            self.resolvedObjects[generation] = {}
+        self.resolvedObjects[generation][idnum] = obj
+
+    def read(self, stream):
+        # start at the end:
+        stream.seek(-1, 2)
+        line = ''
+        while not line:
+            line = self.readNextEndLine(stream)
+        assert line[:5] == "%%EOF"
+
+        # find startxref entry - the location of the xref table
+        line = self.readNextEndLine(stream)
+        startxref = int(line)
+        line = self.readNextEndLine(stream)
+        assert line[:9] == "startxref"
+
+        # read all cross reference tables and their trailers
+        self.xref = {}
+        self.xref_objStm = {}
+        self.trailer = {}
+        while 1:
+            # load the xref table
+            stream.seek(startxref, 0)
+            x = stream.read(1)
+            if x == "x":
+                # standard cross-reference table
+                ref = stream.read(4)
+                assert ref[:3] == "ref"
+                readNonWhitespace(stream)
+                stream.seek(-1, 1)
+                while 1:
+                    num = readObject(stream, self)
+                    readNonWhitespace(stream)
+                    stream.seek(-1, 1)
+                    size = readObject(stream, self)
+                    readNonWhitespace(stream)
+                    stream.seek(-1, 1)
+                    cnt = 0
+                    while cnt < size:
+                        line = stream.read(20)
+                        offset, generation = line[:16].split(" ")
+                        offset, generation = int(offset), int(generation)
+                        if not self.xref.has_key(generation):
+                            self.xref[generation] = {}
+                        if self.xref[generation].has_key(num):
+                            # It really seems like we should allow the last
+                            # xref table in the file to override previous
+                            # ones. Since we read the file backwards, assume
+                            # any existing key is already set correctly.
+                            pass
+                        else:
+                            self.xref[generation][num] = offset
+                        cnt += 1
+                        num += 1
+                    readNonWhitespace(stream)
+                    stream.seek(-1, 1)
+                    trailertag = stream.read(7)
+                    if trailertag != "trailer":
+                        # more xrefs!
+                        stream.seek(-7, 1)
+                    else:
+                        break
+                readNonWhitespace(stream)
+                stream.seek(-1, 1)
+                newTrailer = readObject(stream, self)
+                for key, value in newTrailer.items():
+                    if not self.trailer.has_key(key):
+                        self.trailer[key] = value
+                if newTrailer.has_key(NameObject("/Prev")):
+                    startxref = newTrailer[NameObject("/Prev")]
+                else:
+                    break
+            elif x.isdigit():
+                # PDF 1.5+ Cross-Reference Stream
+                stream.seek(-1, 1)
+                idnum, generation = self.readObjectHeader(stream)
+                xrefstream = readObject(stream, self)
+                assert xrefstream["/Type"] == "/XRef"
+                self.cacheIndirectObject(generation, idnum, xrefstream)
+                streamData = StringIO(xrefstream.getData())
+                num, size = xrefstream.get("/Index", [0, xrefstream.get("/Size")])
+                entrySizes = xrefstream.get("/W")
+                cnt = 0
+                while cnt < size:
+                    for i in range(len(entrySizes)):
+                        d = streamData.read(entrySizes[i])
+                        di = convertToInt(d, entrySizes[i])
+                        if i == 0:
+                            xref_type = di
+                        elif i == 1:
+                            if xref_type == 0:
+                                next_free_object = di
+                            elif xref_type == 1:
+                                byte_offset = di
+                            elif xref_type == 2:
+                                objstr_num = di
+                        elif i == 2:
+                            if xref_type == 0:
+                                next_generation = di
+                            elif xref_type == 1:
+                                generation = di
+                            elif xref_type == 2:
+                                obstr_idx = di
+                    if xref_type == 0:
+                        pass
+                    elif xref_type == 1:
+                        if not self.xref.has_key(generation):
+                            self.xref[generation] = {}
+                        self.xref[generation][num] = byte_offset
+                    elif xref_type == 2:
+                        self.xref_objStm[num] = [objstr_num, obstr_idx]
+                    cnt += 1
+                    num += 1
+                trailerKeys = "/Root", "/Encrypt", "/Info", "/ID"
+                for key in trailerKeys:
+                    if xrefstream.has_key(key) and not self.trailer.has_key(key):
+                        self.trailer[NameObject(key)] = xrefstream[key]
+                if xrefstream.has_key("/Prev"):
+                    startxref = xrefstream["/Prev"]
+                else:
+                    break
+            else:
+                # bad xref character at startxref.  Let's see if we can find
+                # the xref table nearby, as we've observed this error with an
+                # off-by-one before.
+                stream.seek(-11, 1)
+                tmp = stream.read(20)
+                xref_loc = tmp.find("xref")
+                if xref_loc != -1:
+                    startxref -= (10 - xref_loc)
+                    continue
+                else:
+                    # no xref table found at specified location
+                    assert False
+                    break
+
+    def readNextEndLine(self, stream):
+        line = ""
+        while True:
+            x = stream.read(1)
+            stream.seek(-2, 1)
+            if x == '\n' or x == '\r':
+                while x == '\n' or x == '\r':
+                    x = stream.read(1)
+                    stream.seek(-2, 1)
+                stream.seek(1, 1)
+                break
+            else:
+                line = x + line
+        return line
+
+    ##
+    # When using an encrypted / secured PDF file with the PDF Standard
+    # encryption handler, this function will allow the file to be decrypted.
+    # It checks the given password against the document's user password and
+    # owner password, and then stores the resulting decryption key if either
+    # password is correct.
+    # <p>
+    # It does not matter which password was matched.  Both passwords provide
+    # the correct decryption key that will allow the document to be used with
+    # this library.
+    # <p>
+    # Stability: Added in v1.8, will exist for all future v1.x releases.
+    #
+    # @return 0 if the password failed, 1 if the password matched the user
+    # password, and 2 if the password matched the owner password.
+    #
+    # @exception NotImplementedError Document uses an unsupported encryption
+    # method.
+    def decrypt(self, password):
+        self._override_encryption = True
+        try:
+            return self._decrypt(password)
+        finally:
+            self._override_encryption = False
+
+    def _decrypt(self, password):
+        encrypt = self.safeGetObject(self.trailer['/Encrypt'])
+        if encrypt['/Filter'] != '/Standard':
+            raise NotImplementedError, "only Standard PDF encryption handler is available"
+        if not (encrypt['/V'] in (1, 2)):
+            raise NotImplementedError, "only algorithm code 1 and 2 are supported"
+        user_password, key = self._authenticateUserPassword(password)
+        if user_password:
+            self._decryption_key = key
+            return 1
+        else:
+            rev = self.safeGetObject(encrypt['/R'])
+            if rev == 2:
+                keylen = 5
+            else:
+                keylen = self.safeGetObject(encrypt['/Length']) / 8
+            key = _alg33_1(password, rev, keylen)
+            real_O = self.safeGetObject(encrypt["/O"])
+            if rev == 2:
+                userpass = utils.RC4_encrypt(key, real_O)
+            else:
+                val = real_O
+                for i in range(19, -1, -1):
+                    new_key = ''
+                    for l in range(len(key)):
+                        new_key += chr(ord(key[l]) ^ i)
+                    val = utils.RC4_encrypt(new_key, val)
+                userpass = val
+            owner_password, key = self._authenticateUserPassword(userpass)
+            if owner_password:
+                self._decryption_key = key
+                return 2
+        return 0
+
+    def _authenticateUserPassword(self, password):
+        encrypt = self.safeGetObject(self.trailer['/Encrypt'])
+        rev = self.safeGetObject(encrypt['/R'])
+        owner_entry = self.safeGetObject(encrypt['/O'])
+        p_entry = self.safeGetObject(encrypt['/P'])
+        id_entry = self.safeGetObject(self.trailer['/ID'])
+        id1_entry = self.safeGetObject(id_entry[0])
+        if rev == 2:
+            U, key = _alg34(password, owner_entry, p_entry, id1_entry)
+        elif rev >= 3:
+            U, key = _alg35(password, rev,
+                    self.safeGetObject(encrypt["/Length"]) / 8, owner_entry,
+                    p_entry, id1_entry,
+                    self.safeGetObject(encrypt.get("/EncryptMetadata", False)))
+        real_U = self.safeGetObject(encrypt['/U'])
+        return U == real_U, key
+
+    def getIsEncrypted(self):
+        return self.trailer.has_key("/Encrypt")
+
+    ##
+    # Read-only boolean property showing whether this PDF file is encrypted.
+    # Note that this property, if true, will remain true even after the {@link
+    # #PdfFileReader.decrypt decrypt} function is called.
+    isEncrypted = property(lambda self: self.getIsEncrypted(), None, None)
+
+
+def getRectangle(self, name, defaults):
+    retval = self.get(name)
+    if isinstance(retval, RectangleObject):
+        return retval
+    if retval == None:
+        for d in defaults:
+            retval = self.get(d)
+            if retval != None:
+                break
+    if isinstance(retval, IndirectObject):
+        retval = self.pdf.getObject(retval)
+    retval = RectangleObject(retval)
+    setRectangle(self, name, retval)
+    return retval
+
+def setRectangle(self, name, value):
+    if not isinstance(name, NameObject):
+        name = NameObject(name)
+    self[name] = value
+
+def deleteRectangle(self, name):
+    del self[name]
+
+def createRectangleAccessor(name, fallback):
+    return \
+        property(
+            lambda self: getRectangle(self, name, fallback),
+            lambda self, value: setRectangle(self, name, value),
+            lambda self: deleteRectangle(self, name)
+            )
+
+##
+# This class represents a single page within a PDF file.  Typically this object
+# will be created by accessing the {@link #PdfFileReader.getPage getPage}
+# function of the {@link #PdfFileReader PdfFileReader} class.
+class PageObject(DictionaryObject):
+    def __init__(self, pdf):
+        DictionaryObject.__init__(self)
+        self.pdf = pdf
+
+    ##
+    # Rotates a page clockwise by increments of 90 degrees.
+    # <p>
+    # Stability: Added in v1.1, will exist for all future v1.x releases.
+    # @param angle Angle to rotate the page.  Must be an increment of 90 deg.
+    def rotateClockwise(self, angle):
+        assert angle % 90 == 0
+        self._rotate(angle)
+        return self
+
+    ##
+    # Rotates a page counter-clockwise by increments of 90 degrees.
+    # <p>
+    # Stability: Added in v1.1, will exist for all future v1.x releases.
+    # @param angle Angle to rotate the page.  Must be an increment of 90 deg.
+    def rotateCounterClockwise(self, angle):
+        assert angle % 90 == 0
+        self._rotate(-angle)
+        return self
+
+    def _rotate(self, angle):
+        currentAngle = self.get("/Rotate", 0)
+        self[NameObject("/Rotate")] = NumberObject(currentAngle + angle)
+
+    def _mergeResources(res1, res2, resource):
+        newRes = DictionaryObject()
+        newRes.update(res1.get(resource, DictionaryObject()).getObject())
+        page2Res = res2.get(resource, DictionaryObject()).getObject()
+        renameRes = {}
+        for key in page2Res.keys():
+            if newRes.has_key(key) and newRes[key] != page2Res[key]:
+                newname = NameObject(key + "renamed")
+                renameRes[key] = newname
+                newRes[newname] = page2Res[key]
+            elif not newRes.has_key(key):
+                newRes[key] = page2Res[key]
+        return newRes, renameRes
+    _mergeResources = staticmethod(_mergeResources)
+
+    def _contentStreamRename(stream, rename, pdf):
+        if not rename:
+            return stream
+        stream = ContentStream(stream, pdf)
+        for operands,operator in stream.operations:
+            for i in range(len(operands)):
+                op = operands[i]
+                if isinstance(op, NameObject):
+                    operands[i] = rename.get(op, op)
+        return stream
+    _contentStreamRename = staticmethod(_contentStreamRename)
+
+    def _pushPopGS(contents, pdf):
+        # adds a graphics state "push" and "pop" to the beginning and end
+        # of a content stream.  This isolates it from changes such as 
+        # transformation matricies.
+        stream = ContentStream(contents, pdf)
+        stream.operations.insert(0, [[], "q"])
+        stream.operations.append([[], "Q"])
+        return stream
+    _pushPopGS = staticmethod(_pushPopGS)
+
+    ##
+    # Merges the content streams of two pages into one.  Resource references
+    # (i.e. fonts) are maintained from both pages.  The mediabox/cropbox/etc
+    # of this page are not altered.  The parameter page's content stream will
+    # be added to the end of this page's content stream, meaning that it will
+    # be drawn after, or "on top" of this page.
+    # <p>
+    # Stability: Added in v1.4, will exist for all future 1.x releases.
+    # @param page2 An instance of {@link #PageObject PageObject} to be merged
+    #              into this one.
+    def mergePage(self, page2):
+
+        # First we work on merging the resource dictionaries.  This allows us
+        # to find out what symbols in the content streams we might need to
+        # rename.
+
+        newResources = DictionaryObject()
+        rename = {}
+        originalResources = self["/Resources"].getObject()
+        page2Resources = page2["/Resources"].getObject()
+
+        for res in "/ExtGState", "/Font", "/XObject", "/ColorSpace", "/Pattern", "/Shading":
+            new, newrename = PageObject._mergeResources(originalResources, page2Resources, res)
+            if new:
+                newResources[NameObject(res)] = new
+                rename.update(newrename)
+
+        # Combine /ProcSet sets.
+        newResources[NameObject("/ProcSet")] = ArrayObject(
+            ImmutableSet(originalResources.get("/ProcSet", ArrayObject()).getObject()).union(
+                ImmutableSet(page2Resources.get("/ProcSet", ArrayObject()).getObject())
+            )
+        )
+
+        newContentArray = ArrayObject()
+
+        originalContent = self["/Contents"].getObject()
+        newContentArray.append(PageObject._pushPopGS(originalContent, self.pdf))
+
+        page2Content = page2['/Contents'].getObject()
+        page2Content = PageObject._contentStreamRename(page2Content, rename, self.pdf)
+        page2Content = PageObject._pushPopGS(page2Content, self.pdf)
+        newContentArray.append(page2Content)
+
+        self[NameObject('/Contents')] = ContentStream(newContentArray, self.pdf)
+        self[NameObject('/Resources')] = newResources
+
+    ##
+    # Compresses the size of this page by joining all content streams and
+    # applying a FlateDecode filter.
+    # <p>
+    # Stability: Added in v1.6, will exist for all future v1.x releases.
+    # However, it is possible that this function will perform no action if
+    # content stream compression becomes "automatic" for some reason.
+    def compressContentStreams(self):
+        content = self["/Contents"].getObject()
+        if not isinstance(content, ContentStream):
+            content = ContentStream(content, self.pdf)
+        self[NameObject("/Contents")] = content.flateEncode()
+
+    ##
+    # Locate all text drawing commands, in the order they are provided in the
+    # content stream, and extract the text.  This works well for some PDF
+    # files, but poorly for others, depending on the generator used.  This will
+    # be refined in the future.  Do not rely on the order of text coming out of
+    # this function, as it will change if this function is made more
+    # sophisticated.
+    # <p>
+    # Stability: Added in v1.7, will exist for all future v1.x releases.  May
+    # be overhauled to provide more ordered text in the future.
+    # @return a string object
+    def extractText(self):
+        text = ""
+        content = self["/Contents"].getObject()
+        if not isinstance(content, ContentStream):
+            content = ContentStream(content, self.pdf)
+        for operands,operator in content.operations:
+            if operator == "Tj":
+                text += operands[0]
+            elif operator == "T*":
+                text += "\n"
+            elif operator == "'":
+                text += "\n"
+                text += operands[0]
+            elif operator == "\"":
+                text += "\n"
+                text += operands[2]
+            elif operator == "TJ":
+                for i in operands[0]:
+                    if isinstance(i, StringObject):
+                        text += i
+        return text
+
+    ##
+    # A rectangle (RectangleObject), expressed in default user space units,
+    # defining the boundaries of the physical medium on which the page is
+    # intended to be displayed or printed.
+    # <p>
+    # Stability: Added in v1.4, will exist for all future v1.x releases.
+    mediaBox = createRectangleAccessor("/MediaBox", ())
+
+    ##
+    # A rectangle (RectangleObject), expressed in default user space units,
+    # defining the visible region of default user space.  When the page is
+    # displayed or printed, its contents are to be clipped (cropped) to this
+    # rectangle and then imposed on the output medium in some
+    # implementation-defined manner.  Default value: same as MediaBox.
+    # <p>
+    # Stability: Added in v1.4, will exist for all future v1.x releases.
+    cropBox = createRectangleAccessor("/CropBox", ("/CropBox",))
+
+    ##
+    # A rectangle (RectangleObject), expressed in default user space units,
+    # defining the region to which the contents of the page should be clipped
+    # when output in a production enviroment.
+    # <p>
+    # Stability: Added in v1.4, will exist for all future v1.x releases.
+    bleedBox = createRectangleAccessor("/BleedBox", ("/CropBox", "/MediaBox"))
+
+    ##
+    # A rectangle (RectangleObject), expressed in default user space units,
+    # defining the intended dimensions of the finished page after trimming.
+    # <p>
+    # Stability: Added in v1.4, will exist for all future v1.x releases.
+    trimBox = createRectangleAccessor("/TrimBox", ("/CropBox", "/MediaBox"))
+
+    ##
+    # A rectangle (RectangleObject), expressed in default user space units,
+    # defining the extent of the page's meaningful content as intended by the
+    # page's creator.
+    # <p>
+    # Stability: Added in v1.4, will exist for all future v1.x releases.
+    artBox = createRectangleAccessor("/ArtBox", ("/CropBox", "/MediaBox"))
+
+
+class ContentStream(DecodedStreamObject):
+    def __init__(self, stream, pdf):
+        self.pdf = pdf
+        self.operations = []
+        # stream may be a StreamObject or an ArrayObject containing
+        # multiple StreamObjects to be cat'd together.
+        stream = stream.getObject()
+        if isinstance(stream, ArrayObject):
+            data = ""
+            for s in stream:
+                data += s.getObject().getData()
+            stream = StringIO(data)
+        else:
+            stream = StringIO(stream.getData())
+        self.__parseContentStream(stream)
+
+    def __parseContentStream(self, stream):
+        # file("f:\\tmp.txt", "w").write(stream.read())
+        stream.seek(0, 0)
+        operands = []
+        while True:
+            peek = readNonWhitespace(stream)
+            if peek == '':
+                break
+            stream.seek(-1, 1)
+            if peek.isalpha() or peek == "'" or peek == "\"":
+                operator = readUntilWhitespace(stream, maxchars=2)
+                if operator == "BI":
+                    # begin inline image - a completely different parsing
+                    # mechanism is required, of course... thanks buddy...
+                    assert operands == []
+                    ii = self._readInlineImage(stream)
+                    self.operations.append((ii, "INLINE IMAGE"))
+                else:
+                    self.operations.append((operands, operator))
+                    operands = []
+            else:
+                operands.append(readObject(stream, None))
+
+    def _readInlineImage(self, stream):
+        # begin reading just after the "BI" - begin image
+        # first read the dictionary of settings.
+        settings = DictionaryObject()
+        while True:
+            tok = readNonWhitespace(stream)
+            stream.seek(-1, 1)
+            if tok == "I":
+                # "ID" - begin of image data
+                break
+            key = readObject(stream, self.pdf)
+            tok = readNonWhitespace(stream)
+            stream.seek(-1, 1)
+            value = readObject(stream, self.pdf)
+            settings[key] = value
+        # left at beginning of ID
+        tmp = stream.read(3)
+        assert tmp[:2] == "ID"
+        data = ""
+        while True:
+            tok = stream.read(1)
+            if tok == "E":
+                next = stream.read(1)
+                if next == "I":
+                    break
+                else:
+                    stream.seek(-1, 1)
+                    data += tok
+            else:
+                data += tok
+        x = readNonWhitespace(stream)
+        stream.seek(-1, 1)
+        return {"settings": settings, "data": data}
+
+    def _getData(self):
+        newdata = StringIO()
+        for operands,operator in self.operations:
+            if operator == "INLINE IMAGE":
+                newdata.write("BI")
+                dicttext = StringIO()
+                operands["settings"].writeToStream(dicttext, None)
+                newdata.write(dicttext.getvalue()[2:-2])
+                newdata.write("ID ")
+                newdata.write(operands["data"])
+                newdata.write("EI")
+            else:
+                for op in operands:
+                    op.writeToStream(newdata, None)
+                    newdata.write(" ")
+                newdata.write(operator)
+            newdata.write("\n")
+        return newdata.getvalue()
+
+    def _setData(self, value):
+        self.__parseContentStream(StringIO(value))
+
+    _data = property(_getData, _setData)
+
+
+##
+# A class representing the basic document metadata provided in a PDF File.
+class DocumentInformation(DictionaryObject):
+    def __init__(self):
+        DictionaryObject.__init__(self)
+
+    ##
+    # Read-only property accessing the document's title.  Added in v1.6, will
+    # exist for all future v1.x releases.
+    # @return A string, or None if the title is not provided.
+    title = property(lambda self: self.get("/Title", None), None, None)
+
+    ##
+    # Read-only property accessing the document's author.  Added in v1.6, will
+    # exist for all future v1.x releases.
+    # @return A string, or None if the author is not provided.
+    author = property(lambda self: self.get("/Author", None), None, None)
+
+    ##
+    # Read-only property accessing the subject of the document.  Added in v1.6,
+    # will exist for all future v1.x releases.
+    # @return A string, or None if the subject is not provided.
+    subject = property(lambda self: self.get("/Subject", None), None, None)
+
+    ##
+    # Read-only property accessing the document's creator.  If the document was
+    # converted to PDF from another format, the name of the application (for
+    # example, OpenOffice) that created the original document from which it was
+    # converted.  Added in v1.6, will exist for all future v1.x releases.
+    # @return A string, or None if the creator is not provided.
+    creator = property(lambda self: self.get("/Creator", None), None, None)
+
+    ##
+    # Read-only property accessing the document's producer.  If the document
+    # was converted to PDF from another format, the name of the application
+    # (for example, OSX Quartz) that converted it to PDF.  Added in v1.6, will
+    # exist for all future v1.x releases.
+    # @return A string, or None if the producer is not provided.
+    producer = property(lambda self: self.get("/Producer", None), None, None)
+
+
+def convertToInt(d, size):
+    if size <= 4:
+        d = "\x00\x00\x00\x00" + d
+        d = d[-4:]
+        return struct.unpack(">l", d)[0]
+    elif size <= 8:
+        d = "\x00\x00\x00\x00\x00\x00\x00\x00" + d
+        d = d[-8:]
+        return struct.unpack(">q", d)[0]
+    else:
+        # size too big
+        assert False
+
+# ref: pdf1.8 spec section 3.5.2 algorithm 3.2
+_encryption_padding = '\x28\xbf\x4e\x5e\x4e\x75\x8a\x41\x64\x00\x4e\x56' + \
+        '\xff\xfa\x01\x08\x2e\x2e\x00\xb6\xd0\x68\x3e\x80\x2f\x0c' + \
+        '\xa9\xfe\x64\x53\x69\x7a'
+
+def _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt=True):
+    import md5, struct
+    m = md5.new()
+    password = (password + _encryption_padding)[:32]
+    m.update(password)
+    m.update(owner_entry)
+    p_entry = struct.pack('<i', p_entry)
+    m.update(p_entry)
+    m.update(id1_entry)
+    if rev >= 3 and not metadata_encrypt:
+        m.update("\xff\xff\xff\xff")
+    md5_hash = m.digest()
+    if rev >= 3:
+        for i in range(50):
+            md5_hash = md5.new(md5_hash[:keylen]).digest()
+    return md5_hash[:keylen]
+
+def _alg33(owner_pwd, user_pwd, rev, keylen):
+    key = _alg33_1(owner_pwd, rev, keylen)
+    user_pwd = (user_pwd + _encryption_padding)[:32]
+    val = utils.RC4_encrypt(key, user_pwd)
+    if rev >= 3:
+        for i in range(1, 20):
+            new_key = ''
+            for l in range(len(key)):
+                new_key += chr(ord(key[l]) ^ i)
+            val = utils.RC4_encrypt(new_key, val)
+    return val
+
+def _alg33_1(password, rev, keylen):
+    import md5
+    m = md5.new()
+    password = (password + _encryption_padding)[:32]
+    m.update(password)
+    md5_hash = m.digest()
+    if rev >= 3:
+        for i in range(50):
+            md5_hash = md5.new(md5_hash).digest()
+    key = md5_hash[:keylen]
+    return key
+
+def _alg34(password, owner_entry, p_entry, id1_entry):
+    key = _alg32(password, 2, 5, owner_entry, p_entry, id1_entry)
+    U = utils.RC4_encrypt(key, _encryption_padding)
+    return U, key
+
+def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt):
+    import md5
+    m = md5.new()
+    m.update(_encryption_padding)
+    m.update(id1_entry)
+    md5_hash = m.digest()
+    key = _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry)
+    val = utils.RC4_encrypt(key, md5_hash)
+    for i in range(1, 20):
+        new_key = ''
+        for l in range(len(key)):
+            new_key += chr(ord(key[l]) ^ i)
+        val = utils.RC4_encrypt(new_key, val)
+    return val + ('\x00' * 16), key
+
+#if __name__ == "__main__":
+#    output = PdfFileWriter()
+#
+#    input1 = PdfFileReader(file("test\\5000-s1-05e.pdf", "rb"))
+#    page1 = input1.getPage(0)
+#
+#    input2 = PdfFileReader(file("test\\PDFReference16.pdf", "rb"))
+#    page2 = input2.getPage(0)
+#    page3 = input2.getPage(1)
+#    page1.mergePage(page2)
+#    page1.mergePage(page3)
+#
+#    input3 = PdfFileReader(file("test\\cc-cc.pdf", "rb"))
+#    page1.mergePage(input3.getPage(0))
+#
+#    page1.compressContentStreams()
+#
+#    output.addPage(page1)
+#    output.write(file("test\\merge-test.pdf", "wb"))
+
+
diff --git a/src/libprs500/ebooks/pyPdf/utils.py b/src/libprs500/ebooks/pyPdf/utils.py
new file mode 100644
index 0000000000..d6769c248f
--- /dev/null
+++ b/src/libprs500/ebooks/pyPdf/utils.py
@@ -0,0 +1,94 @@
+# vim: sw=4:expandtab:foldmethod=marker
+#
+# Copyright (c) 2006, Mathieu Fenniak
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# * Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+# * The name of the author may not be used to endorse or promote products
+# derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+"""
+Utility functions for PDF library.
+"""
+__author__ = "Mathieu Fenniak"
+__author_email__ = "mfenniak@pobox.com"
+
+def readUntilWhitespace(stream, maxchars=None):
+    txt = ""
+    while True:
+        tok = stream.read(1)
+        if tok.isspace() or not tok:
+            break
+        txt += tok
+        if len(txt) == maxchars:
+            break
+    return txt
+
+def readNonWhitespace(stream):
+    tok = ' '
+    while tok == '\n' or tok == '\r' or tok == ' ' or tok == '\t':
+        tok = stream.read(1)
+    return tok
+
+class ConvertFunctionsToVirtualList(object):
+    def __init__(self, lengthFunction, getFunction):
+        self.lengthFunction = lengthFunction
+        self.getFunction = getFunction
+
+    def __len__(self):
+        return self.lengthFunction()
+
+    def __getitem__(self, index):
+        if not isinstance(index, int):
+            raise TypeError, "sequence indices must be integers"
+        len_self = len(self)
+        if index < 0:
+            # support negative indexes
+            index = len_self + index
+        if index < 0 or index >= len_self:
+            raise IndexError, "sequence index out of range"
+        return self.getFunction(index)
+
+def RC4_encrypt(key, plaintext):
+    S = [i for i in range(256)]
+    j = 0
+    for i in range(256):
+        j = (j + S[i] + ord(key[i % len(key)])) % 256
+        S[i], S[j] = S[j], S[i]
+    i, j = 0, 0
+    retval = ""
+    for x in range(len(plaintext)):
+        i = (i + 1) % 256
+        j = (j + S[i]) % 256
+        S[i], S[j] = S[j], S[i]
+        t = S[(S[i] + S[j]) % 256]
+        retval += chr(ord(plaintext[x]) ^ t)
+    return retval
+
+if __name__ == "__main__":
+    # test RC4
+    out = RC4_encrypt("Key", "Plaintext")
+    print repr(out)
+    pt = RC4_encrypt("Key", out)
+    print repr(pt)