Implement pure python solution for reading PDF metadata

2025-07-09 03:04:10 -04:00 · 2007-09-07 15:43:39 +00:00 · 2007-09-07 15:43:39 +00:00 · f7332494ae
commit f7332494ae
parent 76af4c11d0
6 changed files with 2059 additions and 62 deletions
--- a/src/libprs500/ebooks/metadata/pdf.py
+++ b/src/libprs500/ebooks/metadata/pdf.py
@ -14,83 +14,41 @@
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 '''Read meta information from PDF files'''

-import sys, os, copy
+import sys, os

-from libprs500.ebooks.metadata import MetaInformation, get_parser
-from libprs500.ptempfile import PersistentTemporaryFile
+from libprs500.ebooks.metadata import MetaInformation
+from libprs500.ebooks.pyPdf import PdfFileReader

 def get_metadata(stream):
    """ Return metadata as a L{MetaInfo} object """
    if hasattr(stream, 'name'):
-        title = stream.name
+        title = os.path.splitext(os.path.basename(stream.name))[0]
    else:
        title = 'Unknown'
    mi = MetaInformation(title, 'Unknown')
-
    stream.seek(0)
-    pt = PersistentTemporaryFile('.pdf')
-    pt.write(stream.read())
-    pt.close()
-    return get_metadata_from_file(pt.name, mi)
-    
-def set_metadata(path, options):
-    try:
-        import podofo
-        doc = podofo.PdfDocument()
-        doc.Load(path)
-        info = doc.GetInfo()
-        if options.title:
-            info.SetTitle(options.title)
-        if options.authors:
-            info.SetAuthor(options.authors)
-        if options.category:
-            info.SetSubject(options.category)
-        pt = PersistentTemporaryFile('.pdf')
-        pt.close() 
-        doc.Write(pt.name)
-        stream = open(path, 'wb')
-        stream.write(open(pt.name, 'rb').read())
-        stream.close()
-    except ImportError:
-        return False
-    return True
-
-def get_metadata_from_file(path, default_mi=None):
-    if default_mi is None:
-        title = os.path.splitext(os.path.basename(path))[0]
-        mi = MetaInformation(title, 'Unknown')
-    else:
-        mi = copy.copy(default_mi)
-    try:
-        import podofo
-        doc = podofo.PdfDocument()
-        doc.Load(path)
-        info = doc.GetInfo()
-        if info.GetTitle():
-            mi.title = info.GetTitle()
-        if info.GetAuthor():
-            mi.authors = info.GetAuthor().split(',')
-        if info.GetSubject():
-            mi.category = info.GetSubject()
-    except ImportError:        
-        pass
-    finally:
-        return mi
-    
-
+    info = PdfFileReader(stream).getDocumentInfo()
+    if info.title:
+        mi.title = title
+    if info.author:
+        src = info.author.split('&')
+        authors = []
+        for au in src:
+            authors += au.split(',')
+        mi.authors = authors
+        mi.author = info.author
+    if info.subject:
+        mi.category = info.subject
+    return mi
+        
+            
 def main(args=sys.argv):
-    parser = get_parser('pdf')
-    options, args = parser.parse_args(args)
    if len(args) != 2:
        print >>sys.stderr, 'No filename specified.'
        return 1
    
    path = os.path.abspath(os.path.expanduser(args[1]))
-    if not set_metadata(path, options):
-        print >>sys.stderr, 'You do not have the podofo python extension installed. Cannot read PDF files.'
-        return 1
-    
-    print get_metadata_from_file(path)
+    print get_metadata(open(path, 'rb'))
    return 0

 if __name__ == '__main__':
--- a/src/libprs500/ebooks/pyPdf/init.py
+++ b/src/libprs500/ebooks/pyPdf/init.py
@ -0,0 +1,2 @@
+from pdf import PdfFileReader, PdfFileWriter
+__all__ = ["pdf"]
--- a/src/libprs500/ebooks/pyPdf/filters.py
+++ b/src/libprs500/ebooks/pyPdf/filters.py
@ -0,0 +1,239 @@
+# vim: sw=4:expandtab:foldmethod=marker
+#
+# Copyright (c) 2006, Mathieu Fenniak
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# * Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+# * The name of the author may not be used to endorse or promote products
+# derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+"""
+Implementation of stream filters for PDF.
+"""
+__author__ = "Mathieu Fenniak"
+__author_email__ = "mfenniak@pobox.com"
+
+from generic import NameObject
+
+try:
+    import zlib
+    def decompress(data):
+        return zlib.decompress(data)
+    def compress(data):
+        return zlib.compress(data)
+except ImportError:
+    # Unable to import zlib.  Attempt to use the System.IO.Compression
+    # library from the .NET framework. (IronPython only)
+    import System
+    from System import IO, Collections, Array
+    def _string_to_bytearr(buf):
+        retval = Array.CreateInstance(System.Byte, len(buf))
+        for i in range(len(buf)):
+            retval[i] = ord(buf[i])
+        return retval
+    def _bytearr_to_string(bytes):
+        retval = ""
+        for i in range(bytes.Length):
+            retval += chr(bytes[i])
+        return retval
+    def _read_bytes(stream):
+        ms = IO.MemoryStream()
+        buf = Array.CreateInstance(System.Byte, 2048)
+        while True:
+            bytes = stream.Read(buf, 0, buf.Length)
+            if bytes == 0:
+                break
+            else:
+                ms.Write(buf, 0, bytes)
+        retval = ms.ToArray()
+        ms.Close()
+        return retval
+    def decompress(data):
+        bytes = _string_to_bytearr(data)
+        ms = IO.MemoryStream()
+        ms.Write(bytes, 0, bytes.Length)
+        ms.Position = 0  # fseek 0
+        gz = IO.Compression.DeflateStream(ms, IO.Compression.CompressionMode.Decompress)
+        bytes = _read_bytes(gz)
+        retval = _bytearr_to_string(bytes)
+        gz.Close()
+        return retval
+    def compress(data):
+        bytes = _string_to_bytearr(data)
+        ms = IO.MemoryStream()
+        gz = IO.Compression.DeflateStream(ms, IO.Compression.CompressionMode.Compress, True)
+        gz.Write(bytes, 0, bytes.Length)
+        gz.Close()
+        ms.Position = 0 # fseek 0
+        bytes = ms.ToArray()
+        retval = _bytearr_to_string(bytes)
+        ms.Close()
+        return retval
+
+
+class FlateDecode(object):
+    def decode(data, decodeParms):
+        data = decompress(data)
+        predictor = 1
+        if decodeParms:
+            predictor = decodeParms.get("/Predictor", 1)
+        # predictor 1 == no predictor
+        if predictor != 1:
+            columns = decodeParms["/Columns"]
+            if predictor >= 10:
+                newdata = ""
+                # PNG prediction can vary from row to row
+                rowlength = columns + 1
+                assert len(data) % rowlength == 0
+                prev_rowdata = "\x00"*rowlength
+                for row in range(len(data) / rowlength):
+                    rowdata = list(data[(row*rowlength):((row+1)*rowlength)])
+                    filterByte = ord(rowdata[0])
+                    if filterByte == 0:
+                        pass
+                    elif filterByte == 1:
+                        for i in range(2, rowlength):
+                            rowdata[i] = chr((ord(rowdata[i]) + ord(rowdata[i-1])) % 256)
+                    elif filterByte == 2:
+                        for i in range(1, rowlength):
+                            rowdata[i] = chr((ord(rowdata[i]) + ord(prev_rowdata[i])) % 256)
+                    else:
+                        # unsupported PNG filter
+                        assert False
+                    prev_rowdata = rowdata
+                    newdata += ''.join(rowdata[1:])
+                data = newdata
+            else:
+                # unsupported predictor
+                assert False
+        return data
+    decode = staticmethod(decode)
+
+    def encode(data):
+        return compress(data)
+    encode = staticmethod(encode)
+
+class ASCIIHexDecode(object):
+    def decode(data, decodeParms=None):
+        retval = ""
+        char = ""
+        x = 0
+        while True:
+            c = data[x]
+            if c == ">":
+                break
+            elif c.isspace():
+                x += 1
+                continue
+            char += c
+            if len(char) == 2:
+                retval += chr(int(char, base=16))
+                char = ""
+            x += 1
+        assert char == ""
+        return retval
+    decode = staticmethod(decode)
+
+class ASCII85Decode(object):
+    def decode(data, decodeParms=None):
+        retval = ""
+        group = []
+        x = 0
+        hitEod = False
+        # remove all whitespace from data
+        data = [y for y in data if not (y in ' \n\r\t')]
+        while not hitEod:
+            c = data[x]
+            if len(retval) == 0 and c == "<" and data[x+1] == "~":
+                x += 2
+                continue
+            #elif c.isspace():
+            #    x += 1
+            #    continue
+            elif c == 'z':
+                assert len(group) == 0
+                retval += '\x00\x00\x00\x00'
+                continue
+            elif c == "~" and data[x+1] == ">":
+                if len(group) != 0:
+                    # cannot have a final group of just 1 char
+                    assert len(group) > 1
+                    cnt = len(group) - 1
+                    group += [ 85, 85, 85 ]
+                    hitEod = cnt
+                else:
+                    break
+            else:
+                c = ord(c) - 33
+                assert c >= 0 and c < 85
+                group += [ c ]
+            if len(group) >= 5:
+                b = group[0] * (85**4) + \
+                    group[1] * (85**3) + \
+                    group[2] * (85**2) + \
+                    group[3] * 85 + \
+                    group[4]
+                assert b < (2**32 - 1)
+                c4 = chr((b >> 0) % 256)
+                c3 = chr((b >> 8) % 256)
+                c2 = chr((b >> 16) % 256)
+                c1 = chr(b >> 24)
+                retval += (c1 + c2 + c3 + c4)
+                if hitEod:
+                    retval = retval[:-4+hitEod]
+                group = []
+            x += 1
+        return retval
+    decode = staticmethod(decode)
+
+def decodeStreamData(stream):
+    filters = stream.get("/Filter", ())
+    if len(filters) and not isinstance(filters[0], NameObject):
+        # we have a single filter instance
+        filters = (filters,)
+    data = stream._data
+    for filterType in filters:
+        if filterType == "/FlateDecode":
+            data = FlateDecode.decode(data, stream.get("/DecodeParms"))
+        elif filterType == "/ASCIIHexDecode":
+            data = ASCIIHexDecode.decode(data)
+        elif filterType == "/ASCII85Decode":
+            data = ASCII85Decode.decode(data)
+        else:
+            # unsupported filter
+            assert False
+    return data
+
+if __name__ == "__main__":
+    assert "abc" == ASCIIHexDecode.decode('61\n626\n3>')
+
+    ascii85Test = """
+     <~9jqo^BlbD-BleB1DJ+*+F(f,q/0JhKF<GL>Cj@.4Gp$d7F!,L7@<6@)/0JDEF<G%<+EV:2F!,
+     O<DJ+*.@<*K0@<6L(Df-\\0Ec5e;DffZ(EZee.Bl.9pF"AGXBPCsi+DGm>@3BB/F*&OCAfu2/AKY
+     i(DIb:@FD,*)+C]U=@3BN#EcYf8ATD3s@q?d$AftVqCh[NqF<G:8+EV:.+Cf>-FD5W8ARlolDIa
+     l(DId<j@<?3r@:F%a+D58'ATD4$Bl@l3De:,-DJs`8ARoFb/0JMK@qB4^F!,R<AKZ&-DfTqBG%G
+     >uD.RTpAKYo'+CT/5+Cei#DII?(E,9)oF*2M7/c~>
+    """
+    ascii85_originalText="Man is distinguished, not only by his reason, but by this singular passion from other animals, which is a lust of the mind, that by a perseverance of delight in the continued and indefatigable generation of knowledge, exceeds the short vehemence of any carnal pleasure."
+    assert ASCII85Decode.decode(ascii85Test) == ascii85_originalText
--- a/src/libprs500/ebooks/pyPdf/generic.py
+++ b/src/libprs500/ebooks/pyPdf/generic.py
@ -0,0 +1,542 @@
+# vim: sw=4:expandtab:foldmethod=marker
+#
+# Copyright (c) 2006, Mathieu Fenniak
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# * Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+# * The name of the author may not be used to endorse or promote products
+# derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+"""
+Implementation of generic PDF objects (dictionary, number, string, and so on)
+"""
+__author__ = "Mathieu Fenniak"
+__author_email__ = "mfenniak@pobox.com"
+
+import re
+from utils import readNonWhitespace, RC4_encrypt
+import filters
+
+def readObject(stream, pdf):
+    tok = stream.read(1)
+    stream.seek(-1, 1) # reset to start
+    if tok == 't' or tok == 'f':
+        # boolean object
+        return BooleanObject.readFromStream(stream)
+    elif tok == '(':
+        # string object
+        return StringObject.readFromStream(stream)
+    elif tok == '/':
+        # name object
+        return NameObject.readFromStream(stream)
+    elif tok == '[':
+        # array object
+        return ArrayObject.readFromStream(stream, pdf)
+    elif tok == 'n':
+        # null object
+        return NullObject.readFromStream(stream)
+    elif tok == '<':
+        # hexadecimal string OR dictionary
+        peek = stream.read(2)
+        stream.seek(-2, 1) # reset to start
+        if peek == '<<':
+            return DictionaryObject.readFromStream(stream, pdf)
+        else:
+            return StringObject.readHexStringFromStream(stream)
+    elif tok == '%':
+        # comment
+        while tok not in ('\r', '\n'):
+            tok = stream.read(1)
+        tok = readNonWhitespace(stream)
+        stream.seek(-1, 1)
+        return readObject(stream, pdf)
+    else:
+        # number object OR indirect reference
+        if tok == '+' or tok == '-':
+            # number
+            return NumberObject.readFromStream(stream)
+        peek = stream.read(20)
+        stream.seek(-len(peek), 1) # reset to start
+        if re.match(r"(\d+)\s(\d+)\sR[^a-zA-Z]", peek) != None:
+            return IndirectObject.readFromStream(stream, pdf)
+        else:
+            return NumberObject.readFromStream(stream)
+
+class PdfObject(object):
+    def getObject(self):
+        """Resolves indirect references."""
+        return self
+
+
+class NullObject(PdfObject):
+    def writeToStream(self, stream, encryption_key):
+        stream.write("null")
+
+    def readFromStream(stream):
+        assert stream.read(4) == "null"
+        return NullObject()
+    readFromStream = staticmethod(readFromStream)
+
+
+class BooleanObject(PdfObject):
+    def __init__(self, value):
+        self.value = value
+
+    def writeToStream(self, stream, encryption_key):
+        if self.value:
+            stream.write("true")
+        else:
+            stream.write("false")
+
+    def readFromStream(stream):
+        word = stream.read(4)
+        if word == "true":
+            return BooleanObject(True)
+        elif word == "fals":
+            stream.read(1)
+            return BooleanObject(False)
+        assert False
+    readFromStream = staticmethod(readFromStream)
+
+
+class ArrayObject(list, PdfObject):
+    def writeToStream(self, stream, encryption_key):
+        stream.write("[")
+        for data in self:
+            stream.write(" ")
+            data.writeToStream(stream, encryption_key)
+        stream.write(" ]")
+
+    def readFromStream(stream, pdf):
+        arr = ArrayObject()
+        assert stream.read(1) == "["
+        while True:
+            # skip leading whitespace
+            tok = stream.read(1)
+            while tok.isspace():
+                tok = stream.read(1)
+            stream.seek(-1, 1)
+            # check for array ending
+            peekahead = stream.read(1)
+            if peekahead == "]":
+                break
+            stream.seek(-1, 1)
+            # read and append obj
+            arr.append(readObject(stream, pdf))
+        return arr
+    readFromStream = staticmethod(readFromStream)
+
+
+class IndirectObject(PdfObject):
+    def __init__(self, idnum, generation, pdf):
+        self.idnum = idnum
+        self.generation = generation
+        self.pdf = pdf
+
+    def getObject(self):
+        return self.pdf.getObject(self).getObject()
+
+    def __repr__(self):
+        return "IndirectObject(%r, %r)" % (self.idnum, self.generation)
+
+    def __eq__(self, other):
+        return (
+            other != None and
+            isinstance(other, IndirectObject) and
+            self.idnum == other.idnum and
+            self.generation == other.generation and
+            self.pdf is other.pdf
+            )
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+    def writeToStream(self, stream, encryption_key):
+        stream.write("%s %s R" % (self.idnum, self.generation))
+
+    def readFromStream(stream, pdf):
+        idnum = ""
+        while True:
+            tok = stream.read(1)
+            if tok.isspace():
+                break
+            idnum += tok
+        generation = ""
+        while True:
+            tok = stream.read(1)
+            if tok.isspace():
+                break
+            generation += tok
+        r = stream.read(1)
+        #if r != "R":
+        #    stream.seek(-20, 1)
+        #    print idnum, generation
+        #    print repr(stream.read(40))
+        assert r == "R"
+        return IndirectObject(int(idnum), int(generation), pdf)
+    readFromStream = staticmethod(readFromStream)
+
+
+class FloatObject(float, PdfObject):
+    def writeToStream(self, stream, encryption_key):
+        stream.write(repr(self))
+
+
+class NumberObject(int, PdfObject):
+    def __init__(self, value):
+        int.__init__(self, value)
+
+    def writeToStream(self, stream, encryption_key):
+        stream.write(repr(self))
+
+    def readFromStream(stream):
+        name = ""
+        while True:
+            tok = stream.read(1)
+            if tok != '+' and tok != '-' and tok != '.' and not tok.isdigit():
+                stream.seek(-1, 1)
+                break
+            name += tok
+        if name.find(".") != -1:
+            return FloatObject(name)
+        else:
+            return NumberObject(name)
+    readFromStream = staticmethod(readFromStream)
+
+
+class StringObject(str, PdfObject):
+    def writeToStream(self, stream, encryption_key):
+        string = self
+        if encryption_key:
+            string = RC4_encrypt(encryption_key, string)
+        stream.write("(")
+        for c in string:
+            if not c.isalnum() and not c.isspace():
+                stream.write("\\%03o" % ord(c))
+            else:
+                stream.write(c)
+        stream.write(")")
+
+    def readHexStringFromStream(stream):
+        stream.read(1)
+        txt = ""
+        x = ""
+        while True:
+            tok = readNonWhitespace(stream)
+            if tok == ">":
+                break
+            x += tok
+            if len(x) == 2:
+                txt += chr(int(x, base=16))
+                x = ""
+        if len(x) == 1:
+            x += "0"
+        if len(x) == 2:
+            txt += chr(int(x, base=16))
+        return StringObject(txt)
+    readHexStringFromStream = staticmethod(readHexStringFromStream)
+
+    def readFromStream(stream):
+        tok = stream.read(1)
+        parens = 1
+        txt = ""
+        while True:
+            tok = stream.read(1)
+            if tok == "(":
+                parens += 1
+            elif tok == ")":
+                parens -= 1
+                if parens == 0:
+                    break
+            elif tok == "\\":
+                tok = stream.read(1)
+                if tok == "n":
+                    tok = "\n"
+                elif tok == "r":
+                    tok = "\r"
+                elif tok == "t":
+                    tok = "\t"
+                elif tok == "b":
+                    tok == "\b"
+                elif tok == "f":
+                    tok = "\f"
+                elif tok == "(":
+                    tok = "("
+                elif tok == ")":
+                    tok = ")"
+                elif tok == "\\":
+                    tok = "\\"
+                elif tok.isdigit():
+                    tok += stream.read(2)
+                    tok = chr(int(tok, base=8))
+            txt += tok
+        return StringObject(txt)
+    readFromStream = staticmethod(readFromStream)
+
+
+class NameObject(str, PdfObject):
+    delimiterCharacters = "(", ")", "<", ">", "[", "]", "{", "}", "/", "%"
+
+    def __init__(self, data):
+        str.__init__(self, data)
+
+    def writeToStream(self, stream, encryption_key):
+        stream.write(self)
+
+    def readFromStream(stream):
+        name = stream.read(1)
+        assert name == "/"
+        while True:
+            tok = stream.read(1)
+            if tok.isspace() or tok in NameObject.delimiterCharacters:
+                stream.seek(-1, 1)
+                break
+            name += tok
+        return NameObject(name)
+    readFromStream = staticmethod(readFromStream)
+
+
+class DictionaryObject(dict, PdfObject):
+    def __init__(self):
+        pass
+
+    def writeToStream(self, stream, encryption_key):
+        stream.write("<<\n")
+        for key, value in self.items():
+            key.writeToStream(stream, encryption_key)
+            stream.write(" ")
+            value.writeToStream(stream, encryption_key)
+            stream.write("\n")
+        stream.write(">>")
+
+    def readFromStream(stream, pdf):
+        assert stream.read(2) == "<<"
+        data = {}
+        while True:
+            tok = readNonWhitespace(stream)
+            if tok == ">":
+                stream.read(1)
+                break
+            stream.seek(-1, 1)
+            key = readObject(stream, pdf)
+            tok = readNonWhitespace(stream)
+            stream.seek(-1, 1)
+            value = readObject(stream, pdf)
+            if data.has_key(key):
+                # multiple definitions of key not permitted
+                assert False
+            data[key] = value
+        pos = stream.tell()
+        s = readNonWhitespace(stream)
+        if s == 's' and stream.read(5) == 'tream':
+            eol = stream.read(1)
+            # odd PDF file output has spaces after 'stream' keyword but before EOL.
+            # patch provided by Danial Sandler
+            while eol == ' ':
+                eol = stream.read(1)
+            assert eol in ("\n", "\r")
+            if eol == "\r":
+                # read \n after
+                stream.read(1)
+            # this is a stream object, not a dictionary
+            assert data.has_key("/Length")
+            length = data["/Length"]
+            if isinstance(length, IndirectObject):
+                t = stream.tell()
+                length = pdf.getObject(length)
+                stream.seek(t, 0)
+            data["__streamdata__"] = stream.read(length)
+            e = readNonWhitespace(stream)
+            ndstream = stream.read(8)
+            if (e + ndstream) != "endstream":
+                # (sigh) - the odd PDF file has a length that is too long, so
+                # we need to read backwards to find the "endstream" ending.
+                # ReportLab (unknown version) generates files with this bug,
+                # and Python users into PDF files tend to be our audience.
+                # we need to do this to correct the streamdata and chop off
+                # an extra character.
+                pos = stream.tell()
+                stream.seek(-10, 1)
+                end = stream.read(9)
+                if end == "endstream":
+                    # we found it by looking back one character further.
+                    data["__streamdata__"] = data["__streamdata__"][:-1]
+                else:
+                    stream.seek(pos, 0)
+                    raise "Unable to find 'endstream' marker after stream."
+        else:
+            stream.seek(pos, 0)
+        if data.has_key("__streamdata__"):
+            return StreamObject.initializeFromDictionary(data)
+        else:
+            retval = DictionaryObject()
+            retval.update(data)
+            return retval
+    readFromStream = staticmethod(readFromStream)
+
+
+class StreamObject(DictionaryObject):
+    def __init__(self):
+        self._data = None
+        self.decodedSelf = None
+
+    def writeToStream(self, stream, encryption_key):
+        self[NameObject("/Length")] = NumberObject(len(self._data))
+        DictionaryObject.writeToStream(self, stream, encryption_key)
+        del self["/Length"]
+        stream.write("\nstream\n")
+        data = self._data
+        if encryption_key:
+            data = RC4_encrypt(encryption_key, data)
+        stream.write(data)
+        stream.write("\nendstream")
+
+    def initializeFromDictionary(data):
+        if data.has_key("/Filter"):
+            retval = EncodedStreamObject()
+        else:
+            retval = DecodedStreamObject()
+        retval._data = data["__streamdata__"]
+        del data["__streamdata__"]
+        del data["/Length"]
+        retval.update(data)
+        return retval
+    initializeFromDictionary = staticmethod(initializeFromDictionary)
+
+    def flateEncode(self):
+        if self.has_key("/Filter"):
+            f = self["/Filter"]
+            if isinstance(f, ArrayObject):
+                f.insert(0, NameObject("/FlateDecode"))
+            else:
+                newf = ArrayObject()
+                newf.append(NameObject("/FlateDecode"))
+                newf.append(f)
+                f = newf
+        else:
+            f = NameObject("/FlateDecode")
+        retval = EncodedStreamObject()
+        retval[NameObject("/Filter")] = f
+        retval._data = filters.FlateDecode.encode(self._data)
+        return retval
+
+
+class DecodedStreamObject(StreamObject):
+    def getData(self):
+        return self._data
+
+    def setData(self, data):
+        self._data = data
+
+
+class EncodedStreamObject(StreamObject):
+    def __init__(self):
+        self.decodedSelf = None
+
+    def getData(self):
+        if self.decodedSelf:
+            # cached version of decoded object
+            return self.decodedSelf.getData()
+        else:
+            # create decoded object
+            decoded = StreamObject()
+            decoded._data = filters.decodeStreamData(self)
+            for key, value in self.items():
+                if not key in ("/Length", "/Filter", "/DecodeParms"):
+                    decoded[key] = value
+            self.decodedSelf = decoded
+            return decoded._data
+
+    def setData(self, data):
+        raise "Creating EncodedStreamObject is not currently supported"
+
+
+class RectangleObject(ArrayObject):
+    def __init__(self, arr):
+        # must have four points
+        assert len(arr) == 4
+        # automatically convert arr[x] into NumberObject(arr[x]) if necessary
+        ArrayObject.__init__(self, [self.ensureIsNumber(x) for x in arr])
+
+    def ensureIsNumber(self, value):
+        if not isinstance(value, NumberObject):
+            value = NumberObject(value)
+        return value
+
+    def __repr__(self):
+        return "RectangleObject(%s)" % repr(list(self))
+
+    def getLowerLeft_x(self):
+        return self[0]
+
+    def getLowerLeft_y(self):
+        return self[1]
+
+    def getUpperRight_x(self):
+        return self[2]
+
+    def getUpperRight_y(self):
+        return self[3]
+
+    def getUpperLeft_x(self):
+        return self.getLowerLeft_x()
+    
+    def getUpperLeft_y(self):
+        return self.getUpperRight_y()
+
+    def getLowerRight_x(self):
+        return self.getUpperRight_x()
+
+    def getLowerRight_y(self):
+        return self.getLowerLeft_y()
+
+    def getLowerLeft(self):
+        return self.getLowerLeft_x(), self.getLowerLeft_y()
+
+    def getLowerRight(self):
+        return self.getLowerRight_x(), self.getLowerRight_y()
+
+    def getUpperLeft(self):
+        return self.getUpperLeft_x(), self.getUpperLeft_y()
+
+    def getUpperRight(self):
+        return self.getUpperRight_x(), self.getUpperRight_y()
+
+    def setLowerLeft(self, value):
+        self[0], self[1] = [self.ensureIsNumber(x) for x in value]
+
+    def setLowerRight(self, value):
+        self[2], self[1] = [self.ensureIsNumber(x) for x in value]
+
+    def setUpperLeft(self, value):
+        self[0], self[3] = [self.ensureIsNumber(x) for x in value]
+
+    def setUpperRight(self, value):
+        self[2], self[3] = [self.ensureIsNumber(x) for x in value]
+
+    lowerLeft = property(getLowerLeft, setLowerLeft, None, None)
+    lowerRight = property(getLowerRight, setLowerRight, None, None)
+    upperLeft = property(getUpperLeft, setUpperLeft, None, None)
+    upperRight = property(getUpperRight, setUpperRight, None, None)
+
--- a/src/libprs500/ebooks/pyPdf/pdf.py
+++ b/src/libprs500/ebooks/pyPdf/pdf.py
--- a/src/libprs500/ebooks/pyPdf/utils.py
+++ b/src/libprs500/ebooks/pyPdf/utils.py
@ -0,0 +1,94 @@
+# vim: sw=4:expandtab:foldmethod=marker
+#
+# Copyright (c) 2006, Mathieu Fenniak
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# * Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+# * The name of the author may not be used to endorse or promote products
+# derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+"""
+Utility functions for PDF library.
+"""
+__author__ = "Mathieu Fenniak"
+__author_email__ = "mfenniak@pobox.com"
+
+def readUntilWhitespace(stream, maxchars=None):
+    txt = ""
+    while True:
+        tok = stream.read(1)
+        if tok.isspace() or not tok:
+            break
+        txt += tok
+        if len(txt) == maxchars:
+            break
+    return txt
+
+def readNonWhitespace(stream):
+    tok = ' '
+    while tok == '\n' or tok == '\r' or tok == ' ' or tok == '\t':
+        tok = stream.read(1)
+    return tok
+
+class ConvertFunctionsToVirtualList(object):
+    def __init__(self, lengthFunction, getFunction):
+        self.lengthFunction = lengthFunction
+        self.getFunction = getFunction
+
+    def __len__(self):
+        return self.lengthFunction()
+
+    def __getitem__(self, index):
+        if not isinstance(index, int):
+            raise TypeError, "sequence indices must be integers"
+        len_self = len(self)
+        if index < 0:
+            # support negative indexes
+            index = len_self + index
+        if index < 0 or index >= len_self:
+            raise IndexError, "sequence index out of range"
+        return self.getFunction(index)
+
+def RC4_encrypt(key, plaintext):
+    S = [i for i in range(256)]
+    j = 0
+    for i in range(256):
+        j = (j + S[i] + ord(key[i % len(key)])) % 256
+        S[i], S[j] = S[j], S[i]
+    i, j = 0, 0
+    retval = ""
+    for x in range(len(plaintext)):
+        i = (i + 1) % 256
+        j = (j + S[i]) % 256
+        S[i], S[j] = S[j], S[i]
+        t = S[(S[i] + S[j]) % 256]
+        retval += chr(ord(plaintext[x]) ^ t)
+    return retval
+
+if __name__ == "__main__":
+    # test RC4
+    out = RC4_encrypt("Key", "Plaintext")
+    print repr(out)
+    pt = RC4_encrypt("Key", out)
+    print repr(pt)