Newer pyPdf that is hopefully bug free.

2025-07-09 03:04:10 -04:00 · 2007-09-20 01:54:53 +00:00 · 2007-09-20 01:54:53 +00:00 · 75953a47d2
commit 75953a47d2
parent 40f5d79b61
4 changed files with 1109 additions and 593 deletions
--- a/src/libprs500/ebooks/pyPdf/filters.py
+++ b/src/libprs500/ebooks/pyPdf/filters.py
@ -32,9 +32,8 @@
 Implementation of stream filters for PDF.
 """
 __author__ = "Mathieu Fenniak"
-__author_email__ = "mfenniak@pobox.com"
+__author_email__ = "biziqe@mathieu.fenniak.net"
 from generic import NameObject
 try:
    import zlib
@ -208,6 +207,7 @@ class ASCII85Decode(object):
    decode = staticmethod(decode)
 def decodeStreamData(stream):
    from generic import NameObject
    filters = stream.get("/Filter", ())
    if len(filters) and not isinstance(filters[0], NameObject):
        # we have a single filter instance
--- a/src/libprs500/ebooks/pyPdf/generic.py
+++ b/src/libprs500/ebooks/pyPdf/generic.py
@ -32,11 +32,14 @@
 Implementation of generic PDF objects (dictionary, number, string, and so on)
 """
 __author__ = "Mathieu Fenniak"
-__author_email__ = "mfenniak@pobox.com"
+__author_email__ = "biziqe@mathieu.fenniak.net"
 import re
 from utils import readNonWhitespace, RC4_encrypt
 import filters
 import utils
 import decimal
 import codecs
 def readObject(stream, pdf):
    tok = stream.read(1)
@ -46,7 +49,7 @@ def readObject(stream, pdf):
        return BooleanObject.readFromStream(stream)
    elif tok == '(':
        # string object
-        return StringObject.readFromStream(stream)
+        return readStringFromStream(stream)
    elif tok == '/':
        # name object
        return NameObject.readFromStream(stream)
@ -63,7 +66,7 @@ def readObject(stream, pdf):
        if peek == '<<':
            return DictionaryObject.readFromStream(stream, pdf)
        else:
-            return StringObject.readHexStringFromStream(stream)
+            return readHexStringFromStream(stream)
    elif tok == '%':
        # comment
        while tok not in ('\r', '\n'):
@ -94,7 +97,9 @@ class NullObject(PdfObject):
        stream.write("null")
    def readFromStream(stream):
-        assert stream.read(4) == "null"
+        nulltxt = stream.read(4)
        if nulltxt != "null":
            raise utils.PdfReadError, "error reading null object"
        return NullObject()
    readFromStream = staticmethod(readFromStream)
@ -130,7 +135,9 @@ class ArrayObject(list, PdfObject):
    def readFromStream(stream, pdf):
        arr = ArrayObject()
-        assert stream.read(1) == "["
+        tmp = stream.read(1)
        if tmp != "[":
            raise utils.PdfReadError, "error reading array"
        while True:
            # skip leading whitespace
            tok = stream.read(1)
@ -189,18 +196,15 @@ class IndirectObject(PdfObject):
                break
            generation += tok
        r = stream.read(1)
-        #if r != "R":
+        if r != "R":
-        #    stream.seek(-20, 1)
+            raise utils.PdfReadError("error reading indirect object reference")
        #    print idnum, generation
        #    print repr(stream.read(40))
        assert r == "R"
        return IndirectObject(int(idnum), int(generation), pdf)
    readFromStream = staticmethod(readFromStream)
-class FloatObject(float, PdfObject):
+class FloatObject(decimal.Decimal, PdfObject):
    def writeToStream(self, stream, encryption_key):
-        stream.write(repr(self))
+        stream.write(str(self))
 class NumberObject(int, PdfObject):
@ -225,20 +229,33 @@ class NumberObject(int, PdfObject):
    readFromStream = staticmethod(readFromStream)
-class StringObject(str, PdfObject):
+##
-    def writeToStream(self, stream, encryption_key):
+# Given a string (either a "str" or "unicode"), create a ByteStringObject or a
-        string = self
+# TextStringObject to represent the string.
-        if encryption_key:
+def createStringObject(string):
-            string = RC4_encrypt(encryption_key, string)
+    if isinstance(string, unicode):
-        stream.write("(")
+        return TextStringObject(string)
-        for c in string:
+    elif isinstance(string, str):
-            if not c.isalnum() and not c.isspace():
+        if string.startswith(codecs.BOM_UTF16_BE):
-                stream.write("\\%03o" % ord(c))
+            retval = TextStringObject(string.decode("utf-16"))
            retval.autodetect_utf16 = True
            return retval
        else:
-                stream.write(c)
+            # This is probably a big performance hit here, but we need to
-        stream.write(")")
+            # convert string objects into the text/unicode-aware version if
            # possible... and the only way to check if that's possible is
            # to try.  Some strings are strings, some are just byte arrays.
            try:
                retval = TextStringObject(decode_pdfdocencoding(string))
                retval.autodetect_pdfdocencoding = True
                return retval
            except UnicodeDecodeError:
                return ByteStringObject(string)
    else:
        raise TypeError("createStringObject should have str or unicode arg")
-    def readHexStringFromStream(stream):
+
 def readHexStringFromStream(stream):
    stream.read(1)
    txt = ""
    x = ""
@ -254,10 +271,10 @@ class StringObject(str, PdfObject):
        x += "0"
    if len(x) == 2:
        txt += chr(int(x, base=16))
-        return StringObject(txt)
+    return createStringObject(txt)
    readHexStringFromStream = staticmethod(readHexStringFromStream)
-    def readFromStream(stream):
+
 def readStringFromStream(stream):
    tok = stream.read(1)
    parens = 1
    txt = ""
@ -290,9 +307,92 @@ class StringObject(str, PdfObject):
            elif tok.isdigit():
                tok += stream.read(2)
                tok = chr(int(tok, base=8))
            elif tok in "\n\r":
                # This case is  hit when a backslash followed by a line
                # break occurs.  If it's a multi-char EOL, consume the
                # second character:
                tok = stream.read(1)
                if not tok in "\n\r":
                    stream.seek(-1, 1)
                # Then don't add anything to the actual string, since this
                # line break was escaped:
                tok = ''
            else:
                raise utils.PdfReadError("Unexpected escaped string")
        txt += tok
-        return StringObject(txt)
+    return createStringObject(txt)
-    readFromStream = staticmethod(readFromStream)
+
 ##
 # Represents a string object where the text encoding could not be determined.
 # This occurs quite often, as the PDF spec doesn't provide an alternate way to
 # represent strings -- for example, the encryption data stored in files (like
 # /O) is clearly not text, but is still stored in a "String" object.
 class ByteStringObject(str, PdfObject):
    ##
    # For compatibility with TextStringObject.original_bytes.  This method
    # returns self.
    original_bytes = property(lambda self: self)
    def writeToStream(self, stream, encryption_key):
        bytearr = self
        if encryption_key:
            bytearr = RC4_encrypt(encryption_key, bytearr)
        stream.write("<")
        stream.write(bytearr.encode("hex"))
        stream.write(">")
 ##
 # Represents a string object that has been decoded into a real unicode string.
 # If read from a PDF document, this string appeared to match the
 # PDFDocEncoding, or contained a UTF-16BE BOM mark to cause UTF-16 decoding to
 # occur.
 class TextStringObject(unicode, PdfObject):
    autodetect_pdfdocencoding = False
    autodetect_utf16 = False
    ##
    # It is occasionally possible that a text string object gets created where
    # a byte string object was expected due to the autodetection mechanism --
    # if that occurs, this "original_bytes" property can be used to
    # back-calculate what the original encoded bytes were.
    original_bytes = property(lambda self: self.get_original_bytes())
    def get_original_bytes(self):
        # We're a text string object, but the library is trying to get our raw
        # bytes.  This can happen if we auto-detected this string as text, but
        # we were wrong.  It's pretty common.  Return the original bytes that
        # would have been used to create this object, based upon the autodetect
        # method.
        if self.autodetect_utf16:
            return codecs.BOM_UTF16_BE + self.encode("utf-16be")
        elif self.autodetect_pdfdocencoding:
            return encode_pdfdocencoding(self)
        else:
            raise Exception("no information about original bytes")
    def writeToStream(self, stream, encryption_key):
        # Try to write the string out as a PDFDocEncoding encoded string.  It's
        # nicer to look at in the PDF file.  Sadly, we take a performance hit
        # here for trying...
        try:
            bytearr = encode_pdfdocencoding(self)
        except UnicodeEncodeError:
            bytearr = codecs.BOM_UTF16_BE + self.encode("utf-16be")
        if encryption_key:
            bytearr = RC4_encrypt(encryption_key, bytearr)
            obj = ByteStringObject(bytearr)
            obj.writeToStream(stream, None)
        else:
            stream.write("(")
            for c in bytearr:
                if not c.isalnum() and c != ' ':
                    stream.write("\\%03o" % ord(c))
                else:
                    stream.write(c)
            stream.write(")")
 class NameObject(str, PdfObject):
@ -306,7 +406,8 @@ class NameObject(str, PdfObject):
    def readFromStream(stream):
        name = stream.read(1)
-        assert name == "/"
+        if name != "/":
            raise utils.PdfReadError, "name read error"
        while True:
            tok = stream.read(1)
            if tok.isspace() or tok in NameObject.delimiterCharacters:
@ -331,7 +432,9 @@ class DictionaryObject(dict, PdfObject):
        stream.write(">>")
    def readFromStream(stream, pdf):
-        assert stream.read(2) == "<<"
+        tmp = stream.read(2)
        if tmp != "<<":
            raise utils.PdfReadError, "dictionary read error"
        data = {}
        while True:
            tok = readNonWhitespace(stream)
@ -345,7 +448,7 @@ class DictionaryObject(dict, PdfObject):
            value = readObject(stream, pdf)
            if data.has_key(key):
                # multiple definitions of key not permitted
-                assert False
+                raise utils.PdfReadError, "multiple definitions in dictionary"
            data[key] = value
        pos = stream.tell()
        s = readNonWhitespace(stream)
@ -384,7 +487,7 @@ class DictionaryObject(dict, PdfObject):
                    data["__streamdata__"] = data["__streamdata__"][:-1]
                else:
                    stream.seek(pos, 0)
-                    raise "Unable to find 'endstream' marker after stream."
+                    raise utils.PdfReadError, "Unable to find 'endstream' marker after stream."
        else:
            stream.seek(pos, 0)
        if data.has_key("__streamdata__"):
@ -469,7 +572,7 @@ class EncodedStreamObject(StreamObject):
            return decoded._data
    def setData(self, data):
-        raise "Creating EncodedStreamObject is not currently supported"
+        raise utils.PdfReadError, "Creating EncodedStreamObject is not currently supported"
 class RectangleObject(ArrayObject):
@ -540,3 +643,69 @@ class RectangleObject(ArrayObject):
    upperLeft = property(getUpperLeft, setUpperLeft, None, None)
    upperRight = property(getUpperRight, setUpperRight, None, None)
 def encode_pdfdocencoding(unicode_string):
    retval = ''
    for c in unicode_string:
        try:
            retval += chr(_pdfDocEncoding_rev[c])
        except KeyError:
            raise UnicodeEncodeError("pdfdocencoding", c, -1, -1,
                    "does not exist in translation table")
    return retval
 def decode_pdfdocencoding(byte_array):
    retval = u''
    for b in byte_array:
        c = _pdfDocEncoding[ord(b)]
        if c == u'\u0000':
            raise UnicodeDecodeError("pdfdocencoding", b, -1, -1,
                    "does not exist in translation table")
        retval += c
    return retval
 _pdfDocEncoding = (
  u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000',
  u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000',
  u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000',
  u'\u02d8', u'\u02c7', u'\u02c6', u'\u02d9', u'\u02dd', u'\u02db', u'\u02da', u'\u02dc',
  u'\u0020', u'\u0021', u'\u0022', u'\u0023', u'\u0024', u'\u0025', u'\u0026', u'\u0027',
  u'\u0028', u'\u0029', u'\u002a', u'\u002b', u'\u002c', u'\u002d', u'\u002e', u'\u002f',
  u'\u0030', u'\u0031', u'\u0032', u'\u0033', u'\u0034', u'\u0035', u'\u0036', u'\u0037',
  u'\u0038', u'\u0039', u'\u003a', u'\u003b', u'\u003c', u'\u003d', u'\u003e', u'\u003f',
  u'\u0040', u'\u0041', u'\u0042', u'\u0043', u'\u0044', u'\u0045', u'\u0046', u'\u0047',
  u'\u0048', u'\u0049', u'\u004a', u'\u004b', u'\u004c', u'\u004d', u'\u004e', u'\u004f',
  u'\u0050', u'\u0051', u'\u0052', u'\u0053', u'\u0054', u'\u0055', u'\u0056', u'\u0057',
  u'\u0058', u'\u0059', u'\u005a', u'\u005b', u'\u005c', u'\u005d', u'\u005e', u'\u005f',
  u'\u0060', u'\u0061', u'\u0062', u'\u0063', u'\u0064', u'\u0065', u'\u0066', u'\u0067',
  u'\u0068', u'\u0069', u'\u006a', u'\u006b', u'\u006c', u'\u006d', u'\u006e', u'\u006f',
  u'\u0070', u'\u0071', u'\u0072', u'\u0073', u'\u0074', u'\u0075', u'\u0076', u'\u0077',
  u'\u0078', u'\u0079', u'\u007a', u'\u007b', u'\u007c', u'\u007d', u'\u007e', u'\u0000',
  u'\u2022', u'\u2020', u'\u2021', u'\u2026', u'\u2014', u'\u2013', u'\u0192', u'\u2044',
  u'\u2039', u'\u203a', u'\u2212', u'\u2030', u'\u201e', u'\u201c', u'\u201d', u'\u2018',
  u'\u2019', u'\u201a', u'\u2122', u'\ufb01', u'\ufb02', u'\u0141', u'\u0152', u'\u0160',
  u'\u0178', u'\u017d', u'\u0131', u'\u0142', u'\u0153', u'\u0161', u'\u017e', u'\u0000',
  u'\u20ac', u'\u00a1', u'\u00a2', u'\u00a3', u'\u00a4', u'\u00a5', u'\u00a6', u'\u00a7',
  u'\u00a8', u'\u00a9', u'\u00aa', u'\u00ab', u'\u00ac', u'\u0000', u'\u00ae', u'\u00af',
  u'\u00b0', u'\u00b1', u'\u00b2', u'\u00b3', u'\u00b4', u'\u00b5', u'\u00b6', u'\u00b7',
  u'\u00b8', u'\u00b9', u'\u00ba', u'\u00bb', u'\u00bc', u'\u00bd', u'\u00be', u'\u00bf',
  u'\u00c0', u'\u00c1', u'\u00c2', u'\u00c3', u'\u00c4', u'\u00c5', u'\u00c6', u'\u00c7',
  u'\u00c8', u'\u00c9', u'\u00ca', u'\u00cb', u'\u00cc', u'\u00cd', u'\u00ce', u'\u00cf',
  u'\u00d0', u'\u00d1', u'\u00d2', u'\u00d3', u'\u00d4', u'\u00d5', u'\u00d6', u'\u00d7',
  u'\u00d8', u'\u00d9', u'\u00da', u'\u00db', u'\u00dc', u'\u00dd', u'\u00de', u'\u00df',
  u'\u00e0', u'\u00e1', u'\u00e2', u'\u00e3', u'\u00e4', u'\u00e5', u'\u00e6', u'\u00e7',
  u'\u00e8', u'\u00e9', u'\u00ea', u'\u00eb', u'\u00ec', u'\u00ed', u'\u00ee', u'\u00ef',
  u'\u00f0', u'\u00f1', u'\u00f2', u'\u00f3', u'\u00f4', u'\u00f5', u'\u00f6', u'\u00f7',
  u'\u00f8', u'\u00f9', u'\u00fa', u'\u00fb', u'\u00fc', u'\u00fd', u'\u00fe', u'\u00ff'
 )
 assert len(_pdfDocEncoding) == 256
 _pdfDocEncoding_rev = {}
 for i in xrange(256):
    char = _pdfDocEncoding[i]
    if char == u"\u0000":
        continue
    assert char not in _pdfDocEncoding_rev
    _pdfDocEncoding_rev[char] = i
--- a/src/libprs500/ebooks/pyPdf/pdf.py
+++ b/src/libprs500/ebooks/pyPdf/pdf.py
@ -1,6 +1,8 @@
 # vim: sw=4:expandtab:foldmethod=marker
 #
 # Copyright (c) 2006, Mathieu Fenniak
 # Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com>
 #
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@ -34,7 +36,7 @@ be able to split and merge PDF files by page, and that's about all it can do.
 It may be a solid base for future PDF file work in Python.
 """
 __author__ = "Mathieu Fenniak"
-__author_email__ = "mfenniak@pobox.com"
+__author_email__ = "biziqe@mathieu.fenniak.net"
 import struct
 try:
@ -44,6 +46,7 @@ except ImportError:
 import filters
 import utils
 import warnings
 from generic import *
 from utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirtualList
 from sets import ImmutableSet
@ -68,7 +71,7 @@ class PdfFileWriter(object):
        # info object
        info = DictionaryObject()
        info.update({
-                NameObject("/Producer"): StringObject("Python PDF Library - http://pybrary.net/pyPdf/")
+                NameObject("/Producer"): createStringObject(u"Python PDF Library - http://pybrary.net/pyPdf/")
                })
        self._info = self._addObject(info)
@ -128,10 +131,10 @@ class PdfFileWriter(object):
            keylen = 40 / 8
        # permit everything:
        P = -1
-        O = StringObject(_alg33(owner_pwd, user_pwd, rev, keylen))
+        O = ByteStringObject(_alg33(owner_pwd, user_pwd, rev, keylen))
        ID_1 = md5.new(repr(time.time())).digest()
        ID_2 = md5.new(repr(random.random())).digest()
-        self._ID = ArrayObject((StringObject(ID_1), StringObject(ID_2)))
+        self._ID = ArrayObject((ByteStringObject(ID_1), ByteStringObject(ID_2)))
        if rev == 2:
            U, key = _alg34(user_pwd, O, P, ID_1)
        else:
@ -143,8 +146,8 @@ class PdfFileWriter(object):
        if V == 2:
            encrypt[NameObject("/Length")] = NumberObject(keylen * 8)
        encrypt[NameObject("/R")] = NumberObject(rev)
-        encrypt[NameObject("/O")] = StringObject(O)
+        encrypt[NameObject("/O")] = ByteStringObject(O)
-        encrypt[NameObject("/U")] = StringObject(U)
+        encrypt[NameObject("/U")] = ByteStringObject(U)
        encrypt[NameObject("/P")] = NumberObject(P)
        self._encrypt = self._addObject(encrypt)
        self._encrypt_key = key
@ -212,8 +215,6 @@ class PdfFileWriter(object):
            for key, value in data.items():
                origvalue = value
                value = self._sweepIndirectReferences(externMap, value)
                if value == None:
                    print objects, value, origvalue
                if isinstance(value, StreamObject):
                    # a dictionary value is a stream.  streams must be indirect
                    # objects, so we need to change this value.
@ -271,6 +272,7 @@ class PdfFileWriter(object):
 class PdfFileReader(object):
    def __init__(self, stream):
        self.flattenedPages = None
        self.pageNumbers = {}
        self.resolvedObjects = {}
        self.read(stream)
        self.stream = stream
@ -329,6 +331,144 @@ class PdfFileReader(object):
            self._flatten()
        return self.flattenedPages[pageNumber]
    ##
    # Read-only property that accesses the 
    # {@link #PdfFileReader.getNamedDestinations 
    # getNamedDestinations} function.
    # <p>
    # Stability: Added in v1.10, will exist for all future v1.x releases.
    namedDestinations = property(lambda self:
                                  self.getNamedDestinations(), None, None)
    ##
    # Retrieves the named destinations present in the document.
    # <p>
    # Stability: Added in v1.10, will exist for all future v1.x releases.
    # @return Returns a dict which maps names to {@link #Destination
    # destinations}.
    def getNamedDestinations(self, tree = None, map = None):
        if self.flattenedPages == None:
            self._flatten()
        get = self.safeGetObject
        if map == None:
            map = {}
            catalog = get(self.trailer["/Root"])
            # get the name tree
            if catalog.has_key("/Dests"):
                tree = get(catalog["/Dests"])
            elif catalog.has_key("/Names"):
                names = get(catalog['/Names'])
                if names.has_key("/Dests"):
                    tree = get(names['/Dests'])
        if tree == None:
            return map
        if tree.has_key("/Kids"):
            # recurse down the tree
            for kid in get(tree["/Kids"]):
                self.getNamedDestinations(get(kid), map)
        if tree.has_key("/Names"):
            names = get(tree["/Names"])
            for i in range(0, len(names), 2):
                key = get(names[i])
                val = get(names[i+1])
                if isinstance(val, DictionaryObject) and val.has_key('/D'):
                    val = get(val['/D'])
                dest = self._buildDestination(val, key)
                if dest != None:
                    map[key] = dest
        return map
    ##
    # Read-only property that accesses the {@link #PdfFileReader.getOutlines
    # getOutlines} function.
    # <p>
    # Stability: Added in v1.10, will exist for all future v1.x releases.
    outlines = property(lambda self: self.getOutlines(), None, None)
    ##
    # Retrieves the document outline present in the document.
    # <p>
    # Stability: Added in v1.10, will exist for all future v1.x releases.
    # @return Returns a nested list of {@link #Destination destinations}.
    def getOutlines(self, node = None, outlines = None):
        if self.flattenedPages == None:
            self._flatten()
        get = self.safeGetObject
        if outlines == None:
            outlines = []
            catalog = get(self.trailer["/Root"])
            # get the outline dictionary and named destinations
            if catalog.has_key("/Outlines"):
                lines = get(catalog["/Outlines"])
                if lines.has_key("/First"):
                    node = get(lines["/First"])
            self._namedDests = self.getNamedDestinations()
        if node == None:
          return outlines
        # see if there are any more outlines
        while 1:
            outline = self._buildOutline(node)
            if outline:
                outlines.append(outline)
            # check for sub-outlines
            if node.has_key("/First"):
                subOutlines = []
                self.getOutlines(get(node["/First"]), subOutlines)
                if subOutlines:
                    outlines.append(subOutlines)
            if not node.has_key("/Next"):
                break
            node = get(node["/Next"])
        return outlines
    def _buildDestination(self, array, title):
        if not (isinstance(array, ArrayObject) and len(array) >= 2 and \
                isinstance(array[0], IndirectObject)):
            return None
        pageKey = (array[0].generation, array[0].idnum)
        if not self.pageNumbers.has_key(pageKey):
            return None
        pageNum = self.pageNumbers[pageKey]
        return Destination(*([title, pageNum]+array[1:]))
    def _buildOutline(self, node):
        dest, title, outline = None, None, None
        if node.has_key("/A") and node.has_key("/Title"):
            # Action, section 8.5 (only type GoTo supported)
            title  = self.safeGetObject(node["/Title"])
            action = self.safeGetObject(node["/A"])
            if action["/S"] == "/GoTo":
                dest = self.safeGetObject(action["/D"])
        elif node.has_key("/Dest") and node.has_key("/Title"):
            # Destination, section 8.2.1
            title = self.safeGetObject(node["/Title"])
            dest  = self.safeGetObject(node["/Dest"])
        # if destination found, then create outline
        if dest:
            if isinstance(dest, ArrayObject):
                outline = self._buildDestination(dest, title)
            elif isinstance(dest, str) and self._namedDests.has_key(dest):
                outline = self._namedDests[dest]
                outline.title = title
        return outline
    ##
    # Read-only property that emulates a list based upon the {@link
    # #PdfFileReader.getNumPages getNumPages} and {@link #PdfFileReader.getPage
@ -349,14 +489,16 @@ class PdfFileReader(object):
            self.flattenedPages = []
            catalog = self.getObject(self.trailer["/Root"])
            pages = self.getObject(catalog["/Pages"])
        indirectReference = None
        if isinstance(pages, IndirectObject):
            indirectReference = pages
            pages = self.getObject(pages)
        t = pages["/Type"]
        if t == "/Pages":
            for attr in inheritablePageAttributes:
                if pages.has_key(attr):
                    inherit[attr] = pages[attr]
-            for page in pages["/Kids"]:
+            for page in self.safeGetObject(pages["/Kids"]):
                self._flatten(page, inherit)
        elif t == "/Page":
            for attr,value in inherit.items():
@ -364,8 +506,11 @@ class PdfFileReader(object):
                # parent's value:
                if not pages.has_key(attr):
                    pages[attr] = value
-            pageObj = PageObject(self)
+            pageObj = PageObject(self, indirectReference)
            pageObj.update(pages)
            if indirectReference:
                key = (indirectReference.generation, indirectReference.idnum)
                self.pageNumbers[key] = len(self.flattenedPages)
            self.flattenedPages.append(pageObj)
    def safeGetObject(self, obj):
@ -425,8 +570,8 @@ class PdfFileReader(object):
        return retval
    def _decryptObject(self, obj, key):
-        if isinstance(obj, StringObject):
+        if isinstance(obj, ByteStringObject) or isinstance(obj, TextStringObject):
-            obj = StringObject(utils.RC4_encrypt(key, obj))
+            obj = createStringObject(utils.RC4_encrypt(key, obj.original_bytes))
        elif isinstance(obj, StreamObject):
            obj._data = utils.RC4_encrypt(key, obj._data)
        elif isinstance(obj, DictionaryObject):
@ -438,6 +583,11 @@ class PdfFileReader(object):
        return obj
    def readObjectHeader(self, stream):
        # Should never be necessary to read out whitespace, since the
        # cross-reference table should put us in the right spot to read the
        # object header.  In reality... some files have stupid cross reference
        # tables that are off by whitespace bytes.
        readNonWhitespace(stream); stream.seek(-1, 1)
        idnum = readUntilWhitespace(stream)
        generation = readUntilWhitespace(stream)
        obj = stream.read(3)
@ -456,13 +606,15 @@ class PdfFileReader(object):
        line = ''
        while not line:
            line = self.readNextEndLine(stream)
-        assert line[:5] == "%%EOF"
+        if line[:5] != "%%EOF":
            raise utils.PdfReadError, "EOF marker not found"
        # find startxref entry - the location of the xref table
        line = self.readNextEndLine(stream)
        startxref = int(line)
        line = self.readNextEndLine(stream)
-        assert line[:9] == "startxref"
+        if line[:9] != "startxref":
            raise utils.PdfReadError, "startxref not found"
        # read all cross reference tables and their trailers
        self.xref = {}
@ -475,7 +627,8 @@ class PdfFileReader(object):
            if x == "x":
                # standard cross-reference table
                ref = stream.read(4)
-                assert ref[:3] == "ref"
+                if ref[:3] != "ref":
                    raise utils.PdfReadError, "xref table read error"
                readNonWhitespace(stream)
                stream.seek(-1, 1)
                while 1:
@ -661,7 +814,7 @@ class PdfFileReader(object):
    def _authenticateUserPassword(self, password):
        encrypt = self.safeGetObject(self.trailer['/Encrypt'])
        rev = self.safeGetObject(encrypt['/R'])
-        owner_entry = self.safeGetObject(encrypt['/O'])
+        owner_entry = self.safeGetObject(encrypt['/O']).original_bytes
        p_entry = self.safeGetObject(encrypt['/P'])
        id_entry = self.safeGetObject(self.trailer['/ID'])
        id1_entry = self.safeGetObject(id_entry[0])
@ -672,7 +825,7 @@ class PdfFileReader(object):
                    self.safeGetObject(encrypt["/Length"]) / 8, owner_entry,
                    p_entry, id1_entry,
                    self.safeGetObject(encrypt.get("/EncryptMetadata", False)))
-        real_U = self.safeGetObject(encrypt['/U'])
+        real_U = self.safeGetObject(encrypt['/U']).original_bytes
        return U == real_U, key
    def getIsEncrypted(self):
@ -721,9 +874,10 @@ def createRectangleAccessor(name, fallback):
 # will be created by accessing the {@link #PdfFileReader.getPage getPage}
 # function of the {@link #PdfFileReader PdfFileReader} class.
 class PageObject(DictionaryObject):
-    def __init__(self, pdf):
+    def __init__(self, pdf, indirectReference = None):
        DictionaryObject.__init__(self)
        self.pdf = pdf
        self.indirectReference = indirectReference
    ##
    # Rotates a page clockwise by increments of 90 degrees.
@ -856,26 +1010,35 @@ class PageObject(DictionaryObject):
    # <p>
    # Stability: Added in v1.7, will exist for all future v1.x releases.  May
    # be overhauled to provide more ordered text in the future.
-    # @return a string object
+    # @return a unicode string object
    def extractText(self):
-        text = ""
+        text = u""
        content = self["/Contents"].getObject()
        if not isinstance(content, ContentStream):
            content = ContentStream(content, self.pdf)
        # Note: we check all strings are TextStringObjects.  ByteStringObjects
        # are strings where the byte->string encoding was unknown, so adding
        # them to the text here would be gibberish.
        for operands,operator in content.operations:
            if operator == "Tj":
-                text += operands[0]
+                _text = operands[0]
                if isinstance(_text, TextStringObject):
                    text += _text
            elif operator == "T*":
                text += "\n"
            elif operator == "'":
                text += "\n"
                _text = operands[0]
                if isinstance(_text, TextStringObject):
                    text += operands[0]
-            elif operator == "\"":
+            elif operator == '"':
                _text = operands[2]
                if isinstance(_text, TextStringObject):
                    text += "\n"
-                text += operands[2]
+                    text += _text
            elif operator == "TJ":
                for i in operands[0]:
-                    if isinstance(i, StringObject):
+                    if isinstance(i, TextStringObject):
                        text += i
        return text
@ -946,7 +1109,7 @@ class ContentStream(DecodedStreamObject):
            if peek == '':
                break
            stream.seek(-1, 1)
-            if peek.isalpha() or peek == "'" or peek == "\"":
+            if peek.isalpha() or peek == "'" or peek == '"':
                operator = readUntilWhitespace(stream, maxchars=2)
                if operator == "BI":
                    # begin inline image - a completely different parsing
@ -1021,43 +1184,139 @@ class ContentStream(DecodedStreamObject):
 ##
 # A class representing the basic document metadata provided in a PDF File.
 # <p>
 # As of pyPdf v1.10, all text properties of the document metadata have two
 # properties, eg. author and author_raw.  The non-raw property will always
 # return a TextStringObject, making it ideal for a case where the metadata is
 # being displayed.  The raw property can sometimes return a ByteStringObject,
 # if pyPdf was unable to decode the string's text encoding; this requires
 # additional safety in the caller and therefore is not as commonly accessed.
 class DocumentInformation(DictionaryObject):
    def __init__(self):
        DictionaryObject.__init__(self)
    def getText(self, key):
        retval = self.get(key, None)
        if isinstance(retval, TextStringObject):
            return retval
        return None
    ##
    # Read-only property accessing the document's title.  Added in v1.6, will
-    # exist for all future v1.x releases.
+    # exist for all future v1.x releases.  Modified in v1.10 to always return a
-    # @return A string, or None if the title is not provided.
+    # unicode string (TextStringObject).
-    title = property(lambda self: self.get("/Title", None), None, None)
+    # @return A unicode string, or None if the title is not provided.
    title = property(lambda self: self.getText("/Title"))
    title_raw = property(lambda self: self.get("/Title"))
    ##
    # Read-only property accessing the document's author.  Added in v1.6, will
-    # exist for all future v1.x releases.
+    # exist for all future v1.x releases.  Modified in v1.10 to always return a
-    # @return A string, or None if the author is not provided.
+    # unicode string (TextStringObject).
-    author = property(lambda self: self.get("/Author", None), None, None)
+    # @return A unicode string, or None if the author is not provided.
    author = property(lambda self: self.getText("/Author"))
    author_raw = property(lambda self: self.get("/Author"))
    ##
    # Read-only property accessing the subject of the document.  Added in v1.6,
-    # will exist for all future v1.x releases.
+    # will exist for all future v1.x releases.  Modified in v1.10 to always
-    # @return A string, or None if the subject is not provided.
+    # return a unicode string (TextStringObject).
-    subject = property(lambda self: self.get("/Subject", None), None, None)
+    # @return A unicode string, or None if the subject is not provided.
    subject = property(lambda self: self.getText("/Subject"))
    subject_raw = property(lambda self: self.get("/Subject"))
    ##
    # Read-only property accessing the document's creator.  If the document was
    # converted to PDF from another format, the name of the application (for
    # example, OpenOffice) that created the original document from which it was
    # converted.  Added in v1.6, will exist for all future v1.x releases.
-    # @return A string, or None if the creator is not provided.
+    # Modified in v1.10 to always return a unicode string (TextStringObject).
-    creator = property(lambda self: self.get("/Creator", None), None, None)
+    # @return A unicode string, or None if the creator is not provided.
    creator = property(lambda self: self.getText("/Creator"))
    creator_raw = property(lambda self: self.get("/Creator"))
    ##
    # Read-only property accessing the document's producer.  If the document
    # was converted to PDF from another format, the name of the application
    # (for example, OSX Quartz) that converted it to PDF.  Added in v1.6, will
-    # exist for all future v1.x releases.
+    # exist for all future v1.x releases.  Modified in v1.10 to always return a
-    # @return A string, or None if the producer is not provided.
+    # unicode string (TextStringObject).
-    producer = property(lambda self: self.get("/Producer", None), None, None)
+    # @return A unicode string, or None if the producer is not provided.
    producer = property(lambda self: self.getText("/Producer"))
    producer_raw = property(lambda self: self.get("/Producer"))
 ##
 # A class representing a destination within a PDF file.
 # See section 8.2.1 of the PDF 1.6 reference.
 # Stability: Added in v1.10, will exist for all v1.x releases.
 class Destination(DictionaryObject):
    def __init__(self, *args):
        DictionaryObject.__init__(self)
        self.title = args[0]
        self["/Page"], self["/Type"] = args[1], args[2]
        # from table 8.2 of the PDF 1.6 reference.
        mapNull = lambda x: {True: None, False: x}[isinstance(x, NullObject)]
        params = map(mapNull, args[3:])
        type = self["/Type"]
        if type == "/XYZ":
            self["/Left"], self["/Top"], self["/Zoom"] = params
        elif type == "/FitR":
            self["/Left"], self["/Bottom"], \
                self["/Right"], self["/Top"] = params
        elif type in ["/FitH", "FitBH"]:
            self["/Top"], = params
        elif type in ["/FitV", "FitBV"]:
            self["/Left"], = params
        elif type in ["/Fit", "FitB"]:
            pass
        else:
            raise utils.PdfReadError, "Unknown Destination Type: " + type
    def setTitle(self, title):
        self["/Title"] = title.strip()
    ##
    # Read-write property accessing the destination title.
    # @return A string.
    title = property(lambda self: self.get("/Title"), setTitle, None)
    ##
    # Read-only property accessing the destination page.
    # @return An integer.
    page = property(lambda self: self.get("/Page"), None, None)
    ##
    # Read-only property accessing the destination type.
    # @return A string.
    type = property(lambda self: self.get("/Type"), None, None)
    ##
    # Read-only property accessing the zoom factor.
    # @return A number, or None if not available.
    zoom = property(lambda self: self.get("/Zoom", None), None, None)
    ##
    # Read-only property accessing the left horizontal coordinate.
    # @return A number, or None if not available.
    left = property(lambda self: self.get("/Left", None), None, None)
    ##
    # Read-only property accessing the right horizontal coordinate.
    # @return A number, or None if not available.
    right = property(lambda self: self.get("/Right", None), None, None)
    ##
    # Read-only property accessing the top vertical coordinate.
    # @return A number, or None if not available.
    top = property(lambda self: self.get("/Top", None), None, None)
    ##
    # Read-only property accessing the bottom vertical coordinate.
    # @return A number, or None if not available.
    bottom = property(lambda self: self.get("/Bottom", None), None, None)
 def convertToInt(d, size):
@ -1078,65 +1337,150 @@ _encryption_padding = '\x28\xbf\x4e\x5e\x4e\x75\x8a\x41\x64\x00\x4e\x56' + \
        '\xff\xfa\x01\x08\x2e\x2e\x00\xb6\xd0\x68\x3e\x80\x2f\x0c' + \
        '\xa9\xfe\x64\x53\x69\x7a'
 # Implementation of algorithm 3.2 of the PDF standard security handler,
 # section 3.5.2 of the PDF 1.6 reference.
 def _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt=True):
-    import md5, struct
+    # 1. Pad or truncate the password string to exactly 32 bytes.  If the
-    m = md5.new()
+    # password string is more than 32 bytes long, use only its first 32 bytes;
    # if it is less than 32 bytes long, pad it by appending the required number
    # of additional bytes from the beginning of the padding string
    # (_encryption_padding).
    password = (password + _encryption_padding)[:32]
-    m.update(password)
+    # 2. Initialize the MD5 hash function and pass the result of step 1 as
    # input to this function.
    import md5, struct
    m = md5.new(password)
    # 3. Pass the value of the encryption dictionary's /O entry to the MD5 hash
    # function.
    m.update(owner_entry)
    # 4. Treat the value of the /P entry as an unsigned 4-byte integer and pass
    # these bytes to the MD5 hash function, low-order byte first.
    p_entry = struct.pack('<i', p_entry)
    m.update(p_entry)
    # 5. Pass the first element of the file's file identifier array to the MD5
    # hash function.
    m.update(id1_entry)
    # 6. (Revision 3 or greater) If document metadata is not being encrypted,
    # pass 4 bytes with the value 0xFFFFFFFF to the MD5 hash function.
    if rev >= 3 and not metadata_encrypt:
        m.update("\xff\xff\xff\xff")
    # 7. Finish the hash.
    md5_hash = m.digest()
    # 8. (Revision 3 or greater) Do the following 50 times: Take the output
    # from the previous MD5 hash and pass the first n bytes of the output as
    # input into a new MD5 hash, where n is the number of bytes of the
    # encryption key as defined by the value of the encryption dictionary's
    # /Length entry.
    if rev >= 3:
        for i in range(50):
            md5_hash = md5.new(md5_hash[:keylen]).digest()
    # 9. Set the encryption key to the first n bytes of the output from the
    # final MD5 hash, where n is always 5 for revision 2 but, for revision 3 or
    # greater, depends on the value of the encryption dictionary's /Length
    # entry.
    return md5_hash[:keylen]
 # Implementation of algorithm 3.3 of the PDF standard security handler,
 # section 3.5.2 of the PDF 1.6 reference.
 def _alg33(owner_pwd, user_pwd, rev, keylen):
    # steps 1 - 4
    key = _alg33_1(owner_pwd, rev, keylen)
    # 5. Pad or truncate the user password string as described in step 1 of
    # algorithm 3.2.
    user_pwd = (user_pwd + _encryption_padding)[:32]
    # 6. Encrypt the result of step 5, using an RC4 encryption function with
    # the encryption key obtained in step 4.
    val = utils.RC4_encrypt(key, user_pwd)
    # 7. (Revision 3 or greater) Do the following 19 times: Take the output
    # from the previous invocation of the RC4 function and pass it as input to
    # a new invocation of the function; use an encryption key generated by
    # taking each byte of the encryption key obtained in step 4 and performing
    # an XOR operation between that byte and the single-byte value of the
    # iteration counter (from 1 to 19).
    if rev >= 3:
        for i in range(1, 20):
            new_key = ''
            for l in range(len(key)):
                new_key += chr(ord(key[l]) ^ i)
            val = utils.RC4_encrypt(new_key, val)
    # 8. Store the output from the final invocation of the RC4 as the value of
    # the /O entry in the encryption dictionary.
    return val
 # Steps 1-4 of algorithm 3.3
 def _alg33_1(password, rev, keylen):
-    import md5
+    # 1. Pad or truncate the owner password string as described in step 1 of
-    m = md5.new()
+    # algorithm 3.2.  If there is no owner password, use the user password
    # instead.
    password = (password + _encryption_padding)[:32]
-    m.update(password)
+    # 2. Initialize the MD5 hash function and pass the result of step 1 as
    # input to this function.
    import md5
    m = md5.new(password)
    # 3. (Revision 3 or greater) Do the following 50 times: Take the output
    # from the previous MD5 hash and pass it as input into a new MD5 hash.
    md5_hash = m.digest()
    if rev >= 3:
        for i in range(50):
            md5_hash = md5.new(md5_hash).digest()
    # 4. Create an RC4 encryption key using the first n bytes of the output
    # from the final MD5 hash, where n is always 5 for revision 2 but, for
    # revision 3 or greater, depends on the value of the encryption
    # dictionary's /Length entry.
    key = md5_hash[:keylen]
    return key
 # Implementation of algorithm 3.4 of the PDF standard security handler,
 # section 3.5.2 of the PDF 1.6 reference.
 def _alg34(password, owner_entry, p_entry, id1_entry):
    # 1. Create an encryption key based on the user password string, as
    # described in algorithm 3.2.
    key = _alg32(password, 2, 5, owner_entry, p_entry, id1_entry)
    # 2. Encrypt the 32-byte padding string shown in step 1 of algorithm 3.2,
    # using an RC4 encryption function with the encryption key from the
    # preceding step.
    U = utils.RC4_encrypt(key, _encryption_padding)
    # 3. Store the result of step 2 as the value of the /U entry in the
    # encryption dictionary.
    return U, key
 # Implementation of algorithm 3.4 of the PDF standard security handler,
 # section 3.5.2 of the PDF 1.6 reference.
 def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt):
    # 1. Create an encryption key based on the user password string, as
    # described in Algorithm 3.2.
    key = _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry)
    # 2. Initialize the MD5 hash function and pass the 32-byte padding string
    # shown in step 1 of Algorithm 3.2 as input to this function. 
    import md5
    m = md5.new()
    m.update(_encryption_padding)
    # 3. Pass the first element of the file's file identifier array (the value
    # of the ID entry in the document's trailer dictionary; see Table 3.13 on
    # page 73) to the hash function and finish the hash.  (See implementation
    # note 25 in Appendix H.) 
    m.update(id1_entry)
    md5_hash = m.digest()
-    key = _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry)
+    # 4. Encrypt the 16-byte result of the hash, using an RC4 encryption
    # function with the encryption key from step 1. 
    val = utils.RC4_encrypt(key, md5_hash)
    # 5. Do the following 19 times: Take the output from the previous
    # invocation of the RC4 function and pass it as input to a new invocation
    # of the function; use an encryption key generated by taking each byte of
    # the original encryption key (obtained in step 2) and performing an XOR
    # operation between that byte and the single-byte value of the iteration
    # counter (from 1 to 19). 
    for i in range(1, 20):
        new_key = ''
        for l in range(len(key)):
            new_key += chr(ord(key[l]) ^ i)
        val = utils.RC4_encrypt(new_key, val)
    # 6. Append 16 bytes of arbitrary padding to the output from the final
    # invocation of the RC4 function and store the 32-byte result as the value
    # of the U entry in the encryption dictionary. 
    # (implementator note: I don't know what "arbitrary padding" is supposed to
    # mean, so I have used null bytes.  This seems to match a few other
    # people's implementations)
    return val + ('\x00' * 16), key
 #if __name__ == "__main__":
--- a/src/libprs500/ebooks/pyPdf/utils.py
+++ b/src/libprs500/ebooks/pyPdf/utils.py
@ -32,7 +32,7 @@
 Utility functions for PDF library.
 """
 __author__ = "Mathieu Fenniak"
-__author_email__ = "mfenniak@pobox.com"
+__author_email__ = "biziqe@mathieu.fenniak.net"
 def readUntilWhitespace(stream, maxchars=None):
    txt = ""
@ -86,6 +86,9 @@ def RC4_encrypt(key, plaintext):
        retval += chr(ord(plaintext[x]) ^ t)
    return retval
 class PdfReadError(Exception):
    pass
 if __name__ == "__main__":
    # test RC4
    out = RC4_encrypt("Key", "Plaintext")