From 4e1f851a445737575725e0c3cd7b0f34d0bb9fcb Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Thu, 16 Apr 2009 14:39:17 -0700
Subject: [PATCH] Add a timeout to the PDF metadata writer as it hangs on some
 PDF files

---
 src/calibre/ebooks/metadata/pdf.py | 48 +++++++++++++++++++-------
 src/pyPdf/generic.py               |  4 +--
 src/pyPdf/pdf.py                   | 54 +++++++++++++++---------------
 3 files changed, 64 insertions(+), 42 deletions(-)
diff --git a/src/calibre/ebooks/metadata/pdf.py b/src/calibre/ebooks/metadata/pdf.py
index 80cdc82070..54d52f0b58 100644
--- a/src/calibre/ebooks/metadata/pdf.py
+++ b/src/calibre/ebooks/metadata/pdf.py
@@ -2,7 +2,8 @@ __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
 '''Read meta information from PDF files'''
 
-import sys, os, StringIO
+import sys, os, cStringIO
+from threading import Thread
 
 from calibre.ebooks.metadata import MetaInformation, authors_to_string, get_parser
 from pyPdf import PdfFileReader, PdfFileWriter
@@ -29,25 +30,46 @@ def get_metadata(stream):
         print >>sys.stderr, msg.encode('utf8')
     return mi
 
+class MetadataWriter(Thread):
+
+    def __init__(self, out_pdf, buf):
+        self.out_pdf = out_pdf
+        self.buf = buf
+        Thread.__init__(self)
+        self.daemon = True
+
+    def run(self):
+        try:
+            self.out_pdf.write(self.buf)
+        except RuntimeError:
+            pass
+
 def set_metadata(stream, mi):
     stream.seek(0)
-    
+
     # Use a StringIO object for the pdf because we will want to over
     # write it later and if we are working on the stream directly it
     # could cause some issues.
-    raw = StringIO.StringIO(stream.read())
+    raw = cStringIO.StringIO(stream.read())
     orig_pdf = PdfFileReader(raw)
-    
+
     title = mi.title if mi.title else orig_pdf.documentInfo.title
     author = authors_to_string(mi.authors) if mi.authors else orig_pdf.documentInfo.author
-    
+
     out_pdf = PdfFileWriter(title=title, author=author)
+    out_str = cStringIO.StringIO()
+    writer = MetadataWriter(out_pdf, out_str)
     for page in orig_pdf.pages:
         out_pdf.addPage(page)
-        
-    out_str = StringIO.StringIO()
-    out_pdf.write(out_str)
-    
+
+    writer.start()
+    writer.join(10) # Wait 10 secs for writing to complete
+    out_pdf.killed = True
+    writer.join()
+    if out_pdf.killed:
+        print 'Failed to set metadata: took too long'
+        return
+
     stream.seek(0)
     stream.truncate()
     out_str.seek(0)
@@ -59,7 +81,7 @@ def option_parser():
     p.remove_option('--category')
     p.remove_option('--comment')
     return p
-            
+
 def main(args=sys.argv):
     #p = option_parser()
     #opts, args = p.parse_args(args)
@@ -67,14 +89,14 @@ def main(args=sys.argv):
         print >>sys.stderr, _('Usage: pdf-meta file.pdf')
         print >>sys.stderr, _('No filename specified.')
         return 1
-    
+
     stream = open(os.path.abspath(os.path.expanduser(args[1])), 'r+b')
     #mi = MetaInformation(opts.title, opts.authors)
     #if mi.title or mi.authors:
     #    set_metadata(stream, mi)
     print unicode(get_metadata(stream)).encode('utf-8')
-    
+
     return 0
 
 if __name__ == '__main__':
-    sys.exit(main())
\ No newline at end of file
+    sys.exit(main())
diff --git a/src/pyPdf/generic.py b/src/pyPdf/generic.py
index fb75ef3b3f..5447ef5fbc 100644
--- a/src/pyPdf/generic.py
+++ b/src/pyPdf/generic.py
@@ -299,7 +299,7 @@ def readStringFromStream(stream):
             elif tok == "t":
                 tok = "\t"
             elif tok == "b":
-                tok == "\b"
+                tok = "\b"
             elif tok == "f":
                 tok = "\f"
             elif tok == "(":
@@ -673,7 +673,7 @@ class RectangleObject(ArrayObject):
 
     def getUpperLeft_x(self):
         return self.getLowerLeft_x()
-    
+
     def getUpperLeft_y(self):
         return self.getUpperRight_y()
 
diff --git a/src/pyPdf/pdf.py b/src/pyPdf/pdf.py
index 362879a39a..710d128ad0 100644
--- a/src/pyPdf/pdf.py
+++ b/src/pyPdf/pdf.py
@@ -39,15 +39,12 @@ __author__ = "Mathieu Fenniak"
 __author_email__ = "biziqe@mathieu.fenniak.net"
 
 import struct
-try:
-    from cStringIO import StringIO
-except ImportError:
-    from StringIO import StringIO
+from cStringIO import StringIO
 
-import filters
-import utils
-import warnings
-from generic import *
+from generic import DictionaryObject, NameObject, NumberObject, \
+createStringObject, ArrayObject, ByteStringObject, StreamObject, \
+IndirectObject, utils, readObject, TextStringObject, BooleanObject, \
+RectangleObject, DecodedStreamObject
 from utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirtualList
 
 
@@ -56,6 +53,7 @@ from utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirt
 # class (typically {@link #PdfFileReader PdfFileReader}).
 class PdfFileWriter(object):
     def __init__(self,title=u"Unknown",author=u"Unknown"):
+        self.killed = False
         self._header = "%PDF-1.3"
         self._objects = []  # array of indirect objects
 
@@ -162,7 +160,7 @@ class PdfFileWriter(object):
     # @param stream An object to write the file to.  The object must support
     # the write method, and the tell method, similar to a file object.
     def write(self, stream):
-        import struct, md5
+        import md5
 
         externalReferenceMap = {}
         self.stack = []
@@ -209,11 +207,13 @@ class PdfFileWriter(object):
         if hasattr(self, "_encrypt"):
             trailer[NameObject("/Encrypt")] = self._encrypt
         trailer.writeToStream(stream, None)
-        
+
         # eof
         stream.write("\nstartxref\n%s\n%%%%EOF\n" % (xref_location))
 
     def _sweepIndirectReferences(self, externMap, data):
+        if self.killed:
+            raise RuntimeError('Writer killed')
         if isinstance(data, DictionaryObject):
             for key, value in data.items():
                 origvalue = value
@@ -356,8 +356,8 @@ class PdfFileReader(object):
         return self.flattenedPages[pageNumber]
 
     ##
-    # Read-only property that accesses the 
-    # {@link #PdfFileReader.getNamedDestinations 
+    # Read-only property that accesses the
+    # {@link #PdfFileReader.getNamedDestinations
     # getNamedDestinations} function.
     # <p>
     # Stability: Added in v1.10, will exist for all future v1.x releases.
@@ -374,7 +374,7 @@ class PdfFileReader(object):
         if retval == None:
             retval = {}
             catalog = self.trailer["/Root"]
-            
+
             # get the name tree
             if catalog.has_key("/Dests"):
                 tree = catalog["/Dests"]
@@ -382,7 +382,7 @@ class PdfFileReader(object):
                 names = catalog['/Names']
                 if names.has_key("/Dests"):
                     tree = names['/Dests']
-        
+
         if tree == None:
             return retval
 
@@ -420,17 +420,17 @@ class PdfFileReader(object):
         if outlines == None:
             outlines = []
             catalog = self.trailer["/Root"]
-            
+
             # get the outline dictionary and named destinations
             if catalog.has_key("/Outlines"):
                 lines = catalog["/Outlines"]
                 if lines.has_key("/First"):
                     node = lines["/First"]
             self._namedDests = self.getNamedDestinations()
-            
+
         if node == None:
           return outlines
-          
+
         # see if there are any more outlines
         while 1:
             outline = self._buildOutline(node)
@@ -454,10 +454,10 @@ class PdfFileReader(object):
         page, typ = array[0:2]
         array = array[2:]
         return Destination(title, page, typ, *array)
-          
+
     def _buildOutline(self, node):
         dest, title, outline = None, None, None
-        
+
         if node.has_key("/A") and node.has_key("/Title"):
             # Action, section 8.5 (only type GoTo supported)
             title  = node["/Title"]
@@ -951,7 +951,7 @@ class PageObject(DictionaryObject):
 
     def _pushPopGS(contents, pdf):
         # adds a graphics state "push" and "pop" to the beginning and end
-        # of a content stream.  This isolates it from changes such as 
+        # of a content stream.  This isolates it from changes such as
         # transformation matricies.
         stream = ContentStream(contents, pdf)
         stream.operations.insert(0, [[], "q"])
@@ -1291,7 +1291,7 @@ class Destination(DictionaryObject):
         self[NameObject("/Title")] = title
         self[NameObject("/Page")] = page
         self[NameObject("/Type")] = typ
-        
+
         # from table 8.2 of the PDF 1.6 reference.
         if typ == "/XYZ":
             (self[NameObject("/Left")], self[NameObject("/Top")],
@@ -1307,7 +1307,7 @@ class Destination(DictionaryObject):
             pass
         else:
             raise utils.PdfReadError("Unknown Destination Type: %r" % typ)
-          
+
     ##
     # Read-only property accessing the destination title.
     # @return A string.
@@ -1474,25 +1474,25 @@ def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encr
     # described in Algorithm 3.2.
     key = _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry)
     # 2. Initialize the MD5 hash function and pass the 32-byte padding string
-    # shown in step 1 of Algorithm 3.2 as input to this function. 
+    # shown in step 1 of Algorithm 3.2 as input to this function.
     import md5
     m = md5.new()
     m.update(_encryption_padding)
     # 3. Pass the first element of the file's file identifier array (the value
     # of the ID entry in the document's trailer dictionary; see Table 3.13 on
     # page 73) to the hash function and finish the hash.  (See implementation
-    # note 25 in Appendix H.) 
+    # note 25 in Appendix H.)
     m.update(id1_entry)
     md5_hash = m.digest()
     # 4. Encrypt the 16-byte result of the hash, using an RC4 encryption
-    # function with the encryption key from step 1. 
+    # function with the encryption key from step 1.
     val = utils.RC4_encrypt(key, md5_hash)
     # 5. Do the following 19 times: Take the output from the previous
     # invocation of the RC4 function and pass it as input to a new invocation
     # of the function; use an encryption key generated by taking each byte of
     # the original encryption key (obtained in step 2) and performing an XOR
     # operation between that byte and the single-byte value of the iteration
-    # counter (from 1 to 19). 
+    # counter (from 1 to 19).
     for i in range(1, 20):
         new_key = ''
         for l in range(len(key)):
@@ -1500,7 +1500,7 @@ def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encr
         val = utils.RC4_encrypt(new_key, val)
     # 6. Append 16 bytes of arbitrary padding to the output from the final
     # invocation of the RC4 function and store the 32-byte result as the value
-    # of the U entry in the encryption dictionary. 
+    # of the U entry in the encryption dictionary.
     # (implementator note: I don't know what "arbitrary padding" is supposed to
     # mean, so I have used null bytes.  This seems to match a few other
     # people's implementations)