Add a timeout to the PDF metadata writer as it hangs on some PDF files

2025-07-08 10:44:09 -04:00 · 2009-04-16 14:39:17 -07:00 · 2009-04-16 14:39:17 -07:00 · 4e1f851a44
commit 4e1f851a44
parent cf77ec2c4a
3 changed files with 64 additions and 42 deletions
--- a/src/calibre/ebooks/metadata/pdf.py
+++ b/src/calibre/ebooks/metadata/pdf.py
@ -2,7 +2,8 @@ __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
 '''Read meta information from PDF files'''

-import sys, os, StringIO
+import sys, os, cStringIO
+from threading import Thread

 from calibre.ebooks.metadata import MetaInformation, authors_to_string, get_parser
 from pyPdf import PdfFileReader, PdfFileWriter
@ -29,25 +30,46 @@ def get_metadata(stream):
        print >>sys.stderr, msg.encode('utf8')
    return mi

+class MetadataWriter(Thread):
+
+    def __init__(self, out_pdf, buf):
+        self.out_pdf = out_pdf
+        self.buf = buf
+        Thread.__init__(self)
+        self.daemon = True
+
+    def run(self):
+        try:
+            self.out_pdf.write(self.buf)
+        except RuntimeError:
+            pass
+
 def set_metadata(stream, mi):
    stream.seek(0)
-    
+
    # Use a StringIO object for the pdf because we will want to over
    # write it later and if we are working on the stream directly it
    # could cause some issues.
-    raw = StringIO.StringIO(stream.read())
+    raw = cStringIO.StringIO(stream.read())
    orig_pdf = PdfFileReader(raw)
-    
+
    title = mi.title if mi.title else orig_pdf.documentInfo.title
    author = authors_to_string(mi.authors) if mi.authors else orig_pdf.documentInfo.author
-    
+
    out_pdf = PdfFileWriter(title=title, author=author)
+    out_str = cStringIO.StringIO()
+    writer = MetadataWriter(out_pdf, out_str)
    for page in orig_pdf.pages:
        out_pdf.addPage(page)
-        
-    out_str = StringIO.StringIO()
-    out_pdf.write(out_str)
-    
+
+    writer.start()
+    writer.join(10) # Wait 10 secs for writing to complete
+    out_pdf.killed = True
+    writer.join()
+    if out_pdf.killed:
+        print 'Failed to set metadata: took too long'
+        return
+
    stream.seek(0)
    stream.truncate()
    out_str.seek(0)
@ -59,7 +81,7 @@ def option_parser():
    p.remove_option('--category')
    p.remove_option('--comment')
    return p
-            
+
 def main(args=sys.argv):
    #p = option_parser()
    #opts, args = p.parse_args(args)
@ -67,14 +89,14 @@ def main(args=sys.argv):
        print >>sys.stderr, _('Usage: pdf-meta file.pdf')
        print >>sys.stderr, _('No filename specified.')
        return 1
-    
+
    stream = open(os.path.abspath(os.path.expanduser(args[1])), 'r+b')
    #mi = MetaInformation(opts.title, opts.authors)
    #if mi.title or mi.authors:
    #    set_metadata(stream, mi)
    print unicode(get_metadata(stream)).encode('utf-8')
-    
+
    return 0

 if __name__ == '__main__':
-    sys.exit(main())
+    sys.exit(main())
--- a/src/pyPdf/generic.py
+++ b/src/pyPdf/generic.py
@ -299,7 +299,7 @@ def readStringFromStream(stream):
            elif tok == "t":
                tok = "\t"
            elif tok == "b":
-                tok == "\b"
+                tok = "\b"
            elif tok == "f":
                tok = "\f"
            elif tok == "(":
@ -673,7 +673,7 @@ class RectangleObject(ArrayObject):

    def getUpperLeft_x(self):
        return self.getLowerLeft_x()
-    
+
    def getUpperLeft_y(self):
        return self.getUpperRight_y()

--- a/src/pyPdf/pdf.py
+++ b/src/pyPdf/pdf.py
@ -39,15 +39,12 @@ __author__ = "Mathieu Fenniak"
 __author_email__ = "biziqe@mathieu.fenniak.net"

 import struct
-try:
-    from cStringIO import StringIO
-except ImportError:
-    from StringIO import StringIO
+from cStringIO import StringIO

-import filters
-import utils
-import warnings
-from generic import *
+from generic import DictionaryObject, NameObject, NumberObject, \
+createStringObject, ArrayObject, ByteStringObject, StreamObject, \
+IndirectObject, utils, readObject, TextStringObject, BooleanObject, \
+RectangleObject, DecodedStreamObject
 from utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirtualList


@ -56,6 +53,7 @@ from utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirt
 # class (typically {@link #PdfFileReader PdfFileReader}).
 class PdfFileWriter(object):
    def __init__(self,title=u"Unknown",author=u"Unknown"):
+        self.killed = False
        self._header = "%PDF-1.3"
        self._objects = []  # array of indirect objects

@ -162,7 +160,7 @@ class PdfFileWriter(object):
    # @param stream An object to write the file to.  The object must support
    # the write method, and the tell method, similar to a file object.
    def write(self, stream):
-        import struct, md5
+        import md5

        externalReferenceMap = {}
        self.stack = []
@ -209,11 +207,13 @@ class PdfFileWriter(object):
        if hasattr(self, "_encrypt"):
            trailer[NameObject("/Encrypt")] = self._encrypt
        trailer.writeToStream(stream, None)
-        
+
        # eof
        stream.write("\nstartxref\n%s\n%%%%EOF\n" % (xref_location))

    def _sweepIndirectReferences(self, externMap, data):
+        if self.killed:
+            raise RuntimeError('Writer killed')
        if isinstance(data, DictionaryObject):
            for key, value in data.items():
                origvalue = value
@ -356,8 +356,8 @@ class PdfFileReader(object):
        return self.flattenedPages[pageNumber]

    ##
-    # Read-only property that accesses the 
-    # {@link #PdfFileReader.getNamedDestinations 
+    # Read-only property that accesses the
+    # {@link #PdfFileReader.getNamedDestinations
    # getNamedDestinations} function.
    # <p>
    # Stability: Added in v1.10, will exist for all future v1.x releases.
@ -374,7 +374,7 @@ class PdfFileReader(object):
        if retval == None:
            retval = {}
            catalog = self.trailer["/Root"]
-            
+
            # get the name tree
            if catalog.has_key("/Dests"):
                tree = catalog["/Dests"]
@ -382,7 +382,7 @@ class PdfFileReader(object):
                names = catalog['/Names']
                if names.has_key("/Dests"):
                    tree = names['/Dests']
-        
+
        if tree == None:
            return retval

@ -420,17 +420,17 @@ class PdfFileReader(object):
        if outlines == None:
            outlines = []
            catalog = self.trailer["/Root"]
-            
+
            # get the outline dictionary and named destinations
            if catalog.has_key("/Outlines"):
                lines = catalog["/Outlines"]
                if lines.has_key("/First"):
                    node = lines["/First"]
            self._namedDests = self.getNamedDestinations()
-            
+
        if node == None:
          return outlines
-          
+
        # see if there are any more outlines
        while 1:
            outline = self._buildOutline(node)
@ -454,10 +454,10 @@ class PdfFileReader(object):
        page, typ = array[0:2]
        array = array[2:]
        return Destination(title, page, typ, *array)
-          
+
    def _buildOutline(self, node):
        dest, title, outline = None, None, None
-        
+
        if node.has_key("/A") and node.has_key("/Title"):
            # Action, section 8.5 (only type GoTo supported)
            title  = node["/Title"]
@ -951,7 +951,7 @@ class PageObject(DictionaryObject):

    def _pushPopGS(contents, pdf):
        # adds a graphics state "push" and "pop" to the beginning and end
-        # of a content stream.  This isolates it from changes such as 
+        # of a content stream.  This isolates it from changes such as
        # transformation matricies.
        stream = ContentStream(contents, pdf)
        stream.operations.insert(0, [[], "q"])
@ -1291,7 +1291,7 @@ class Destination(DictionaryObject):
        self[NameObject("/Title")] = title
        self[NameObject("/Page")] = page
        self[NameObject("/Type")] = typ
-        
+
        # from table 8.2 of the PDF 1.6 reference.
        if typ == "/XYZ":
            (self[NameObject("/Left")], self[NameObject("/Top")],
@ -1307,7 +1307,7 @@ class Destination(DictionaryObject):
            pass
        else:
            raise utils.PdfReadError("Unknown Destination Type: %r" % typ)
-          
+
    ##
    # Read-only property accessing the destination title.
    # @return A string.
@ -1474,25 +1474,25 @@ def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encr
    # described in Algorithm 3.2.
    key = _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry)
    # 2. Initialize the MD5 hash function and pass the 32-byte padding string
-    # shown in step 1 of Algorithm 3.2 as input to this function. 
+    # shown in step 1 of Algorithm 3.2 as input to this function.
    import md5
    m = md5.new()
    m.update(_encryption_padding)
    # 3. Pass the first element of the file's file identifier array (the value
    # of the ID entry in the document's trailer dictionary; see Table 3.13 on
    # page 73) to the hash function and finish the hash.  (See implementation
-    # note 25 in Appendix H.) 
+    # note 25 in Appendix H.)
    m.update(id1_entry)
    md5_hash = m.digest()
    # 4. Encrypt the 16-byte result of the hash, using an RC4 encryption
-    # function with the encryption key from step 1. 
+    # function with the encryption key from step 1.
    val = utils.RC4_encrypt(key, md5_hash)
    # 5. Do the following 19 times: Take the output from the previous
    # invocation of the RC4 function and pass it as input to a new invocation
    # of the function; use an encryption key generated by taking each byte of
    # the original encryption key (obtained in step 2) and performing an XOR
    # operation between that byte and the single-byte value of the iteration
-    # counter (from 1 to 19). 
+    # counter (from 1 to 19).
    for i in range(1, 20):
        new_key = ''
        for l in range(len(key)):
@ -1500,7 +1500,7 @@ def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encr
        val = utils.RC4_encrypt(new_key, val)
    # 6. Append 16 bytes of arbitrary padding to the output from the final
    # invocation of the RC4 function and store the 32-byte result as the value
-    # of the U entry in the encryption dictionary. 
+    # of the U entry in the encryption dictionary.
    # (implementator note: I don't know what "arbitrary padding" is supposed to
    # mean, so I have used null bytes.  This seems to match a few other
    # people's implementations)