Add a timeout to the PDF metadata writer as it hangs on some PDF files

2025-11-25 07:45:01 -05:00 · 2009-04-16 14:39:17 -07:00 · 2009-04-16 14:39:17 -07:00 · 4e1f851a44
commit 4e1f851a44
parent cf77ec2c4a
3 changed files with 64 additions and 42 deletions
--- a/src/calibre/ebooks/metadata/pdf.py
+++ b/src/calibre/ebooks/metadata/pdf.py
@ -2,7 +2,8 @@ __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
 '''Read meta information from PDF files'''

-import sys, os, StringIO
+import sys, os, cStringIO
+from threading import Thread

 from calibre.ebooks.metadata import MetaInformation, authors_to_string, get_parser
 from pyPdf import PdfFileReader, PdfFileWriter
@ -29,24 +30,45 @@ def get_metadata(stream):
        print >>sys.stderr, msg.encode('utf8')
    return mi

+class MetadataWriter(Thread):
+
+    def __init__(self, out_pdf, buf):
+        self.out_pdf = out_pdf
+        self.buf = buf
+        Thread.__init__(self)
+        self.daemon = True
+
+    def run(self):
+        try:
+            self.out_pdf.write(self.buf)
+        except RuntimeError:
+            pass
+
 def set_metadata(stream, mi):
    stream.seek(0)

    # Use a StringIO object for the pdf because we will want to over
    # write it later and if we are working on the stream directly it
    # could cause some issues.
-    raw = StringIO.StringIO(stream.read())
+    raw = cStringIO.StringIO(stream.read())
    orig_pdf = PdfFileReader(raw)

    title = mi.title if mi.title else orig_pdf.documentInfo.title
    author = authors_to_string(mi.authors) if mi.authors else orig_pdf.documentInfo.author

    out_pdf = PdfFileWriter(title=title, author=author)
+    out_str = cStringIO.StringIO()
+    writer = MetadataWriter(out_pdf, out_str)
    for page in orig_pdf.pages:
        out_pdf.addPage(page)

-    out_str = StringIO.StringIO()
-    out_pdf.write(out_str)
+    writer.start()
+    writer.join(10) # Wait 10 secs for writing to complete
+    out_pdf.killed = True
+    writer.join()
+    if out_pdf.killed:
+        print 'Failed to set metadata: took too long'
+        return

    stream.seek(0)
    stream.truncate()
--- a/src/pyPdf/generic.py
+++ b/src/pyPdf/generic.py
@ -299,7 +299,7 @@ def readStringFromStream(stream):
            elif tok == "t":
                tok = "\t"
            elif tok == "b":
-                tok == "\b"
+                tok = "\b"
            elif tok == "f":
                tok = "\f"
            elif tok == "(":
--- a/src/pyPdf/pdf.py
+++ b/src/pyPdf/pdf.py
@ -39,15 +39,12 @@ __author__ = "Mathieu Fenniak"
 __author_email__ = "biziqe@mathieu.fenniak.net"

 import struct
-try:
 from cStringIO import StringIO
-except ImportError:
-    from StringIO import StringIO

-import filters
-import utils
-import warnings
-from generic import *
+from generic import DictionaryObject, NameObject, NumberObject, \
+createStringObject, ArrayObject, ByteStringObject, StreamObject, \
+IndirectObject, utils, readObject, TextStringObject, BooleanObject, \
+RectangleObject, DecodedStreamObject
 from utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirtualList


@ -56,6 +53,7 @@ from utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirt
 # class (typically {@link #PdfFileReader PdfFileReader}).
 class PdfFileWriter(object):
    def __init__(self,title=u"Unknown",author=u"Unknown"):
+        self.killed = False
        self._header = "%PDF-1.3"
        self._objects = []  # array of indirect objects

@ -162,7 +160,7 @@ class PdfFileWriter(object):
    # @param stream An object to write the file to.  The object must support
    # the write method, and the tell method, similar to a file object.
    def write(self, stream):
-        import struct, md5
+        import md5

        externalReferenceMap = {}
        self.stack = []
@ -214,6 +212,8 @@ class PdfFileWriter(object):
        stream.write("\nstartxref\n%s\n%%%%EOF\n" % (xref_location))

    def _sweepIndirectReferences(self, externMap, data):
+        if self.killed:
+            raise RuntimeError('Writer killed')
        if isinstance(data, DictionaryObject):
            for key, value in data.items():
                origvalue = value