Add a timeout to the PDF metadata writer as it hangs on some PDF files

This commit is contained in:
Kovid Goyal 2009-04-16 14:39:17 -07:00
parent cf77ec2c4a
commit 4e1f851a44
3 changed files with 64 additions and 42 deletions

View File

@ -2,7 +2,8 @@ __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''Read meta information from PDF files'''
import sys, os, StringIO
import sys, os, cStringIO
from threading import Thread
from calibre.ebooks.metadata import MetaInformation, authors_to_string, get_parser
from pyPdf import PdfFileReader, PdfFileWriter
@ -29,24 +30,45 @@ def get_metadata(stream):
print >>sys.stderr, msg.encode('utf8')
return mi
class MetadataWriter(Thread):
def __init__(self, out_pdf, buf):
self.out_pdf = out_pdf
self.buf = buf
Thread.__init__(self)
self.daemon = True
def run(self):
try:
self.out_pdf.write(self.buf)
except RuntimeError:
pass
def set_metadata(stream, mi):
stream.seek(0)
# Use a StringIO object for the pdf because we will want to over
# write it later and if we are working on the stream directly it
# could cause some issues.
raw = StringIO.StringIO(stream.read())
raw = cStringIO.StringIO(stream.read())
orig_pdf = PdfFileReader(raw)
title = mi.title if mi.title else orig_pdf.documentInfo.title
author = authors_to_string(mi.authors) if mi.authors else orig_pdf.documentInfo.author
out_pdf = PdfFileWriter(title=title, author=author)
out_str = cStringIO.StringIO()
writer = MetadataWriter(out_pdf, out_str)
for page in orig_pdf.pages:
out_pdf.addPage(page)
out_str = StringIO.StringIO()
out_pdf.write(out_str)
writer.start()
writer.join(10) # Wait 10 secs for writing to complete
out_pdf.killed = True
writer.join()
if out_pdf.killed:
print 'Failed to set metadata: took too long'
return
stream.seek(0)
stream.truncate()

View File

@ -299,7 +299,7 @@ def readStringFromStream(stream):
elif tok == "t":
tok = "\t"
elif tok == "b":
tok == "\b"
tok = "\b"
elif tok == "f":
tok = "\f"
elif tok == "(":

View File

@ -39,15 +39,12 @@ __author__ = "Mathieu Fenniak"
__author_email__ = "biziqe@mathieu.fenniak.net"
import struct
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
import filters
import utils
import warnings
from generic import *
from generic import DictionaryObject, NameObject, NumberObject, \
createStringObject, ArrayObject, ByteStringObject, StreamObject, \
IndirectObject, utils, readObject, TextStringObject, BooleanObject, \
RectangleObject, DecodedStreamObject
from utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirtualList
@ -56,6 +53,7 @@ from utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirt
# class (typically {@link #PdfFileReader PdfFileReader}).
class PdfFileWriter(object):
def __init__(self,title=u"Unknown",author=u"Unknown"):
self.killed = False
self._header = "%PDF-1.3"
self._objects = [] # array of indirect objects
@ -162,7 +160,7 @@ class PdfFileWriter(object):
# @param stream An object to write the file to. The object must support
# the write method, and the tell method, similar to a file object.
def write(self, stream):
import struct, md5
import md5
externalReferenceMap = {}
self.stack = []
@ -214,6 +212,8 @@ class PdfFileWriter(object):
stream.write("\nstartxref\n%s\n%%%%EOF\n" % (xref_location))
def _sweepIndirectReferences(self, externMap, data):
if self.killed:
raise RuntimeError('Writer killed')
if isinstance(data, DictionaryObject):
for key, value in data.items():
origvalue = value