From 4e1f851a445737575725e0c3cd7b0f34d0bb9fcb Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 16 Apr 2009 14:39:17 -0700 Subject: [PATCH] Add a timeout to the PDF metadata writer as it hangs on some PDF files --- src/calibre/ebooks/metadata/pdf.py | 48 +++++++++++++++++++------- src/pyPdf/generic.py | 4 +-- src/pyPdf/pdf.py | 54 +++++++++++++++--------------- 3 files changed, 64 insertions(+), 42 deletions(-) diff --git a/src/calibre/ebooks/metadata/pdf.py b/src/calibre/ebooks/metadata/pdf.py index 80cdc82070..54d52f0b58 100644 --- a/src/calibre/ebooks/metadata/pdf.py +++ b/src/calibre/ebooks/metadata/pdf.py @@ -2,7 +2,8 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' '''Read meta information from PDF files''' -import sys, os, StringIO +import sys, os, cStringIO +from threading import Thread from calibre.ebooks.metadata import MetaInformation, authors_to_string, get_parser from pyPdf import PdfFileReader, PdfFileWriter @@ -29,25 +30,46 @@ def get_metadata(stream): print >>sys.stderr, msg.encode('utf8') return mi +class MetadataWriter(Thread): + + def __init__(self, out_pdf, buf): + self.out_pdf = out_pdf + self.buf = buf + Thread.__init__(self) + self.daemon = True + + def run(self): + try: + self.out_pdf.write(self.buf) + except RuntimeError: + pass + def set_metadata(stream, mi): stream.seek(0) - + # Use a StringIO object for the pdf because we will want to over # write it later and if we are working on the stream directly it # could cause some issues. - raw = StringIO.StringIO(stream.read()) + raw = cStringIO.StringIO(stream.read()) orig_pdf = PdfFileReader(raw) - + title = mi.title if mi.title else orig_pdf.documentInfo.title author = authors_to_string(mi.authors) if mi.authors else orig_pdf.documentInfo.author - + out_pdf = PdfFileWriter(title=title, author=author) + out_str = cStringIO.StringIO() + writer = MetadataWriter(out_pdf, out_str) for page in orig_pdf.pages: out_pdf.addPage(page) - - out_str = StringIO.StringIO() - out_pdf.write(out_str) - + + writer.start() + writer.join(10) # Wait 10 secs for writing to complete + out_pdf.killed = True + writer.join() + if out_pdf.killed: + print 'Failed to set metadata: took too long' + return + stream.seek(0) stream.truncate() out_str.seek(0) @@ -59,7 +81,7 @@ def option_parser(): p.remove_option('--category') p.remove_option('--comment') return p - + def main(args=sys.argv): #p = option_parser() #opts, args = p.parse_args(args) @@ -67,14 +89,14 @@ def main(args=sys.argv): print >>sys.stderr, _('Usage: pdf-meta file.pdf') print >>sys.stderr, _('No filename specified.') return 1 - + stream = open(os.path.abspath(os.path.expanduser(args[1])), 'r+b') #mi = MetaInformation(opts.title, opts.authors) #if mi.title or mi.authors: # set_metadata(stream, mi) print unicode(get_metadata(stream)).encode('utf-8') - + return 0 if __name__ == '__main__': - sys.exit(main()) \ No newline at end of file + sys.exit(main()) diff --git a/src/pyPdf/generic.py b/src/pyPdf/generic.py index fb75ef3b3f..5447ef5fbc 100644 --- a/src/pyPdf/generic.py +++ b/src/pyPdf/generic.py @@ -299,7 +299,7 @@ def readStringFromStream(stream): elif tok == "t": tok = "\t" elif tok == "b": - tok == "\b" + tok = "\b" elif tok == "f": tok = "\f" elif tok == "(": @@ -673,7 +673,7 @@ class RectangleObject(ArrayObject): def getUpperLeft_x(self): return self.getLowerLeft_x() - + def getUpperLeft_y(self): return self.getUpperRight_y() diff --git a/src/pyPdf/pdf.py b/src/pyPdf/pdf.py index 362879a39a..710d128ad0 100644 --- a/src/pyPdf/pdf.py +++ b/src/pyPdf/pdf.py @@ -39,15 +39,12 @@ __author__ = "Mathieu Fenniak" __author_email__ = "biziqe@mathieu.fenniak.net" import struct -try: - from cStringIO import StringIO -except ImportError: - from StringIO import StringIO +from cStringIO import StringIO -import filters -import utils -import warnings -from generic import * +from generic import DictionaryObject, NameObject, NumberObject, \ +createStringObject, ArrayObject, ByteStringObject, StreamObject, \ +IndirectObject, utils, readObject, TextStringObject, BooleanObject, \ +RectangleObject, DecodedStreamObject from utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirtualList @@ -56,6 +53,7 @@ from utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirt # class (typically {@link #PdfFileReader PdfFileReader}). class PdfFileWriter(object): def __init__(self,title=u"Unknown",author=u"Unknown"): + self.killed = False self._header = "%PDF-1.3" self._objects = [] # array of indirect objects @@ -162,7 +160,7 @@ class PdfFileWriter(object): # @param stream An object to write the file to. The object must support # the write method, and the tell method, similar to a file object. def write(self, stream): - import struct, md5 + import md5 externalReferenceMap = {} self.stack = [] @@ -209,11 +207,13 @@ class PdfFileWriter(object): if hasattr(self, "_encrypt"): trailer[NameObject("/Encrypt")] = self._encrypt trailer.writeToStream(stream, None) - + # eof stream.write("\nstartxref\n%s\n%%%%EOF\n" % (xref_location)) def _sweepIndirectReferences(self, externMap, data): + if self.killed: + raise RuntimeError('Writer killed') if isinstance(data, DictionaryObject): for key, value in data.items(): origvalue = value @@ -356,8 +356,8 @@ class PdfFileReader(object): return self.flattenedPages[pageNumber] ## - # Read-only property that accesses the - # {@link #PdfFileReader.getNamedDestinations + # Read-only property that accesses the + # {@link #PdfFileReader.getNamedDestinations # getNamedDestinations} function. #

# Stability: Added in v1.10, will exist for all future v1.x releases. @@ -374,7 +374,7 @@ class PdfFileReader(object): if retval == None: retval = {} catalog = self.trailer["/Root"] - + # get the name tree if catalog.has_key("/Dests"): tree = catalog["/Dests"] @@ -382,7 +382,7 @@ class PdfFileReader(object): names = catalog['/Names'] if names.has_key("/Dests"): tree = names['/Dests'] - + if tree == None: return retval @@ -420,17 +420,17 @@ class PdfFileReader(object): if outlines == None: outlines = [] catalog = self.trailer["/Root"] - + # get the outline dictionary and named destinations if catalog.has_key("/Outlines"): lines = catalog["/Outlines"] if lines.has_key("/First"): node = lines["/First"] self._namedDests = self.getNamedDestinations() - + if node == None: return outlines - + # see if there are any more outlines while 1: outline = self._buildOutline(node) @@ -454,10 +454,10 @@ class PdfFileReader(object): page, typ = array[0:2] array = array[2:] return Destination(title, page, typ, *array) - + def _buildOutline(self, node): dest, title, outline = None, None, None - + if node.has_key("/A") and node.has_key("/Title"): # Action, section 8.5 (only type GoTo supported) title = node["/Title"] @@ -951,7 +951,7 @@ class PageObject(DictionaryObject): def _pushPopGS(contents, pdf): # adds a graphics state "push" and "pop" to the beginning and end - # of a content stream. This isolates it from changes such as + # of a content stream. This isolates it from changes such as # transformation matricies. stream = ContentStream(contents, pdf) stream.operations.insert(0, [[], "q"]) @@ -1291,7 +1291,7 @@ class Destination(DictionaryObject): self[NameObject("/Title")] = title self[NameObject("/Page")] = page self[NameObject("/Type")] = typ - + # from table 8.2 of the PDF 1.6 reference. if typ == "/XYZ": (self[NameObject("/Left")], self[NameObject("/Top")], @@ -1307,7 +1307,7 @@ class Destination(DictionaryObject): pass else: raise utils.PdfReadError("Unknown Destination Type: %r" % typ) - + ## # Read-only property accessing the destination title. # @return A string. @@ -1474,25 +1474,25 @@ def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encr # described in Algorithm 3.2. key = _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry) # 2. Initialize the MD5 hash function and pass the 32-byte padding string - # shown in step 1 of Algorithm 3.2 as input to this function. + # shown in step 1 of Algorithm 3.2 as input to this function. import md5 m = md5.new() m.update(_encryption_padding) # 3. Pass the first element of the file's file identifier array (the value # of the ID entry in the document's trailer dictionary; see Table 3.13 on # page 73) to the hash function and finish the hash. (See implementation - # note 25 in Appendix H.) + # note 25 in Appendix H.) m.update(id1_entry) md5_hash = m.digest() # 4. Encrypt the 16-byte result of the hash, using an RC4 encryption - # function with the encryption key from step 1. + # function with the encryption key from step 1. val = utils.RC4_encrypt(key, md5_hash) # 5. Do the following 19 times: Take the output from the previous # invocation of the RC4 function and pass it as input to a new invocation # of the function; use an encryption key generated by taking each byte of # the original encryption key (obtained in step 2) and performing an XOR # operation between that byte and the single-byte value of the iteration - # counter (from 1 to 19). + # counter (from 1 to 19). for i in range(1, 20): new_key = '' for l in range(len(key)): @@ -1500,7 +1500,7 @@ def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encr val = utils.RC4_encrypt(new_key, val) # 6. Append 16 bytes of arbitrary padding to the output from the final # invocation of the RC4 function and store the 32-byte result as the value - # of the U entry in the encryption dictionary. + # of the U entry in the encryption dictionary. # (implementator note: I don't know what "arbitrary padding" is supposed to # mean, so I have used null bytes. This seems to match a few other # people's implementations)