mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 10:44:09 -04:00
Add a timeout to the PDF metadata writer as it hangs on some PDF files
This commit is contained in:
parent
cf77ec2c4a
commit
4e1f851a44
@ -2,7 +2,8 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
'''Read meta information from PDF files'''
|
||||
|
||||
import sys, os, StringIO
|
||||
import sys, os, cStringIO
|
||||
from threading import Thread
|
||||
|
||||
from calibre.ebooks.metadata import MetaInformation, authors_to_string, get_parser
|
||||
from pyPdf import PdfFileReader, PdfFileWriter
|
||||
@ -29,25 +30,46 @@ def get_metadata(stream):
|
||||
print >>sys.stderr, msg.encode('utf8')
|
||||
return mi
|
||||
|
||||
class MetadataWriter(Thread):
|
||||
|
||||
def __init__(self, out_pdf, buf):
|
||||
self.out_pdf = out_pdf
|
||||
self.buf = buf
|
||||
Thread.__init__(self)
|
||||
self.daemon = True
|
||||
|
||||
def run(self):
|
||||
try:
|
||||
self.out_pdf.write(self.buf)
|
||||
except RuntimeError:
|
||||
pass
|
||||
|
||||
def set_metadata(stream, mi):
|
||||
stream.seek(0)
|
||||
|
||||
|
||||
# Use a StringIO object for the pdf because we will want to over
|
||||
# write it later and if we are working on the stream directly it
|
||||
# could cause some issues.
|
||||
raw = StringIO.StringIO(stream.read())
|
||||
raw = cStringIO.StringIO(stream.read())
|
||||
orig_pdf = PdfFileReader(raw)
|
||||
|
||||
|
||||
title = mi.title if mi.title else orig_pdf.documentInfo.title
|
||||
author = authors_to_string(mi.authors) if mi.authors else orig_pdf.documentInfo.author
|
||||
|
||||
|
||||
out_pdf = PdfFileWriter(title=title, author=author)
|
||||
out_str = cStringIO.StringIO()
|
||||
writer = MetadataWriter(out_pdf, out_str)
|
||||
for page in orig_pdf.pages:
|
||||
out_pdf.addPage(page)
|
||||
|
||||
out_str = StringIO.StringIO()
|
||||
out_pdf.write(out_str)
|
||||
|
||||
|
||||
writer.start()
|
||||
writer.join(10) # Wait 10 secs for writing to complete
|
||||
out_pdf.killed = True
|
||||
writer.join()
|
||||
if out_pdf.killed:
|
||||
print 'Failed to set metadata: took too long'
|
||||
return
|
||||
|
||||
stream.seek(0)
|
||||
stream.truncate()
|
||||
out_str.seek(0)
|
||||
@ -59,7 +81,7 @@ def option_parser():
|
||||
p.remove_option('--category')
|
||||
p.remove_option('--comment')
|
||||
return p
|
||||
|
||||
|
||||
def main(args=sys.argv):
|
||||
#p = option_parser()
|
||||
#opts, args = p.parse_args(args)
|
||||
@ -67,14 +89,14 @@ def main(args=sys.argv):
|
||||
print >>sys.stderr, _('Usage: pdf-meta file.pdf')
|
||||
print >>sys.stderr, _('No filename specified.')
|
||||
return 1
|
||||
|
||||
|
||||
stream = open(os.path.abspath(os.path.expanduser(args[1])), 'r+b')
|
||||
#mi = MetaInformation(opts.title, opts.authors)
|
||||
#if mi.title or mi.authors:
|
||||
# set_metadata(stream, mi)
|
||||
print unicode(get_metadata(stream)).encode('utf-8')
|
||||
|
||||
|
||||
return 0
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
sys.exit(main())
|
||||
|
@ -299,7 +299,7 @@ def readStringFromStream(stream):
|
||||
elif tok == "t":
|
||||
tok = "\t"
|
||||
elif tok == "b":
|
||||
tok == "\b"
|
||||
tok = "\b"
|
||||
elif tok == "f":
|
||||
tok = "\f"
|
||||
elif tok == "(":
|
||||
@ -673,7 +673,7 @@ class RectangleObject(ArrayObject):
|
||||
|
||||
def getUpperLeft_x(self):
|
||||
return self.getLowerLeft_x()
|
||||
|
||||
|
||||
def getUpperLeft_y(self):
|
||||
return self.getUpperRight_y()
|
||||
|
||||
|
@ -39,15 +39,12 @@ __author__ = "Mathieu Fenniak"
|
||||
__author_email__ = "biziqe@mathieu.fenniak.net"
|
||||
|
||||
import struct
|
||||
try:
|
||||
from cStringIO import StringIO
|
||||
except ImportError:
|
||||
from StringIO import StringIO
|
||||
from cStringIO import StringIO
|
||||
|
||||
import filters
|
||||
import utils
|
||||
import warnings
|
||||
from generic import *
|
||||
from generic import DictionaryObject, NameObject, NumberObject, \
|
||||
createStringObject, ArrayObject, ByteStringObject, StreamObject, \
|
||||
IndirectObject, utils, readObject, TextStringObject, BooleanObject, \
|
||||
RectangleObject, DecodedStreamObject
|
||||
from utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirtualList
|
||||
|
||||
|
||||
@ -56,6 +53,7 @@ from utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirt
|
||||
# class (typically {@link #PdfFileReader PdfFileReader}).
|
||||
class PdfFileWriter(object):
|
||||
def __init__(self,title=u"Unknown",author=u"Unknown"):
|
||||
self.killed = False
|
||||
self._header = "%PDF-1.3"
|
||||
self._objects = [] # array of indirect objects
|
||||
|
||||
@ -162,7 +160,7 @@ class PdfFileWriter(object):
|
||||
# @param stream An object to write the file to. The object must support
|
||||
# the write method, and the tell method, similar to a file object.
|
||||
def write(self, stream):
|
||||
import struct, md5
|
||||
import md5
|
||||
|
||||
externalReferenceMap = {}
|
||||
self.stack = []
|
||||
@ -209,11 +207,13 @@ class PdfFileWriter(object):
|
||||
if hasattr(self, "_encrypt"):
|
||||
trailer[NameObject("/Encrypt")] = self._encrypt
|
||||
trailer.writeToStream(stream, None)
|
||||
|
||||
|
||||
# eof
|
||||
stream.write("\nstartxref\n%s\n%%%%EOF\n" % (xref_location))
|
||||
|
||||
def _sweepIndirectReferences(self, externMap, data):
|
||||
if self.killed:
|
||||
raise RuntimeError('Writer killed')
|
||||
if isinstance(data, DictionaryObject):
|
||||
for key, value in data.items():
|
||||
origvalue = value
|
||||
@ -356,8 +356,8 @@ class PdfFileReader(object):
|
||||
return self.flattenedPages[pageNumber]
|
||||
|
||||
##
|
||||
# Read-only property that accesses the
|
||||
# {@link #PdfFileReader.getNamedDestinations
|
||||
# Read-only property that accesses the
|
||||
# {@link #PdfFileReader.getNamedDestinations
|
||||
# getNamedDestinations} function.
|
||||
# <p>
|
||||
# Stability: Added in v1.10, will exist for all future v1.x releases.
|
||||
@ -374,7 +374,7 @@ class PdfFileReader(object):
|
||||
if retval == None:
|
||||
retval = {}
|
||||
catalog = self.trailer["/Root"]
|
||||
|
||||
|
||||
# get the name tree
|
||||
if catalog.has_key("/Dests"):
|
||||
tree = catalog["/Dests"]
|
||||
@ -382,7 +382,7 @@ class PdfFileReader(object):
|
||||
names = catalog['/Names']
|
||||
if names.has_key("/Dests"):
|
||||
tree = names['/Dests']
|
||||
|
||||
|
||||
if tree == None:
|
||||
return retval
|
||||
|
||||
@ -420,17 +420,17 @@ class PdfFileReader(object):
|
||||
if outlines == None:
|
||||
outlines = []
|
||||
catalog = self.trailer["/Root"]
|
||||
|
||||
|
||||
# get the outline dictionary and named destinations
|
||||
if catalog.has_key("/Outlines"):
|
||||
lines = catalog["/Outlines"]
|
||||
if lines.has_key("/First"):
|
||||
node = lines["/First"]
|
||||
self._namedDests = self.getNamedDestinations()
|
||||
|
||||
|
||||
if node == None:
|
||||
return outlines
|
||||
|
||||
|
||||
# see if there are any more outlines
|
||||
while 1:
|
||||
outline = self._buildOutline(node)
|
||||
@ -454,10 +454,10 @@ class PdfFileReader(object):
|
||||
page, typ = array[0:2]
|
||||
array = array[2:]
|
||||
return Destination(title, page, typ, *array)
|
||||
|
||||
|
||||
def _buildOutline(self, node):
|
||||
dest, title, outline = None, None, None
|
||||
|
||||
|
||||
if node.has_key("/A") and node.has_key("/Title"):
|
||||
# Action, section 8.5 (only type GoTo supported)
|
||||
title = node["/Title"]
|
||||
@ -951,7 +951,7 @@ class PageObject(DictionaryObject):
|
||||
|
||||
def _pushPopGS(contents, pdf):
|
||||
# adds a graphics state "push" and "pop" to the beginning and end
|
||||
# of a content stream. This isolates it from changes such as
|
||||
# of a content stream. This isolates it from changes such as
|
||||
# transformation matricies.
|
||||
stream = ContentStream(contents, pdf)
|
||||
stream.operations.insert(0, [[], "q"])
|
||||
@ -1291,7 +1291,7 @@ class Destination(DictionaryObject):
|
||||
self[NameObject("/Title")] = title
|
||||
self[NameObject("/Page")] = page
|
||||
self[NameObject("/Type")] = typ
|
||||
|
||||
|
||||
# from table 8.2 of the PDF 1.6 reference.
|
||||
if typ == "/XYZ":
|
||||
(self[NameObject("/Left")], self[NameObject("/Top")],
|
||||
@ -1307,7 +1307,7 @@ class Destination(DictionaryObject):
|
||||
pass
|
||||
else:
|
||||
raise utils.PdfReadError("Unknown Destination Type: %r" % typ)
|
||||
|
||||
|
||||
##
|
||||
# Read-only property accessing the destination title.
|
||||
# @return A string.
|
||||
@ -1474,25 +1474,25 @@ def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encr
|
||||
# described in Algorithm 3.2.
|
||||
key = _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry)
|
||||
# 2. Initialize the MD5 hash function and pass the 32-byte padding string
|
||||
# shown in step 1 of Algorithm 3.2 as input to this function.
|
||||
# shown in step 1 of Algorithm 3.2 as input to this function.
|
||||
import md5
|
||||
m = md5.new()
|
||||
m.update(_encryption_padding)
|
||||
# 3. Pass the first element of the file's file identifier array (the value
|
||||
# of the ID entry in the document's trailer dictionary; see Table 3.13 on
|
||||
# page 73) to the hash function and finish the hash. (See implementation
|
||||
# note 25 in Appendix H.)
|
||||
# note 25 in Appendix H.)
|
||||
m.update(id1_entry)
|
||||
md5_hash = m.digest()
|
||||
# 4. Encrypt the 16-byte result of the hash, using an RC4 encryption
|
||||
# function with the encryption key from step 1.
|
||||
# function with the encryption key from step 1.
|
||||
val = utils.RC4_encrypt(key, md5_hash)
|
||||
# 5. Do the following 19 times: Take the output from the previous
|
||||
# invocation of the RC4 function and pass it as input to a new invocation
|
||||
# of the function; use an encryption key generated by taking each byte of
|
||||
# the original encryption key (obtained in step 2) and performing an XOR
|
||||
# operation between that byte and the single-byte value of the iteration
|
||||
# counter (from 1 to 19).
|
||||
# counter (from 1 to 19).
|
||||
for i in range(1, 20):
|
||||
new_key = ''
|
||||
for l in range(len(key)):
|
||||
@ -1500,7 +1500,7 @@ def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encr
|
||||
val = utils.RC4_encrypt(new_key, val)
|
||||
# 6. Append 16 bytes of arbitrary padding to the output from the final
|
||||
# invocation of the RC4 function and store the 32-byte result as the value
|
||||
# of the U entry in the encryption dictionary.
|
||||
# of the U entry in the encryption dictionary.
|
||||
# (implementator note: I don't know what "arbitrary padding" is supposed to
|
||||
# mean, so I have used null bytes. This seems to match a few other
|
||||
# people's implementations)
|
||||
|
Loading…
x
Reference in New Issue
Block a user