diff --git a/src/calibre/ebooks/metadata/meta.py b/src/calibre/ebooks/metadata/meta.py index e7dac062f4..c1ad9da7d6 100644 --- a/src/calibre/ebooks/metadata/meta.py +++ b/src/calibre/ebooks/metadata/meta.py @@ -20,6 +20,7 @@ from calibre.ebooks.lrf.meta import set_metadata as set_lrf_metadata from calibre.ebooks.metadata.epub import set_metadata as set_epub_metadata from calibre.ebooks.metadata import MetaInformation +from calibre.utils.config import prefs _METADATA_PRIORITIES = [ 'html', 'htm', 'xhtml', 'xhtm', @@ -59,7 +60,7 @@ def metadata_from_formats(formats): def get_metadata(stream, stream_type='lrf', use_libprs_metadata=False): if stream_type: stream_type = stream_type.lower() - if stream_type in ('html', 'html', 'xhtml', 'xhtm'): + if stream_type in ('html', 'html', 'xhtml', 'xhtm', 'xml'): stream_type = 'html' if stream_type in ('mobi', 'prc'): stream_type = 'mobi' @@ -73,18 +74,20 @@ def get_metadata(stream, stream_type='lrf', use_libprs_metadata=False): if use_libprs_metadata and getattr(opf, 'application_id', None) is not None: return opf - try: - func = eval(stream_type + '_metadata') - mi = func(stream) - except NameError: - mi = MetaInformation(None, None) + mi = MetaInformation(None, None) + if prefs['read_file_metadata']: + try: + func = eval(stream_type + '_metadata') + mi = func(stream) + except NameError: + pass name = os.path.basename(getattr(stream, 'name', '')) base = metadata_from_filename(name) if not base.authors: - base.authors = ['Unknown'] + base.authors = [_('Unknown')] if not base.title: - base.title = 'Unknown' + base.title = _('Unknown') base.smart_update(mi) if opf is not None: base.smart_update(opf) diff --git a/src/calibre/ebooks/metadata/pdf.py b/src/calibre/ebooks/metadata/pdf.py index bf8d50adf4..84a38d9ee4 100644 --- a/src/calibre/ebooks/metadata/pdf.py +++ b/src/calibre/ebooks/metadata/pdf.py @@ -5,12 +5,11 @@ __copyright__ = '2008, Kovid Goyal ' import sys, os from calibre.ebooks.metadata import MetaInformation -from calibre.ebooks.pyPdf import PdfFileReader +from pyPdf import PdfFileReader def get_metadata(stream): """ Return metadata as a L{MetaInfo} object """ - title = 'Unknown' - mi = MetaInformation(title, ['Unknown']) + mi = MetaInformation(_('Unknown'), [_('Unknown')]) stream.seek(0) try: info = PdfFileReader(stream).getDocumentInfo() diff --git a/src/calibre/gui2/dialogs/config.py b/src/calibre/gui2/dialogs/config.py index 8fa791fd44..7cb03c0f91 100644 --- a/src/calibre/gui2/dialogs/config.py +++ b/src/calibre/gui2/dialogs/config.py @@ -80,6 +80,7 @@ class ConfigDialog(QDialog, Ui_Dialog): self.language.addItem(item[1], QVariant(item[0])) self.output_format.setCurrentIndex(0 if prefs['output_format'] == 'LRF' else 1) + self.pdf_metadata.setChecked(prefs['read_file_metadata']) @@ -113,6 +114,7 @@ class ConfigDialog(QDialog, Ui_Dialog): config['confirm_delete'] = bool(self.confirm_delete.isChecked()) pattern = self.filename_pattern.commit() prefs['filename_pattern'] = pattern + prefs['read_file_metadata'] = bool(self.pdf_metadata.isChecked()) config['save_to_disk_single_format'] = BOOK_EXTENSIONS[self.single_format.currentIndex()] config['cover_flow_queue_length'] = self.cover_browse.value() prefs['language'] = str(self.language.itemData(self.language.currentIndex()).toString()) diff --git a/src/calibre/gui2/dialogs/config.ui b/src/calibre/gui2/dialogs/config.ui index bc5f634af4..76eb59f334 100644 --- a/src/calibre/gui2/dialogs/config.ui +++ b/src/calibre/gui2/dialogs/config.ui @@ -7,7 +7,7 @@ 0 0 709 - 723 + 750 @@ -158,6 +158,19 @@ + + + + If you disable this setting, metadatas is guessed from the filename instead. This can be configured in the Advanced section. + + + Read &metadata from files + + + true + + + diff --git a/src/calibre/utils/config.py b/src/calibre/utils/config.py index 8aa5ac7991..cd5944d3c4 100644 --- a/src/calibre/utils/config.py +++ b/src/calibre/utils/config.py @@ -524,6 +524,8 @@ def _prefs(): help=_('The language in which to display the user interface')) c.add_opt('output_format', default='LRF', help=_('The default output format for ebook conversions.')) + c.add_opt('read_file_metadata', default=True, + help=_('Read metadata from files')) c.add_opt('migrated', default=False, help='For Internal use. Don\'t modify.') return c diff --git a/src/calibre/ebooks/pyPdf/__init__.py b/src/pyPdf/__init__.py similarity index 100% rename from src/calibre/ebooks/pyPdf/__init__.py rename to src/pyPdf/__init__.py diff --git a/src/calibre/ebooks/pyPdf/filters.py b/src/pyPdf/filters.py similarity index 84% rename from src/calibre/ebooks/pyPdf/filters.py rename to src/pyPdf/filters.py index 581cd52111..7fe10fb481 100644 --- a/src/calibre/ebooks/pyPdf/filters.py +++ b/src/pyPdf/filters.py @@ -34,6 +34,11 @@ Implementation of stream filters for PDF. __author__ = "Mathieu Fenniak" __author_email__ = "biziqe@mathieu.fenniak.net" +from utils import PdfReadError +try: + from cStringIO import StringIO +except ImportError: + from StringIO import StringIO try: import zlib @@ -100,32 +105,33 @@ class FlateDecode(object): # predictor 1 == no predictor if predictor != 1: columns = decodeParms["/Columns"] - if predictor >= 10: - newdata = "" + # PNG prediction: + if predictor >= 10 and predictor <= 15: + output = StringIO() # PNG prediction can vary from row to row rowlength = columns + 1 assert len(data) % rowlength == 0 - prev_rowdata = "\x00"*rowlength - for row in range(len(data) / rowlength): - rowdata = list(data[(row*rowlength):((row+1)*rowlength)]) - filterByte = ord(rowdata[0]) + prev_rowdata = (0,) * rowlength + for row in xrange(len(data) / rowlength): + rowdata = [ord(x) for x in data[(row*rowlength):((row+1)*rowlength)]] + filterByte = rowdata[0] if filterByte == 0: pass elif filterByte == 1: for i in range(2, rowlength): - rowdata[i] = chr((ord(rowdata[i]) + ord(rowdata[i-1])) % 256) + rowdata[i] = (rowdata[i] + rowdata[i-1]) % 256 elif filterByte == 2: for i in range(1, rowlength): - rowdata[i] = chr((ord(rowdata[i]) + ord(prev_rowdata[i])) % 256) + rowdata[i] = (rowdata[i] + prev_rowdata[i]) % 256 else: # unsupported PNG filter - assert False + raise PdfReadError("Unsupported PNG filter %r" % filterByte) prev_rowdata = rowdata - newdata += ''.join(rowdata[1:]) - data = newdata + output.write(''.join([chr(x) for x in rowdata[1:]])) + data = output.getvalue() else: # unsupported predictor - assert False + raise PdfReadError("Unsupported flatedecode predictor %r" % predictor) return data decode = staticmethod(decode) @@ -220,9 +226,15 @@ def decodeStreamData(stream): data = ASCIIHexDecode.decode(data) elif filterType == "/ASCII85Decode": data = ASCII85Decode.decode(data) + elif filterType == "/Crypt": + decodeParams = stream.get("/DecodeParams", {}) + if "/Name" not in decodeParams and "/Type" not in decodeParams: + pass + else: + raise NotImplementedError("/Crypt filter with /Name or /Type not supported yet") else: # unsupported filter - assert False + raise NotImplementedError("unsupported filter %s" % filterType) return data if __name__ == "__main__": @@ -237,3 +249,4 @@ if __name__ == "__main__": """ ascii85_originalText="Man is distinguished, not only by his reason, but by this singular passion from other animals, which is a lust of the mind, that by a perseverance of delight in the continued and indefatigable generation of knowledge, exceeds the short vehemence of any carnal pleasure." assert ASCII85Decode.decode(ascii85Test) == ascii85_originalText + diff --git a/src/calibre/ebooks/pyPdf/generic.py b/src/pyPdf/generic.py similarity index 90% rename from src/calibre/ebooks/pyPdf/generic.py rename to src/pyPdf/generic.py index 69a9ad7b5e..fb75ef3b3f 100644 --- a/src/calibre/ebooks/pyPdf/generic.py +++ b/src/pyPdf/generic.py @@ -203,6 +203,10 @@ class IndirectObject(PdfObject): class FloatObject(decimal.Decimal, PdfObject): + def __new__(cls, value="0", context=None): + return decimal.Decimal.__new__(cls, str(value), context) + def __repr__(self): + return str(self) def writeToStream(self, stream, encryption_key): stream.write(str(self)) @@ -419,8 +423,73 @@ class NameObject(str, PdfObject): class DictionaryObject(dict, PdfObject): - def __init__(self): - pass + + def __init__(self, *args, **kwargs): + if len(args) == 0: + self.update(kwargs) + elif len(args) == 1: + arr = args[0] + # If we're passed a list/tuple, make a dict out of it + if not hasattr(arr, "iteritems"): + newarr = {} + for k, v in arr: + newarr[k] = v + arr = newarr + self.update(arr) + else: + raise TypeError("dict expected at most 1 argument, got 3") + + def update(self, arr): + # note, a ValueError halfway through copying values + # will leave half the values in this dict. + for k, v in arr.iteritems(): + self.__setitem__(k, v) + + def raw_get(self, key): + return dict.__getitem__(self, key) + + def __setitem__(self, key, value): + if not isinstance(key, PdfObject): + raise ValueError("key must be PdfObject") + if not isinstance(value, PdfObject): + raise ValueError("value must be PdfObject") + return dict.__setitem__(self, key, value) + + def setdefault(self, key, value=None): + if not isinstance(key, PdfObject): + raise ValueError("key must be PdfObject") + if not isinstance(value, PdfObject): + raise ValueError("value must be PdfObject") + return dict.setdefault(self, key, value) + + def __getitem__(self, key): + return dict.__getitem__(self, key).getObject() + + ## + # Retrieves XMP (Extensible Metadata Platform) data relevant to the + # this object, if available. + #

+ # Stability: Added in v1.12, will exist for all future v1.x releases. + # @return Returns a {@link #xmp.XmpInformation XmlInformation} instance + # that can be used to access XMP metadata from the document. Can also + # return None if no metadata was found on the document root. + def getXmpMetadata(self): + metadata = self.get("/Metadata", None) + if metadata == None: + return None + metadata = metadata.getObject() + import xmp + if not isinstance(metadata, xmp.XmpInformation): + metadata = xmp.XmpInformation(metadata) + self[NameObject("/Metadata")] = metadata + return metadata + + ## + # Read-only property that accesses the {@link + # #DictionaryObject.getXmpData getXmpData} function. + #

+ # Stability: Added in v1.12, will exist for all future v1.x releases. + xmpMetadata = property(lambda self: self.getXmpMetadata(), None, None) def writeToStream(self, stream, encryption_key): stream.write("<<\n") @@ -563,7 +632,7 @@ class EncodedStreamObject(StreamObject): return self.decodedSelf.getData() else: # create decoded object - decoded = StreamObject() + decoded = DecodedStreamObject() decoded._data = filters.decodeStreamData(self) for key, value in self.items(): if not key in ("/Length", "/Filter", "/DecodeParms"): @@ -583,8 +652,8 @@ class RectangleObject(ArrayObject): ArrayObject.__init__(self, [self.ensureIsNumber(x) for x in arr]) def ensureIsNumber(self, value): - if not isinstance(value, NumberObject): - value = NumberObject(value) + if not isinstance(value, (NumberObject, FloatObject)): + value = FloatObject(value) return value def __repr__(self): diff --git a/src/calibre/ebooks/pyPdf/pdf.py b/src/pyPdf/pdf.py similarity index 86% rename from src/calibre/ebooks/pyPdf/pdf.py rename to src/pyPdf/pdf.py index f64c1a6c22..ce4331b498 100644 --- a/src/calibre/ebooks/pyPdf/pdf.py +++ b/src/pyPdf/pdf.py @@ -88,7 +88,8 @@ class PdfFileWriter(object): return IndirectObject(len(self._objects), 0, self) def getObject(self, ido): - assert ido.pdf == self + if ido.pdf != self: + raise ValueError("pdf must be self") return self._objects[ido.idnum - 1] ## @@ -105,7 +106,7 @@ class PdfFileWriter(object): page = self._addObject(page) pages = self.getObject(self._pages) pages["/Kids"].append(page) - pages["/Count"] = NumberObject(pages["/Count"] + 1) + pages[NameObject("/Count")] = NumberObject(pages["/Count"] + 1) ## # Encrypt this PDF file with the PDF Standard encryption handler. @@ -272,7 +273,6 @@ class PdfFileWriter(object): class PdfFileReader(object): def __init__(self, stream): self.flattenedPages = None - self.pageNumbers = {} self.resolvedObjects = {} self.read(stream) self.stream = stream @@ -290,7 +290,7 @@ class PdfFileReader(object): def getDocumentInfo(self): if not self.trailer.has_key("/Info"): return None - obj = self.getObject(self.trailer['/Info']) + obj = self.trailer['/Info'] retval = DocumentInformation() retval.update(obj) return retval @@ -302,6 +302,28 @@ class PdfFileReader(object): # Stability: Added in v1.7, will exist for all future v1.x releases. documentInfo = property(lambda self: self.getDocumentInfo(), None, None) + ## + # Retrieves XMP (Extensible Metadata Platform) data from the PDF document + # root. + #

+ # Stability: Added in v1.12, will exist for all future v1.x releases. + # @return Returns a {@link #generic.XmpInformation XmlInformation} + # instance that can be used to access XMP metadata from the document. + # Can also return None if no metadata was found on the document root. + def getXmpMetadata(self): + try: + self._override_encryption = True + return self.trailer["/Root"].getXmpMetadata() + finally: + self._override_encryption = False + + ## + # Read-only property that accesses the {@link #PdfFileReader.getXmpData + # getXmpData} function. + #

+ # Stability: Added in v1.12, will exist for all future v1.x releases. + xmpMetadata = property(lambda self: self.getXmpMetadata(), None, None) + ## # Calculates the number of pages in this PDF file. #

@@ -346,43 +368,39 @@ class PdfFileReader(object): # Stability: Added in v1.10, will exist for all future v1.x releases. # @return Returns a dict which maps names to {@link #Destination # destinations}. - def getNamedDestinations(self, tree = None, map = None): - if self.flattenedPages == None: - self._flatten() - - get = self.safeGetObject - if map == None: - map = {} - catalog = get(self.trailer["/Root"]) + def getNamedDestinations(self, tree=None, retval=None): + if retval == None: + retval = {} + catalog = self.trailer["/Root"] # get the name tree if catalog.has_key("/Dests"): - tree = get(catalog["/Dests"]) + tree = catalog["/Dests"] elif catalog.has_key("/Names"): - names = get(catalog['/Names']) + names = catalog['/Names'] if names.has_key("/Dests"): - tree = get(names['/Dests']) + tree = names['/Dests'] if tree == None: - return map + return retval if tree.has_key("/Kids"): # recurse down the tree - for kid in get(tree["/Kids"]): - self.getNamedDestinations(get(kid), map) + for kid in tree["/Kids"]: + self.getNamedDestinations(kid.getObject(), retval) if tree.has_key("/Names"): - names = get(tree["/Names"]) + names = tree["/Names"] for i in range(0, len(names), 2): - key = get(names[i]) - val = get(names[i+1]) + key = names[i].getObject() + val = names[i+1].getObject() if isinstance(val, DictionaryObject) and val.has_key('/D'): - val = get(val['/D']) - dest = self._buildDestination(val, key) + val = val['/D'] + dest = self._buildDestination(key, val) if dest != None: - map[key] = dest + retval[key] = dest - return map + return retval ## # Read-only property that accesses the {@link #PdfFileReader.getOutlines @@ -396,20 +414,16 @@ class PdfFileReader(object): #

# Stability: Added in v1.10, will exist for all future v1.x releases. # @return Returns a nested list of {@link #Destination destinations}. - def getOutlines(self, node = None, outlines = None): - if self.flattenedPages == None: - self._flatten() - - get = self.safeGetObject + def getOutlines(self, node=None, outlines=None): if outlines == None: outlines = [] - catalog = get(self.trailer["/Root"]) + catalog = self.trailer["/Root"] # get the outline dictionary and named destinations if catalog.has_key("/Outlines"): - lines = get(catalog["/Outlines"]) + lines = catalog["/Outlines"] if lines.has_key("/First"): - node = get(lines["/First"]) + node = lines["/First"] self._namedDests = self.getNamedDestinations() if node == None: @@ -424,49 +438,44 @@ class PdfFileReader(object): # check for sub-outlines if node.has_key("/First"): subOutlines = [] - self.getOutlines(get(node["/First"]), subOutlines) + self.getOutlines(node["/First"], subOutlines) if subOutlines: outlines.append(subOutlines) if not node.has_key("/Next"): break - node = get(node["/Next"]) + node = node["/Next"] return outlines - def _buildDestination(self, array, title): - if not (isinstance(array, ArrayObject) and len(array) >= 2 and \ - isinstance(array[0], IndirectObject)): - return None - - pageKey = (array[0].generation, array[0].idnum) - if not self.pageNumbers.has_key(pageKey): - return None - - pageNum = self.pageNumbers[pageKey] - return Destination(*([title, pageNum]+array[1:])) + def _buildDestination(self, title, array): + page, typ = array[0:2] + array = array[2:] + return Destination(title, page, typ, *array) def _buildOutline(self, node): dest, title, outline = None, None, None if node.has_key("/A") and node.has_key("/Title"): # Action, section 8.5 (only type GoTo supported) - title = self.safeGetObject(node["/Title"]) - action = self.safeGetObject(node["/A"]) + title = node["/Title"] + action = node["/A"] if action["/S"] == "/GoTo": - dest = self.safeGetObject(action["/D"]) + dest = action["/D"] elif node.has_key("/Dest") and node.has_key("/Title"): # Destination, section 8.2.1 - title = self.safeGetObject(node["/Title"]) - dest = self.safeGetObject(node["/Dest"]) + title = node["/Title"] + dest = node["/Dest"] # if destination found, then create outline if dest: if isinstance(dest, ArrayObject): - outline = self._buildDestination(dest, title) - elif isinstance(dest, str) and self._namedDests.has_key(dest): + outline = self._buildDestination(title, dest) + elif isinstance(dest, unicode) and self._namedDests.has_key(dest): outline = self._namedDests[dest] - outline.title = title + outline[NameObject("/Title")] = title + else: + raise utils.PdfReadError("Unexpected destination %r" % dest) return outline ## @@ -478,7 +487,7 @@ class PdfFileReader(object): pages = property(lambda self: ConvertFunctionsToVirtualList(self.getNumPages, self.getPage), None, None) - def _flatten(self, pages = None, inherit = None): + def _flatten(self, pages=None, inherit=None): inheritablePageAttributes = ( NameObject("/Resources"), NameObject("/MediaBox"), NameObject("/CropBox"), NameObject("/Rotate") @@ -487,37 +496,25 @@ class PdfFileReader(object): inherit = dict() if pages == None: self.flattenedPages = [] - catalog = self.getObject(self.trailer["/Root"]) - pages = self.getObject(catalog["/Pages"]) - indirectReference = None - if isinstance(pages, IndirectObject): - indirectReference = pages - pages = self.getObject(pages) + catalog = self.trailer["/Root"].getObject() + pages = catalog["/Pages"].getObject() t = pages["/Type"] if t == "/Pages": for attr in inheritablePageAttributes: if pages.has_key(attr): inherit[attr] = pages[attr] - for page in self.safeGetObject(pages["/Kids"]): - self._flatten(page, inherit) + for page in pages["/Kids"]: + self._flatten(page.getObject(), inherit) elif t == "/Page": for attr,value in inherit.items(): # if the page has it's own value, it does not inherit the # parent's value: if not pages.has_key(attr): pages[attr] = value - pageObj = PageObject(self, indirectReference) + pageObj = PageObject(self) pageObj.update(pages) - if indirectReference: - key = (indirectReference.generation, indirectReference.idnum) - self.pageNumbers[key] = len(self.flattenedPages) self.flattenedPages.append(pageObj) - def safeGetObject(self, obj): - if isinstance(obj, IndirectObject): - return self.safeGetObject(self.getObject(obj)) - return obj - def getObject(self, indirectReference): retval = self.resolvedObjects.get(indirectReference.generation, {}).get(indirectReference.idnum, None) if retval != None: @@ -527,7 +524,7 @@ class PdfFileReader(object): # indirect reference to object in object stream # read the entire object stream into memory stmnum,idx = self.xref_objStm[indirectReference.idnum] - objStm = self.getObject(IndirectObject(stmnum, 0, self)) + objStm = IndirectObject(stmnum, 0, self).getObject() assert objStm['/Type'] == '/ObjStm' assert idx < objStm['/N'] streamData = StringIO(objStm.getData()) @@ -619,7 +616,7 @@ class PdfFileReader(object): # read all cross reference tables and their trailers self.xref = {} self.xref_objStm = {} - self.trailer = {} + self.trailer = DictionaryObject() while 1: # load the xref table stream.seek(startxref, 0) @@ -641,6 +638,16 @@ class PdfFileReader(object): cnt = 0 while cnt < size: line = stream.read(20) + # It's very clear in section 3.4.3 of the PDF spec + # that all cross-reference table lines are a fixed + # 20 bytes. However... some malformed PDF files + # use a single character EOL without a preceeding + # space. Detect that case, and seek the stream + # back one character. (0-9 means we've bled into + # the next xref entry, t means we've bled into the + # text "trailer"): + if line[-1] in "0123456789t": + stream.seek(-1, 1) offset, generation = line[:16].split(" ") offset, generation = int(offset), int(generation) if not self.xref.has_key(generation): @@ -669,8 +676,8 @@ class PdfFileReader(object): for key, value in newTrailer.items(): if not self.trailer.has_key(key): self.trailer[key] = value - if newTrailer.has_key(NameObject("/Prev")): - startxref = newTrailer[NameObject("/Prev")] + if newTrailer.has_key("/Prev"): + startxref = newTrailer["/Prev"] else: break elif x.isdigit(): @@ -681,43 +688,46 @@ class PdfFileReader(object): assert xrefstream["/Type"] == "/XRef" self.cacheIndirectObject(generation, idnum, xrefstream) streamData = StringIO(xrefstream.getData()) - num, size = xrefstream.get("/Index", [0, xrefstream.get("/Size")]) + idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")]) entrySizes = xrefstream.get("/W") - cnt = 0 - while cnt < size: - for i in range(len(entrySizes)): - d = streamData.read(entrySizes[i]) - di = convertToInt(d, entrySizes[i]) - if i == 0: - xref_type = di - elif i == 1: - if xref_type == 0: - next_free_object = di - elif xref_type == 1: - byte_offset = di - elif xref_type == 2: - objstr_num = di - elif i == 2: - if xref_type == 0: - next_generation = di - elif xref_type == 1: - generation = di - elif xref_type == 2: - obstr_idx = di - if xref_type == 0: - pass - elif xref_type == 1: - if not self.xref.has_key(generation): - self.xref[generation] = {} - self.xref[generation][num] = byte_offset - elif xref_type == 2: - self.xref_objStm[num] = [objstr_num, obstr_idx] - cnt += 1 - num += 1 + for num, size in self._pairs(idx_pairs): + cnt = 0 + while cnt < size: + for i in range(len(entrySizes)): + d = streamData.read(entrySizes[i]) + di = convertToInt(d, entrySizes[i]) + if i == 0: + xref_type = di + elif i == 1: + if xref_type == 0: + next_free_object = di + elif xref_type == 1: + byte_offset = di + elif xref_type == 2: + objstr_num = di + elif i == 2: + if xref_type == 0: + next_generation = di + elif xref_type == 1: + generation = di + elif xref_type == 2: + obstr_idx = di + if xref_type == 0: + pass + elif xref_type == 1: + if not self.xref.has_key(generation): + self.xref[generation] = {} + if not num in self.xref[generation]: + self.xref[generation][num] = byte_offset + elif xref_type == 2: + if not num in self.xref_objStm: + self.xref_objStm[num] = [objstr_num, obstr_idx] + cnt += 1 + num += 1 trailerKeys = "/Root", "/Encrypt", "/Info", "/ID" for key in trailerKeys: if xrefstream.has_key(key) and not self.trailer.has_key(key): - self.trailer[NameObject(key)] = xrefstream[key] + self.trailer[NameObject(key)] = xrefstream.raw_get(key) if xrefstream.has_key("/Prev"): startxref = xrefstream["/Prev"] else: @@ -737,6 +747,14 @@ class PdfFileReader(object): assert False break + def _pairs(self, array): + i = 0 + while True: + yield array[i], array[i+1] + i += 2 + if (i+1) >= len(array): + break + def readNextEndLine(self, stream): line = "" while True: @@ -778,7 +796,7 @@ class PdfFileReader(object): self._override_encryption = False def _decrypt(self, password): - encrypt = self.safeGetObject(self.trailer['/Encrypt']) + encrypt = self.trailer['/Encrypt'].getObject() if encrypt['/Filter'] != '/Standard': raise NotImplementedError, "only Standard PDF encryption handler is available" if not (encrypt['/V'] in (1, 2)): @@ -788,13 +806,13 @@ class PdfFileReader(object): self._decryption_key = key return 1 else: - rev = self.safeGetObject(encrypt['/R']) + rev = encrypt['/R'].getObject() if rev == 2: keylen = 5 else: - keylen = self.safeGetObject(encrypt['/Length']) / 8 + keylen = encrypt['/Length'].getObject() / 8 key = _alg33_1(password, rev, keylen) - real_O = self.safeGetObject(encrypt["/O"]) + real_O = encrypt["/O"].getObject() if rev == 2: userpass = utils.RC4_encrypt(key, real_O) else: @@ -812,20 +830,20 @@ class PdfFileReader(object): return 0 def _authenticateUserPassword(self, password): - encrypt = self.safeGetObject(self.trailer['/Encrypt']) - rev = self.safeGetObject(encrypt['/R']) - owner_entry = self.safeGetObject(encrypt['/O']).original_bytes - p_entry = self.safeGetObject(encrypt['/P']) - id_entry = self.safeGetObject(self.trailer['/ID']) - id1_entry = self.safeGetObject(id_entry[0]) + encrypt = self.trailer['/Encrypt'].getObject() + rev = encrypt['/R'].getObject() + owner_entry = encrypt['/O'].getObject().original_bytes + p_entry = encrypt['/P'].getObject() + id_entry = self.trailer['/ID'].getObject() + id1_entry = id_entry[0].getObject() if rev == 2: U, key = _alg34(password, owner_entry, p_entry, id1_entry) elif rev >= 3: U, key = _alg35(password, rev, - self.safeGetObject(encrypt["/Length"]) / 8, owner_entry, + encrypt["/Length"].getObject() / 8, owner_entry, p_entry, id1_entry, - self.safeGetObject(encrypt.get("/EncryptMetadata", False))) - real_U = self.safeGetObject(encrypt['/U']).original_bytes + encrypt.get("/EncryptMetadata", BooleanObject(False)).getObject()) + real_U = encrypt['/U'].getObject().original_bytes return U == real_U, key def getIsEncrypted(self): @@ -874,10 +892,9 @@ def createRectangleAccessor(name, fallback): # will be created by accessing the {@link #PdfFileReader.getPage getPage} # function of the {@link #PdfFileReader PdfFileReader} class. class PageObject(DictionaryObject): - def __init__(self, pdf, indirectReference = None): + def __init__(self, pdf): DictionaryObject.__init__(self) self.pdf = pdf - self.indirectReference = indirectReference ## # Rotates a page clockwise by increments of 90 degrees. @@ -1058,7 +1075,7 @@ class PageObject(DictionaryObject): # implementation-defined manner. Default value: same as MediaBox. #

# Stability: Added in v1.4, will exist for all future v1.x releases. - cropBox = createRectangleAccessor("/CropBox", ("/CropBox",)) + cropBox = createRectangleAccessor("/CropBox", ("/MediaBox",)) ## # A rectangle (RectangleObject), expressed in default user space units, @@ -1110,7 +1127,15 @@ class ContentStream(DecodedStreamObject): break stream.seek(-1, 1) if peek.isalpha() or peek == "'" or peek == '"': - operator = readUntilWhitespace(stream, maxchars=2) + operator = "" + while True: + tok = stream.read(1) + if tok.isspace() or tok in NameObject.delimiterCharacters: + stream.seek(-1, 1) + break + elif tok == '': + break + operator += tok if operator == "BI": # begin inline image - a completely different parsing # mechanism is required, of course... thanks buddy... @@ -1120,6 +1145,14 @@ class ContentStream(DecodedStreamObject): else: self.operations.append((operands, operator)) operands = [] + elif peek == '%': + # If we encounter a comment in the content stream, we have to + # handle it here. Typically, readObject will handle + # encountering a comment -- but readObject assumes that + # following the comment must be the object we're trying to + # read. In this case, it could be an operator instead. + while peek not in ('\r', '\n'): + peek = stream.read(1) else: operands.append(readObject(stream, None)) @@ -1251,86 +1284,74 @@ class DocumentInformation(DictionaryObject): # See section 8.2.1 of the PDF 1.6 reference. # Stability: Added in v1.10, will exist for all v1.x releases. class Destination(DictionaryObject): - def __init__(self, *args): + def __init__(self, title, page, typ, *args): DictionaryObject.__init__(self) - self.title = args[0] - self["/Page"], self["/Type"] = args[1], args[2] + self[NameObject("/Title")] = title + self[NameObject("/Page")] = page + self[NameObject("/Type")] = typ # from table 8.2 of the PDF 1.6 reference. - mapNull = lambda x: {True: None, False: x}[isinstance(x, NullObject)] - params = map(mapNull, args[3:]) - type = self["/Type"] - - if type == "/XYZ": - self["/Left"], self["/Top"], self["/Zoom"] = params - elif type == "/FitR": - self["/Left"], self["/Bottom"], \ - self["/Right"], self["/Top"] = params - elif type in ["/FitH", "FitBH"]: - self["/Top"], = params - elif type in ["/FitV", "FitBV"]: - self["/Left"], = params - elif type in ["/Fit", "FitB"]: + if typ == "/XYZ": + (self[NameObject("/Left")], self[NameObject("/Top")], + self[NameObject("/Zoom")]) = args + elif typ == "/FitR": + (self[NameObject("/Left")], self[NameObject("/Bottom")], + self[NameObject("/Right")], self[NameObject("/Top")]) = args + elif typ in ["/FitH", "FitBH"]: + self[NameObject("/Top")], = args + elif typ in ["/FitV", "FitBV"]: + self[NameObject("/Left")], = args + elif typ in ["/Fit", "FitB"]: pass else: - raise utils.PdfReadError, "Unknown Destination Type: " + type + raise utils.PdfReadError("Unknown Destination Type: %r" % typ) - def setTitle(self, title): - self["/Title"] = title.strip() - ## - # Read-write property accessing the destination title. + # Read-only property accessing the destination title. # @return A string. - title = property(lambda self: self.get("/Title"), setTitle, None) + title = property(lambda self: self.get("/Title")) ## # Read-only property accessing the destination page. # @return An integer. - page = property(lambda self: self.get("/Page"), None, None) + page = property(lambda self: self.get("/Page")) ## # Read-only property accessing the destination type. # @return A string. - type = property(lambda self: self.get("/Type"), None, None) + typ = property(lambda self: self.get("/Type")) ## # Read-only property accessing the zoom factor. # @return A number, or None if not available. - zoom = property(lambda self: self.get("/Zoom", None), None, None) + zoom = property(lambda self: self.get("/Zoom", None)) ## # Read-only property accessing the left horizontal coordinate. # @return A number, or None if not available. - left = property(lambda self: self.get("/Left", None), None, None) + left = property(lambda self: self.get("/Left", None)) ## # Read-only property accessing the right horizontal coordinate. # @return A number, or None if not available. - right = property(lambda self: self.get("/Right", None), None, None) + right = property(lambda self: self.get("/Right", None)) ## # Read-only property accessing the top vertical coordinate. # @return A number, or None if not available. - top = property(lambda self: self.get("/Top", None), None, None) + top = property(lambda self: self.get("/Top", None)) ## # Read-only property accessing the bottom vertical coordinate. # @return A number, or None if not available. - bottom = property(lambda self: self.get("/Bottom", None), None, None) - + bottom = property(lambda self: self.get("/Bottom", None)) def convertToInt(d, size): - if size <= 4: - d = "\x00\x00\x00\x00" + d - d = d[-4:] - return struct.unpack(">l", d)[0] - elif size <= 8: - d = "\x00\x00\x00\x00\x00\x00\x00\x00" + d - d = d[-8:] - return struct.unpack(">q", d)[0] - else: - # size too big - assert False + if size > 8: + raise utils.PdfReadError("invalid size in convertToInt") + d = "\x00\x00\x00\x00\x00\x00\x00\x00" + d + d = d[-8:] + return struct.unpack(">q", d)[0] # ref: pdf1.8 spec section 3.5.2 algorithm 3.2 _encryption_padding = '\x28\xbf\x4e\x5e\x4e\x75\x8a\x41\x64\x00\x4e\x56' + \ diff --git a/src/calibre/ebooks/pyPdf/utils.py b/src/pyPdf/utils.py similarity index 90% rename from src/calibre/ebooks/pyPdf/utils.py rename to src/pyPdf/utils.py index 860a42e669..dd0a3d002a 100644 --- a/src/calibre/ebooks/pyPdf/utils.py +++ b/src/pyPdf/utils.py @@ -34,6 +34,19 @@ Utility functions for PDF library. __author__ = "Mathieu Fenniak" __author_email__ = "biziqe@mathieu.fenniak.net" +#ENABLE_PSYCO = False +#if ENABLE_PSYCO: +# try: +# import psyco +# except ImportError: +# ENABLE_PSYCO = False +# +#if not ENABLE_PSYCO: +# class psyco: +# def proxy(func): +# return func +# proxy = staticmethod(proxy) + def readUntilWhitespace(stream, maxchars=None): txt = "" while True: diff --git a/src/pyPdf/xmp.py b/src/pyPdf/xmp.py new file mode 100644 index 0000000000..b070df9093 --- /dev/null +++ b/src/pyPdf/xmp.py @@ -0,0 +1,355 @@ +import re +import datetime +import decimal +from generic import PdfObject +from xml.dom import getDOMImplementation +from xml.dom.minidom import parseString + +RDF_NAMESPACE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" +DC_NAMESPACE = "http://purl.org/dc/elements/1.1/" +XMP_NAMESPACE = "http://ns.adobe.com/xap/1.0/" +PDF_NAMESPACE = "http://ns.adobe.com/pdf/1.3/" +XMPMM_NAMESPACE = "http://ns.adobe.com/xap/1.0/mm/" + +# What is the PDFX namespace, you might ask? I might ask that too. It's +# a completely undocumented namespace used to place "custom metadata" +# properties, which are arbitrary metadata properties with no semantic or +# documented meaning. Elements in the namespace are key/value-style storage, +# where the element name is the key and the content is the value. The keys +# are transformed into valid XML identifiers by substituting an invalid +# identifier character with \u2182 followed by the unicode hex ID of the +# original character. A key like "my car" is therefore "my\u21820020car". +# +# \u2182, in case you're wondering, is the unicode character +# \u{ROMAN NUMERAL TEN THOUSAND}, a straightforward and obvious choice for +# escaping characters. +# +# Intentional users of the pdfx namespace should be shot on sight. A +# custom data schema and sensical XML elements could be used instead, as is +# suggested by Adobe's own documentation on XMP (under "Extensibility of +# Schemas"). +# +# Information presented here on the /pdfx/ schema is a result of limited +# reverse engineering, and does not constitute a full specification. +PDFX_NAMESPACE = "http://ns.adobe.com/pdfx/1.3/" + +iso8601 = re.compile(""" + (?P[0-9]{4}) + (- + (?P[0-9]{2}) + (- + (?P[0-9]+) + (T + (?P[0-9]{2}): + (?P[0-9]{2}) + (:(?P[0-9]{2}(.[0-9]+)?))? + (?PZ|[-+][0-9]{2}:[0-9]{2}) + )? + )? + )? + """, re.VERBOSE) + +## +# An object that represents Adobe XMP metadata. +class XmpInformation(PdfObject): + + def __init__(self, stream): + self.stream = stream + docRoot = parseString(self.stream.getData()) + self.rdfRoot = docRoot.getElementsByTagNameNS(RDF_NAMESPACE, "RDF")[0] + self.cache = {} + + def writeToStream(self, stream, encryption_key): + self.stream.writeToStream(stream, encryption_key) + + def getElement(self, aboutUri, namespace, name): + for desc in self.rdfRoot.getElementsByTagNameNS(RDF_NAMESPACE, "Description"): + if desc.getAttributeNS(RDF_NAMESPACE, "about") == aboutUri: + attr = desc.getAttributeNodeNS(namespace, name) + if attr != None: + yield attr + for element in desc.getElementsByTagNameNS(namespace, name): + yield element + + def getNodesInNamespace(self, aboutUri, namespace): + for desc in self.rdfRoot.getElementsByTagNameNS(RDF_NAMESPACE, "Description"): + if desc.getAttributeNS(RDF_NAMESPACE, "about") == aboutUri: + for i in range(desc.attributes.length): + attr = desc.attributes.item(i) + if attr.namespaceURI == namespace: + yield attr + for child in desc.childNodes: + if child.namespaceURI == namespace: + yield child + + def _getText(self, element): + text = "" + for child in element.childNodes: + if child.nodeType == child.TEXT_NODE: + text += child.data + return text + + def _converter_string(value): + return value + + def _converter_date(value): + m = iso8601.match(value) + year = int(m.group("year")) + month = int(m.group("month") or "1") + day = int(m.group("day") or "1") + hour = int(m.group("hour") or "0") + minute = int(m.group("minute") or "0") + second = decimal.Decimal(m.group("second") or "0") + seconds = second.to_integral(decimal.ROUND_FLOOR) + milliseconds = (second - seconds) * 1000000 + tzd = m.group("tzd") or "Z" + dt = datetime.datetime(year, month, day, hour, minute, seconds, milliseconds) + if tzd != "Z": + tzd_hours, tzd_minutes = [int(x) for x in tzd.split(":")] + tzd_hours *= -1 + if tzd_hours < 0: + tzd_minutes *= -1 + dt = dt + datetime.timedelta(hours=tzd_hours, minutes=tzd_minutes) + return dt + _test_converter_date = staticmethod(_converter_date) + + def _getter_bag(namespace, name, converter): + def get(self): + cached = self.cache.get(namespace, {}).get(name) + if cached: + return cached + retval = [] + for element in self.getElement("", namespace, name): + bags = element.getElementsByTagNameNS(RDF_NAMESPACE, "Bag") + if len(bags): + for bag in bags: + for item in bag.getElementsByTagNameNS(RDF_NAMESPACE, "li"): + value = self._getText(item) + value = converter(value) + retval.append(value) + ns_cache = self.cache.setdefault(namespace, {}) + ns_cache[name] = retval + return retval + return get + + def _getter_seq(namespace, name, converter): + def get(self): + cached = self.cache.get(namespace, {}).get(name) + if cached: + return cached + retval = [] + for element in self.getElement("", namespace, name): + seqs = element.getElementsByTagNameNS(RDF_NAMESPACE, "Seq") + if len(seqs): + for seq in seqs: + for item in seq.getElementsByTagNameNS(RDF_NAMESPACE, "li"): + value = self._getText(item) + value = converter(value) + retval.append(value) + else: + value = converter(self._getText(element)) + retval.append(value) + ns_cache = self.cache.setdefault(namespace, {}) + ns_cache[name] = retval + return retval + return get + + def _getter_langalt(namespace, name, converter): + def get(self): + cached = self.cache.get(namespace, {}).get(name) + if cached: + return cached + retval = {} + for element in self.getElement("", namespace, name): + alts = element.getElementsByTagNameNS(RDF_NAMESPACE, "Alt") + if len(alts): + for alt in alts: + for item in alt.getElementsByTagNameNS(RDF_NAMESPACE, "li"): + value = self._getText(item) + value = converter(value) + retval[item.getAttribute("xml:lang")] = value + else: + retval["x-default"] = converter(self._getText(element)) + ns_cache = self.cache.setdefault(namespace, {}) + ns_cache[name] = retval + return retval + return get + + def _getter_single(namespace, name, converter): + def get(self): + cached = self.cache.get(namespace, {}).get(name) + if cached: + return cached + value = None + for element in self.getElement("", namespace, name): + if element.nodeType == element.ATTRIBUTE_NODE: + value = element.nodeValue + else: + value = self._getText(element) + break + if value != None: + value = converter(value) + ns_cache = self.cache.setdefault(namespace, {}) + ns_cache[name] = value + return value + return get + + ## + # Contributors to the resource (other than the authors). An unsorted + # array of names. + #

Stability: Added in v1.12, will exist for all future v1.x releases. + dc_contributor = property(_getter_bag(DC_NAMESPACE, "contributor", _converter_string)) + + ## + # Text describing the extent or scope of the resource. + #

Stability: Added in v1.12, will exist for all future v1.x releases. + dc_coverage = property(_getter_single(DC_NAMESPACE, "coverage", _converter_string)) + + ## + # A sorted array of names of the authors of the resource, listed in order + # of precedence. + #

Stability: Added in v1.12, will exist for all future v1.x releases. + dc_creator = property(_getter_seq(DC_NAMESPACE, "creator", _converter_string)) + + ## + # A sorted array of dates (datetime.datetime instances) of signifigance to + # the resource. The dates and times are in UTC. + #

Stability: Added in v1.12, will exist for all future v1.x releases. + dc_date = property(_getter_seq(DC_NAMESPACE, "date", _converter_date)) + + ## + # A language-keyed dictionary of textual descriptions of the content of the + # resource. + #

Stability: Added in v1.12, will exist for all future v1.x releases. + dc_description = property(_getter_langalt(DC_NAMESPACE, "description", _converter_string)) + + ## + # The mime-type of the resource. + #

Stability: Added in v1.12, will exist for all future v1.x releases. + dc_format = property(_getter_single(DC_NAMESPACE, "format", _converter_string)) + + ## + # Unique identifier of the resource. + #

Stability: Added in v1.12, will exist for all future v1.x releases. + dc_identifier = property(_getter_single(DC_NAMESPACE, "identifier", _converter_string)) + + ## + # An unordered array specifying the languages used in the resource. + #

Stability: Added in v1.12, will exist for all future v1.x releases. + dc_language = property(_getter_bag(DC_NAMESPACE, "language", _converter_string)) + + ## + # An unordered array of publisher names. + #

Stability: Added in v1.12, will exist for all future v1.x releases. + dc_publisher = property(_getter_bag(DC_NAMESPACE, "publisher", _converter_string)) + + ## + # An unordered array of text descriptions of relationships to other + # documents. + #

Stability: Added in v1.12, will exist for all future v1.x releases. + dc_relation = property(_getter_bag(DC_NAMESPACE, "relation", _converter_string)) + + ## + # A language-keyed dictionary of textual descriptions of the rights the + # user has to this resource. + #

Stability: Added in v1.12, will exist for all future v1.x releases. + dc_rights = property(_getter_langalt(DC_NAMESPACE, "rights", _converter_string)) + + ## + # Unique identifier of the work from which this resource was derived. + #

Stability: Added in v1.12, will exist for all future v1.x releases. + dc_source = property(_getter_single(DC_NAMESPACE, "source", _converter_string)) + + ## + # An unordered array of descriptive phrases or keywrods that specify the + # topic of the content of the resource. + #

Stability: Added in v1.12, will exist for all future v1.x releases. + dc_subject = property(_getter_bag(DC_NAMESPACE, "subject", _converter_string)) + + ## + # A language-keyed dictionary of the title of the resource. + #

Stability: Added in v1.12, will exist for all future v1.x releases. + dc_title = property(_getter_langalt(DC_NAMESPACE, "title", _converter_string)) + + ## + # An unordered array of textual descriptions of the document type. + #

Stability: Added in v1.12, will exist for all future v1.x releases. + dc_type = property(_getter_bag(DC_NAMESPACE, "type", _converter_string)) + + ## + # An unformatted text string representing document keywords. + #

Stability: Added in v1.12, will exist for all future v1.x releases. + pdf_keywords = property(_getter_single(PDF_NAMESPACE, "Keywords", _converter_string)) + + ## + # The PDF file version, for example 1.0, 1.3. + #

Stability: Added in v1.12, will exist for all future v1.x releases. + pdf_pdfversion = property(_getter_single(PDF_NAMESPACE, "PDFVersion", _converter_string)) + + ## + # The name of the tool that created the PDF document. + #

Stability: Added in v1.12, will exist for all future v1.x releases. + pdf_producer = property(_getter_single(PDF_NAMESPACE, "Producer", _converter_string)) + + ## + # The date and time the resource was originally created. The date and + # time are returned as a UTC datetime.datetime object. + #

Stability: Added in v1.12, will exist for all future v1.x releases. + xmp_createDate = property(_getter_single(XMP_NAMESPACE, "CreateDate", _converter_date)) + + ## + # The date and time the resource was last modified. The date and time + # are returned as a UTC datetime.datetime object. + #

Stability: Added in v1.12, will exist for all future v1.x releases. + xmp_modifyDate = property(_getter_single(XMP_NAMESPACE, "ModifyDate", _converter_date)) + + ## + # The date and time that any metadata for this resource was last + # changed. The date and time are returned as a UTC datetime.datetime + # object. + #

Stability: Added in v1.12, will exist for all future v1.x releases. + xmp_metadataDate = property(_getter_single(XMP_NAMESPACE, "MetadataDate", _converter_date)) + + ## + # The name of the first known tool used to create the resource. + #

Stability: Added in v1.12, will exist for all future v1.x releases. + xmp_creatorTool = property(_getter_single(XMP_NAMESPACE, "CreatorTool", _converter_string)) + + ## + # The common identifier for all versions and renditions of this resource. + #

Stability: Added in v1.12, will exist for all future v1.x releases. + xmpmm_documentId = property(_getter_single(XMPMM_NAMESPACE, "DocumentID", _converter_string)) + + ## + # An identifier for a specific incarnation of a document, updated each + # time a file is saved. + #

Stability: Added in v1.12, will exist for all future v1.x releases. + xmpmm_instanceId = property(_getter_single(XMPMM_NAMESPACE, "InstanceID", _converter_string)) + + def custom_properties(self): + if not hasattr(self, "_custom_properties"): + self._custom_properties = {} + for node in self.getNodesInNamespace("", PDFX_NAMESPACE): + key = node.localName + while True: + # see documentation about PDFX_NAMESPACE earlier in file + idx = key.find(u"\u2182") + if idx == -1: + break + key = key[:idx] + chr(int(key[idx+1:idx+5], base=16)) + key[idx+5:] + if node.nodeType == node.ATTRIBUTE_NODE: + value = node.nodeValue + else: + value = self._getText(node) + self._custom_properties[key] = value + return self._custom_properties + + ## + # Retrieves custom metadata properties defined in the undocumented pdfx + # metadata schema. + #

Stability: Added in v1.12, will exist for all future v1.x releases. + # @return Returns a dictionary of key/value items for custom metadata + # properties. + custom_properties = property(custom_properties) + +