PDF metadata: Workaround for PDF files with corrupted XMP metadata packets, generated by Nitro PDF. See #1541981 (Private bug)

This commit is contained in:
Kovid Goyal 2016-02-05 21:48:17 +05:30
parent c585ad5893
commit 1b148eb370
2 changed files with 5 additions and 3 deletions

View File

@ -13,8 +13,6 @@ from calibre.ebooks.metadata import (
MetaInformation, string_to_authors, check_isbn, check_doi)
from calibre.utils.ipc.simple_worker import fork_job, WorkerError
#_isbn_pat = re.compile(r'ISBN[: ]*([-0-9Xx]+)')
def get_tools():
from calibre.ebooks.pdf.pdftohtml import PDFTOHTML
base = os.path.dirname(PDFTOHTML)

View File

@ -221,6 +221,10 @@ def metadata_from_xmp_packet(raw_bytes):
root = parse_xmp_packet(raw_bytes)
mi = Metadata(_('Unknown'))
title = first_alt('//dc:title', root)
if title.startswith(r'\376\377'):
# corrupted XMP packet generated by Nitro PDF. See
# https://bugs.launchpad.net/calibre/+bug/1541981
raise ValueError('Corrupted XMP metadata packet detected, probably generated by Nitro PDF')
if title:
mi.title = title
authors = multiple_sequences('//dc:creator', root)
@ -341,7 +345,7 @@ def consolidate_metadata(info_mi, info):
# We'll use the xmp tags/authors but fallback to the info ones if the
# xmp does not have tags/authors. smart_update() should have taken care of
# the rest
info_mi.authors, info_mi.tags = xmp_mi.authors or info_mi.authors, xmp_mi.tags or info_mi.tags
info_mi.authors, info_mi.tags = (info_authors if xmp_mi.is_null('authors') else xmp_mi.authors), xmp_mi.tags or info_tags
return info_mi
def nsmap(*args):