PDF metadata: Workaround for PDF files with corrupted XMP metadata packets, generated by Nitro PDF. See #1541981 (Private bug)

2025-07-09 03:04:10 -04:00 · 2016-02-05 21:48:17 +05:30 · 2016-02-05 21:48:17 +05:30 · 1b148eb370
commit 1b148eb370
parent c585ad5893
2 changed files with 5 additions and 3 deletions
--- a/src/calibre/ebooks/metadata/pdf.py
+++ b/src/calibre/ebooks/metadata/pdf.py
@ -13,8 +13,6 @@ from calibre.ebooks.metadata import (
    MetaInformation, string_to_authors, check_isbn, check_doi)
 from calibre.utils.ipc.simple_worker import fork_job, WorkerError

-#_isbn_pat = re.compile(r'ISBN[: ]*([-0-9Xx]+)')
-
 def get_tools():
    from calibre.ebooks.pdf.pdftohtml import PDFTOHTML
    base = os.path.dirname(PDFTOHTML)
--- a/src/calibre/ebooks/metadata/xmp.py
+++ b/src/calibre/ebooks/metadata/xmp.py
@ -221,6 +221,10 @@ def metadata_from_xmp_packet(raw_bytes):
    root = parse_xmp_packet(raw_bytes)
    mi = Metadata(_('Unknown'))
    title = first_alt('//dc:title', root)
+    if title.startswith(r'\376\377'):
+        # corrupted XMP packet generated by Nitro PDF. See
+        # https://bugs.launchpad.net/calibre/+bug/1541981
+        raise ValueError('Corrupted XMP metadata packet detected, probably generated by Nitro PDF')
    if title:
        mi.title = title
    authors = multiple_sequences('//dc:creator', root)
@ -341,7 +345,7 @@ def consolidate_metadata(info_mi, info):
        # We'll use the xmp tags/authors but fallback to the info ones if the
        # xmp does not have tags/authors. smart_update() should have taken care of
        # the rest
-        info_mi.authors, info_mi.tags = xmp_mi.authors or info_mi.authors, xmp_mi.tags or info_mi.tags
+        info_mi.authors, info_mi.tags = (info_authors if xmp_mi.is_null('authors') else xmp_mi.authors), xmp_mi.tags or info_tags
    return info_mi

 def nsmap(*args):