PDF metadata: When both Info and XMP metadata present resolve which one ot use based on modification timestamps

2025-07-31 14:33:54 -04:00 · 2014-02-18 10:52:52 +05:30 · 2014-02-18 10:52:52 +05:30 · 9d7a1f8417
commit 9d7a1f8417
parent 0ff835c4e6
2 changed files with 29 additions and 5 deletions
--- a/src/calibre/ebooks/metadata/pdf.py
+++ b/src/calibre/ebooks/metadata/pdf.py
@ -134,7 +134,7 @@ def get_metadata(stream, cover=True):
    if 'xmp_metadata' in info:
        from calibre.ebooks.metadata.xmp import consolidate_metadata
-        mi = consolidate_metadata(mi, info['xmp_metadata'])
+        mi = consolidate_metadata(mi, info)
    # Look for recognizable identifiers in the info dict, if they were not
    # found in the XMP metadata
--- a/src/calibre/ebooks/metadata/xmp.py
+++ b/src/calibre/ebooks/metadata/xmp.py
@ -226,6 +226,12 @@ def metadata_from_xmp_packet(raw_bytes):
    bkp = first_simple('//xmp:CreatorTool', root)
    if bkp:
        mi.book_producer = bkp
    md = first_simple('//xmp:MetadataDate', root)
    if md:
        try:
            mi.metadata_date = parse_date(md)
        except:
            pass
    rating = first_simple('//calibre:rating', root)
    if rating is not None:
        try:
@ -289,15 +295,33 @@ def metadata_from_xmp_packet(raw_bytes):
    return mi
-def consolidate_metadata(info_mi, xmp_packet):
+def consolidate_metadata(info_mi, info):
-    ' When both the PDF Info dict and XMP metadata are present, prefer the xmp metadata '
+    ''' When both the PDF Info dict and XMP metadata are present, prefer the xmp
    metadata unless the Info ModDate is never than the XMP MetadataDate. This
    is the algorithm recommended by the PDF spec. '''
    try:
-        xmp_mi = metadata_from_xmp_packet(xmp_packet)
+        xmp_mi = metadata_from_xmp_packet(info['xmp_metadata'])
    except:
        import traceback
        traceback.print_exc()
        return info_mi
    info_title, info_authors, info_tags = info_mi.title or _('Unknown'), list(info_mi.authors or ()), list(info_mi.tags or ())
    info_mi.smart_update(xmp_mi, replace_metadata=True)
    prefer_info = False
    if 'ModDate' in info and hasattr(xmp_mi, 'metadata_date'):
        try:
            info_date = parse_date(info['ModDate'])
        except:
            pass
        else:
            prefer_info = info_date > xmp_mi.metadata_date
    if prefer_info:
        info_mi.title, info_mi.authors, info_mi.tags = info_title, info_authors, info_tags
    else:
-        info_mi.smart_update(xmp_mi, replace_metadata=True)
+        # We'll use the xmp tags/authors but fallback to the info ones if the
        # xmp does not have tags/authors. smart_update() should have taken care of
        # the rest
        info_mi.authors, info_mi.tags = xmp_mi.authors or info_mi.authors, xmp_mi.tags or info_mi.tags
    return info_mi
 def nsmap(*args):