Fix reading metadata from PDF broken by poppler >= 0.47.0

2025-07-08 02:34:06 -04:00 · 2016-09-08 21:05:32 +05:30 · 2016-09-08 21:05:32 +05:30 · 59e8070ef9
commit 59e8070ef9
parent 8f655cd610
1 changed files with 19 additions and 8 deletions
--- a/src/calibre/ebooks/metadata/pdf.py
+++ b/src/calibre/ebooks/metadata/pdf.py
@ -33,22 +33,17 @@ def read_info(outputdir, get_cover):
    ans = {}

    try:
-        raw = subprocess.check_output([pdfinfo, '-meta', '-enc', 'UTF-8', 'src.pdf'])
+        raw = subprocess.check_output([pdfinfo, '-enc', 'UTF-8', 'src.pdf'])
    except subprocess.CalledProcessError as e:
        prints('pdfinfo errored out with return code: %d'%e.returncode)
        return None
-    # The XMP metadata could be in an encoding other than UTF-8, so split it
-    # out before trying to decode raw
-    parts = re.split(br'^Metadata:', raw, 1, flags=re.MULTILINE)
-    if len(parts) > 1:
-        raw, ans['xmp_metadata'] = parts
    try:
-        raw = raw.decode('utf-8')
+        info_raw = raw.decode('utf-8')
    except UnicodeDecodeError:
        prints('pdfinfo returned no UTF-8 data')
        return None

-    for line in raw.splitlines():
+    for line in info_raw.splitlines():
        if u':' not in line:
            continue
        field, val = line.partition(u':')[::2]
@ -56,6 +51,22 @@ def read_info(outputdir, get_cover):
        if field and val:
            ans[field] = val.strip()

+    # Now read XMP metadata
+    # Versions of poppler before 0.47.0 used to print out both the Info dict and
+    # XMP metadata packet together. However, since that changed in
+    # https://cgit.freedesktop.org/poppler/poppler/commit/?id=c91483aceb1b640771f572cb3df9ad707e5cad0d
+    # we can no longer rely on it.
+    try:
+        raw = subprocess.check_output([pdfinfo, '-meta', 'src.pdf']).strip()
+    except subprocess.CalledProcessError as e:
+        prints('pdfinfo errored out with return code: %d'%e.returncode)
+
+    parts = re.split(br'^Metadata:', raw, 1, flags=re.MULTILINE)
+    if len(parts) > 1:
+        raw, ans['xmp_metadata'] = parts
+    elif raw:
+        ans['xmp_metadata'] = raw
+
    if get_cover:
        try:
            subprocess.check_call([pdftoppm, '-singlefile', '-jpeg', '-cropbox',