When converting AZW4 files to PDF, skip the conversion and simply unwrap the PDF file already embedded inside the AZW4 file.

This almost always gives better results, since otherwise we are doing a PDF to PDF conversion.
2025-07-09 03:04:10 -04:00 · 2015-10-27 07:00:38 +05:30 · 2015-10-27 07:00:38 +05:30 · 826dc25ffd
commit 826dc25ffd
parent de641e723a
2 changed files with 23 additions and 7 deletions
--- a/src/calibre/ebooks/azw4/reader.py
+++ b/src/calibre/ebooks/azw4/reader.py
@ -16,6 +16,15 @@ import re

 from calibre.ebooks.pdb.formatreader import FormatReader

+def unwrap(stream, output_path):
+    raw_data = stream.read()
+    m = re.search(br'%PDF.+%%EOF', raw_data, flags=re.DOTALL)
+    if m is None:
+        raise ValueError('No embedded PDF found in AZW4 file')
+    with open(output_path, 'wb') as f:
+        f.write(m.group())
+
+
 class Reader(FormatReader):

    def __init__(self, header, stream, log, options):
@ -30,15 +39,13 @@ class Reader(FormatReader):
        self.stream.seek(0)
        raw_data = self.stream.read()
        data = ''
-        mo = re.search(r'(?ums)%PDF.*%%EOF.', raw_data)
+        mo = re.search(br'%PDF.+%%EOF', raw_data, flags=re.DOTALL)
        if mo:
            data = mo.group()

        pdf_n = os.path.join(os.getcwdu(), 'tmp.pdf')
-        pdf = open(pdf_n, 'wb')
-        pdf.write(data)
-        pdf.close()
-    
+        with open(pdf_n, 'wb') as pdf:
+            pdf.write(data)
        from calibre.customize.ui import plugin_for_input_format

        pdf_plugin = plugin_for_input_format('pdf')
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@ -1030,6 +1030,15 @@ OptionRecommendation(name='search_replace',

        if hasattr(self.opts, 'lrf') and self.output_plugin.file_type == 'lrf':
            self.opts.lrf = True
+        if self.input_fmt == 'azw4' and self.output_plugin.file_type == 'pdf':
+            self.ui_reporter(0.01, 'AZW4 files are simply wrappers around PDF files.'
+                             ' Skipping the conversion and unwrapping the embedded PDF instead')
+            from calibre.ebooks.azw4.reader import unwrap
+            unwrap(stream, self.output)
+            self.ui_reporter(1.)
+            self.log(self.output_fmt.upper(), 'output written to', self.output)
+            self.flush()
+            return

        self.ui_reporter(0.01, _('Converting input to HTML...'))
        ir = CompositeProgressReporter(0.01, 0.34, self.ui_reporter)