From 826dc25ffde9ad468e15e08a34b1a433efd65e34 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 27 Oct 2015 07:00:38 +0530 Subject: [PATCH] When converting AZW4 files to PDF, skip the conversion and simply unwrap the PDF file already embedded inside the AZW4 file. This almost always gives better results, since otherwise we are doing a PDF to PDF conversion. --- src/calibre/ebooks/azw4/reader.py | 21 ++++++++++++++------- src/calibre/ebooks/conversion/plumber.py | 9 +++++++++ 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/src/calibre/ebooks/azw4/reader.py b/src/calibre/ebooks/azw4/reader.py index 5acb86b3fc..60eaaef20e 100644 --- a/src/calibre/ebooks/azw4/reader.py +++ b/src/calibre/ebooks/azw4/reader.py @@ -16,6 +16,15 @@ import re from calibre.ebooks.pdb.formatreader import FormatReader +def unwrap(stream, output_path): + raw_data = stream.read() + m = re.search(br'%PDF.+%%EOF', raw_data, flags=re.DOTALL) + if m is None: + raise ValueError('No embedded PDF found in AZW4 file') + with open(output_path, 'wb') as f: + f.write(m.group()) + + class Reader(FormatReader): def __init__(self, header, stream, log, options): @@ -30,17 +39,15 @@ class Reader(FormatReader): self.stream.seek(0) raw_data = self.stream.read() data = '' - mo = re.search(r'(?ums)%PDF.*%%EOF.', raw_data) + mo = re.search(br'%PDF.+%%EOF', raw_data, flags=re.DOTALL) if mo: data = mo.group() - + pdf_n = os.path.join(os.getcwdu(), 'tmp.pdf') - pdf = open(pdf_n, 'wb') - pdf.write(data) - pdf.close() - + with open(pdf_n, 'wb') as pdf: + pdf.write(data) from calibre.customize.ui import plugin_for_input_format - + pdf_plugin = plugin_for_input_format('pdf') for opt in pdf_plugin.options: if not hasattr(self.options, opt.option.name): diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 4c73aa8272..e027fdbcf9 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -1030,6 +1030,15 @@ OptionRecommendation(name='search_replace', if hasattr(self.opts, 'lrf') and self.output_plugin.file_type == 'lrf': self.opts.lrf = True + if self.input_fmt == 'azw4' and self.output_plugin.file_type == 'pdf': + self.ui_reporter(0.01, 'AZW4 files are simply wrappers around PDF files.' + ' Skipping the conversion and unwrapping the embedded PDF instead') + from calibre.ebooks.azw4.reader import unwrap + unwrap(stream, self.output) + self.ui_reporter(1.) + self.log(self.output_fmt.upper(), 'output written to', self.output) + self.flush() + return self.ui_reporter(0.01, _('Converting input to HTML...')) ir = CompositeProgressReporter(0.01, 0.34, self.ui_reporter)