When converting AZW4 files to PDF, skip the conversion and simply unwrap the PDF file already embedded inside the AZW4 file.

This almost always gives better results, since otherwise we are doing a
PDF to PDF conversion.
This commit is contained in:
Kovid Goyal 2015-10-27 07:00:38 +05:30
parent de641e723a
commit 826dc25ffd
2 changed files with 23 additions and 7 deletions

View File

@ -16,6 +16,15 @@ import re
from calibre.ebooks.pdb.formatreader import FormatReader
def unwrap(stream, output_path):
raw_data = stream.read()
m = re.search(br'%PDF.+%%EOF', raw_data, flags=re.DOTALL)
if m is None:
raise ValueError('No embedded PDF found in AZW4 file')
with open(output_path, 'wb') as f:
f.write(m.group())
class Reader(FormatReader):
def __init__(self, header, stream, log, options):
@ -30,15 +39,13 @@ class Reader(FormatReader):
self.stream.seek(0)
raw_data = self.stream.read()
data = ''
mo = re.search(r'(?ums)%PDF.*%%EOF.', raw_data)
mo = re.search(br'%PDF.+%%EOF', raw_data, flags=re.DOTALL)
if mo:
data = mo.group()
pdf_n = os.path.join(os.getcwdu(), 'tmp.pdf')
pdf = open(pdf_n, 'wb')
pdf.write(data)
pdf.close()
with open(pdf_n, 'wb') as pdf:
pdf.write(data)
from calibre.customize.ui import plugin_for_input_format
pdf_plugin = plugin_for_input_format('pdf')

View File

@ -1030,6 +1030,15 @@ OptionRecommendation(name='search_replace',
if hasattr(self.opts, 'lrf') and self.output_plugin.file_type == 'lrf':
self.opts.lrf = True
if self.input_fmt == 'azw4' and self.output_plugin.file_type == 'pdf':
self.ui_reporter(0.01, 'AZW4 files are simply wrappers around PDF files.'
' Skipping the conversion and unwrapping the embedded PDF instead')
from calibre.ebooks.azw4.reader import unwrap
unwrap(stream, self.output)
self.ui_reporter(1.)
self.log(self.output_fmt.upper(), 'output written to', self.output)
self.flush()
return
self.ui_reporter(0.01, _('Converting input to HTML...'))
ir = CompositeProgressReporter(0.01, 0.34, self.ui_reporter)