diff --git a/src/calibre/ebooks/conversion/plugins/pdf_input.py b/src/calibre/ebooks/conversion/plugins/pdf_input.py index 08cb2d0154..3bbe33207d 100644 --- a/src/calibre/ebooks/conversion/plugins/pdf_input.py +++ b/src/calibre/ebooks/conversion/plugins/pdf_input.py @@ -27,19 +27,16 @@ class PDFInput(InputFormatPlugin): ]) def convert_new(self, stream, accelerators): - from calibre.constants import plugins - pdfreflow, pdfreflow_err = plugins['pdfreflow'] - - from calibre.ebooks.pdf.reflow import PDFDocument + from calibre.ebooks.pdf.pdftohtml import pdftohtml from calibre.utils.cleantext import clean_ascii_chars - if pdfreflow_err: - raise RuntimeError('Failed to load pdfreflow: ' + pdfreflow_err) - pdfreflow.reflow(stream.read(), 1, -1) - xml = clean_ascii_chars(open(u'index.xml', 'rb').read()) + from calibre.ebooks.pdf.reflow import PDFDocument + + pdftohtml(os.getcwdu(), stream.name, self.opts.no_images, as_xml=True) + with open(u'index.xml', 'rb') as f: + xml = clean_ascii_chars(f.read()) PDFDocument(xml, self.opts, self.log) return os.path.join(os.getcwdu(), u'metadata.opf') - def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.metadata.opf2 import OPFCreator diff --git a/src/calibre/ebooks/pdf/pdftohtml.py b/src/calibre/ebooks/pdf/pdftohtml.py index 7c5e852b24..ca950b84b2 100644 --- a/src/calibre/ebooks/pdf/pdftohtml.py +++ b/src/calibre/ebooks/pdf/pdftohtml.py @@ -24,7 +24,7 @@ if iswindows and hasattr(sys, 'frozen'): if (islinux or isbsd) and getattr(sys, 'frozen', False): PDFTOHTML = os.path.join(sys.executables_location, 'bin', 'pdftohtml') -def pdftohtml(output_dir, pdf_path, no_images): +def pdftohtml(output_dir, pdf_path, no_images, as_xml=False): ''' Convert the pdf into html using the pdftohtml app. This will write the html as index.html into output_dir. @@ -32,7 +32,7 @@ def pdftohtml(output_dir, pdf_path, no_images): ''' pdfsrc = os.path.join(output_dir, u'src.pdf') - index = os.path.join(output_dir, u'index.html') + index = os.path.join(output_dir, u'index.'+('xml' if as_xml else 'html')) with open(pdf_path, 'rb') as src, open(pdfsrc, 'wb') as dest: shutil.copyfileobj(src, dest) @@ -58,6 +58,8 @@ def pdftohtml(output_dir, pdf_path, no_images): cmd.remove(b'-nodrm') if no_images: cmd.append(b'-i') + if as_xml: + cmd.append('-xml') logf = PersistentTemporaryFile(u'pdftohtml_log') try: @@ -94,15 +96,16 @@ def pdftohtml(output_dir, pdf_path, no_images): if not os.path.exists(index) or os.stat(index).st_size < 100: raise DRMError() - with open(index, 'r+b') as i: - raw = i.read() - raw = flip_images(raw) - raw = '\n' + raw - i.seek(0) - i.truncate() - # versions of pdftohtml >= 0.20 output self closing
tags, this - # breaks the pdf heuristics regexps, so replace them - i.write(raw.replace(b'
', b'
')) + if not as_xml: + with open(index, 'r+b') as i: + raw = i.read() + raw = flip_images(raw) + raw = '\n' + raw + i.seek(0) + i.truncate() + # versions of pdftohtml >= 0.20 output self closing
tags, this + # breaks the pdf heuristics regexps, so replace them + i.write(raw.replace(b'
', b'
')) def flip_image(img, flip): from calibre.utils.magick import Image