mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Switch new pdf engine to use the xml output produced by pdftohtml
This commit is contained in:
parent
7882df7fa3
commit
b2ccb3160d
@ -27,19 +27,16 @@ class PDFInput(InputFormatPlugin):
|
|||||||
])
|
])
|
||||||
|
|
||||||
def convert_new(self, stream, accelerators):
|
def convert_new(self, stream, accelerators):
|
||||||
from calibre.constants import plugins
|
from calibre.ebooks.pdf.pdftohtml import pdftohtml
|
||||||
pdfreflow, pdfreflow_err = plugins['pdfreflow']
|
|
||||||
|
|
||||||
from calibre.ebooks.pdf.reflow import PDFDocument
|
|
||||||
from calibre.utils.cleantext import clean_ascii_chars
|
from calibre.utils.cleantext import clean_ascii_chars
|
||||||
if pdfreflow_err:
|
from calibre.ebooks.pdf.reflow import PDFDocument
|
||||||
raise RuntimeError('Failed to load pdfreflow: ' + pdfreflow_err)
|
|
||||||
pdfreflow.reflow(stream.read(), 1, -1)
|
pdftohtml(os.getcwdu(), stream.name, self.opts.no_images, as_xml=True)
|
||||||
xml = clean_ascii_chars(open(u'index.xml', 'rb').read())
|
with open(u'index.xml', 'rb') as f:
|
||||||
|
xml = clean_ascii_chars(f.read())
|
||||||
PDFDocument(xml, self.opts, self.log)
|
PDFDocument(xml, self.opts, self.log)
|
||||||
return os.path.join(os.getcwdu(), u'metadata.opf')
|
return os.path.join(os.getcwdu(), u'metadata.opf')
|
||||||
|
|
||||||
|
|
||||||
def convert(self, stream, options, file_ext, log,
|
def convert(self, stream, options, file_ext, log,
|
||||||
accelerators):
|
accelerators):
|
||||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||||
|
@ -24,7 +24,7 @@ if iswindows and hasattr(sys, 'frozen'):
|
|||||||
if (islinux or isbsd) and getattr(sys, 'frozen', False):
|
if (islinux or isbsd) and getattr(sys, 'frozen', False):
|
||||||
PDFTOHTML = os.path.join(sys.executables_location, 'bin', 'pdftohtml')
|
PDFTOHTML = os.path.join(sys.executables_location, 'bin', 'pdftohtml')
|
||||||
|
|
||||||
def pdftohtml(output_dir, pdf_path, no_images):
|
def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
|
||||||
'''
|
'''
|
||||||
Convert the pdf into html using the pdftohtml app.
|
Convert the pdf into html using the pdftohtml app.
|
||||||
This will write the html as index.html into output_dir.
|
This will write the html as index.html into output_dir.
|
||||||
@ -32,7 +32,7 @@ def pdftohtml(output_dir, pdf_path, no_images):
|
|||||||
'''
|
'''
|
||||||
|
|
||||||
pdfsrc = os.path.join(output_dir, u'src.pdf')
|
pdfsrc = os.path.join(output_dir, u'src.pdf')
|
||||||
index = os.path.join(output_dir, u'index.html')
|
index = os.path.join(output_dir, u'index.'+('xml' if as_xml else 'html'))
|
||||||
|
|
||||||
with open(pdf_path, 'rb') as src, open(pdfsrc, 'wb') as dest:
|
with open(pdf_path, 'rb') as src, open(pdfsrc, 'wb') as dest:
|
||||||
shutil.copyfileobj(src, dest)
|
shutil.copyfileobj(src, dest)
|
||||||
@ -58,6 +58,8 @@ def pdftohtml(output_dir, pdf_path, no_images):
|
|||||||
cmd.remove(b'-nodrm')
|
cmd.remove(b'-nodrm')
|
||||||
if no_images:
|
if no_images:
|
||||||
cmd.append(b'-i')
|
cmd.append(b'-i')
|
||||||
|
if as_xml:
|
||||||
|
cmd.append('-xml')
|
||||||
|
|
||||||
logf = PersistentTemporaryFile(u'pdftohtml_log')
|
logf = PersistentTemporaryFile(u'pdftohtml_log')
|
||||||
try:
|
try:
|
||||||
@ -94,15 +96,16 @@ def pdftohtml(output_dir, pdf_path, no_images):
|
|||||||
if not os.path.exists(index) or os.stat(index).st_size < 100:
|
if not os.path.exists(index) or os.stat(index).st_size < 100:
|
||||||
raise DRMError()
|
raise DRMError()
|
||||||
|
|
||||||
with open(index, 'r+b') as i:
|
if not as_xml:
|
||||||
raw = i.read()
|
with open(index, 'r+b') as i:
|
||||||
raw = flip_images(raw)
|
raw = i.read()
|
||||||
raw = '<!-- created by calibre\'s pdftohtml -->\n' + raw
|
raw = flip_images(raw)
|
||||||
i.seek(0)
|
raw = '<!-- created by calibre\'s pdftohtml -->\n' + raw
|
||||||
i.truncate()
|
i.seek(0)
|
||||||
# versions of pdftohtml >= 0.20 output self closing <br> tags, this
|
i.truncate()
|
||||||
# breaks the pdf heuristics regexps, so replace them
|
# versions of pdftohtml >= 0.20 output self closing <br> tags, this
|
||||||
i.write(raw.replace(b'<br/>', b'<br>'))
|
# breaks the pdf heuristics regexps, so replace them
|
||||||
|
i.write(raw.replace(b'<br/>', b'<br>'))
|
||||||
|
|
||||||
def flip_image(img, flip):
|
def flip_image(img, flip):
|
||||||
from calibre.utils.magick import Image
|
from calibre.utils.magick import Image
|
||||||
|
Loading…
x
Reference in New Issue
Block a user