PDF Input: image extraction, option to disable image extraction.

This commit is contained in:
John Schember 2009-06-20 09:27:02 -04:00
parent efab7fdcdb
commit 219c92036d
2 changed files with 55 additions and 39 deletions

View File

@ -6,7 +6,7 @@ __docformat__ = 'restructuredtext en'
import os import os
from calibre.customize.conversion import InputFormatPlugin from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from calibre.ebooks.pdf.pdftohtml import pdftohtml from calibre.ebooks.pdf.pdftohtml import pdftohtml
from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.metadata.opf2 import OPFCreator
@ -17,17 +17,33 @@ class PDFInput(InputFormatPlugin):
description = 'Convert PDF files to HTML' description = 'Convert PDF files to HTML'
file_types = set(['pdf']) file_types = set(['pdf'])
options = set([
OptionRecommendation(name='no_images', recommended_value=False,
help=_('Do not extract images from the document')),
])
def convert(self, stream, options, file_ext, log, def convert(self, stream, options, file_ext, log,
accelerators): accelerators):
html = pdftohtml(stream.name) # The main html file will be named index.html
pdftohtml(os.getcwd(), stream.name, options.no_images)
with open('index.html', 'wb') as index:
index.write(html)
from calibre.ebooks.metadata.meta import get_metadata from calibre.ebooks.metadata.meta import get_metadata
mi = get_metadata(stream, 'pdf') mi = get_metadata(stream, 'pdf')
opf = OPFCreator(os.getcwd(), mi) opf = OPFCreator(os.getcwd(), mi)
opf.create_manifest([('index.html', None)])
manifest = [('index.html', None)]
images = os.listdir(os.getcwd())
images.remove('index.html')
for i in images:
# Remove the - from the file name because it causes problems.
# The referenec to the image with the - will be changed to not
# include it later in the conversion process.
new_i = i.replace('-', '')
os.rename(i, new_i)
manifest.append((new_i, None))
opf.create_manifest(manifest)
opf.create_spine(['index.html']) opf.create_spine(['index.html'])
with open('metadata.opf', 'wb') as opffile: with open('metadata.opf', 'wb') as opffile:
opf.render(opffile) opf.render(opffile)

View File

@ -14,7 +14,6 @@ from functools import partial
from calibre.ebooks import ConversionError, DRMError from calibre.ebooks import ConversionError, DRMError
from calibre import isosx, iswindows, islinux from calibre import isosx, iswindows, islinux
from calibre import CurrentDir from calibre import CurrentDir
from calibre.ptempfile import TemporaryDirectory
PDFTOHTML = 'pdftohtml' PDFTOHTML = 'pdftohtml'
popen = subprocess.Popen popen = subprocess.Popen
@ -26,10 +25,11 @@ if iswindows and hasattr(sys, 'frozen'):
if islinux and getattr(sys, 'frozen_path', False): if islinux and getattr(sys, 'frozen_path', False):
PDFTOHTML = os.path.join(getattr(sys, 'frozen_path'), 'pdftohtml') PDFTOHTML = os.path.join(getattr(sys, 'frozen_path'), 'pdftohtml')
def pdftohtml(pdf_path): def pdftohtml(output_dir, pdf_path, no_images):
''' '''
Convert the pdf into html using the pdftohtml app. Convert the pdf into html using the pdftohtml app.
@return: The HTML as a unicode string. This will write the html as index.html into output_dir.
It will also wirte all extracted images to the output_dir
''' '''
if isinstance(pdf_path, unicode): if isinstance(pdf_path, unicode):
@ -37,41 +37,41 @@ def pdftohtml(pdf_path):
if not os.access(pdf_path, os.R_OK): if not os.access(pdf_path, os.R_OK):
raise ConversionError('Cannot read from ' + pdf_path) raise ConversionError('Cannot read from ' + pdf_path)
with TemporaryDirectory('_pdftohtml') as tdir: with CurrentDir(output_dir):
index = os.path.join(tdir, 'index.html') index = os.path.join(os.getcwd(), 'index.html')
# This is neccessary as pdftohtml doesn't always (linux) respect absolute paths # This is neccessary as pdftohtml doesn't always (linux) respect absolute paths
pdf_path = os.path.abspath(pdf_path) pdf_path = os.path.abspath(pdf_path)
cmd = (PDFTOHTML, '-enc', 'UTF-8', '-noframes', '-p', '-nomerge', '-i', '-q', pdf_path, os.path.basename(index)) cmd = [PDFTOHTML, '-enc', 'UTF-8', '-noframes', '-p', '-nomerge', '-nodrm', '-q', pdf_path, os.path.basename(index)]
cwd = os.getcwd() if no_images:
cmd.append('-i')
with CurrentDir(tdir): try:
p = popen(cmd, stderr=subprocess.PIPE)
except OSError, err:
if err.errno == 2:
raise ConversionError(_('Could not find pdftohtml, check it is in your PATH'))
else:
raise
while True:
try: try:
p = popen(cmd, stderr=subprocess.PIPE) ret = p.wait()
except OSError, err: break
if err.errno == 2: except OSError, e:
raise ConversionError(_('Could not find pdftohtml, check it is in your PATH')) if e.errno == errno.EINTR:
continue
else: else:
raise raise
while True: if ret != 0:
try: err = p.stderr.read()
ret = p.wait() raise ConversionError(err)
break if not os.path.exists(index) or os.stat(index).st_size < 100:
except OSError, e: raise DRMError()
if e.errno == errno.EINTR:
continue
else:
raise
if ret != 0: with open(index, 'rb+wb') as i:
err = p.stderr.read() raw = i.read()
raise ConversionError(err) raw = '<!-- created by calibre\'s pdftohtml -->\n' + raw
if not os.path.exists(index) or os.stat(index).st_size < 100: i.seek(0)
raise DRMError() i.truncate()
i.write(raw)
with open(index, 'rb') as i:
raw = i.read()
if not '<br' in raw[:4000]:
raise ConversionError(os.path.basename(pdf_path) + _(' is an image based PDF. Only conversion of text based PDFs is supported.'))
return '<!-- created by calibre\'s pdftohtml -->\n' + raw