PDF Input: image extraction, option to disable image extraction.

This commit is contained in:
John Schember 2009-06-20 09:27:02 -04:00
parent efab7fdcdb
commit 219c92036d
2 changed files with 55 additions and 39 deletions

View File

@ -6,7 +6,7 @@ __docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import InputFormatPlugin
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from calibre.ebooks.pdf.pdftohtml import pdftohtml
from calibre.ebooks.metadata.opf2 import OPFCreator
@ -17,17 +17,33 @@ class PDFInput(InputFormatPlugin):
description = 'Convert PDF files to HTML'
file_types = set(['pdf'])
options = set([
OptionRecommendation(name='no_images', recommended_value=False,
help=_('Do not extract images from the document')),
])
def convert(self, stream, options, file_ext, log,
accelerators):
html = pdftohtml(stream.name)
with open('index.html', 'wb') as index:
index.write(html)
# The main html file will be named index.html
pdftohtml(os.getcwd(), stream.name, options.no_images)
from calibre.ebooks.metadata.meta import get_metadata
mi = get_metadata(stream, 'pdf')
opf = OPFCreator(os.getcwd(), mi)
opf.create_manifest([('index.html', None)])
manifest = [('index.html', None)]
images = os.listdir(os.getcwd())
images.remove('index.html')
for i in images:
# Remove the - from the file name because it causes problems.
# The referenec to the image with the - will be changed to not
# include it later in the conversion process.
new_i = i.replace('-', '')
os.rename(i, new_i)
manifest.append((new_i, None))
opf.create_manifest(manifest)
opf.create_spine(['index.html'])
with open('metadata.opf', 'wb') as opffile:
opf.render(opffile)

View File

@ -14,7 +14,6 @@ from functools import partial
from calibre.ebooks import ConversionError, DRMError
from calibre import isosx, iswindows, islinux
from calibre import CurrentDir
from calibre.ptempfile import TemporaryDirectory
PDFTOHTML = 'pdftohtml'
popen = subprocess.Popen
@ -26,10 +25,11 @@ if iswindows and hasattr(sys, 'frozen'):
if islinux and getattr(sys, 'frozen_path', False):
PDFTOHTML = os.path.join(getattr(sys, 'frozen_path'), 'pdftohtml')
def pdftohtml(pdf_path):
def pdftohtml(output_dir, pdf_path, no_images):
'''
Convert the pdf into html using the pdftohtml app.
@return: The HTML as a unicode string.
This will write the html as index.html into output_dir.
It will also wirte all extracted images to the output_dir
'''
if isinstance(pdf_path, unicode):
@ -37,14 +37,14 @@ def pdftohtml(pdf_path):
if not os.access(pdf_path, os.R_OK):
raise ConversionError('Cannot read from ' + pdf_path)
with TemporaryDirectory('_pdftohtml') as tdir:
index = os.path.join(tdir, 'index.html')
with CurrentDir(output_dir):
index = os.path.join(os.getcwd(), 'index.html')
# This is neccessary as pdftohtml doesn't always (linux) respect absolute paths
pdf_path = os.path.abspath(pdf_path)
cmd = (PDFTOHTML, '-enc', 'UTF-8', '-noframes', '-p', '-nomerge', '-i', '-q', pdf_path, os.path.basename(index))
cwd = os.getcwd()
cmd = [PDFTOHTML, '-enc', 'UTF-8', '-noframes', '-p', '-nomerge', '-nodrm', '-q', pdf_path, os.path.basename(index)]
if no_images:
cmd.append('-i')
with CurrentDir(tdir):
try:
p = popen(cmd, stderr=subprocess.PIPE)
except OSError, err:
@ -69,9 +69,9 @@ def pdftohtml(pdf_path):
if not os.path.exists(index) or os.stat(index).st_size < 100:
raise DRMError()
with open(index, 'rb') as i:
with open(index, 'rb+wb') as i:
raw = i.read()
if not '<br' in raw[:4000]:
raise ConversionError(os.path.basename(pdf_path) + _(' is an image based PDF. Only conversion of text based PDFs is supported.'))
return '<!-- created by calibre\'s pdftohtml -->\n' + raw
raw = '<!-- created by calibre\'s pdftohtml -->\n' + raw
i.seek(0)
i.truncate()
i.write(raw)