PDF Input: image extraction, option to disable image extraction.

2025-12-17 02:25:02 -05:00 · 2009-06-20 09:27:02 -04:00 · 2009-06-20 09:27:02 -04:00 · 219c92036d
commit 219c92036d
parent efab7fdcdb
2 changed files with 55 additions and 39 deletions
--- a/src/calibre/ebooks/pdf/input.py
+++ b/src/calibre/ebooks/pdf/input.py
@ -6,7 +6,7 @@ __docformat__ = 'restructuredtext en'
 import os
-from calibre.customize.conversion import InputFormatPlugin
+from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
 from calibre.ebooks.pdf.pdftohtml import pdftohtml
 from calibre.ebooks.metadata.opf2 import OPFCreator
@ -17,17 +17,33 @@ class PDFInput(InputFormatPlugin):
    description = 'Convert PDF files to HTML'
    file_types  = set(['pdf'])
    options = set([
        OptionRecommendation(name='no_images', recommended_value=False,
            help=_('Do not extract images from the document')),
    ])
    def convert(self, stream, options, file_ext, log,
                accelerators):
-        html = pdftohtml(stream.name)
+        # The main html file will be named index.html
-
+        pdftohtml(os.getcwd(), stream.name, options.no_images)
        with open('index.html', 'wb') as index:
            index.write(html)
        from calibre.ebooks.metadata.meta import get_metadata
        mi = get_metadata(stream, 'pdf')
        opf = OPFCreator(os.getcwd(), mi)
-        opf.create_manifest([('index.html', None)])
+
        manifest = [('index.html', None)]
        images = os.listdir(os.getcwd())
        images.remove('index.html')
        for i in images:
            # Remove the - from the file name because it causes problems.
            # The referenec to the image with the - will be changed to not
            # include it later in the conversion process.
            new_i = i.replace('-', '')
            os.rename(i, new_i)
            manifest.append((new_i, None))
        opf.create_manifest(manifest)
        opf.create_spine(['index.html'])
        with open('metadata.opf', 'wb') as opffile:
            opf.render(opffile)
--- a/src/calibre/ebooks/pdf/pdftohtml.py
+++ b/src/calibre/ebooks/pdf/pdftohtml.py
@ -14,7 +14,6 @@ from functools import partial
 from calibre.ebooks import ConversionError, DRMError
 from calibre import isosx, iswindows, islinux
 from calibre import CurrentDir
 from calibre.ptempfile import TemporaryDirectory
 PDFTOHTML = 'pdftohtml'
 popen = subprocess.Popen
@ -26,10 +25,11 @@ if iswindows and hasattr(sys, 'frozen'):
 if islinux and getattr(sys, 'frozen_path', False):
    PDFTOHTML = os.path.join(getattr(sys, 'frozen_path'), 'pdftohtml')
-def pdftohtml(pdf_path):
+def pdftohtml(output_dir, pdf_path, no_images):
    '''
    Convert the pdf into html using the pdftohtml app.
-    @return: The HTML as a unicode string.
+    This will write the html as index.html into output_dir.
    It will also wirte all extracted images to the output_dir
    '''
    if isinstance(pdf_path, unicode):
@ -37,41 +37,41 @@ def pdftohtml(pdf_path):
    if not os.access(pdf_path, os.R_OK):
        raise ConversionError('Cannot read from ' + pdf_path)
-    with TemporaryDirectory('_pdftohtml') as tdir:
+    with CurrentDir(output_dir):
-        index = os.path.join(tdir, 'index.html')
+        index = os.path.join(os.getcwd(), 'index.html')
        # This is neccessary as pdftohtml doesn't always (linux) respect absolute paths
        pdf_path = os.path.abspath(pdf_path)
-        cmd = (PDFTOHTML, '-enc', 'UTF-8', '-noframes', '-p', '-nomerge', '-i', '-q', pdf_path, os.path.basename(index))
+        cmd = [PDFTOHTML, '-enc', 'UTF-8', '-noframes', '-p', '-nomerge', '-nodrm', '-q', pdf_path, os.path.basename(index)]
-        cwd = os.getcwd()
+        if no_images:
            cmd.append('-i')
-        with CurrentDir(tdir):
+        try:
            p = popen(cmd, stderr=subprocess.PIPE)
        except OSError, err:
            if err.errno == 2:
                raise ConversionError(_('Could not find pdftohtml, check it is in your PATH'))
            else:
                raise
        while True:
            try:
-                p = popen(cmd, stderr=subprocess.PIPE)
+                ret = p.wait()
-            except OSError, err:
+                break
-                if err.errno == 2:
+            except OSError, e:
-                    raise ConversionError(_('Could not find pdftohtml, check it is in your PATH'))
+                if e.errno == errno.EINTR:
                    continue
                else:
                    raise
-            while True:
+        if ret != 0:
-                try:
+            err = p.stderr.read()
-                    ret = p.wait()
+            raise ConversionError(err)
-                    break
+        if not os.path.exists(index) or os.stat(index).st_size < 100:
-                except OSError, e:
+            raise DRMError()
                    if e.errno == errno.EINTR:
                        continue
                    else:
                        raise
-            if ret != 0:
+        with open(index, 'rb+wb') as i:
-                err = p.stderr.read()
+            raw = i.read()
-                raise ConversionError(err)
+            raw = '<!-- created by calibre\'s pdftohtml -->\n' + raw
-            if not os.path.exists(index) or os.stat(index).st_size < 100:
+            i.seek(0)
-                raise DRMError()
+            i.truncate()
-
+            i.write(raw)
            with open(index, 'rb') as i:
                raw = i.read()
            if not '<br' in raw[:4000]:
                raise ConversionError(os.path.basename(pdf_path) + _(' is an image based PDF. Only conversion of text based PDFs is supported.'))
            return '<!-- created by calibre\'s pdftohtml -->\n' + raw