IGN:...

2025-07-07 18:24:30 -04:00 · 2009-04-02 20:30:47 -07:00 · 2009-04-02 20:30:47 -07:00 · aac75238c6
commit aac75238c6
parent ca0fb160f2 754923ce07
7 changed files with 143 additions and 9 deletions
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@ -265,13 +265,14 @@ class MOBIMetadataWriter(MetadataWriterPlugin):

 from calibre.ebooks.epub.input import EPUBInput
 from calibre.ebooks.mobi.input import MOBIInput
+from calibre.ebooks.pdf.input import PDFInput
 from calibre.ebooks.txt.input import TXTInput
 from calibre.ebooks.oeb.output import OEBOutput
 from calibre.ebooks.txt.output import TXTOutput
 from calibre.ebooks.pdf.output import PDFOutput
 from calibre.customize.profiles import input_profiles, output_profiles

-plugins = [HTML2ZIP, EPUBInput, MOBIInput, TXTInput, OEBOutput, TXTOutput, PDFOutput]
+plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDFInput, TXTInput, OEBOutput, TXTOutput, PDFOutput]
 plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
                                        x.__name__.endswith('MetadataReader')]
 plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
--- a/src/calibre/customize/ui.py
+++ b/src/calibre/customize/ui.py
@ -254,6 +254,14 @@ def plugin_for_input_format(fmt):
        if fmt.lower() in plugin.file_types:
            return plugin

+def available_input_formats():
+    formats = []
+    for plugin in input_format_plugins():
+        if not is_disabled(plugin):
+            for format in plugin.file_types:
+                formats.append(format)
+    return formats
+
 def output_format_plugins():
    for plugin in _initialized_plugins:
        if isinstance(plugin, OutputFormatPlugin):
@ -264,6 +272,12 @@ def plugin_for_output_format(fmt):
        if fmt.lower() == plugin.file_type:
            return plugin

+def available_output_formats():
+    formats = []
+    for plugin in output_format_plugins():
+        if not is_disabled(plugin):
+            formats.append(plugin.file_type)
+    return formats

 def disable_plugin(plugin_or_name):
    x = getattr(plugin_or_name, 'name', plugin_or_name)
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@ -173,6 +173,9 @@ def xml2str(root, pretty_print=False):
    return etree.tostring(root, encoding='utf-8', xml_declaration=True,
                          pretty_print=pretty_print)

+def xml2unicode(root, pretty_print=False):
+    return etree.tostring(root, pretty_print=pretty_print)
+
 ASCII_CHARS   = set(chr(x) for x in xrange(128))
 UNIBYTE_CHARS = set(chr(x) for x in xrange(256))
 URL_SAFE      = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
@ -722,6 +725,14 @@ class Manifest(object):
                return data.encode('utf-8')
            return str(data)
            
+        def __unicode__(self):
+            data = self.data
+            if isinstance(data, etree._Element):
+                return xml2unicode(data, pretty_print=self.oeb.pretty_print)
+            if isinstance(data, unicode):
+                return data
+            return unicode(data)
+
        def __eq__(self, other):
            return id(self) == id(other)

--- a/src/calibre/ebooks/pdf/input.py
+++ b/src/calibre/ebooks/pdf/input.py
@ -0,0 +1,38 @@
+# -*- coding: utf-8 -*-
+from __future__ import with_statement
+
+__license__ = 'GPL 3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import os
+
+from calibre.customize.conversion import InputFormatPlugin
+from calibre.ebooks.pdf.pdftohtml import pdftohtml
+from calibre.ebooks.metadata.opf import OPFCreator
+from calibre.ebooks.metadata import MetaInformation
+#from calibre.ebooks.metadata.meta import metadata_from_formats
+
+class PDFInput(InputFormatPlugin):
+    
+    name        = 'PDF Input'
+    author      = 'John Schember'
+    description = 'Convert PDF files to HTML'
+    file_types  = set(['pdf'])
+
+    def convert(self, stream, options, file_ext, log,
+                accelerators):
+        html = pdftohtml(stream.name)
+        
+        with open('index.html', 'wb') as index:
+            index.write(html)
+            
+        #mi = metadata_from_formats([stream.name])
+        mi = MetaInformation(_('Unknown'), _('Unknown'))
+        opf = OPFCreator(os.getcwd(), mi)
+        opf.create_manifest([('index.html', None)])
+        opf.create_spine(['index.html'])
+        with open('metadata.opf', 'wb') as opffile:
+            opf.render(opffile)
+        
+        return os.path.join(os.getcwd(), 'metadata.opf')
--- a/src/calibre/ebooks/pdf/pdftohtml.py
+++ b/src/calibre/ebooks/pdf/pdftohtml.py
@ -0,0 +1,75 @@
+# -*- coding: utf-8 -*-
+from __future__ import with_statement
+
+__license__ = 'GPL 3'
+__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net> \
+                 2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import errno, os, sys, subprocess
+from functools import partial
+
+from calibre.ebooks import ConversionError, DRMError
+from calibre import isosx, iswindows, islinux
+from calibre import CurrentDir
+from calibre.ptempfile import TemporaryDirectory
+
+PDFTOHTML = 'pdftohtml'
+popen = subprocess.Popen
+if isosx and hasattr(sys, 'frameworks_dir'):
+    PDFTOHTML = os.path.join(getattr(sys, 'frameworks_dir'), PDFTOHTML)
+if iswindows and hasattr(sys, 'frozen'):
+    PDFTOHTML = os.path.join(os.path.dirname(sys.executable), 'pdftohtml.exe')
+    popen = partial(subprocess.Popen, creationflags=0x08) # CREATE_NO_WINDOW=0x08 so that no ugly console is popped up
+if islinux and getattr(sys, 'frozen_path', False):
+    PDFTOHTML = os.path.join(getattr(sys, 'frozen_path'), 'pdftohtml')
+
+def pdftohtml(pdf_path):
+    '''
+    Convert the pdf into html using the pdftohtml app.
+    @return: The HTML as a unicode string.
+    '''
+
+    if isinstance(pdf_path, unicode):
+        pdf_path = pdf_path.encode(sys.getfilesystemencoding())
+    if not os.access(pdf_path, os.R_OK):
+        raise ConversionError, 'Cannot read from ' + pdf_path
+
+    with TemporaryDirectory('_pdftohtml') as tdir:
+        index = os.path.join(tdir, 'index.html')
+        # This is neccessary as pdftohtml doesn't always (linux) respect absolute paths
+        pdf_path = os.path.abspath(pdf_path)
+        cmd = (PDFTOHTML, '-enc', 'UTF-8', '-noframes', '-p', '-nomerge', '-i', '-q', pdf_path, os.path.basename(index))
+        cwd = os.getcwd()
+
+        with CurrentDir(tdir):
+            try:
+                p = popen(cmd, stderr=subprocess.PIPE)
+            except OSError, err:
+                if err.errno == 2:
+                    raise ConversionError(_('Could not find pdftohtml, check it is in your PATH'), True)
+                else:
+                    raise
+
+            while True:
+                try:
+                    ret = p.wait()
+                    break
+                except OSError, e:
+                    if e.errno == errno.EINTR:
+                        continue
+                    else:
+                        raise
+
+            if ret != 0:
+                err = p.stderr.read()
+                raise ConversionError, err
+            if not os.path.exists(index) or os.stat(index).st_size < 100:
+                raise DRMError()
+        
+            with open(index, 'rb') as i:
+                raw = i.read()
+            if not '<br' in raw[:4000]:
+                raise ConversionError(os.path.basename(pdf_path) + _(' is an image based PDF. Only conversion of text based PDFs is supported.'), True)
+
+            return '<!-- created by calibre\'s pdftohtml -->\n' + raw
--- a/src/calibre/ebooks/txt/output.py
+++ b/src/calibre/ebooks/txt/output.py
@ -51,7 +51,7 @@ class TXTOutput(OutputFormatPlugin):
        
        out_stream.seek(0)
        out_stream.truncate()
-        out_stream.write(txt)
+        out_stream.write(txt.encode('utf-8'))
        
        if close:
            out_stream.close()
--- a/src/calibre/ebooks/txt/writer.py
+++ b/src/calibre/ebooks/txt/writer.py
@ -102,12 +102,7 @@ class TxtWriter(object):
        text = text.replace('\f+', ' ')
    
        # Single line paragraph.
-        r = re.compile('.\n.')
-        while True:
-            mo = r.search(text)
-            if mo == None:
-                break
-            text = '%s %s' % (text[:mo.start()+1], text[mo.end()-1:])
+        text = re.sub('(?<=.)\n(?=.)', ' ', text)
        
        # Remove multiple spaces.
        text = re.sub('[  ]+', ' ', text)