diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index ab6d772121..30f423fce3 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -265,13 +265,14 @@ class MOBIMetadataWriter(MetadataWriterPlugin): from calibre.ebooks.epub.input import EPUBInput from calibre.ebooks.mobi.input import MOBIInput +from calibre.ebooks.pdf.input import PDFInput from calibre.ebooks.txt.input import TXTInput from calibre.ebooks.oeb.output import OEBOutput from calibre.ebooks.txt.output import TXTOutput from calibre.ebooks.pdf.output import PDFOutput from calibre.customize.profiles import input_profiles, output_profiles -plugins = [HTML2ZIP, EPUBInput, MOBIInput, TXTInput, OEBOutput, TXTOutput, PDFOutput] +plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDFInput, TXTInput, OEBOutput, TXTOutput, PDFOutput] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ x.__name__.endswith('MetadataReader')] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ diff --git a/src/calibre/customize/ui.py b/src/calibre/customize/ui.py index d8b7ebf6d8..af85ca523d 100644 --- a/src/calibre/customize/ui.py +++ b/src/calibre/customize/ui.py @@ -254,6 +254,14 @@ def plugin_for_input_format(fmt): if fmt.lower() in plugin.file_types: return plugin +def available_input_formats(): + formats = [] + for plugin in input_format_plugins(): + if not is_disabled(plugin): + for format in plugin.file_types: + formats.append(format) + return formats + def output_format_plugins(): for plugin in _initialized_plugins: if isinstance(plugin, OutputFormatPlugin): @@ -263,7 +271,13 @@ def plugin_for_output_format(fmt): for plugin in output_format_plugins(): if fmt.lower() == plugin.file_type: return plugin - + +def available_output_formats(): + formats = [] + for plugin in output_format_plugins(): + if not is_disabled(plugin): + formats.append(plugin.file_type) + return formats def disable_plugin(plugin_or_name): x = getattr(plugin_or_name, 'name', plugin_or_name) diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index e96de5112f..7d489ec3ae 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -173,6 +173,9 @@ def xml2str(root, pretty_print=False): return etree.tostring(root, encoding='utf-8', xml_declaration=True, pretty_print=pretty_print) +def xml2unicode(root, pretty_print=False): + return etree.tostring(root, pretty_print=pretty_print) + ASCII_CHARS = set(chr(x) for x in xrange(128)) UNIBYTE_CHARS = set(chr(x) for x in xrange(256)) URL_SAFE = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ' @@ -721,6 +724,14 @@ class Manifest(object): if isinstance(data, unicode): return data.encode('utf-8') return str(data) + + def __unicode__(self): + data = self.data + if isinstance(data, etree._Element): + return xml2unicode(data, pretty_print=self.oeb.pretty_print) + if isinstance(data, unicode): + return data + return unicode(data) def __eq__(self, other): return id(self) == id(other) diff --git a/src/calibre/ebooks/pdf/input.py b/src/calibre/ebooks/pdf/input.py new file mode 100644 index 0000000000..6f55b71dd5 --- /dev/null +++ b/src/calibre/ebooks/pdf/input.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- +from __future__ import with_statement + +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +import os + +from calibre.customize.conversion import InputFormatPlugin +from calibre.ebooks.pdf.pdftohtml import pdftohtml +from calibre.ebooks.metadata.opf import OPFCreator +from calibre.ebooks.metadata import MetaInformation +#from calibre.ebooks.metadata.meta import metadata_from_formats + +class PDFInput(InputFormatPlugin): + + name = 'PDF Input' + author = 'John Schember' + description = 'Convert PDF files to HTML' + file_types = set(['pdf']) + + def convert(self, stream, options, file_ext, log, + accelerators): + html = pdftohtml(stream.name) + + with open('index.html', 'wb') as index: + index.write(html) + + #mi = metadata_from_formats([stream.name]) + mi = MetaInformation(_('Unknown'), _('Unknown')) + opf = OPFCreator(os.getcwd(), mi) + opf.create_manifest([('index.html', None)]) + opf.create_spine(['index.html']) + with open('metadata.opf', 'wb') as opffile: + opf.render(opffile) + + return os.path.join(os.getcwd(), 'metadata.opf') diff --git a/src/calibre/ebooks/pdf/pdftohtml.py b/src/calibre/ebooks/pdf/pdftohtml.py new file mode 100644 index 0000000000..27cdb3f691 --- /dev/null +++ b/src/calibre/ebooks/pdf/pdftohtml.py @@ -0,0 +1,75 @@ +# -*- coding: utf-8 -*- +from __future__ import with_statement + +__license__ = 'GPL 3' +__copyright__ = '2008, Kovid Goyal \ + 2009, John Schember ' +__docformat__ = 'restructuredtext en' + +import errno, os, sys, subprocess +from functools import partial + +from calibre.ebooks import ConversionError, DRMError +from calibre import isosx, iswindows, islinux +from calibre import CurrentDir +from calibre.ptempfile import TemporaryDirectory + +PDFTOHTML = 'pdftohtml' +popen = subprocess.Popen +if isosx and hasattr(sys, 'frameworks_dir'): + PDFTOHTML = os.path.join(getattr(sys, 'frameworks_dir'), PDFTOHTML) +if iswindows and hasattr(sys, 'frozen'): + PDFTOHTML = os.path.join(os.path.dirname(sys.executable), 'pdftohtml.exe') + popen = partial(subprocess.Popen, creationflags=0x08) # CREATE_NO_WINDOW=0x08 so that no ugly console is popped up +if islinux and getattr(sys, 'frozen_path', False): + PDFTOHTML = os.path.join(getattr(sys, 'frozen_path'), 'pdftohtml') + +def pdftohtml(pdf_path): + ''' + Convert the pdf into html using the pdftohtml app. + @return: The HTML as a unicode string. + ''' + + if isinstance(pdf_path, unicode): + pdf_path = pdf_path.encode(sys.getfilesystemencoding()) + if not os.access(pdf_path, os.R_OK): + raise ConversionError, 'Cannot read from ' + pdf_path + + with TemporaryDirectory('_pdftohtml') as tdir: + index = os.path.join(tdir, 'index.html') + # This is neccessary as pdftohtml doesn't always (linux) respect absolute paths + pdf_path = os.path.abspath(pdf_path) + cmd = (PDFTOHTML, '-enc', 'UTF-8', '-noframes', '-p', '-nomerge', '-i', '-q', pdf_path, os.path.basename(index)) + cwd = os.getcwd() + + with CurrentDir(tdir): + try: + p = popen(cmd, stderr=subprocess.PIPE) + except OSError, err: + if err.errno == 2: + raise ConversionError(_('Could not find pdftohtml, check it is in your PATH'), True) + else: + raise + + while True: + try: + ret = p.wait() + break + except OSError, e: + if e.errno == errno.EINTR: + continue + else: + raise + + if ret != 0: + err = p.stderr.read() + raise ConversionError, err + if not os.path.exists(index) or os.stat(index).st_size < 100: + raise DRMError() + + with open(index, 'rb') as i: + raw = i.read() + if not '\n' + raw diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py index c1e48d98fd..2d1ef98662 100644 --- a/src/calibre/ebooks/txt/output.py +++ b/src/calibre/ebooks/txt/output.py @@ -51,7 +51,7 @@ class TXTOutput(OutputFormatPlugin): out_stream.seek(0) out_stream.truncate() - out_stream.write(txt) + out_stream.write(txt.encode('utf-8')) if close: out_stream.close() diff --git a/src/calibre/ebooks/txt/writer.py b/src/calibre/ebooks/txt/writer.py index efd3ec0a2f..0f84c32804 100644 --- a/src/calibre/ebooks/txt/writer.py +++ b/src/calibre/ebooks/txt/writer.py @@ -102,12 +102,7 @@ class TxtWriter(object): text = text.replace('\f+', ' ') # Single line paragraph. - r = re.compile('.\n.') - while True: - mo = r.search(text) - if mo == None: - break - text = '%s %s' % (text[:mo.start()+1], text[mo.end()-1:]) + text = re.sub('(?<=.)\n(?=.)', ' ', text) # Remove multiple spaces. text = re.sub('[ ]+', ' ', text)