From 54e7822128eedc2ee2950f4a4cbe4af18e8c7a2d Mon Sep 17 00:00:00 2001 From: John Schember Date: Thu, 2 Apr 2009 18:40:42 -0400 Subject: [PATCH 1/4] PDF input and txt output tweaks --- src/calibre/customize/builtins.py | 3 +- src/calibre/ebooks/oeb/base.py | 11 +++++ src/calibre/ebooks/pdf/input.py | 38 +++++++++++++++ src/calibre/ebooks/pdf/pdftohtml.py | 75 +++++++++++++++++++++++++++++ src/calibre/ebooks/txt/output.py | 2 +- src/calibre/ebooks/txt/writer.py | 7 +-- 6 files changed, 128 insertions(+), 8 deletions(-) create mode 100644 src/calibre/ebooks/pdf/input.py create mode 100644 src/calibre/ebooks/pdf/pdftohtml.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index ab6d772121..30f423fce3 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -265,13 +265,14 @@ class MOBIMetadataWriter(MetadataWriterPlugin): from calibre.ebooks.epub.input import EPUBInput from calibre.ebooks.mobi.input import MOBIInput +from calibre.ebooks.pdf.input import PDFInput from calibre.ebooks.txt.input import TXTInput from calibre.ebooks.oeb.output import OEBOutput from calibre.ebooks.txt.output import TXTOutput from calibre.ebooks.pdf.output import PDFOutput from calibre.customize.profiles import input_profiles, output_profiles -plugins = [HTML2ZIP, EPUBInput, MOBIInput, TXTInput, OEBOutput, TXTOutput, PDFOutput] +plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDFInput, TXTInput, OEBOutput, TXTOutput, PDFOutput] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ x.__name__.endswith('MetadataReader')] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index e96de5112f..7d489ec3ae 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -173,6 +173,9 @@ def xml2str(root, pretty_print=False): return etree.tostring(root, encoding='utf-8', xml_declaration=True, pretty_print=pretty_print) +def xml2unicode(root, pretty_print=False): + return etree.tostring(root, pretty_print=pretty_print) + ASCII_CHARS = set(chr(x) for x in xrange(128)) UNIBYTE_CHARS = set(chr(x) for x in xrange(256)) URL_SAFE = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ' @@ -721,6 +724,14 @@ class Manifest(object): if isinstance(data, unicode): return data.encode('utf-8') return str(data) + + def __unicode__(self): + data = self.data + if isinstance(data, etree._Element): + return xml2unicode(data, pretty_print=self.oeb.pretty_print) + if isinstance(data, unicode): + return data + return unicode(data) def __eq__(self, other): return id(self) == id(other) diff --git a/src/calibre/ebooks/pdf/input.py b/src/calibre/ebooks/pdf/input.py new file mode 100644 index 0000000000..060b9f5367 --- /dev/null +++ b/src/calibre/ebooks/pdf/input.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- +from __future__ import with_statement + +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +import os + +from calibre.customize.conversion import InputFormatPlugin +from calibre.ebooks.pdf.pdftohtml import pdftohtml +from calibre.ebooks.metadata.opf import OPFCreator +from calibre.ebooks.metadata import MetaInformation +#from calibre.ebooks.metadata.meta import metadata_from_formats + +class PDFInput(InputFormatPlugin): + + name = 'PDF Input' + author = 'John Schember' + description = 'Convert PDF files to HTML' + file_types = set(['pdf']) + + def convert(self, stream, options, file_ext, log, + accelerators): + html = pdftohtml(stream.name) + + with open('index.html', 'wb') as index: + index.write(html.encode('utf-8')) + + #mi = metadata_from_formats([stream.name]) + mi = MetaInformation(_('Unknown'), _('Unknown')) + opf = OPFCreator(os.getcwd(), mi) + opf.create_manifest([('index.html', None)]) + opf.create_spine(['index.html']) + with open('metadata.opf', 'wb') as opffile: + opf.render(opffile) + + return os.path.join(os.getcwd(), 'metadata.opf') diff --git a/src/calibre/ebooks/pdf/pdftohtml.py b/src/calibre/ebooks/pdf/pdftohtml.py new file mode 100644 index 0000000000..275cfadb08 --- /dev/null +++ b/src/calibre/ebooks/pdf/pdftohtml.py @@ -0,0 +1,75 @@ +# -*- coding: utf-8 -*- +from __future__ import with_statement + +__license__ = 'GPL 3' +__copyright__ = '2008, Kovid Goyal \ + 2009, John Schember ' +__docformat__ = 'restructuredtext en' + +import errno, os, sys, subprocess +from functools import partial + +from calibre.ebooks import ConversionError, DRMError +from calibre import isosx, iswindows, islinux +from calibre import CurrentDir +from calibre.ptempfile import TemporaryDirectory + +PDFTOHTML = 'pdftohtml' +popen = subprocess.Popen +if isosx and hasattr(sys, 'frameworks_dir'): + PDFTOHTML = os.path.join(getattr(sys, 'frameworks_dir'), PDFTOHTML) +if iswindows and hasattr(sys, 'frozen'): + PDFTOHTML = os.path.join(os.path.dirname(sys.executable), 'pdftohtml.exe') + popen = partial(subprocess.Popen, creationflags=0x08) # CREATE_NO_WINDOW=0x08 so that no ugly console is popped up +if islinux and getattr(sys, 'frozen_path', False): + PDFTOHTML = os.path.join(getattr(sys, 'frozen_path'), 'pdftohtml') + +def pdftohtml(pdf_path): + ''' + Convert the pdf into html using the pdftohtml app. + @return: The HTML as a unicode string. + ''' + + if isinstance(pdf_path, unicode): + pdf_path = pdf_path.encode(sys.getfilesystemencoding()) + if not os.access(pdf_path, os.R_OK): + raise ConversionError, 'Cannot read from ' + pdf_path + + with TemporaryDirectory('_pdftohtml') as tdir: + index = os.path.join(tdir, 'index.html') + # This is neccessary as pdftohtml doesn't always (linux) respect absolute paths + pdf_path = os.path.abspath(pdf_path) + cmd = (PDFTOHTML, '-noframes', '-p', '-nomerge', pdf_path, os.path.basename(index)) + cwd = os.getcwd() + + with CurrentDir(tdir): + try: + p = popen(cmd, stderr=subprocess.PIPE) + except OSError, err: + if err.errno == 2: + raise ConversionError(_('Could not find pdftohtml, check it is in your PATH'), True) + else: + raise + + while True: + try: + ret = p.wait() + break + except OSError, e: + if e.errno == errno.EINTR: + continue + else: + raise + + if ret != 0: + err = p.stderr.read() + raise ConversionError, err + if not os.path.exists(index) or os.stat(index).st_size < 100: + raise DRMError() + + with open(index, 'rb') as i: + raw = i.read().decode('latin-1') + if not '\n' + raw diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py index c1e48d98fd..2d1ef98662 100644 --- a/src/calibre/ebooks/txt/output.py +++ b/src/calibre/ebooks/txt/output.py @@ -51,7 +51,7 @@ class TXTOutput(OutputFormatPlugin): out_stream.seek(0) out_stream.truncate() - out_stream.write(txt) + out_stream.write(txt.encode('utf-8')) if close: out_stream.close() diff --git a/src/calibre/ebooks/txt/writer.py b/src/calibre/ebooks/txt/writer.py index efd3ec0a2f..0f84c32804 100644 --- a/src/calibre/ebooks/txt/writer.py +++ b/src/calibre/ebooks/txt/writer.py @@ -102,12 +102,7 @@ class TxtWriter(object): text = text.replace('\f+', ' ') # Single line paragraph. - r = re.compile('.\n.') - while True: - mo = r.search(text) - if mo == None: - break - text = '%s %s' % (text[:mo.start()+1], text[mo.end()-1:]) + text = re.sub('(?<=.)\n(?=.)', ' ', text) # Remove multiple spaces. text = re.sub('[ ]+', ' ', text) From 044d1d65fbe8726ac31aef74116fa411c60a044e Mon Sep 17 00:00:00 2001 From: John Schember Date: Thu, 2 Apr 2009 20:12:45 -0400 Subject: [PATCH 2/4] Get avaliable input/output file ext --- src/calibre/customize/ui.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/src/calibre/customize/ui.py b/src/calibre/customize/ui.py index d8b7ebf6d8..ee5dc03713 100644 --- a/src/calibre/customize/ui.py +++ b/src/calibre/customize/ui.py @@ -254,16 +254,31 @@ def plugin_for_input_format(fmt): if fmt.lower() in plugin.file_types: return plugin +def available_input_formats(): + formats = [] + for plugin in input_format_plugins(): + if not is_disabled(plugin): + for format in plugin.file_types: + formats.append(format) + return formats + def output_format_plugins(): for plugin in _initialized_plugins: if isinstance(plugin, OutputFormatPlugin): - yield plugin + yield plugin.file_type def plugin_for_output_format(fmt): for plugin in output_format_plugins(): if fmt.lower() == plugin.file_type: return plugin - + +def available_output_formats(): + formats = [] + for plugin in _initialized_plugins: + if isinstance(plugin, OutputFormatPlugin): + if not is_disabled(plugin): + formats.append(plugin.file_type) + return formats def disable_plugin(plugin_or_name): x = getattr(plugin_or_name, 'name', plugin_or_name) From a9a74acbdec1f843e91cecd29135a6a3827bd08b Mon Sep 17 00:00:00 2001 From: John Schember Date: Thu, 2 Apr 2009 20:33:14 -0400 Subject: [PATCH 3/4] tweaks --- src/calibre/customize/ui.py | 9 ++++----- src/calibre/ebooks/pdf/pdftohtml.py | 2 +- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/calibre/customize/ui.py b/src/calibre/customize/ui.py index ee5dc03713..af85ca523d 100644 --- a/src/calibre/customize/ui.py +++ b/src/calibre/customize/ui.py @@ -265,7 +265,7 @@ def available_input_formats(): def output_format_plugins(): for plugin in _initialized_plugins: if isinstance(plugin, OutputFormatPlugin): - yield plugin.file_type + yield plugin def plugin_for_output_format(fmt): for plugin in output_format_plugins(): @@ -274,10 +274,9 @@ def plugin_for_output_format(fmt): def available_output_formats(): formats = [] - for plugin in _initialized_plugins: - if isinstance(plugin, OutputFormatPlugin): - if not is_disabled(plugin): - formats.append(plugin.file_type) + for plugin in output_format_plugins(): + if not is_disabled(plugin): + formats.append(plugin.file_type) return formats def disable_plugin(plugin_or_name): diff --git a/src/calibre/ebooks/pdf/pdftohtml.py b/src/calibre/ebooks/pdf/pdftohtml.py index 275cfadb08..168923ad1a 100644 --- a/src/calibre/ebooks/pdf/pdftohtml.py +++ b/src/calibre/ebooks/pdf/pdftohtml.py @@ -39,7 +39,7 @@ def pdftohtml(pdf_path): index = os.path.join(tdir, 'index.html') # This is neccessary as pdftohtml doesn't always (linux) respect absolute paths pdf_path = os.path.abspath(pdf_path) - cmd = (PDFTOHTML, '-noframes', '-p', '-nomerge', pdf_path, os.path.basename(index)) + cmd = (PDFTOHTML, '-enc', 'UTF-8', '-noframes', '-p', '-nomerge', '-i', '-q', pdf_path, os.path.basename(index)) cwd = os.getcwd() with CurrentDir(tdir): From 754923ce07cbd268039b70bb9c8563f217b17730 Mon Sep 17 00:00:00 2001 From: John Schember Date: Thu, 2 Apr 2009 20:44:48 -0400 Subject: [PATCH 4/4] pdf input to txt output giving correct output --- src/calibre/ebooks/pdf/input.py | 2 +- src/calibre/ebooks/pdf/pdftohtml.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/pdf/input.py b/src/calibre/ebooks/pdf/input.py index 060b9f5367..6f55b71dd5 100644 --- a/src/calibre/ebooks/pdf/input.py +++ b/src/calibre/ebooks/pdf/input.py @@ -25,7 +25,7 @@ class PDFInput(InputFormatPlugin): html = pdftohtml(stream.name) with open('index.html', 'wb') as index: - index.write(html.encode('utf-8')) + index.write(html) #mi = metadata_from_formats([stream.name]) mi = MetaInformation(_('Unknown'), _('Unknown')) diff --git a/src/calibre/ebooks/pdf/pdftohtml.py b/src/calibre/ebooks/pdf/pdftohtml.py index 168923ad1a..27cdb3f691 100644 --- a/src/calibre/ebooks/pdf/pdftohtml.py +++ b/src/calibre/ebooks/pdf/pdftohtml.py @@ -68,8 +68,8 @@ def pdftohtml(pdf_path): raise DRMError() with open(index, 'rb') as i: - raw = i.read().decode('latin-1') + raw = i.read() if not '\n' + raw + return '\n' + raw