This commit is contained in:
Kovid Goyal 2009-04-02 20:30:47 -07:00
commit aac75238c6
7 changed files with 143 additions and 9 deletions

View File

@ -265,13 +265,14 @@ class MOBIMetadataWriter(MetadataWriterPlugin):
from calibre.ebooks.epub.input import EPUBInput
from calibre.ebooks.mobi.input import MOBIInput
from calibre.ebooks.pdf.input import PDFInput
from calibre.ebooks.txt.input import TXTInput
from calibre.ebooks.oeb.output import OEBOutput
from calibre.ebooks.txt.output import TXTOutput
from calibre.ebooks.pdf.output import PDFOutput
from calibre.customize.profiles import input_profiles, output_profiles
plugins = [HTML2ZIP, EPUBInput, MOBIInput, TXTInput, OEBOutput, TXTOutput, PDFOutput]
plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDFInput, TXTInput, OEBOutput, TXTOutput, PDFOutput]
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
x.__name__.endswith('MetadataReader')]
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \

View File

@ -254,6 +254,14 @@ def plugin_for_input_format(fmt):
if fmt.lower() in plugin.file_types:
return plugin
def available_input_formats():
formats = []
for plugin in input_format_plugins():
if not is_disabled(plugin):
for format in plugin.file_types:
formats.append(format)
return formats
def output_format_plugins():
for plugin in _initialized_plugins:
if isinstance(plugin, OutputFormatPlugin):
@ -264,6 +272,12 @@ def plugin_for_output_format(fmt):
if fmt.lower() == plugin.file_type:
return plugin
def available_output_formats():
formats = []
for plugin in output_format_plugins():
if not is_disabled(plugin):
formats.append(plugin.file_type)
return formats
def disable_plugin(plugin_or_name):
x = getattr(plugin_or_name, 'name', plugin_or_name)

View File

@ -173,6 +173,9 @@ def xml2str(root, pretty_print=False):
return etree.tostring(root, encoding='utf-8', xml_declaration=True,
pretty_print=pretty_print)
def xml2unicode(root, pretty_print=False):
return etree.tostring(root, pretty_print=pretty_print)
ASCII_CHARS = set(chr(x) for x in xrange(128))
UNIBYTE_CHARS = set(chr(x) for x in xrange(256))
URL_SAFE = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
@ -722,6 +725,14 @@ class Manifest(object):
return data.encode('utf-8')
return str(data)
def __unicode__(self):
data = self.data
if isinstance(data, etree._Element):
return xml2unicode(data, pretty_print=self.oeb.pretty_print)
if isinstance(data, unicode):
return data
return unicode(data)
def __eq__(self, other):
return id(self) == id(other)

View File

@ -0,0 +1,38 @@
# -*- coding: utf-8 -*-
from __future__ import with_statement
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import InputFormatPlugin
from calibre.ebooks.pdf.pdftohtml import pdftohtml
from calibre.ebooks.metadata.opf import OPFCreator
from calibre.ebooks.metadata import MetaInformation
#from calibre.ebooks.metadata.meta import metadata_from_formats
class PDFInput(InputFormatPlugin):
name = 'PDF Input'
author = 'John Schember'
description = 'Convert PDF files to HTML'
file_types = set(['pdf'])
def convert(self, stream, options, file_ext, log,
accelerators):
html = pdftohtml(stream.name)
with open('index.html', 'wb') as index:
index.write(html)
#mi = metadata_from_formats([stream.name])
mi = MetaInformation(_('Unknown'), _('Unknown'))
opf = OPFCreator(os.getcwd(), mi)
opf.create_manifest([('index.html', None)])
opf.create_spine(['index.html'])
with open('metadata.opf', 'wb') as opffile:
opf.render(opffile)
return os.path.join(os.getcwd(), 'metadata.opf')

View File

@ -0,0 +1,75 @@
# -*- coding: utf-8 -*-
from __future__ import with_statement
__license__ = 'GPL 3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net> \
2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import errno, os, sys, subprocess
from functools import partial
from calibre.ebooks import ConversionError, DRMError
from calibre import isosx, iswindows, islinux
from calibre import CurrentDir
from calibre.ptempfile import TemporaryDirectory
PDFTOHTML = 'pdftohtml'
popen = subprocess.Popen
if isosx and hasattr(sys, 'frameworks_dir'):
PDFTOHTML = os.path.join(getattr(sys, 'frameworks_dir'), PDFTOHTML)
if iswindows and hasattr(sys, 'frozen'):
PDFTOHTML = os.path.join(os.path.dirname(sys.executable), 'pdftohtml.exe')
popen = partial(subprocess.Popen, creationflags=0x08) # CREATE_NO_WINDOW=0x08 so that no ugly console is popped up
if islinux and getattr(sys, 'frozen_path', False):
PDFTOHTML = os.path.join(getattr(sys, 'frozen_path'), 'pdftohtml')
def pdftohtml(pdf_path):
'''
Convert the pdf into html using the pdftohtml app.
@return: The HTML as a unicode string.
'''
if isinstance(pdf_path, unicode):
pdf_path = pdf_path.encode(sys.getfilesystemencoding())
if not os.access(pdf_path, os.R_OK):
raise ConversionError, 'Cannot read from ' + pdf_path
with TemporaryDirectory('_pdftohtml') as tdir:
index = os.path.join(tdir, 'index.html')
# This is neccessary as pdftohtml doesn't always (linux) respect absolute paths
pdf_path = os.path.abspath(pdf_path)
cmd = (PDFTOHTML, '-enc', 'UTF-8', '-noframes', '-p', '-nomerge', '-i', '-q', pdf_path, os.path.basename(index))
cwd = os.getcwd()
with CurrentDir(tdir):
try:
p = popen(cmd, stderr=subprocess.PIPE)
except OSError, err:
if err.errno == 2:
raise ConversionError(_('Could not find pdftohtml, check it is in your PATH'), True)
else:
raise
while True:
try:
ret = p.wait()
break
except OSError, e:
if e.errno == errno.EINTR:
continue
else:
raise
if ret != 0:
err = p.stderr.read()
raise ConversionError, err
if not os.path.exists(index) or os.stat(index).st_size < 100:
raise DRMError()
with open(index, 'rb') as i:
raw = i.read()
if not '<br' in raw[:4000]:
raise ConversionError(os.path.basename(pdf_path) + _(' is an image based PDF. Only conversion of text based PDFs is supported.'), True)
return '<!-- created by calibre\'s pdftohtml -->\n' + raw

View File

@ -51,7 +51,7 @@ class TXTOutput(OutputFormatPlugin):
out_stream.seek(0)
out_stream.truncate()
out_stream.write(txt)
out_stream.write(txt.encode('utf-8'))
if close:
out_stream.close()

View File

@ -102,12 +102,7 @@ class TxtWriter(object):
text = text.replace('\f+', ' ')
# Single line paragraph.
r = re.compile('.\n.')
while True:
mo = r.search(text)
if mo == None:
break
text = '%s %s' % (text[:mo.start()+1], text[mo.end()-1:])
text = re.sub('(?<=.)\n(?=.)', ' ', text)
# Remove multiple spaces.
text = re.sub('[ ]+', ' ', text)