mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 18:24:30 -04:00
IGN:...
This commit is contained in:
commit
aac75238c6
@ -265,13 +265,14 @@ class MOBIMetadataWriter(MetadataWriterPlugin):
|
||||
|
||||
from calibre.ebooks.epub.input import EPUBInput
|
||||
from calibre.ebooks.mobi.input import MOBIInput
|
||||
from calibre.ebooks.pdf.input import PDFInput
|
||||
from calibre.ebooks.txt.input import TXTInput
|
||||
from calibre.ebooks.oeb.output import OEBOutput
|
||||
from calibre.ebooks.txt.output import TXTOutput
|
||||
from calibre.ebooks.pdf.output import PDFOutput
|
||||
from calibre.customize.profiles import input_profiles, output_profiles
|
||||
|
||||
plugins = [HTML2ZIP, EPUBInput, MOBIInput, TXTInput, OEBOutput, TXTOutput, PDFOutput]
|
||||
plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDFInput, TXTInput, OEBOutput, TXTOutput, PDFOutput]
|
||||
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
|
||||
x.__name__.endswith('MetadataReader')]
|
||||
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
|
||||
|
@ -254,6 +254,14 @@ def plugin_for_input_format(fmt):
|
||||
if fmt.lower() in plugin.file_types:
|
||||
return plugin
|
||||
|
||||
def available_input_formats():
|
||||
formats = []
|
||||
for plugin in input_format_plugins():
|
||||
if not is_disabled(plugin):
|
||||
for format in plugin.file_types:
|
||||
formats.append(format)
|
||||
return formats
|
||||
|
||||
def output_format_plugins():
|
||||
for plugin in _initialized_plugins:
|
||||
if isinstance(plugin, OutputFormatPlugin):
|
||||
@ -264,6 +272,12 @@ def plugin_for_output_format(fmt):
|
||||
if fmt.lower() == plugin.file_type:
|
||||
return plugin
|
||||
|
||||
def available_output_formats():
|
||||
formats = []
|
||||
for plugin in output_format_plugins():
|
||||
if not is_disabled(plugin):
|
||||
formats.append(plugin.file_type)
|
||||
return formats
|
||||
|
||||
def disable_plugin(plugin_or_name):
|
||||
x = getattr(plugin_or_name, 'name', plugin_or_name)
|
||||
|
@ -173,6 +173,9 @@ def xml2str(root, pretty_print=False):
|
||||
return etree.tostring(root, encoding='utf-8', xml_declaration=True,
|
||||
pretty_print=pretty_print)
|
||||
|
||||
def xml2unicode(root, pretty_print=False):
|
||||
return etree.tostring(root, pretty_print=pretty_print)
|
||||
|
||||
ASCII_CHARS = set(chr(x) for x in xrange(128))
|
||||
UNIBYTE_CHARS = set(chr(x) for x in xrange(256))
|
||||
URL_SAFE = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
|
||||
@ -722,6 +725,14 @@ class Manifest(object):
|
||||
return data.encode('utf-8')
|
||||
return str(data)
|
||||
|
||||
def __unicode__(self):
|
||||
data = self.data
|
||||
if isinstance(data, etree._Element):
|
||||
return xml2unicode(data, pretty_print=self.oeb.pretty_print)
|
||||
if isinstance(data, unicode):
|
||||
return data
|
||||
return unicode(data)
|
||||
|
||||
def __eq__(self, other):
|
||||
return id(self) == id(other)
|
||||
|
||||
|
38
src/calibre/ebooks/pdf/input.py
Normal file
38
src/calibre/ebooks/pdf/input.py
Normal file
@ -0,0 +1,38 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import with_statement
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from calibre.ebooks.pdf.pdftohtml import pdftohtml
|
||||
from calibre.ebooks.metadata.opf import OPFCreator
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
#from calibre.ebooks.metadata.meta import metadata_from_formats
|
||||
|
||||
class PDFInput(InputFormatPlugin):
|
||||
|
||||
name = 'PDF Input'
|
||||
author = 'John Schember'
|
||||
description = 'Convert PDF files to HTML'
|
||||
file_types = set(['pdf'])
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
html = pdftohtml(stream.name)
|
||||
|
||||
with open('index.html', 'wb') as index:
|
||||
index.write(html)
|
||||
|
||||
#mi = metadata_from_formats([stream.name])
|
||||
mi = MetaInformation(_('Unknown'), _('Unknown'))
|
||||
opf = OPFCreator(os.getcwd(), mi)
|
||||
opf.create_manifest([('index.html', None)])
|
||||
opf.create_spine(['index.html'])
|
||||
with open('metadata.opf', 'wb') as opffile:
|
||||
opf.render(opffile)
|
||||
|
||||
return os.path.join(os.getcwd(), 'metadata.opf')
|
75
src/calibre/ebooks/pdf/pdftohtml.py
Normal file
75
src/calibre/ebooks/pdf/pdftohtml.py
Normal file
@ -0,0 +1,75 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import with_statement
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net> \
|
||||
2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import errno, os, sys, subprocess
|
||||
from functools import partial
|
||||
|
||||
from calibre.ebooks import ConversionError, DRMError
|
||||
from calibre import isosx, iswindows, islinux
|
||||
from calibre import CurrentDir
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
|
||||
PDFTOHTML = 'pdftohtml'
|
||||
popen = subprocess.Popen
|
||||
if isosx and hasattr(sys, 'frameworks_dir'):
|
||||
PDFTOHTML = os.path.join(getattr(sys, 'frameworks_dir'), PDFTOHTML)
|
||||
if iswindows and hasattr(sys, 'frozen'):
|
||||
PDFTOHTML = os.path.join(os.path.dirname(sys.executable), 'pdftohtml.exe')
|
||||
popen = partial(subprocess.Popen, creationflags=0x08) # CREATE_NO_WINDOW=0x08 so that no ugly console is popped up
|
||||
if islinux and getattr(sys, 'frozen_path', False):
|
||||
PDFTOHTML = os.path.join(getattr(sys, 'frozen_path'), 'pdftohtml')
|
||||
|
||||
def pdftohtml(pdf_path):
|
||||
'''
|
||||
Convert the pdf into html using the pdftohtml app.
|
||||
@return: The HTML as a unicode string.
|
||||
'''
|
||||
|
||||
if isinstance(pdf_path, unicode):
|
||||
pdf_path = pdf_path.encode(sys.getfilesystemencoding())
|
||||
if not os.access(pdf_path, os.R_OK):
|
||||
raise ConversionError, 'Cannot read from ' + pdf_path
|
||||
|
||||
with TemporaryDirectory('_pdftohtml') as tdir:
|
||||
index = os.path.join(tdir, 'index.html')
|
||||
# This is neccessary as pdftohtml doesn't always (linux) respect absolute paths
|
||||
pdf_path = os.path.abspath(pdf_path)
|
||||
cmd = (PDFTOHTML, '-enc', 'UTF-8', '-noframes', '-p', '-nomerge', '-i', '-q', pdf_path, os.path.basename(index))
|
||||
cwd = os.getcwd()
|
||||
|
||||
with CurrentDir(tdir):
|
||||
try:
|
||||
p = popen(cmd, stderr=subprocess.PIPE)
|
||||
except OSError, err:
|
||||
if err.errno == 2:
|
||||
raise ConversionError(_('Could not find pdftohtml, check it is in your PATH'), True)
|
||||
else:
|
||||
raise
|
||||
|
||||
while True:
|
||||
try:
|
||||
ret = p.wait()
|
||||
break
|
||||
except OSError, e:
|
||||
if e.errno == errno.EINTR:
|
||||
continue
|
||||
else:
|
||||
raise
|
||||
|
||||
if ret != 0:
|
||||
err = p.stderr.read()
|
||||
raise ConversionError, err
|
||||
if not os.path.exists(index) or os.stat(index).st_size < 100:
|
||||
raise DRMError()
|
||||
|
||||
with open(index, 'rb') as i:
|
||||
raw = i.read()
|
||||
if not '<br' in raw[:4000]:
|
||||
raise ConversionError(os.path.basename(pdf_path) + _(' is an image based PDF. Only conversion of text based PDFs is supported.'), True)
|
||||
|
||||
return '<!-- created by calibre\'s pdftohtml -->\n' + raw
|
@ -51,7 +51,7 @@ class TXTOutput(OutputFormatPlugin):
|
||||
|
||||
out_stream.seek(0)
|
||||
out_stream.truncate()
|
||||
out_stream.write(txt)
|
||||
out_stream.write(txt.encode('utf-8'))
|
||||
|
||||
if close:
|
||||
out_stream.close()
|
||||
|
@ -102,12 +102,7 @@ class TxtWriter(object):
|
||||
text = text.replace('\f+', ' ')
|
||||
|
||||
# Single line paragraph.
|
||||
r = re.compile('.\n.')
|
||||
while True:
|
||||
mo = r.search(text)
|
||||
if mo == None:
|
||||
break
|
||||
text = '%s %s' % (text[:mo.start()+1], text[mo.end()-1:])
|
||||
text = re.sub('(?<=.)\n(?=.)', ' ', text)
|
||||
|
||||
# Remove multiple spaces.
|
||||
text = re.sub('[ ]+', ' ', text)
|
||||
|
Loading…
x
Reference in New Issue
Block a user