mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 18:24:30 -04:00
IGN:...
This commit is contained in:
commit
aac75238c6
@ -265,13 +265,14 @@ class MOBIMetadataWriter(MetadataWriterPlugin):
|
|||||||
|
|
||||||
from calibre.ebooks.epub.input import EPUBInput
|
from calibre.ebooks.epub.input import EPUBInput
|
||||||
from calibre.ebooks.mobi.input import MOBIInput
|
from calibre.ebooks.mobi.input import MOBIInput
|
||||||
|
from calibre.ebooks.pdf.input import PDFInput
|
||||||
from calibre.ebooks.txt.input import TXTInput
|
from calibre.ebooks.txt.input import TXTInput
|
||||||
from calibre.ebooks.oeb.output import OEBOutput
|
from calibre.ebooks.oeb.output import OEBOutput
|
||||||
from calibre.ebooks.txt.output import TXTOutput
|
from calibre.ebooks.txt.output import TXTOutput
|
||||||
from calibre.ebooks.pdf.output import PDFOutput
|
from calibre.ebooks.pdf.output import PDFOutput
|
||||||
from calibre.customize.profiles import input_profiles, output_profiles
|
from calibre.customize.profiles import input_profiles, output_profiles
|
||||||
|
|
||||||
plugins = [HTML2ZIP, EPUBInput, MOBIInput, TXTInput, OEBOutput, TXTOutput, PDFOutput]
|
plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDFInput, TXTInput, OEBOutput, TXTOutput, PDFOutput]
|
||||||
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
|
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
|
||||||
x.__name__.endswith('MetadataReader')]
|
x.__name__.endswith('MetadataReader')]
|
||||||
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
|
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
|
||||||
|
@ -254,6 +254,14 @@ def plugin_for_input_format(fmt):
|
|||||||
if fmt.lower() in plugin.file_types:
|
if fmt.lower() in plugin.file_types:
|
||||||
return plugin
|
return plugin
|
||||||
|
|
||||||
|
def available_input_formats():
|
||||||
|
formats = []
|
||||||
|
for plugin in input_format_plugins():
|
||||||
|
if not is_disabled(plugin):
|
||||||
|
for format in plugin.file_types:
|
||||||
|
formats.append(format)
|
||||||
|
return formats
|
||||||
|
|
||||||
def output_format_plugins():
|
def output_format_plugins():
|
||||||
for plugin in _initialized_plugins:
|
for plugin in _initialized_plugins:
|
||||||
if isinstance(plugin, OutputFormatPlugin):
|
if isinstance(plugin, OutputFormatPlugin):
|
||||||
@ -263,7 +271,13 @@ def plugin_for_output_format(fmt):
|
|||||||
for plugin in output_format_plugins():
|
for plugin in output_format_plugins():
|
||||||
if fmt.lower() == plugin.file_type:
|
if fmt.lower() == plugin.file_type:
|
||||||
return plugin
|
return plugin
|
||||||
|
|
||||||
|
def available_output_formats():
|
||||||
|
formats = []
|
||||||
|
for plugin in output_format_plugins():
|
||||||
|
if not is_disabled(plugin):
|
||||||
|
formats.append(plugin.file_type)
|
||||||
|
return formats
|
||||||
|
|
||||||
def disable_plugin(plugin_or_name):
|
def disable_plugin(plugin_or_name):
|
||||||
x = getattr(plugin_or_name, 'name', plugin_or_name)
|
x = getattr(plugin_or_name, 'name', plugin_or_name)
|
||||||
|
@ -173,6 +173,9 @@ def xml2str(root, pretty_print=False):
|
|||||||
return etree.tostring(root, encoding='utf-8', xml_declaration=True,
|
return etree.tostring(root, encoding='utf-8', xml_declaration=True,
|
||||||
pretty_print=pretty_print)
|
pretty_print=pretty_print)
|
||||||
|
|
||||||
|
def xml2unicode(root, pretty_print=False):
|
||||||
|
return etree.tostring(root, pretty_print=pretty_print)
|
||||||
|
|
||||||
ASCII_CHARS = set(chr(x) for x in xrange(128))
|
ASCII_CHARS = set(chr(x) for x in xrange(128))
|
||||||
UNIBYTE_CHARS = set(chr(x) for x in xrange(256))
|
UNIBYTE_CHARS = set(chr(x) for x in xrange(256))
|
||||||
URL_SAFE = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
|
URL_SAFE = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
|
||||||
@ -721,6 +724,14 @@ class Manifest(object):
|
|||||||
if isinstance(data, unicode):
|
if isinstance(data, unicode):
|
||||||
return data.encode('utf-8')
|
return data.encode('utf-8')
|
||||||
return str(data)
|
return str(data)
|
||||||
|
|
||||||
|
def __unicode__(self):
|
||||||
|
data = self.data
|
||||||
|
if isinstance(data, etree._Element):
|
||||||
|
return xml2unicode(data, pretty_print=self.oeb.pretty_print)
|
||||||
|
if isinstance(data, unicode):
|
||||||
|
return data
|
||||||
|
return unicode(data)
|
||||||
|
|
||||||
def __eq__(self, other):
|
def __eq__(self, other):
|
||||||
return id(self) == id(other)
|
return id(self) == id(other)
|
||||||
|
38
src/calibre/ebooks/pdf/input.py
Normal file
38
src/calibre/ebooks/pdf/input.py
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
from __future__ import with_statement
|
||||||
|
|
||||||
|
__license__ = 'GPL 3'
|
||||||
|
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
from calibre.customize.conversion import InputFormatPlugin
|
||||||
|
from calibre.ebooks.pdf.pdftohtml import pdftohtml
|
||||||
|
from calibre.ebooks.metadata.opf import OPFCreator
|
||||||
|
from calibre.ebooks.metadata import MetaInformation
|
||||||
|
#from calibre.ebooks.metadata.meta import metadata_from_formats
|
||||||
|
|
||||||
|
class PDFInput(InputFormatPlugin):
|
||||||
|
|
||||||
|
name = 'PDF Input'
|
||||||
|
author = 'John Schember'
|
||||||
|
description = 'Convert PDF files to HTML'
|
||||||
|
file_types = set(['pdf'])
|
||||||
|
|
||||||
|
def convert(self, stream, options, file_ext, log,
|
||||||
|
accelerators):
|
||||||
|
html = pdftohtml(stream.name)
|
||||||
|
|
||||||
|
with open('index.html', 'wb') as index:
|
||||||
|
index.write(html)
|
||||||
|
|
||||||
|
#mi = metadata_from_formats([stream.name])
|
||||||
|
mi = MetaInformation(_('Unknown'), _('Unknown'))
|
||||||
|
opf = OPFCreator(os.getcwd(), mi)
|
||||||
|
opf.create_manifest([('index.html', None)])
|
||||||
|
opf.create_spine(['index.html'])
|
||||||
|
with open('metadata.opf', 'wb') as opffile:
|
||||||
|
opf.render(opffile)
|
||||||
|
|
||||||
|
return os.path.join(os.getcwd(), 'metadata.opf')
|
75
src/calibre/ebooks/pdf/pdftohtml.py
Normal file
75
src/calibre/ebooks/pdf/pdftohtml.py
Normal file
@ -0,0 +1,75 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
from __future__ import with_statement
|
||||||
|
|
||||||
|
__license__ = 'GPL 3'
|
||||||
|
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net> \
|
||||||
|
2009, John Schember <john@nachtimwald.com>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import errno, os, sys, subprocess
|
||||||
|
from functools import partial
|
||||||
|
|
||||||
|
from calibre.ebooks import ConversionError, DRMError
|
||||||
|
from calibre import isosx, iswindows, islinux
|
||||||
|
from calibre import CurrentDir
|
||||||
|
from calibre.ptempfile import TemporaryDirectory
|
||||||
|
|
||||||
|
PDFTOHTML = 'pdftohtml'
|
||||||
|
popen = subprocess.Popen
|
||||||
|
if isosx and hasattr(sys, 'frameworks_dir'):
|
||||||
|
PDFTOHTML = os.path.join(getattr(sys, 'frameworks_dir'), PDFTOHTML)
|
||||||
|
if iswindows and hasattr(sys, 'frozen'):
|
||||||
|
PDFTOHTML = os.path.join(os.path.dirname(sys.executable), 'pdftohtml.exe')
|
||||||
|
popen = partial(subprocess.Popen, creationflags=0x08) # CREATE_NO_WINDOW=0x08 so that no ugly console is popped up
|
||||||
|
if islinux and getattr(sys, 'frozen_path', False):
|
||||||
|
PDFTOHTML = os.path.join(getattr(sys, 'frozen_path'), 'pdftohtml')
|
||||||
|
|
||||||
|
def pdftohtml(pdf_path):
|
||||||
|
'''
|
||||||
|
Convert the pdf into html using the pdftohtml app.
|
||||||
|
@return: The HTML as a unicode string.
|
||||||
|
'''
|
||||||
|
|
||||||
|
if isinstance(pdf_path, unicode):
|
||||||
|
pdf_path = pdf_path.encode(sys.getfilesystemencoding())
|
||||||
|
if not os.access(pdf_path, os.R_OK):
|
||||||
|
raise ConversionError, 'Cannot read from ' + pdf_path
|
||||||
|
|
||||||
|
with TemporaryDirectory('_pdftohtml') as tdir:
|
||||||
|
index = os.path.join(tdir, 'index.html')
|
||||||
|
# This is neccessary as pdftohtml doesn't always (linux) respect absolute paths
|
||||||
|
pdf_path = os.path.abspath(pdf_path)
|
||||||
|
cmd = (PDFTOHTML, '-enc', 'UTF-8', '-noframes', '-p', '-nomerge', '-i', '-q', pdf_path, os.path.basename(index))
|
||||||
|
cwd = os.getcwd()
|
||||||
|
|
||||||
|
with CurrentDir(tdir):
|
||||||
|
try:
|
||||||
|
p = popen(cmd, stderr=subprocess.PIPE)
|
||||||
|
except OSError, err:
|
||||||
|
if err.errno == 2:
|
||||||
|
raise ConversionError(_('Could not find pdftohtml, check it is in your PATH'), True)
|
||||||
|
else:
|
||||||
|
raise
|
||||||
|
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
ret = p.wait()
|
||||||
|
break
|
||||||
|
except OSError, e:
|
||||||
|
if e.errno == errno.EINTR:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
raise
|
||||||
|
|
||||||
|
if ret != 0:
|
||||||
|
err = p.stderr.read()
|
||||||
|
raise ConversionError, err
|
||||||
|
if not os.path.exists(index) or os.stat(index).st_size < 100:
|
||||||
|
raise DRMError()
|
||||||
|
|
||||||
|
with open(index, 'rb') as i:
|
||||||
|
raw = i.read()
|
||||||
|
if not '<br' in raw[:4000]:
|
||||||
|
raise ConversionError(os.path.basename(pdf_path) + _(' is an image based PDF. Only conversion of text based PDFs is supported.'), True)
|
||||||
|
|
||||||
|
return '<!-- created by calibre\'s pdftohtml -->\n' + raw
|
@ -51,7 +51,7 @@ class TXTOutput(OutputFormatPlugin):
|
|||||||
|
|
||||||
out_stream.seek(0)
|
out_stream.seek(0)
|
||||||
out_stream.truncate()
|
out_stream.truncate()
|
||||||
out_stream.write(txt)
|
out_stream.write(txt.encode('utf-8'))
|
||||||
|
|
||||||
if close:
|
if close:
|
||||||
out_stream.close()
|
out_stream.close()
|
||||||
|
@ -102,12 +102,7 @@ class TxtWriter(object):
|
|||||||
text = text.replace('\f+', ' ')
|
text = text.replace('\f+', ' ')
|
||||||
|
|
||||||
# Single line paragraph.
|
# Single line paragraph.
|
||||||
r = re.compile('.\n.')
|
text = re.sub('(?<=.)\n(?=.)', ' ', text)
|
||||||
while True:
|
|
||||||
mo = r.search(text)
|
|
||||||
if mo == None:
|
|
||||||
break
|
|
||||||
text = '%s %s' % (text[:mo.start()+1], text[mo.end()-1:])
|
|
||||||
|
|
||||||
# Remove multiple spaces.
|
# Remove multiple spaces.
|
||||||
text = re.sub('[ ]+', ' ', text)
|
text = re.sub('[ ]+', ' ', text)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user