mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Implemented any2epub
This commit is contained in:
parent
f0d9bded08
commit
896182b201
@ -317,7 +317,8 @@ def main():
|
||||
'mechanize', 'ClientForm', 'usbobserver',
|
||||
'genshi', 'calibre.web.feeds.recipes.*',
|
||||
'calibre.ebooks.lrf.any.*', 'calibre.ebooks.lrf.feeds.*',
|
||||
'keyword', 'codeop', 'pydoc', 'readline'],
|
||||
'keyword', 'codeop', 'pydoc', 'readline',
|
||||
'BeautifulSoup'],
|
||||
'packages' : ['PIL', 'Authorization', 'lxml'],
|
||||
'excludes' : ['IPython'],
|
||||
'plist' : { 'CFBundleGetInfoString' : '''calibre, an E-book management application.'''
|
||||
|
@ -152,7 +152,7 @@ def main(args=sys.argv):
|
||||
'win32process', 'win32api', 'msvcrt',
|
||||
'win32event', 'calibre.ebooks.lrf.any.*',
|
||||
'calibre.ebooks.lrf.feeds.*',
|
||||
'genshi',
|
||||
'genshi', 'BeautifulSoup',
|
||||
'path', 'pydoc', 'IPython.Extensions.*',
|
||||
'calibre.web.feeds.recipes.*',
|
||||
'PyQt4.QtWebKit', 'PyQt4.QtNetwork',
|
||||
|
@ -317,6 +317,11 @@ class LoggingInterface:
|
||||
def log_exception(self, msg, *args):
|
||||
self.___log(self.__logger.exception, msg, args, {})
|
||||
|
||||
def walk(dir):
|
||||
''' A nice interface to os.walk '''
|
||||
for record in os.walk(dir):
|
||||
for f in record[-1]:
|
||||
yield os.path.join(record[0], f)
|
||||
|
||||
def strftime(fmt, t=time.localtime()):
|
||||
''' A version of strtime that returns unicode strings. '''
|
||||
|
@ -44,6 +44,7 @@ def config(defaults=None):
|
||||
c.add_opt('output', ['-o', '--output'], default=None,
|
||||
help=_('The output EPUB file. If not specified, it is derived from the input file name.'))
|
||||
|
||||
|
||||
structure = c.add_group('structure detection', _('Control auto-detection of document structure.'))
|
||||
structure('chapter', ['--chapter'], default="//*[re:match(name(), 'h[1-2]') and re:test(., 'chapter|book|section', 'i')]",
|
||||
help=_('''\
|
||||
@ -74,6 +75,16 @@ to auto-generate a Table of Contents.
|
||||
toc('no_chapters_in_toc', ['--no-chapters-in-toc'], default=False,
|
||||
help=_("Don't add auto-detected chapters to the Table of Contents."))
|
||||
|
||||
layout = c.add_group('page layout', _('Control page layout'))
|
||||
layout('margin_top', ['--margin-top'], default=5.0,
|
||||
help=_('Set the top margin in pts. Default is %default'))
|
||||
layout('margin_bottom', ['--margin-bottom'], default=5.0,
|
||||
help=_('Set the bottom margin in pts. Default is %default'))
|
||||
layout('margin_left', ['--margin-left'], default=5.0,
|
||||
help=_('Set the left margin in pts. Default is %default'))
|
||||
layout('margin_right', ['--margin-right'], default=5.0,
|
||||
help=_('Set the right margin in pts. Default is %default'))
|
||||
|
||||
c.add_opt('show_opf', ['--show-opf'], default=False, group='debug',
|
||||
help=_('Print generated OPF file to stdout'))
|
||||
c.add_opt('show_ncx', ['--show-ncx'], default=False, group='debug',
|
||||
|
154
src/calibre/ebooks/epub/from_any.py
Normal file
154
src/calibre/ebooks/epub/from_any.py
Normal file
@ -0,0 +1,154 @@
|
||||
from __future__ import with_statement
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'''
|
||||
Convert any ebook format to epub.
|
||||
'''
|
||||
|
||||
import sys, os, re
|
||||
from contextlib import nested
|
||||
|
||||
from calibre import extract, walk
|
||||
from calibre.ebooks.epub import config as common_config
|
||||
from calibre.ebooks.epub.from_html import convert as html2epub
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
from calibre.ebooks.metadata.opf import OPFCreator
|
||||
|
||||
def lit2opf(path, tdir, opts):
|
||||
from calibre.ebooks.lit.reader import LitReader
|
||||
print 'Exploding LIT file:', path
|
||||
reader = LitReader(path)
|
||||
reader.extract_content(tdir, False)
|
||||
for f in walk(tdir):
|
||||
if f.lower().endswith('.opf'):
|
||||
return f
|
||||
|
||||
def mobi2opf(path, tdir, opts):
|
||||
from calibre.ebooks.mobi.reader import MobiReader
|
||||
print 'Exploding MOBI file:', path
|
||||
reader = MobiReader(path)
|
||||
reader.extract_content(tdir)
|
||||
files = list(walk(tdir))
|
||||
for f in files:
|
||||
if f.lower().endswith('.opf'):
|
||||
return f
|
||||
html_pat = re.compile(r'\.(x){0,1}htm(l){0,1}', re.IGNORECASE)
|
||||
hf = [f for f in files if html_pat.match(os.path.splitext(f)[1]) is not None]
|
||||
mi = MetaInformation(os.path.splitext(os.path.basename(path))[0], [_('Unknown')])
|
||||
opf = OPFCreator(tdir, mi)
|
||||
opf.create_manifest([(hf[0], None)])
|
||||
opf.create_spine([hf[0]])
|
||||
ans = os.path.join(tdir, 'metadata.opf')
|
||||
opf.render(open(ans, 'wb'))
|
||||
return ans
|
||||
|
||||
def fb22opf(path, tdir, opts):
|
||||
from calibre.ebooks.lrf.fb2.convert_from import to_html
|
||||
print 'Converting FB2 to HTML...'
|
||||
return to_html(path, tdir)
|
||||
|
||||
def rtf2opf(path, tdir, opts):
|
||||
from calibre.ebooks.lrf.rtf.convert_from import generate_html
|
||||
generate_html(path, tdir)
|
||||
return os.path.join(tdir, 'metadata.opf')
|
||||
|
||||
def txt2opf(path, tdir, opts):
|
||||
from calibre.ebooks.lrf.txt.convert_from import generate_html
|
||||
generate_html(path, opts.encoding, tdir)
|
||||
return os.path.join(tdir, 'metadata.opf')
|
||||
|
||||
def pdf2opf(path, tdir, opts):
|
||||
from calibre.ebooks.lrf.pdf.convert_from import generate_html
|
||||
generate_html(path, tdir)
|
||||
return os.path.join(tdir, 'metadata.opf')
|
||||
|
||||
MAP = {
|
||||
'lit' : lit2opf,
|
||||
'mobi' : mobi2opf,
|
||||
'prc' : mobi2opf,
|
||||
'fb2' : fb22opf,
|
||||
'rtf' : rtf2opf,
|
||||
'txt' : txt2opf,
|
||||
'pdf' : pdf2opf,
|
||||
}
|
||||
|
||||
|
||||
def unarchive(path, tdir):
|
||||
extract(path, tdir)
|
||||
files = list(walk(tdir))
|
||||
|
||||
for ext in ['opf'] + list(MAP.keys()):
|
||||
for f in files:
|
||||
if f.lower().endswith('.'+ext):
|
||||
if ext in ['txt', 'rtf'] and os.stat(f).st_size < 2048:
|
||||
continue
|
||||
return f, ext
|
||||
html_pat = re.compile(r'\.(x){0,1}htm(l){0,1}$', re.IGNORECASE)
|
||||
html_files = [f for f in files if html_pat.search(f) is not None]
|
||||
if not html_files:
|
||||
raise ValueError(_('Could not find an ebook inside the archive'))
|
||||
html_files = [(f, os.stat(f).st_size) for f in html_files]
|
||||
html_files.sort(cmp = lambda x, y: cmp(x[1], y[1]))
|
||||
html_files = [f[0] for f in html_files]
|
||||
for q in ('toc', 'index'):
|
||||
for f in html_files:
|
||||
if os.path.splitext(f)[0].lower() == q:
|
||||
return f, os.path.splitext(f)[1].lower()[1:]
|
||||
return html_files[-1], os.path.splitext(html_files[-1])[1].lower()[1:]
|
||||
|
||||
def any2epub(opts, path, notification=None):
|
||||
ext = os.path.splitext(path)[1]
|
||||
if not ext:
|
||||
raise ValueError('Unknown file type: '+path)
|
||||
ext = ext.lower()[1:]
|
||||
|
||||
if opts.output is None:
|
||||
opts.output = os.path.splitext(os.path.basename(path))[0]+'.epub'
|
||||
|
||||
with nested(TemporaryDirectory('_any2epub1'), TemporaryDirectory('_any2epub2')) as (tdir1, tdir2):
|
||||
if ext in ['rar', 'zip']:
|
||||
path, ext = unarchive(path, tdir1)
|
||||
print 'Found %s file in archive'%(ext.upper())
|
||||
|
||||
if ext in MAP.keys():
|
||||
path = MAP[ext](path, tdir2, opts)
|
||||
ext = 'opf'
|
||||
|
||||
|
||||
if re.match(r'((x){0,1}htm(l){0,1})|opf', ext) is None:
|
||||
raise ValueError('Conversion from %s is not supported'%ext.upper())
|
||||
|
||||
print 'Creating EPUB file...'
|
||||
html2epub(path, opts, notification=notification)
|
||||
|
||||
def config(defaults=None):
|
||||
return common_config(defaults=defaults)
|
||||
|
||||
|
||||
def formats():
|
||||
return ['html', 'rar', 'zip']+list(MAP.keys())
|
||||
|
||||
def option_parser():
|
||||
|
||||
return config().option_parser(usage=_('''\
|
||||
%%prog [options] filename
|
||||
|
||||
Convert any of a large number of ebook formats to an epub file. Supported formats are: %s
|
||||
''')%formats()
|
||||
)
|
||||
|
||||
def main(args=sys.argv):
|
||||
parser = option_parser()
|
||||
opts, args = parser.parse_args(args)
|
||||
if len(args) < 2:
|
||||
parser.print_help()
|
||||
print 'No input file specified.'
|
||||
return 1
|
||||
any2epub(opts, args[1])
|
||||
return 0
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
@ -29,7 +29,6 @@ def option_parser():
|
||||
def convert(opts, recipe_arg, notification=None):
|
||||
opts.lrf = False
|
||||
opts.epub = True
|
||||
opts.chapter_mark = 'none'
|
||||
if opts.debug:
|
||||
opts.verbose = 2
|
||||
parser = option_parser()
|
||||
@ -40,6 +39,7 @@ def convert(opts, recipe_arg, notification=None):
|
||||
recipe_opts = c.parse_string(recipe.html2epub_options)
|
||||
c.smart_update(recipe_opts, opts)
|
||||
opts = recipe_opts
|
||||
opts.chapter_mark = 'none'
|
||||
opf = glob.glob(os.path.join(tdir, '*.opf'))
|
||||
if not opf:
|
||||
raise Exception('Downloading of recipe: %s failed'%recipe_arg)
|
||||
|
@ -4,7 +4,12 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
import os, sys, re, shutil, cStringIO
|
||||
|
||||
from lxml.etree import XPath
|
||||
try:
|
||||
from PIL import Image as PILImage
|
||||
except ImportError:
|
||||
import Image as PILImage
|
||||
|
||||
from calibre.ebooks.html import Processor, get_text, merge_metadata, get_filelist,\
|
||||
opf_traverse, create_metadata, rebase_toc
|
||||
@ -106,8 +111,8 @@ def convert(htmlfile, opts, notification=None):
|
||||
cover_src = opts.cover
|
||||
|
||||
if cover_src is not None:
|
||||
cover_dest = os.path.join(tdir, 'content', 'resources', '_cover_'+os.path.splitext(cover_src)[1])
|
||||
shutil.copyfile(cover_src, cover_dest)
|
||||
cover_dest = os.path.join(tdir, 'content', 'resources', '_cover_.jpg')
|
||||
PILImage.open(cover_src).convert('RGB').save(cover_dest)
|
||||
mi.cover = cover_dest
|
||||
resources.append(cover_dest)
|
||||
|
||||
|
@ -23,6 +23,7 @@ from calibre.utils.config import Config, StringConfig
|
||||
from calibre.ebooks.metadata.opf import OPFReader, OPFCreator
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
from calibre.ebooks.metadata.meta import get_metadata
|
||||
from calibre.ebooks.metadata.opf2 import OPF
|
||||
from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
|
||||
@ -280,7 +281,7 @@ class PreProcessor(object):
|
||||
return re.search('<H2[^><]*id=BookTitle', raw) is not None
|
||||
|
||||
def is_pdftohtml(self, src):
|
||||
return src.startswith('<!-- created by calibre\'s pdftohtml -->')
|
||||
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
|
||||
|
||||
def preprocess(self, html):
|
||||
if self.is_baen(html):
|
||||
@ -335,6 +336,7 @@ class Parser(PreProcessor, LoggingInterface):
|
||||
pretty_print=self.opts.pretty_print,
|
||||
include_meta_content_type=True)
|
||||
ans = re.compile(r'<html>', re.IGNORECASE).sub('<html xmlns="http://www.w3.org/1999/xhtml">', ans)
|
||||
ans = re.compile(r'<head[^<>]*?>', re.IGNORECASE).sub('<head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\n', ans)
|
||||
f.write(ans)
|
||||
return f.name
|
||||
|
||||
@ -360,6 +362,8 @@ class Parser(PreProcessor, LoggingInterface):
|
||||
body = self.root.xpath('//body')
|
||||
if body:
|
||||
self.body = body[0]
|
||||
for a in self.root.xpath('//a[@name]'):
|
||||
a.set('id', a.get('name'))
|
||||
|
||||
def debug_tree(self, name):
|
||||
'''
|
||||
@ -540,16 +544,20 @@ class Processor(Parser):
|
||||
css.append('#%s { %s }'%(id, setting))
|
||||
|
||||
for elem in self.root.xpath('//*[@style]'):
|
||||
if 'id' not in elem.keys():
|
||||
id = get_id(elem, counter)
|
||||
counter += 1
|
||||
id = get_id(elem, counter)
|
||||
counter += 1
|
||||
css.append('#%s {%s}'%(id, elem.get('style')))
|
||||
elem.attrib.pop('style')
|
||||
|
||||
self.raw_css = '\n\n'.join(css)
|
||||
self.css = unicode(self.raw_css)
|
||||
self.do_layout()
|
||||
# TODO: Figure out what to do about CSS imports from linked stylesheets
|
||||
|
||||
def do_layout(self):
|
||||
self.css += '\nbody {margin-top: 0pt; margin-botton: 0pt; margin-left: 0pt; margin-right: 0pt}\n'
|
||||
self.css += '@page {margin-top: %fpt; margin-botton: %fpt; margin-left: %fpt; margin-right: %fpt}\n'%(self.opts.margin_top, self.opts.margin_bottom, self.opts.margin_left, self.opts.margin_right)
|
||||
|
||||
def config(defaults=None, config_name='html',
|
||||
desc=_('Options to control the traversal of HTML')):
|
||||
if defaults is None:
|
||||
@ -575,6 +583,8 @@ def config(defaults=None, config_name='html',
|
||||
help=_('Set the title. Default is to autodetect.'))
|
||||
metadata('authors', ['-a', '--authors'], default=_('Unknown'),
|
||||
help=_('The author(s) of the ebook, as a comma separated list.'))
|
||||
metadata('from_opf', ['--metadata-from'], default=None,
|
||||
help=_('Load metadata from the specified OPF file'))
|
||||
|
||||
debug = c.add_group('debug', _('Options useful for debugging'))
|
||||
debug('verbose', ['-v', '--verbose'], default=0, action='count',
|
||||
@ -648,7 +658,12 @@ def merge_metadata(htmlfile, opf, opts):
|
||||
if opf:
|
||||
mi = MetaInformation(opf)
|
||||
else:
|
||||
mi = get_metadata(open(htmlfile, 'rb'), 'html')
|
||||
try:
|
||||
mi = get_metadata(open(htmlfile, 'rb'), 'html')
|
||||
except:
|
||||
mi = MetaInformation(None, None)
|
||||
if opts.from_opf is not None and os.access(opts.from_opf, os.R_OK):
|
||||
mi.smart_update(OPF(open(opts.from_opf, 'rb'), os.path.abspath(os.path.dirname(opts.from_opf))))
|
||||
if opts.title:
|
||||
mi.title = opts.title
|
||||
if opts.authors != _('Unknown'):
|
||||
|
@ -1,16 +1,22 @@
|
||||
from __future__ import with_statement
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Anatoly Shipitsin <norguhtar at gmail.com>'
|
||||
"""
|
||||
Convert .fb2 files to .lrf
|
||||
"""
|
||||
import os, sys, tempfile, shutil, logging
|
||||
import os, sys, shutil, logging
|
||||
from base64 import b64decode
|
||||
from lxml import etree
|
||||
|
||||
from calibre.ebooks.lrf import option_parser as lrf_option_parser
|
||||
from calibre.ebooks.metadata.meta import get_metadata
|
||||
from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file
|
||||
from calibre import setup_cli_handlers, __appname__
|
||||
from calibre import setup_cli_handlers
|
||||
from calibre.resources import fb2_xsl
|
||||
from calibre.ptempfile import PersistentTemporaryDirectory
|
||||
from calibre.ebooks.metadata.opf import OPFCreator
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
|
||||
|
||||
def option_parser():
|
||||
parser = lrf_option_parser(
|
||||
@ -31,28 +37,41 @@ def extract_embedded_content(doc):
|
||||
data = b64decode(elem.text.strip())
|
||||
open(fname, 'wb').write(data)
|
||||
|
||||
def generate_html(fb2file, encoding, logger):
|
||||
from lxml import etree
|
||||
tdir = tempfile.mkdtemp(prefix=__appname__+'_fb2_')
|
||||
cwd = os.getcwdu()
|
||||
os.chdir(tdir)
|
||||
def to_html(fb2file, tdir):
|
||||
cwd = os.getcwd()
|
||||
try:
|
||||
logger.info('Parsing XML...')
|
||||
os.chdir(tdir)
|
||||
print 'Parsing XML...'
|
||||
parser = etree.XMLParser(recover=True, no_network=True)
|
||||
doc = etree.parse(fb2file, parser)
|
||||
extract_embedded_content(doc)
|
||||
logger.info('Converting XML to HTML...')
|
||||
print 'Converting XML to HTML...'
|
||||
styledoc = etree.fromstring(fb2_xsl)
|
||||
|
||||
transform = etree.XSLT(styledoc)
|
||||
result = transform(doc)
|
||||
html = os.path.join(tdir, 'index.html')
|
||||
f = open(html, 'wb')
|
||||
f.write(transform.tostring(result))
|
||||
f.close()
|
||||
open('index.html', 'wb').write(transform.tostring(result))
|
||||
try:
|
||||
mi = get_metadata(open(fb2file, 'rb'))
|
||||
except:
|
||||
mi = MetaInformation(None, None)
|
||||
if not mi.title:
|
||||
mi.title = os.path.splitext(os.path.basename(fb2file))[0]
|
||||
if not mi.authors:
|
||||
mi.authors = [_('Unknown')]
|
||||
opf = OPFCreator(tdir, mi)
|
||||
opf.create_manifest([('index.html', None)])
|
||||
opf.create_spine(['index.html'])
|
||||
opf.render(open('metadata.opf', 'wb'))
|
||||
return os.path.join(tdir, 'metadata.opf')
|
||||
finally:
|
||||
os.chdir(cwd)
|
||||
return html
|
||||
|
||||
|
||||
def generate_html(fb2file, encoding, logger):
|
||||
tdir = PersistentTemporaryDirectory('_fb22lrf')
|
||||
to_html(fb2file, tdir)
|
||||
return os.path.join(tdir, 'index.html')
|
||||
|
||||
def process_file(path, options, logger=None):
|
||||
if logger is None:
|
||||
|
@ -9,6 +9,9 @@ from calibre.ebooks import ConversionError
|
||||
from calibre.ptempfile import PersistentTemporaryDirectory
|
||||
from calibre.ebooks.lrf import option_parser as lrf_option_parser
|
||||
from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
from calibre.ebooks.metadata.opf import OPFCreator
|
||||
from calibre.ebooks.metadata.pdf import get_metadata
|
||||
|
||||
PDFTOHTML = 'pdftohtml'
|
||||
popen = subprocess.Popen
|
||||
@ -20,7 +23,7 @@ if iswindows and hasattr(sys, 'frozen'):
|
||||
if islinux and getattr(sys, 'frozen_path', False):
|
||||
PDFTOHTML = os.path.join(getattr(sys, 'frozen_path'), 'pdftohtml')
|
||||
|
||||
def generate_html(pathtopdf, logger):
|
||||
def generate_html(pathtopdf, tdir):
|
||||
'''
|
||||
Convert the pdf into html.
|
||||
@return: Path to a temporary file containing the HTML.
|
||||
@ -29,10 +32,10 @@ def generate_html(pathtopdf, logger):
|
||||
pathtopdf = pathtopdf.encode(sys.getfilesystemencoding())
|
||||
if not os.access(pathtopdf, os.R_OK):
|
||||
raise ConversionError, 'Cannot read from ' + pathtopdf
|
||||
tdir = PersistentTemporaryDirectory('pdftohtml')
|
||||
index = os.path.join(tdir, 'index.html')
|
||||
# This is neccessary as pdftohtml doesn't always (linux) respect absolute paths
|
||||
cmd = (PDFTOHTML, '-enc', 'UTF-8', '-noframes', '-p', '-nomerge', pathtopdf, os.path.basename(index))
|
||||
pathtopdf = os.path.abspath(pathtopdf)
|
||||
cmd = (PDFTOHTML, '-enc', 'UTF-8', '-noframes', '-p', '-nomerge', '-nodrm', pathtopdf, os.path.basename(index))
|
||||
cwd = os.getcwd()
|
||||
|
||||
try:
|
||||
@ -44,16 +47,30 @@ def generate_html(pathtopdf, logger):
|
||||
raise ConversionError(_('Could not find pdftohtml, check it is in your PATH'), True)
|
||||
else:
|
||||
raise
|
||||
logger.info(p.stdout.read())
|
||||
print p.stdout.read()
|
||||
ret = p.wait()
|
||||
if ret != 0:
|
||||
err = p.stderr.read()
|
||||
raise ConversionError, err
|
||||
if not os.path.exists(index) or os.stat(index).st_size < 100:
|
||||
raise ConversionError(os.path.basename(pathtopdf) + _(' does not allow copying of text.'), True)
|
||||
raw = open(index).read(4000)
|
||||
if not '<br' in raw:
|
||||
|
||||
raw = open(index, 'rb').read()
|
||||
open(index, 'wb').write('<!-- created by calibre\'s pdftohtml -->\n'+raw)
|
||||
if not '<br' in raw[:4000]:
|
||||
raise ConversionError(os.path.basename(pathtopdf) + _(' is an image based PDF. Only conversion of text based PDFs is supported.'), True)
|
||||
try:
|
||||
mi = get_metadata(open(pathtopdf, 'rb'))
|
||||
except:
|
||||
mi = MetaInformation(None, None)
|
||||
if not mi.title:
|
||||
mi.title = os.path.splitext(os.path.basename(pathtopdf))[0]
|
||||
if not mi.authors:
|
||||
mi.authors = [_('Unknown')]
|
||||
opf = OPFCreator(tdir, mi)
|
||||
opf.create_manifest([('index.html', None)])
|
||||
opf.create_spine(['index.html'])
|
||||
opf.render(open('metadata.opf', 'wb'))
|
||||
finally:
|
||||
os.chdir(cwd)
|
||||
return index
|
||||
@ -72,7 +89,8 @@ def process_file(path, options, logger=None):
|
||||
logger = logging.getLogger('pdf2lrf')
|
||||
setup_cli_handlers(logger, level)
|
||||
pdf = os.path.abspath(os.path.expanduser(path))
|
||||
htmlfile = generate_html(pdf, logger)
|
||||
tdir = PersistentTemporaryDirectory('_pdf2lrf')
|
||||
htmlfile = generate_html(pdf, tdir)
|
||||
if not options.output:
|
||||
ext = '.lrs' if options.lrs else '.lrf'
|
||||
options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext)
|
||||
|
@ -1,17 +1,20 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
import os, sys, tempfile, shutil, logging, glob
|
||||
import os, sys, shutil, logging, glob
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from calibre.ebooks.lrf import option_parser as lrf_option_parser
|
||||
from calibre.ebooks.metadata.meta import get_metadata
|
||||
from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file
|
||||
from calibre import setup_cli_handlers, __appname__
|
||||
from calibre import setup_cli_handlers
|
||||
from calibre.libwand import convert, WandException
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
|
||||
from calibre.ebooks.lrf.rtf.xsl import xhtml
|
||||
from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException
|
||||
from calibre.ptempfile import PersistentTemporaryDirectory
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
from calibre.ebooks.metadata.opf import OPFCreator
|
||||
|
||||
def option_parser():
|
||||
parser = lrf_option_parser(
|
||||
@ -44,8 +47,8 @@ def process_file(path, options, logger=None):
|
||||
f = open(rtf, 'rb')
|
||||
mi = get_metadata(f, 'rtf')
|
||||
f.close()
|
||||
html = generate_html(rtf, logger)
|
||||
tdir = os.path.dirname(html)
|
||||
tdir = PersistentTemporaryDirectory('_rtf2lrf')
|
||||
html = generate_html(rtf, tdir)
|
||||
cwd = os.getcwdu()
|
||||
try:
|
||||
if not options.output:
|
||||
@ -83,12 +86,12 @@ def main(args=sys.argv, logger=None):
|
||||
return 0
|
||||
|
||||
|
||||
def generate_xml(rtfpath):
|
||||
def generate_xml(rtfpath, tdir):
|
||||
from calibre.ebooks.rtf2xml.ParseRtf import ParseRtf
|
||||
tdir = tempfile.mkdtemp(prefix=__appname__+'_')
|
||||
ofile = os.path.join(tdir, 'index.xml')
|
||||
cwd = os.getcwdu()
|
||||
os.chdir(tdir)
|
||||
rtfpath = os.path.abspath(rtfpath)
|
||||
try:
|
||||
parser = ParseRtf(
|
||||
in_file = rtfpath,
|
||||
@ -134,26 +137,27 @@ def generate_xml(rtfpath):
|
||||
return ofile
|
||||
|
||||
|
||||
def generate_html(rtfpath, logger):
|
||||
logger.info('Converting RTF to XML...')
|
||||
def generate_html(rtfpath, tdir):
|
||||
print 'Converting RTF to XML...'
|
||||
rtfpath = os.path.abspath(rtfpath)
|
||||
try:
|
||||
xml = generate_xml(rtfpath)
|
||||
xml = generate_xml(rtfpath, tdir)
|
||||
except RtfInvalidCodeException:
|
||||
raise Exception(_('This RTF file has a feature calibre does not support. Convert it to HTML and then convert it.'))
|
||||
tdir = os.path.dirname(xml)
|
||||
cwd = os.getcwdu()
|
||||
os.chdir(tdir)
|
||||
try:
|
||||
logger.info('Parsing XML...')
|
||||
print 'Parsing XML...'
|
||||
parser = etree.XMLParser(recover=True, no_network=True)
|
||||
try:
|
||||
doc = etree.parse(xml, parser)
|
||||
except:
|
||||
raise
|
||||
logger.info('Parsing failed. Trying to clean up XML...')
|
||||
print 'Parsing failed. Trying to clean up XML...'
|
||||
soup = BeautifulStoneSoup(open(xml, 'rb').read())
|
||||
doc = etree.fromstring(str(soup))
|
||||
logger.info('Converting XML to HTML...')
|
||||
print 'Converting XML to HTML...'
|
||||
styledoc = etree.fromstring(xhtml)
|
||||
|
||||
transform = etree.XSLT(styledoc)
|
||||
@ -161,8 +165,22 @@ def generate_html(rtfpath, logger):
|
||||
tdir = os.path.dirname(xml)
|
||||
html = os.path.join(tdir, 'index.html')
|
||||
f = open(html, 'wb')
|
||||
f.write(transform.tostring(result))
|
||||
res = transform.tostring(result)
|
||||
res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
|
||||
f.write(res)
|
||||
f.close()
|
||||
try:
|
||||
mi = get_metadata(open(rtfpath, 'rb'))
|
||||
except:
|
||||
mi = MetaInformation(None, None)
|
||||
if not mi.title:
|
||||
mi.title = os.path.splitext(os.path.basename(rtfpath))[0]
|
||||
if not mi.authors:
|
||||
mi.authors = [_('Unknown')]
|
||||
opf = OPFCreator(tdir, mi)
|
||||
opf.create_manifest([('index.html', None)])
|
||||
opf.create_spine(['index.html'])
|
||||
opf.render(open('metadata.opf', 'wb'))
|
||||
finally:
|
||||
os.chdir(cwd)
|
||||
return html
|
||||
|
@ -5,12 +5,14 @@ Convert .txt files to .lrf
|
||||
"""
|
||||
import os, sys, codecs, logging
|
||||
|
||||
from calibre.ptempfile import PersistentTemporaryFile
|
||||
from calibre.ptempfile import PersistentTemporaryDirectory
|
||||
from calibre.ebooks.lrf import option_parser as lrf_option_parser
|
||||
from calibre.ebooks import ConversionError
|
||||
from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file
|
||||
from calibre.ebooks.markdown import markdown
|
||||
from calibre import setup_cli_handlers
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
from calibre.ebooks.metadata.opf import OPFCreator
|
||||
|
||||
def option_parser():
|
||||
parser = lrf_option_parser(
|
||||
@ -23,7 +25,7 @@ _('''%prog [options] mybook.txt
|
||||
return parser
|
||||
|
||||
|
||||
def generate_html(txtfile, encoding, logger):
|
||||
def generate_html(txtfile, encoding, tdir):
|
||||
'''
|
||||
Convert txtfile to html and return a PersistentTemporaryFile object pointing
|
||||
to the file with the HTML.
|
||||
@ -44,15 +46,19 @@ def generate_html(txtfile, encoding, logger):
|
||||
else:
|
||||
txt = codecs.open(txtfile, 'rb', enc).read()
|
||||
|
||||
logger.info('Converting text to HTML...')
|
||||
print 'Converting text to HTML...'
|
||||
md = markdown.Markdown(
|
||||
extensions=['footnotes', 'tables', 'toc'],
|
||||
safe_mode=False,
|
||||
)
|
||||
html = md.convert(txt)
|
||||
p = PersistentTemporaryFile('.html', dir=os.path.dirname(txtfile))
|
||||
p.close()
|
||||
codecs.open(p.name, 'wb', 'utf8').write(html)
|
||||
html = '<html><body>'+md.convert(txt)+'</body></html>'
|
||||
p = os.path.join(tdir, 'index.html')
|
||||
open(p, 'wb').write(html.encode('utf-8'))
|
||||
mi = MetaInformation(os.path.splitext(os.path.basename(txtfile))[0], [_('Unknown')])
|
||||
opf = OPFCreator(tdir, mi)
|
||||
opf.create_manifest([(os.path.join(tdir, 'index.html'), None)])
|
||||
opf.create_spine([os.path.join(tdir, 'index.html')])
|
||||
opf.render(open(os.path.join(tdir, 'metadata.opf'), 'wb'))
|
||||
return p
|
||||
|
||||
def process_file(path, options, logger=None):
|
||||
@ -63,7 +69,8 @@ def process_file(path, options, logger=None):
|
||||
txt = os.path.abspath(os.path.expanduser(path))
|
||||
if not hasattr(options, 'debug_html_generation'):
|
||||
options.debug_html_generation = False
|
||||
htmlfile = generate_html(txt, options.encoding, logger)
|
||||
tdir = PersistentTemporaryDirectory('_txt2lrf')
|
||||
htmlfile = generate_html(txt, options.encoding, tdir)
|
||||
options.encoding = 'utf-8'
|
||||
if not options.debug_html_generation:
|
||||
options.force_page_break = 'h2'
|
||||
@ -73,9 +80,9 @@ def process_file(path, options, logger=None):
|
||||
options.output = os.path.abspath(os.path.expanduser(options.output))
|
||||
if not options.title:
|
||||
options.title = os.path.splitext(os.path.basename(path))[0]
|
||||
html_process_file(htmlfile.name, options, logger)
|
||||
html_process_file(htmlfile, options, logger)
|
||||
else:
|
||||
print open(htmlfile.name, 'rb').read()
|
||||
print open(htmlfile, 'rb').read()
|
||||
|
||||
def main(args=sys.argv, logger=None):
|
||||
parser = option_parser()
|
||||
|
@ -200,10 +200,10 @@ class MetaInformation(object):
|
||||
Merge the information in C{mi} into self. In case of conflicts, the information
|
||||
in C{mi} takes precedence, unless the information in mi is NULL.
|
||||
'''
|
||||
if mi.title and mi.title.lower() != 'unknown':
|
||||
if mi.title and mi.title != _('Unknown'):
|
||||
self.title = mi.title
|
||||
|
||||
if mi.authors and mi.authors[0].lower() != 'unknown':
|
||||
if mi.authors and mi.authors[0] != _('Unknown'):
|
||||
self.authors = mi.authors
|
||||
|
||||
for attr in ('author_sort', 'title_sort', 'comments', 'category',
|
||||
|
@ -12,7 +12,7 @@ try:
|
||||
except ImportError:
|
||||
import Image as PILImage
|
||||
|
||||
from calibre import __appname__
|
||||
from calibre import __appname__, entity_to_unicode
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
|
||||
from calibre.ebooks.mobi import MobiError
|
||||
from calibre.ebooks.mobi.huffcdic import HuffReader
|
||||
@ -263,17 +263,19 @@ class MobiReader(object):
|
||||
if ref.type.lower() == 'toc':
|
||||
toc = ref.href()
|
||||
if toc:
|
||||
index = self.processed_html.find('<a name="%s"'%toc.partition('#')[-1])
|
||||
index = self.processed_html.find('<a id="%s" name="%s"'%(toc.partition('#')[-1], toc.partition('#')[-1]))
|
||||
tocobj = None
|
||||
ent_pat = re.compile(r'&(\S+?);')
|
||||
if index > -1:
|
||||
raw = '<html><body>'+self.processed_html[index:]
|
||||
soup = BeautifulSoup(raw)
|
||||
tocobj = TOC()
|
||||
for a in soup.findAll('a', href=True):
|
||||
try:
|
||||
text = ''.join(a.findAll(text=True)).strip()
|
||||
text = u''.join(a.findAll(text=True)).strip()
|
||||
except:
|
||||
text = ''
|
||||
text = ent_pat.sub(entity_to_unicode, text)
|
||||
tocobj.add_item(toc.partition('#')[0], a['href'][1:], text)
|
||||
if tocobj is not None:
|
||||
opf.set_toc(tocobj)
|
||||
@ -353,7 +355,7 @@ class MobiReader(object):
|
||||
r = self.mobi_html.find('>', end)
|
||||
if r > -1 and r < l: # Move out of tag
|
||||
end = r+1
|
||||
self.processed_html += self.mobi_html[pos:end] + '<a name="filepos%d"></a>'%oend
|
||||
self.processed_html += self.mobi_html[pos:end] + '<a id="filepos%d" name="filepos%d"></a>'%(oend, oend)
|
||||
pos = end
|
||||
|
||||
self.processed_html += self.mobi_html[pos:]
|
||||
|
@ -43,6 +43,7 @@ entry_points = {
|
||||
'fb22lrf = calibre.ebooks.lrf.fb2.convert_from:main',
|
||||
'fb2-meta = calibre.ebooks.metadata.fb2:main',
|
||||
'any2lrf = calibre.ebooks.lrf.any.convert_from:main',
|
||||
'any2epub = calibre.ebooks.epub.from_any:main',
|
||||
'lrf2lrs = calibre.ebooks.lrf.lrfparser:main',
|
||||
'lrs2lrf = calibre.ebooks.lrf.lrs.convert_from:main',
|
||||
'pdfreflow = calibre.ebooks.lrf.pdf.reflow:main',
|
||||
@ -175,7 +176,9 @@ def setup_completion(fatal_errors):
|
||||
from calibre.ebooks.epub.from_html import option_parser as html2epub
|
||||
from calibre.ebooks.html import option_parser as html2oeb
|
||||
from calibre.ebooks.epub.from_feeds import option_parser as feeds2epub
|
||||
|
||||
from calibre.ebooks.epub.from_any import option_parser as any2epub
|
||||
any_formats = ['epub', 'htm', 'html', 'xhtml', 'xhtm', 'rar', 'zip',
|
||||
'txt', 'lit', 'rtf', 'pdf', 'prc', 'mobi', 'fb2']
|
||||
f = open_file('/etc/bash_completion.d/libprs500')
|
||||
f.close()
|
||||
os.remove(f.name)
|
||||
@ -193,9 +196,8 @@ def setup_completion(fatal_errors):
|
||||
f.write(opts_and_exts('mobi2lrf', htmlop, ['mobi', 'prc']))
|
||||
f.write(opts_and_exts('fb22lrf', htmlop, ['fb2']))
|
||||
f.write(opts_and_exts('pdf2lrf', htmlop, ['pdf']))
|
||||
f.write(opts_and_exts('any2lrf', htmlop,
|
||||
['epub', 'htm', 'html', 'xhtml', 'xhtm', 'rar', 'zip',
|
||||
'txt', 'lit', 'rtf', 'pdf', 'prc', 'mobi', 'fb2']))
|
||||
f.write(opts_and_exts('any2lrf', htmlop, any_formats))
|
||||
f.write(opts_and_exts('any2lrf', any2epub, any_formats))
|
||||
f.write(opts_and_exts('lrf2lrs', lrf2lrsop, ['lrf']))
|
||||
f.write(opts_and_exts('lrf-meta', metaop, ['lrf']))
|
||||
f.write(opts_and_exts('rtf-meta', metaop, ['rtf']))
|
||||
|
@ -178,6 +178,12 @@ class Option(object):
|
||||
def __eq__(self, other):
|
||||
return self.name == getattr(other, 'name', other)
|
||||
|
||||
def __repr__(self):
|
||||
return 'Option: '+self.name
|
||||
|
||||
def __str__(self):
|
||||
return repr(self)
|
||||
|
||||
class OptionValues(object):
|
||||
|
||||
def copy(self):
|
||||
|
Loading…
x
Reference in New Issue
Block a user