Implemented any2epub

This commit is contained in:
Kovid Goyal 2008-09-18 21:48:08 -07:00
parent f0d9bded08
commit 896182b201
16 changed files with 332 additions and 69 deletions

View File

@ -317,7 +317,8 @@ def main():
'mechanize', 'ClientForm', 'usbobserver',
'genshi', 'calibre.web.feeds.recipes.*',
'calibre.ebooks.lrf.any.*', 'calibre.ebooks.lrf.feeds.*',
'keyword', 'codeop', 'pydoc', 'readline'],
'keyword', 'codeop', 'pydoc', 'readline',
'BeautifulSoup'],
'packages' : ['PIL', 'Authorization', 'lxml'],
'excludes' : ['IPython'],
'plist' : { 'CFBundleGetInfoString' : '''calibre, an E-book management application.'''

View File

@ -152,7 +152,7 @@ def main(args=sys.argv):
'win32process', 'win32api', 'msvcrt',
'win32event', 'calibre.ebooks.lrf.any.*',
'calibre.ebooks.lrf.feeds.*',
'genshi',
'genshi', 'BeautifulSoup',
'path', 'pydoc', 'IPython.Extensions.*',
'calibre.web.feeds.recipes.*',
'PyQt4.QtWebKit', 'PyQt4.QtNetwork',

View File

@ -317,6 +317,11 @@ class LoggingInterface:
def log_exception(self, msg, *args):
self.___log(self.__logger.exception, msg, args, {})
def walk(dir):
''' A nice interface to os.walk '''
for record in os.walk(dir):
for f in record[-1]:
yield os.path.join(record[0], f)
def strftime(fmt, t=time.localtime()):
''' A version of strtime that returns unicode strings. '''

View File

@ -44,6 +44,7 @@ def config(defaults=None):
c.add_opt('output', ['-o', '--output'], default=None,
help=_('The output EPUB file. If not specified, it is derived from the input file name.'))
structure = c.add_group('structure detection', _('Control auto-detection of document structure.'))
structure('chapter', ['--chapter'], default="//*[re:match(name(), 'h[1-2]') and re:test(., 'chapter|book|section', 'i')]",
help=_('''\
@ -74,6 +75,16 @@ to auto-generate a Table of Contents.
toc('no_chapters_in_toc', ['--no-chapters-in-toc'], default=False,
help=_("Don't add auto-detected chapters to the Table of Contents."))
layout = c.add_group('page layout', _('Control page layout'))
layout('margin_top', ['--margin-top'], default=5.0,
help=_('Set the top margin in pts. Default is %default'))
layout('margin_bottom', ['--margin-bottom'], default=5.0,
help=_('Set the bottom margin in pts. Default is %default'))
layout('margin_left', ['--margin-left'], default=5.0,
help=_('Set the left margin in pts. Default is %default'))
layout('margin_right', ['--margin-right'], default=5.0,
help=_('Set the right margin in pts. Default is %default'))
c.add_opt('show_opf', ['--show-opf'], default=False, group='debug',
help=_('Print generated OPF file to stdout'))
c.add_opt('show_ncx', ['--show-ncx'], default=False, group='debug',

View File

@ -0,0 +1,154 @@
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
'''
Convert any ebook format to epub.
'''
import sys, os, re
from contextlib import nested
from calibre import extract, walk
from calibre.ebooks.epub import config as common_config
from calibre.ebooks.epub.from_html import convert as html2epub
from calibre.ptempfile import TemporaryDirectory
from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.metadata.opf import OPFCreator
def lit2opf(path, tdir, opts):
from calibre.ebooks.lit.reader import LitReader
print 'Exploding LIT file:', path
reader = LitReader(path)
reader.extract_content(tdir, False)
for f in walk(tdir):
if f.lower().endswith('.opf'):
return f
def mobi2opf(path, tdir, opts):
from calibre.ebooks.mobi.reader import MobiReader
print 'Exploding MOBI file:', path
reader = MobiReader(path)
reader.extract_content(tdir)
files = list(walk(tdir))
for f in files:
if f.lower().endswith('.opf'):
return f
html_pat = re.compile(r'\.(x){0,1}htm(l){0,1}', re.IGNORECASE)
hf = [f for f in files if html_pat.match(os.path.splitext(f)[1]) is not None]
mi = MetaInformation(os.path.splitext(os.path.basename(path))[0], [_('Unknown')])
opf = OPFCreator(tdir, mi)
opf.create_manifest([(hf[0], None)])
opf.create_spine([hf[0]])
ans = os.path.join(tdir, 'metadata.opf')
opf.render(open(ans, 'wb'))
return ans
def fb22opf(path, tdir, opts):
from calibre.ebooks.lrf.fb2.convert_from import to_html
print 'Converting FB2 to HTML...'
return to_html(path, tdir)
def rtf2opf(path, tdir, opts):
from calibre.ebooks.lrf.rtf.convert_from import generate_html
generate_html(path, tdir)
return os.path.join(tdir, 'metadata.opf')
def txt2opf(path, tdir, opts):
from calibre.ebooks.lrf.txt.convert_from import generate_html
generate_html(path, opts.encoding, tdir)
return os.path.join(tdir, 'metadata.opf')
def pdf2opf(path, tdir, opts):
from calibre.ebooks.lrf.pdf.convert_from import generate_html
generate_html(path, tdir)
return os.path.join(tdir, 'metadata.opf')
MAP = {
'lit' : lit2opf,
'mobi' : mobi2opf,
'prc' : mobi2opf,
'fb2' : fb22opf,
'rtf' : rtf2opf,
'txt' : txt2opf,
'pdf' : pdf2opf,
}
def unarchive(path, tdir):
extract(path, tdir)
files = list(walk(tdir))
for ext in ['opf'] + list(MAP.keys()):
for f in files:
if f.lower().endswith('.'+ext):
if ext in ['txt', 'rtf'] and os.stat(f).st_size < 2048:
continue
return f, ext
html_pat = re.compile(r'\.(x){0,1}htm(l){0,1}$', re.IGNORECASE)
html_files = [f for f in files if html_pat.search(f) is not None]
if not html_files:
raise ValueError(_('Could not find an ebook inside the archive'))
html_files = [(f, os.stat(f).st_size) for f in html_files]
html_files.sort(cmp = lambda x, y: cmp(x[1], y[1]))
html_files = [f[0] for f in html_files]
for q in ('toc', 'index'):
for f in html_files:
if os.path.splitext(f)[0].lower() == q:
return f, os.path.splitext(f)[1].lower()[1:]
return html_files[-1], os.path.splitext(html_files[-1])[1].lower()[1:]
def any2epub(opts, path, notification=None):
ext = os.path.splitext(path)[1]
if not ext:
raise ValueError('Unknown file type: '+path)
ext = ext.lower()[1:]
if opts.output is None:
opts.output = os.path.splitext(os.path.basename(path))[0]+'.epub'
with nested(TemporaryDirectory('_any2epub1'), TemporaryDirectory('_any2epub2')) as (tdir1, tdir2):
if ext in ['rar', 'zip']:
path, ext = unarchive(path, tdir1)
print 'Found %s file in archive'%(ext.upper())
if ext in MAP.keys():
path = MAP[ext](path, tdir2, opts)
ext = 'opf'
if re.match(r'((x){0,1}htm(l){0,1})|opf', ext) is None:
raise ValueError('Conversion from %s is not supported'%ext.upper())
print 'Creating EPUB file...'
html2epub(path, opts, notification=notification)
def config(defaults=None):
return common_config(defaults=defaults)
def formats():
return ['html', 'rar', 'zip']+list(MAP.keys())
def option_parser():
return config().option_parser(usage=_('''\
%%prog [options] filename
Convert any of a large number of ebook formats to an epub file. Supported formats are: %s
''')%formats()
)
def main(args=sys.argv):
parser = option_parser()
opts, args = parser.parse_args(args)
if len(args) < 2:
parser.print_help()
print 'No input file specified.'
return 1
any2epub(opts, args[1])
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -29,7 +29,6 @@ def option_parser():
def convert(opts, recipe_arg, notification=None):
opts.lrf = False
opts.epub = True
opts.chapter_mark = 'none'
if opts.debug:
opts.verbose = 2
parser = option_parser()
@ -40,6 +39,7 @@ def convert(opts, recipe_arg, notification=None):
recipe_opts = c.parse_string(recipe.html2epub_options)
c.smart_update(recipe_opts, opts)
opts = recipe_opts
opts.chapter_mark = 'none'
opf = glob.glob(os.path.join(tdir, '*.opf'))
if not opf:
raise Exception('Downloading of recipe: %s failed'%recipe_arg)

View File

@ -4,7 +4,12 @@ __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
import os, sys, re, shutil, cStringIO
from lxml.etree import XPath
try:
from PIL import Image as PILImage
except ImportError:
import Image as PILImage
from calibre.ebooks.html import Processor, get_text, merge_metadata, get_filelist,\
opf_traverse, create_metadata, rebase_toc
@ -106,8 +111,8 @@ def convert(htmlfile, opts, notification=None):
cover_src = opts.cover
if cover_src is not None:
cover_dest = os.path.join(tdir, 'content', 'resources', '_cover_'+os.path.splitext(cover_src)[1])
shutil.copyfile(cover_src, cover_dest)
cover_dest = os.path.join(tdir, 'content', 'resources', '_cover_.jpg')
PILImage.open(cover_src).convert('RGB').save(cover_dest)
mi.cover = cover_dest
resources.append(cover_dest)

View File

@ -23,6 +23,7 @@ from calibre.utils.config import Config, StringConfig
from calibre.ebooks.metadata.opf import OPFReader, OPFCreator
from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.metadata.meta import get_metadata
from calibre.ebooks.metadata.opf2 import OPF
from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile
from calibre.utils.zipfile import ZipFile
@ -280,7 +281,7 @@ class PreProcessor(object):
return re.search('<H2[^><]*id=BookTitle', raw) is not None
def is_pdftohtml(self, src):
return src.startswith('<!-- created by calibre\'s pdftohtml -->')
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
def preprocess(self, html):
if self.is_baen(html):
@ -335,6 +336,7 @@ class Parser(PreProcessor, LoggingInterface):
pretty_print=self.opts.pretty_print,
include_meta_content_type=True)
ans = re.compile(r'<html>', re.IGNORECASE).sub('<html xmlns="http://www.w3.org/1999/xhtml">', ans)
ans = re.compile(r'<head[^<>]*?>', re.IGNORECASE).sub('<head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\n', ans)
f.write(ans)
return f.name
@ -360,6 +362,8 @@ class Parser(PreProcessor, LoggingInterface):
body = self.root.xpath('//body')
if body:
self.body = body[0]
for a in self.root.xpath('//a[@name]'):
a.set('id', a.get('name'))
def debug_tree(self, name):
'''
@ -540,7 +544,6 @@ class Processor(Parser):
css.append('#%s { %s }'%(id, setting))
for elem in self.root.xpath('//*[@style]'):
if 'id' not in elem.keys():
id = get_id(elem, counter)
counter += 1
css.append('#%s {%s}'%(id, elem.get('style')))
@ -548,8 +551,13 @@ class Processor(Parser):
self.raw_css = '\n\n'.join(css)
self.css = unicode(self.raw_css)
self.do_layout()
# TODO: Figure out what to do about CSS imports from linked stylesheets
def do_layout(self):
self.css += '\nbody {margin-top: 0pt; margin-botton: 0pt; margin-left: 0pt; margin-right: 0pt}\n'
self.css += '@page {margin-top: %fpt; margin-botton: %fpt; margin-left: %fpt; margin-right: %fpt}\n'%(self.opts.margin_top, self.opts.margin_bottom, self.opts.margin_left, self.opts.margin_right)
def config(defaults=None, config_name='html',
desc=_('Options to control the traversal of HTML')):
if defaults is None:
@ -575,6 +583,8 @@ def config(defaults=None, config_name='html',
help=_('Set the title. Default is to autodetect.'))
metadata('authors', ['-a', '--authors'], default=_('Unknown'),
help=_('The author(s) of the ebook, as a comma separated list.'))
metadata('from_opf', ['--metadata-from'], default=None,
help=_('Load metadata from the specified OPF file'))
debug = c.add_group('debug', _('Options useful for debugging'))
debug('verbose', ['-v', '--verbose'], default=0, action='count',
@ -648,7 +658,12 @@ def merge_metadata(htmlfile, opf, opts):
if opf:
mi = MetaInformation(opf)
else:
try:
mi = get_metadata(open(htmlfile, 'rb'), 'html')
except:
mi = MetaInformation(None, None)
if opts.from_opf is not None and os.access(opts.from_opf, os.R_OK):
mi.smart_update(OPF(open(opts.from_opf, 'rb'), os.path.abspath(os.path.dirname(opts.from_opf))))
if opts.title:
mi.title = opts.title
if opts.authors != _('Unknown'):

View File

@ -1,16 +1,22 @@
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Anatoly Shipitsin <norguhtar at gmail.com>'
"""
Convert .fb2 files to .lrf
"""
import os, sys, tempfile, shutil, logging
import os, sys, shutil, logging
from base64 import b64decode
from lxml import etree
from calibre.ebooks.lrf import option_parser as lrf_option_parser
from calibre.ebooks.metadata.meta import get_metadata
from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file
from calibre import setup_cli_handlers, __appname__
from calibre import setup_cli_handlers
from calibre.resources import fb2_xsl
from calibre.ptempfile import PersistentTemporaryDirectory
from calibre.ebooks.metadata.opf import OPFCreator
from calibre.ebooks.metadata import MetaInformation
def option_parser():
parser = lrf_option_parser(
@ -31,28 +37,41 @@ def extract_embedded_content(doc):
data = b64decode(elem.text.strip())
open(fname, 'wb').write(data)
def generate_html(fb2file, encoding, logger):
from lxml import etree
tdir = tempfile.mkdtemp(prefix=__appname__+'_fb2_')
cwd = os.getcwdu()
os.chdir(tdir)
def to_html(fb2file, tdir):
cwd = os.getcwd()
try:
logger.info('Parsing XML...')
os.chdir(tdir)
print 'Parsing XML...'
parser = etree.XMLParser(recover=True, no_network=True)
doc = etree.parse(fb2file, parser)
extract_embedded_content(doc)
logger.info('Converting XML to HTML...')
print 'Converting XML to HTML...'
styledoc = etree.fromstring(fb2_xsl)
transform = etree.XSLT(styledoc)
result = transform(doc)
html = os.path.join(tdir, 'index.html')
f = open(html, 'wb')
f.write(transform.tostring(result))
f.close()
open('index.html', 'wb').write(transform.tostring(result))
try:
mi = get_metadata(open(fb2file, 'rb'))
except:
mi = MetaInformation(None, None)
if not mi.title:
mi.title = os.path.splitext(os.path.basename(fb2file))[0]
if not mi.authors:
mi.authors = [_('Unknown')]
opf = OPFCreator(tdir, mi)
opf.create_manifest([('index.html', None)])
opf.create_spine(['index.html'])
opf.render(open('metadata.opf', 'wb'))
return os.path.join(tdir, 'metadata.opf')
finally:
os.chdir(cwd)
return html
def generate_html(fb2file, encoding, logger):
tdir = PersistentTemporaryDirectory('_fb22lrf')
to_html(fb2file, tdir)
return os.path.join(tdir, 'index.html')
def process_file(path, options, logger=None):
if logger is None:

View File

@ -9,6 +9,9 @@ from calibre.ebooks import ConversionError
from calibre.ptempfile import PersistentTemporaryDirectory
from calibre.ebooks.lrf import option_parser as lrf_option_parser
from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file
from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.metadata.opf import OPFCreator
from calibre.ebooks.metadata.pdf import get_metadata
PDFTOHTML = 'pdftohtml'
popen = subprocess.Popen
@ -20,7 +23,7 @@ if iswindows and hasattr(sys, 'frozen'):
if islinux and getattr(sys, 'frozen_path', False):
PDFTOHTML = os.path.join(getattr(sys, 'frozen_path'), 'pdftohtml')
def generate_html(pathtopdf, logger):
def generate_html(pathtopdf, tdir):
'''
Convert the pdf into html.
@return: Path to a temporary file containing the HTML.
@ -29,10 +32,10 @@ def generate_html(pathtopdf, logger):
pathtopdf = pathtopdf.encode(sys.getfilesystemencoding())
if not os.access(pathtopdf, os.R_OK):
raise ConversionError, 'Cannot read from ' + pathtopdf
tdir = PersistentTemporaryDirectory('pdftohtml')
index = os.path.join(tdir, 'index.html')
# This is neccessary as pdftohtml doesn't always (linux) respect absolute paths
cmd = (PDFTOHTML, '-enc', 'UTF-8', '-noframes', '-p', '-nomerge', pathtopdf, os.path.basename(index))
pathtopdf = os.path.abspath(pathtopdf)
cmd = (PDFTOHTML, '-enc', 'UTF-8', '-noframes', '-p', '-nomerge', '-nodrm', pathtopdf, os.path.basename(index))
cwd = os.getcwd()
try:
@ -44,16 +47,30 @@ def generate_html(pathtopdf, logger):
raise ConversionError(_('Could not find pdftohtml, check it is in your PATH'), True)
else:
raise
logger.info(p.stdout.read())
print p.stdout.read()
ret = p.wait()
if ret != 0:
err = p.stderr.read()
raise ConversionError, err
if not os.path.exists(index) or os.stat(index).st_size < 100:
raise ConversionError(os.path.basename(pathtopdf) + _(' does not allow copying of text.'), True)
raw = open(index).read(4000)
if not '<br' in raw:
raw = open(index, 'rb').read()
open(index, 'wb').write('<!-- created by calibre\'s pdftohtml -->\n'+raw)
if not '<br' in raw[:4000]:
raise ConversionError(os.path.basename(pathtopdf) + _(' is an image based PDF. Only conversion of text based PDFs is supported.'), True)
try:
mi = get_metadata(open(pathtopdf, 'rb'))
except:
mi = MetaInformation(None, None)
if not mi.title:
mi.title = os.path.splitext(os.path.basename(pathtopdf))[0]
if not mi.authors:
mi.authors = [_('Unknown')]
opf = OPFCreator(tdir, mi)
opf.create_manifest([('index.html', None)])
opf.create_spine(['index.html'])
opf.render(open('metadata.opf', 'wb'))
finally:
os.chdir(cwd)
return index
@ -72,7 +89,8 @@ def process_file(path, options, logger=None):
logger = logging.getLogger('pdf2lrf')
setup_cli_handlers(logger, level)
pdf = os.path.abspath(os.path.expanduser(path))
htmlfile = generate_html(pdf, logger)
tdir = PersistentTemporaryDirectory('_pdf2lrf')
htmlfile = generate_html(pdf, tdir)
if not options.output:
ext = '.lrs' if options.lrs else '.lrf'
options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext)

View File

@ -1,17 +1,20 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
import os, sys, tempfile, shutil, logging, glob
import os, sys, shutil, logging, glob
from lxml import etree
from calibre.ebooks.lrf import option_parser as lrf_option_parser
from calibre.ebooks.metadata.meta import get_metadata
from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file
from calibre import setup_cli_handlers, __appname__
from calibre import setup_cli_handlers
from calibre.libwand import convert, WandException
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
from calibre.ebooks.lrf.rtf.xsl import xhtml
from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException
from calibre.ptempfile import PersistentTemporaryDirectory
from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.metadata.opf import OPFCreator
def option_parser():
parser = lrf_option_parser(
@ -44,8 +47,8 @@ def process_file(path, options, logger=None):
f = open(rtf, 'rb')
mi = get_metadata(f, 'rtf')
f.close()
html = generate_html(rtf, logger)
tdir = os.path.dirname(html)
tdir = PersistentTemporaryDirectory('_rtf2lrf')
html = generate_html(rtf, tdir)
cwd = os.getcwdu()
try:
if not options.output:
@ -83,12 +86,12 @@ def main(args=sys.argv, logger=None):
return 0
def generate_xml(rtfpath):
def generate_xml(rtfpath, tdir):
from calibre.ebooks.rtf2xml.ParseRtf import ParseRtf
tdir = tempfile.mkdtemp(prefix=__appname__+'_')
ofile = os.path.join(tdir, 'index.xml')
cwd = os.getcwdu()
os.chdir(tdir)
rtfpath = os.path.abspath(rtfpath)
try:
parser = ParseRtf(
in_file = rtfpath,
@ -134,26 +137,27 @@ def generate_xml(rtfpath):
return ofile
def generate_html(rtfpath, logger):
logger.info('Converting RTF to XML...')
def generate_html(rtfpath, tdir):
print 'Converting RTF to XML...'
rtfpath = os.path.abspath(rtfpath)
try:
xml = generate_xml(rtfpath)
xml = generate_xml(rtfpath, tdir)
except RtfInvalidCodeException:
raise Exception(_('This RTF file has a feature calibre does not support. Convert it to HTML and then convert it.'))
tdir = os.path.dirname(xml)
cwd = os.getcwdu()
os.chdir(tdir)
try:
logger.info('Parsing XML...')
print 'Parsing XML...'
parser = etree.XMLParser(recover=True, no_network=True)
try:
doc = etree.parse(xml, parser)
except:
raise
logger.info('Parsing failed. Trying to clean up XML...')
print 'Parsing failed. Trying to clean up XML...'
soup = BeautifulStoneSoup(open(xml, 'rb').read())
doc = etree.fromstring(str(soup))
logger.info('Converting XML to HTML...')
print 'Converting XML to HTML...'
styledoc = etree.fromstring(xhtml)
transform = etree.XSLT(styledoc)
@ -161,8 +165,22 @@ def generate_html(rtfpath, logger):
tdir = os.path.dirname(xml)
html = os.path.join(tdir, 'index.html')
f = open(html, 'wb')
f.write(transform.tostring(result))
res = transform.tostring(result)
res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
f.write(res)
f.close()
try:
mi = get_metadata(open(rtfpath, 'rb'))
except:
mi = MetaInformation(None, None)
if not mi.title:
mi.title = os.path.splitext(os.path.basename(rtfpath))[0]
if not mi.authors:
mi.authors = [_('Unknown')]
opf = OPFCreator(tdir, mi)
opf.create_manifest([('index.html', None)])
opf.create_spine(['index.html'])
opf.render(open('metadata.opf', 'wb'))
finally:
os.chdir(cwd)
return html

View File

@ -5,12 +5,14 @@ Convert .txt files to .lrf
"""
import os, sys, codecs, logging
from calibre.ptempfile import PersistentTemporaryFile
from calibre.ptempfile import PersistentTemporaryDirectory
from calibre.ebooks.lrf import option_parser as lrf_option_parser
from calibre.ebooks import ConversionError
from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file
from calibre.ebooks.markdown import markdown
from calibre import setup_cli_handlers
from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.metadata.opf import OPFCreator
def option_parser():
parser = lrf_option_parser(
@ -23,7 +25,7 @@ _('''%prog [options] mybook.txt
return parser
def generate_html(txtfile, encoding, logger):
def generate_html(txtfile, encoding, tdir):
'''
Convert txtfile to html and return a PersistentTemporaryFile object pointing
to the file with the HTML.
@ -44,15 +46,19 @@ def generate_html(txtfile, encoding, logger):
else:
txt = codecs.open(txtfile, 'rb', enc).read()
logger.info('Converting text to HTML...')
print 'Converting text to HTML...'
md = markdown.Markdown(
extensions=['footnotes', 'tables', 'toc'],
safe_mode=False,
)
html = md.convert(txt)
p = PersistentTemporaryFile('.html', dir=os.path.dirname(txtfile))
p.close()
codecs.open(p.name, 'wb', 'utf8').write(html)
html = '<html><body>'+md.convert(txt)+'</body></html>'
p = os.path.join(tdir, 'index.html')
open(p, 'wb').write(html.encode('utf-8'))
mi = MetaInformation(os.path.splitext(os.path.basename(txtfile))[0], [_('Unknown')])
opf = OPFCreator(tdir, mi)
opf.create_manifest([(os.path.join(tdir, 'index.html'), None)])
opf.create_spine([os.path.join(tdir, 'index.html')])
opf.render(open(os.path.join(tdir, 'metadata.opf'), 'wb'))
return p
def process_file(path, options, logger=None):
@ -63,7 +69,8 @@ def process_file(path, options, logger=None):
txt = os.path.abspath(os.path.expanduser(path))
if not hasattr(options, 'debug_html_generation'):
options.debug_html_generation = False
htmlfile = generate_html(txt, options.encoding, logger)
tdir = PersistentTemporaryDirectory('_txt2lrf')
htmlfile = generate_html(txt, options.encoding, tdir)
options.encoding = 'utf-8'
if not options.debug_html_generation:
options.force_page_break = 'h2'
@ -73,9 +80,9 @@ def process_file(path, options, logger=None):
options.output = os.path.abspath(os.path.expanduser(options.output))
if not options.title:
options.title = os.path.splitext(os.path.basename(path))[0]
html_process_file(htmlfile.name, options, logger)
html_process_file(htmlfile, options, logger)
else:
print open(htmlfile.name, 'rb').read()
print open(htmlfile, 'rb').read()
def main(args=sys.argv, logger=None):
parser = option_parser()

View File

@ -200,10 +200,10 @@ class MetaInformation(object):
Merge the information in C{mi} into self. In case of conflicts, the information
in C{mi} takes precedence, unless the information in mi is NULL.
'''
if mi.title and mi.title.lower() != 'unknown':
if mi.title and mi.title != _('Unknown'):
self.title = mi.title
if mi.authors and mi.authors[0].lower() != 'unknown':
if mi.authors and mi.authors[0] != _('Unknown'):
self.authors = mi.authors
for attr in ('author_sort', 'title_sort', 'comments', 'category',

View File

@ -12,7 +12,7 @@ try:
except ImportError:
import Image as PILImage
from calibre import __appname__
from calibre import __appname__, entity_to_unicode
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
from calibre.ebooks.mobi import MobiError
from calibre.ebooks.mobi.huffcdic import HuffReader
@ -263,17 +263,19 @@ class MobiReader(object):
if ref.type.lower() == 'toc':
toc = ref.href()
if toc:
index = self.processed_html.find('<a name="%s"'%toc.partition('#')[-1])
index = self.processed_html.find('<a id="%s" name="%s"'%(toc.partition('#')[-1], toc.partition('#')[-1]))
tocobj = None
ent_pat = re.compile(r'&(\S+?);')
if index > -1:
raw = '<html><body>'+self.processed_html[index:]
soup = BeautifulSoup(raw)
tocobj = TOC()
for a in soup.findAll('a', href=True):
try:
text = ''.join(a.findAll(text=True)).strip()
text = u''.join(a.findAll(text=True)).strip()
except:
text = ''
text = ent_pat.sub(entity_to_unicode, text)
tocobj.add_item(toc.partition('#')[0], a['href'][1:], text)
if tocobj is not None:
opf.set_toc(tocobj)
@ -353,7 +355,7 @@ class MobiReader(object):
r = self.mobi_html.find('>', end)
if r > -1 and r < l: # Move out of tag
end = r+1
self.processed_html += self.mobi_html[pos:end] + '<a name="filepos%d"></a>'%oend
self.processed_html += self.mobi_html[pos:end] + '<a id="filepos%d" name="filepos%d"></a>'%(oend, oend)
pos = end
self.processed_html += self.mobi_html[pos:]

View File

@ -43,6 +43,7 @@ entry_points = {
'fb22lrf = calibre.ebooks.lrf.fb2.convert_from:main',
'fb2-meta = calibre.ebooks.metadata.fb2:main',
'any2lrf = calibre.ebooks.lrf.any.convert_from:main',
'any2epub = calibre.ebooks.epub.from_any:main',
'lrf2lrs = calibre.ebooks.lrf.lrfparser:main',
'lrs2lrf = calibre.ebooks.lrf.lrs.convert_from:main',
'pdfreflow = calibre.ebooks.lrf.pdf.reflow:main',
@ -175,7 +176,9 @@ def setup_completion(fatal_errors):
from calibre.ebooks.epub.from_html import option_parser as html2epub
from calibre.ebooks.html import option_parser as html2oeb
from calibre.ebooks.epub.from_feeds import option_parser as feeds2epub
from calibre.ebooks.epub.from_any import option_parser as any2epub
any_formats = ['epub', 'htm', 'html', 'xhtml', 'xhtm', 'rar', 'zip',
'txt', 'lit', 'rtf', 'pdf', 'prc', 'mobi', 'fb2']
f = open_file('/etc/bash_completion.d/libprs500')
f.close()
os.remove(f.name)
@ -193,9 +196,8 @@ def setup_completion(fatal_errors):
f.write(opts_and_exts('mobi2lrf', htmlop, ['mobi', 'prc']))
f.write(opts_and_exts('fb22lrf', htmlop, ['fb2']))
f.write(opts_and_exts('pdf2lrf', htmlop, ['pdf']))
f.write(opts_and_exts('any2lrf', htmlop,
['epub', 'htm', 'html', 'xhtml', 'xhtm', 'rar', 'zip',
'txt', 'lit', 'rtf', 'pdf', 'prc', 'mobi', 'fb2']))
f.write(opts_and_exts('any2lrf', htmlop, any_formats))
f.write(opts_and_exts('any2lrf', any2epub, any_formats))
f.write(opts_and_exts('lrf2lrs', lrf2lrsop, ['lrf']))
f.write(opts_and_exts('lrf-meta', metaop, ['lrf']))
f.write(opts_and_exts('rtf-meta', metaop, ['rtf']))

View File

@ -178,6 +178,12 @@ class Option(object):
def __eq__(self, other):
return self.name == getattr(other, 'name', other)
def __repr__(self):
return 'Option: '+self.name
def __str__(self):
return repr(self)
class OptionValues(object):
def copy(self):