Implemented any2epub

This commit is contained in:
Kovid Goyal 2008-09-18 21:48:08 -07:00
parent f0d9bded08
commit 896182b201
16 changed files with 332 additions and 69 deletions

View File

@ -317,7 +317,8 @@ def main():
'mechanize', 'ClientForm', 'usbobserver', 'mechanize', 'ClientForm', 'usbobserver',
'genshi', 'calibre.web.feeds.recipes.*', 'genshi', 'calibre.web.feeds.recipes.*',
'calibre.ebooks.lrf.any.*', 'calibre.ebooks.lrf.feeds.*', 'calibre.ebooks.lrf.any.*', 'calibre.ebooks.lrf.feeds.*',
'keyword', 'codeop', 'pydoc', 'readline'], 'keyword', 'codeop', 'pydoc', 'readline',
'BeautifulSoup'],
'packages' : ['PIL', 'Authorization', 'lxml'], 'packages' : ['PIL', 'Authorization', 'lxml'],
'excludes' : ['IPython'], 'excludes' : ['IPython'],
'plist' : { 'CFBundleGetInfoString' : '''calibre, an E-book management application.''' 'plist' : { 'CFBundleGetInfoString' : '''calibre, an E-book management application.'''

View File

@ -152,7 +152,7 @@ def main(args=sys.argv):
'win32process', 'win32api', 'msvcrt', 'win32process', 'win32api', 'msvcrt',
'win32event', 'calibre.ebooks.lrf.any.*', 'win32event', 'calibre.ebooks.lrf.any.*',
'calibre.ebooks.lrf.feeds.*', 'calibre.ebooks.lrf.feeds.*',
'genshi', 'genshi', 'BeautifulSoup',
'path', 'pydoc', 'IPython.Extensions.*', 'path', 'pydoc', 'IPython.Extensions.*',
'calibre.web.feeds.recipes.*', 'calibre.web.feeds.recipes.*',
'PyQt4.QtWebKit', 'PyQt4.QtNetwork', 'PyQt4.QtWebKit', 'PyQt4.QtNetwork',

View File

@ -317,6 +317,11 @@ class LoggingInterface:
def log_exception(self, msg, *args): def log_exception(self, msg, *args):
self.___log(self.__logger.exception, msg, args, {}) self.___log(self.__logger.exception, msg, args, {})
def walk(dir):
''' A nice interface to os.walk '''
for record in os.walk(dir):
for f in record[-1]:
yield os.path.join(record[0], f)
def strftime(fmt, t=time.localtime()): def strftime(fmt, t=time.localtime()):
''' A version of strtime that returns unicode strings. ''' ''' A version of strtime that returns unicode strings. '''

View File

@ -44,6 +44,7 @@ def config(defaults=None):
c.add_opt('output', ['-o', '--output'], default=None, c.add_opt('output', ['-o', '--output'], default=None,
help=_('The output EPUB file. If not specified, it is derived from the input file name.')) help=_('The output EPUB file. If not specified, it is derived from the input file name.'))
structure = c.add_group('structure detection', _('Control auto-detection of document structure.')) structure = c.add_group('structure detection', _('Control auto-detection of document structure.'))
structure('chapter', ['--chapter'], default="//*[re:match(name(), 'h[1-2]') and re:test(., 'chapter|book|section', 'i')]", structure('chapter', ['--chapter'], default="//*[re:match(name(), 'h[1-2]') and re:test(., 'chapter|book|section', 'i')]",
help=_('''\ help=_('''\
@ -74,6 +75,16 @@ to auto-generate a Table of Contents.
toc('no_chapters_in_toc', ['--no-chapters-in-toc'], default=False, toc('no_chapters_in_toc', ['--no-chapters-in-toc'], default=False,
help=_("Don't add auto-detected chapters to the Table of Contents.")) help=_("Don't add auto-detected chapters to the Table of Contents."))
layout = c.add_group('page layout', _('Control page layout'))
layout('margin_top', ['--margin-top'], default=5.0,
help=_('Set the top margin in pts. Default is %default'))
layout('margin_bottom', ['--margin-bottom'], default=5.0,
help=_('Set the bottom margin in pts. Default is %default'))
layout('margin_left', ['--margin-left'], default=5.0,
help=_('Set the left margin in pts. Default is %default'))
layout('margin_right', ['--margin-right'], default=5.0,
help=_('Set the right margin in pts. Default is %default'))
c.add_opt('show_opf', ['--show-opf'], default=False, group='debug', c.add_opt('show_opf', ['--show-opf'], default=False, group='debug',
help=_('Print generated OPF file to stdout')) help=_('Print generated OPF file to stdout'))
c.add_opt('show_ncx', ['--show-ncx'], default=False, group='debug', c.add_opt('show_ncx', ['--show-ncx'], default=False, group='debug',

View File

@ -0,0 +1,154 @@
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
'''
Convert any ebook format to epub.
'''
import sys, os, re
from contextlib import nested
from calibre import extract, walk
from calibre.ebooks.epub import config as common_config
from calibre.ebooks.epub.from_html import convert as html2epub
from calibre.ptempfile import TemporaryDirectory
from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.metadata.opf import OPFCreator
def lit2opf(path, tdir, opts):
from calibre.ebooks.lit.reader import LitReader
print 'Exploding LIT file:', path
reader = LitReader(path)
reader.extract_content(tdir, False)
for f in walk(tdir):
if f.lower().endswith('.opf'):
return f
def mobi2opf(path, tdir, opts):
from calibre.ebooks.mobi.reader import MobiReader
print 'Exploding MOBI file:', path
reader = MobiReader(path)
reader.extract_content(tdir)
files = list(walk(tdir))
for f in files:
if f.lower().endswith('.opf'):
return f
html_pat = re.compile(r'\.(x){0,1}htm(l){0,1}', re.IGNORECASE)
hf = [f for f in files if html_pat.match(os.path.splitext(f)[1]) is not None]
mi = MetaInformation(os.path.splitext(os.path.basename(path))[0], [_('Unknown')])
opf = OPFCreator(tdir, mi)
opf.create_manifest([(hf[0], None)])
opf.create_spine([hf[0]])
ans = os.path.join(tdir, 'metadata.opf')
opf.render(open(ans, 'wb'))
return ans
def fb22opf(path, tdir, opts):
from calibre.ebooks.lrf.fb2.convert_from import to_html
print 'Converting FB2 to HTML...'
return to_html(path, tdir)
def rtf2opf(path, tdir, opts):
from calibre.ebooks.lrf.rtf.convert_from import generate_html
generate_html(path, tdir)
return os.path.join(tdir, 'metadata.opf')
def txt2opf(path, tdir, opts):
from calibre.ebooks.lrf.txt.convert_from import generate_html
generate_html(path, opts.encoding, tdir)
return os.path.join(tdir, 'metadata.opf')
def pdf2opf(path, tdir, opts):
from calibre.ebooks.lrf.pdf.convert_from import generate_html
generate_html(path, tdir)
return os.path.join(tdir, 'metadata.opf')
MAP = {
'lit' : lit2opf,
'mobi' : mobi2opf,
'prc' : mobi2opf,
'fb2' : fb22opf,
'rtf' : rtf2opf,
'txt' : txt2opf,
'pdf' : pdf2opf,
}
def unarchive(path, tdir):
extract(path, tdir)
files = list(walk(tdir))
for ext in ['opf'] + list(MAP.keys()):
for f in files:
if f.lower().endswith('.'+ext):
if ext in ['txt', 'rtf'] and os.stat(f).st_size < 2048:
continue
return f, ext
html_pat = re.compile(r'\.(x){0,1}htm(l){0,1}$', re.IGNORECASE)
html_files = [f for f in files if html_pat.search(f) is not None]
if not html_files:
raise ValueError(_('Could not find an ebook inside the archive'))
html_files = [(f, os.stat(f).st_size) for f in html_files]
html_files.sort(cmp = lambda x, y: cmp(x[1], y[1]))
html_files = [f[0] for f in html_files]
for q in ('toc', 'index'):
for f in html_files:
if os.path.splitext(f)[0].lower() == q:
return f, os.path.splitext(f)[1].lower()[1:]
return html_files[-1], os.path.splitext(html_files[-1])[1].lower()[1:]
def any2epub(opts, path, notification=None):
ext = os.path.splitext(path)[1]
if not ext:
raise ValueError('Unknown file type: '+path)
ext = ext.lower()[1:]
if opts.output is None:
opts.output = os.path.splitext(os.path.basename(path))[0]+'.epub'
with nested(TemporaryDirectory('_any2epub1'), TemporaryDirectory('_any2epub2')) as (tdir1, tdir2):
if ext in ['rar', 'zip']:
path, ext = unarchive(path, tdir1)
print 'Found %s file in archive'%(ext.upper())
if ext in MAP.keys():
path = MAP[ext](path, tdir2, opts)
ext = 'opf'
if re.match(r'((x){0,1}htm(l){0,1})|opf', ext) is None:
raise ValueError('Conversion from %s is not supported'%ext.upper())
print 'Creating EPUB file...'
html2epub(path, opts, notification=notification)
def config(defaults=None):
return common_config(defaults=defaults)
def formats():
return ['html', 'rar', 'zip']+list(MAP.keys())
def option_parser():
return config().option_parser(usage=_('''\
%%prog [options] filename
Convert any of a large number of ebook formats to an epub file. Supported formats are: %s
''')%formats()
)
def main(args=sys.argv):
parser = option_parser()
opts, args = parser.parse_args(args)
if len(args) < 2:
parser.print_help()
print 'No input file specified.'
return 1
any2epub(opts, args[1])
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -29,7 +29,6 @@ def option_parser():
def convert(opts, recipe_arg, notification=None): def convert(opts, recipe_arg, notification=None):
opts.lrf = False opts.lrf = False
opts.epub = True opts.epub = True
opts.chapter_mark = 'none'
if opts.debug: if opts.debug:
opts.verbose = 2 opts.verbose = 2
parser = option_parser() parser = option_parser()
@ -40,6 +39,7 @@ def convert(opts, recipe_arg, notification=None):
recipe_opts = c.parse_string(recipe.html2epub_options) recipe_opts = c.parse_string(recipe.html2epub_options)
c.smart_update(recipe_opts, opts) c.smart_update(recipe_opts, opts)
opts = recipe_opts opts = recipe_opts
opts.chapter_mark = 'none'
opf = glob.glob(os.path.join(tdir, '*.opf')) opf = glob.glob(os.path.join(tdir, '*.opf'))
if not opf: if not opf:
raise Exception('Downloading of recipe: %s failed'%recipe_arg) raise Exception('Downloading of recipe: %s failed'%recipe_arg)

View File

@ -4,7 +4,12 @@ __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import os, sys, re, shutil, cStringIO import os, sys, re, shutil, cStringIO
from lxml.etree import XPath from lxml.etree import XPath
try:
from PIL import Image as PILImage
except ImportError:
import Image as PILImage
from calibre.ebooks.html import Processor, get_text, merge_metadata, get_filelist,\ from calibre.ebooks.html import Processor, get_text, merge_metadata, get_filelist,\
opf_traverse, create_metadata, rebase_toc opf_traverse, create_metadata, rebase_toc
@ -106,8 +111,8 @@ def convert(htmlfile, opts, notification=None):
cover_src = opts.cover cover_src = opts.cover
if cover_src is not None: if cover_src is not None:
cover_dest = os.path.join(tdir, 'content', 'resources', '_cover_'+os.path.splitext(cover_src)[1]) cover_dest = os.path.join(tdir, 'content', 'resources', '_cover_.jpg')
shutil.copyfile(cover_src, cover_dest) PILImage.open(cover_src).convert('RGB').save(cover_dest)
mi.cover = cover_dest mi.cover = cover_dest
resources.append(cover_dest) resources.append(cover_dest)

View File

@ -23,6 +23,7 @@ from calibre.utils.config import Config, StringConfig
from calibre.ebooks.metadata.opf import OPFReader, OPFCreator from calibre.ebooks.metadata.opf import OPFReader, OPFCreator
from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.metadata.meta import get_metadata from calibre.ebooks.metadata.meta import get_metadata
from calibre.ebooks.metadata.opf2 import OPF
from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile
from calibre.utils.zipfile import ZipFile from calibre.utils.zipfile import ZipFile
@ -280,7 +281,7 @@ class PreProcessor(object):
return re.search('<H2[^><]*id=BookTitle', raw) is not None return re.search('<H2[^><]*id=BookTitle', raw) is not None
def is_pdftohtml(self, src): def is_pdftohtml(self, src):
return src.startswith('<!-- created by calibre\'s pdftohtml -->') return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
def preprocess(self, html): def preprocess(self, html):
if self.is_baen(html): if self.is_baen(html):
@ -335,6 +336,7 @@ class Parser(PreProcessor, LoggingInterface):
pretty_print=self.opts.pretty_print, pretty_print=self.opts.pretty_print,
include_meta_content_type=True) include_meta_content_type=True)
ans = re.compile(r'<html>', re.IGNORECASE).sub('<html xmlns="http://www.w3.org/1999/xhtml">', ans) ans = re.compile(r'<html>', re.IGNORECASE).sub('<html xmlns="http://www.w3.org/1999/xhtml">', ans)
ans = re.compile(r'<head[^<>]*?>', re.IGNORECASE).sub('<head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\n', ans)
f.write(ans) f.write(ans)
return f.name return f.name
@ -360,6 +362,8 @@ class Parser(PreProcessor, LoggingInterface):
body = self.root.xpath('//body') body = self.root.xpath('//body')
if body: if body:
self.body = body[0] self.body = body[0]
for a in self.root.xpath('//a[@name]'):
a.set('id', a.get('name'))
def debug_tree(self, name): def debug_tree(self, name):
''' '''
@ -540,15 +544,19 @@ class Processor(Parser):
css.append('#%s { %s }'%(id, setting)) css.append('#%s { %s }'%(id, setting))
for elem in self.root.xpath('//*[@style]'): for elem in self.root.xpath('//*[@style]'):
if 'id' not in elem.keys(): id = get_id(elem, counter)
id = get_id(elem, counter) counter += 1
counter += 1
css.append('#%s {%s}'%(id, elem.get('style'))) css.append('#%s {%s}'%(id, elem.get('style')))
elem.attrib.pop('style') elem.attrib.pop('style')
self.raw_css = '\n\n'.join(css) self.raw_css = '\n\n'.join(css)
self.css = unicode(self.raw_css) self.css = unicode(self.raw_css)
# TODO: Figure out what to do about CSS imports from linked stylesheets self.do_layout()
# TODO: Figure out what to do about CSS imports from linked stylesheets
def do_layout(self):
self.css += '\nbody {margin-top: 0pt; margin-botton: 0pt; margin-left: 0pt; margin-right: 0pt}\n'
self.css += '@page {margin-top: %fpt; margin-botton: %fpt; margin-left: %fpt; margin-right: %fpt}\n'%(self.opts.margin_top, self.opts.margin_bottom, self.opts.margin_left, self.opts.margin_right)
def config(defaults=None, config_name='html', def config(defaults=None, config_name='html',
desc=_('Options to control the traversal of HTML')): desc=_('Options to control the traversal of HTML')):
@ -575,6 +583,8 @@ def config(defaults=None, config_name='html',
help=_('Set the title. Default is to autodetect.')) help=_('Set the title. Default is to autodetect.'))
metadata('authors', ['-a', '--authors'], default=_('Unknown'), metadata('authors', ['-a', '--authors'], default=_('Unknown'),
help=_('The author(s) of the ebook, as a comma separated list.')) help=_('The author(s) of the ebook, as a comma separated list.'))
metadata('from_opf', ['--metadata-from'], default=None,
help=_('Load metadata from the specified OPF file'))
debug = c.add_group('debug', _('Options useful for debugging')) debug = c.add_group('debug', _('Options useful for debugging'))
debug('verbose', ['-v', '--verbose'], default=0, action='count', debug('verbose', ['-v', '--verbose'], default=0, action='count',
@ -648,7 +658,12 @@ def merge_metadata(htmlfile, opf, opts):
if opf: if opf:
mi = MetaInformation(opf) mi = MetaInformation(opf)
else: else:
mi = get_metadata(open(htmlfile, 'rb'), 'html') try:
mi = get_metadata(open(htmlfile, 'rb'), 'html')
except:
mi = MetaInformation(None, None)
if opts.from_opf is not None and os.access(opts.from_opf, os.R_OK):
mi.smart_update(OPF(open(opts.from_opf, 'rb'), os.path.abspath(os.path.dirname(opts.from_opf))))
if opts.title: if opts.title:
mi.title = opts.title mi.title = opts.title
if opts.authors != _('Unknown'): if opts.authors != _('Unknown'):

View File

@ -1,16 +1,22 @@
from __future__ import with_statement
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Anatoly Shipitsin <norguhtar at gmail.com>' __copyright__ = '2008, Anatoly Shipitsin <norguhtar at gmail.com>'
""" """
Convert .fb2 files to .lrf Convert .fb2 files to .lrf
""" """
import os, sys, tempfile, shutil, logging import os, sys, shutil, logging
from base64 import b64decode from base64 import b64decode
from lxml import etree
from calibre.ebooks.lrf import option_parser as lrf_option_parser from calibre.ebooks.lrf import option_parser as lrf_option_parser
from calibre.ebooks.metadata.meta import get_metadata from calibre.ebooks.metadata.meta import get_metadata
from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file
from calibre import setup_cli_handlers, __appname__ from calibre import setup_cli_handlers
from calibre.resources import fb2_xsl from calibre.resources import fb2_xsl
from calibre.ptempfile import PersistentTemporaryDirectory
from calibre.ebooks.metadata.opf import OPFCreator
from calibre.ebooks.metadata import MetaInformation
def option_parser(): def option_parser():
parser = lrf_option_parser( parser = lrf_option_parser(
@ -31,29 +37,42 @@ def extract_embedded_content(doc):
data = b64decode(elem.text.strip()) data = b64decode(elem.text.strip())
open(fname, 'wb').write(data) open(fname, 'wb').write(data)
def generate_html(fb2file, encoding, logger): def to_html(fb2file, tdir):
from lxml import etree cwd = os.getcwd()
tdir = tempfile.mkdtemp(prefix=__appname__+'_fb2_')
cwd = os.getcwdu()
os.chdir(tdir)
try: try:
logger.info('Parsing XML...') os.chdir(tdir)
print 'Parsing XML...'
parser = etree.XMLParser(recover=True, no_network=True) parser = etree.XMLParser(recover=True, no_network=True)
doc = etree.parse(fb2file, parser) doc = etree.parse(fb2file, parser)
extract_embedded_content(doc) extract_embedded_content(doc)
logger.info('Converting XML to HTML...') print 'Converting XML to HTML...'
styledoc = etree.fromstring(fb2_xsl) styledoc = etree.fromstring(fb2_xsl)
transform = etree.XSLT(styledoc) transform = etree.XSLT(styledoc)
result = transform(doc) result = transform(doc)
html = os.path.join(tdir, 'index.html') open('index.html', 'wb').write(transform.tostring(result))
f = open(html, 'wb') try:
f.write(transform.tostring(result)) mi = get_metadata(open(fb2file, 'rb'))
f.close() except:
mi = MetaInformation(None, None)
if not mi.title:
mi.title = os.path.splitext(os.path.basename(fb2file))[0]
if not mi.authors:
mi.authors = [_('Unknown')]
opf = OPFCreator(tdir, mi)
opf.create_manifest([('index.html', None)])
opf.create_spine(['index.html'])
opf.render(open('metadata.opf', 'wb'))
return os.path.join(tdir, 'metadata.opf')
finally: finally:
os.chdir(cwd) os.chdir(cwd)
return html
def generate_html(fb2file, encoding, logger):
tdir = PersistentTemporaryDirectory('_fb22lrf')
to_html(fb2file, tdir)
return os.path.join(tdir, 'index.html')
def process_file(path, options, logger=None): def process_file(path, options, logger=None):
if logger is None: if logger is None:
level = logging.DEBUG if options.verbose else logging.INFO level = logging.DEBUG if options.verbose else logging.INFO

View File

@ -9,6 +9,9 @@ from calibre.ebooks import ConversionError
from calibre.ptempfile import PersistentTemporaryDirectory from calibre.ptempfile import PersistentTemporaryDirectory
from calibre.ebooks.lrf import option_parser as lrf_option_parser from calibre.ebooks.lrf import option_parser as lrf_option_parser
from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file
from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.metadata.opf import OPFCreator
from calibre.ebooks.metadata.pdf import get_metadata
PDFTOHTML = 'pdftohtml' PDFTOHTML = 'pdftohtml'
popen = subprocess.Popen popen = subprocess.Popen
@ -20,7 +23,7 @@ if iswindows and hasattr(sys, 'frozen'):
if islinux and getattr(sys, 'frozen_path', False): if islinux and getattr(sys, 'frozen_path', False):
PDFTOHTML = os.path.join(getattr(sys, 'frozen_path'), 'pdftohtml') PDFTOHTML = os.path.join(getattr(sys, 'frozen_path'), 'pdftohtml')
def generate_html(pathtopdf, logger): def generate_html(pathtopdf, tdir):
''' '''
Convert the pdf into html. Convert the pdf into html.
@return: Path to a temporary file containing the HTML. @return: Path to a temporary file containing the HTML.
@ -29,10 +32,10 @@ def generate_html(pathtopdf, logger):
pathtopdf = pathtopdf.encode(sys.getfilesystemencoding()) pathtopdf = pathtopdf.encode(sys.getfilesystemencoding())
if not os.access(pathtopdf, os.R_OK): if not os.access(pathtopdf, os.R_OK):
raise ConversionError, 'Cannot read from ' + pathtopdf raise ConversionError, 'Cannot read from ' + pathtopdf
tdir = PersistentTemporaryDirectory('pdftohtml')
index = os.path.join(tdir, 'index.html') index = os.path.join(tdir, 'index.html')
# This is neccessary as pdftohtml doesn't always (linux) respect absolute paths # This is neccessary as pdftohtml doesn't always (linux) respect absolute paths
cmd = (PDFTOHTML, '-enc', 'UTF-8', '-noframes', '-p', '-nomerge', pathtopdf, os.path.basename(index)) pathtopdf = os.path.abspath(pathtopdf)
cmd = (PDFTOHTML, '-enc', 'UTF-8', '-noframes', '-p', '-nomerge', '-nodrm', pathtopdf, os.path.basename(index))
cwd = os.getcwd() cwd = os.getcwd()
try: try:
@ -44,16 +47,30 @@ def generate_html(pathtopdf, logger):
raise ConversionError(_('Could not find pdftohtml, check it is in your PATH'), True) raise ConversionError(_('Could not find pdftohtml, check it is in your PATH'), True)
else: else:
raise raise
logger.info(p.stdout.read()) print p.stdout.read()
ret = p.wait() ret = p.wait()
if ret != 0: if ret != 0:
err = p.stderr.read() err = p.stderr.read()
raise ConversionError, err raise ConversionError, err
if not os.path.exists(index) or os.stat(index).st_size < 100: if not os.path.exists(index) or os.stat(index).st_size < 100:
raise ConversionError(os.path.basename(pathtopdf) + _(' does not allow copying of text.'), True) raise ConversionError(os.path.basename(pathtopdf) + _(' does not allow copying of text.'), True)
raw = open(index).read(4000)
if not '<br' in raw: raw = open(index, 'rb').read()
open(index, 'wb').write('<!-- created by calibre\'s pdftohtml -->\n'+raw)
if not '<br' in raw[:4000]:
raise ConversionError(os.path.basename(pathtopdf) + _(' is an image based PDF. Only conversion of text based PDFs is supported.'), True) raise ConversionError(os.path.basename(pathtopdf) + _(' is an image based PDF. Only conversion of text based PDFs is supported.'), True)
try:
mi = get_metadata(open(pathtopdf, 'rb'))
except:
mi = MetaInformation(None, None)
if not mi.title:
mi.title = os.path.splitext(os.path.basename(pathtopdf))[0]
if not mi.authors:
mi.authors = [_('Unknown')]
opf = OPFCreator(tdir, mi)
opf.create_manifest([('index.html', None)])
opf.create_spine(['index.html'])
opf.render(open('metadata.opf', 'wb'))
finally: finally:
os.chdir(cwd) os.chdir(cwd)
return index return index
@ -72,7 +89,8 @@ def process_file(path, options, logger=None):
logger = logging.getLogger('pdf2lrf') logger = logging.getLogger('pdf2lrf')
setup_cli_handlers(logger, level) setup_cli_handlers(logger, level)
pdf = os.path.abspath(os.path.expanduser(path)) pdf = os.path.abspath(os.path.expanduser(path))
htmlfile = generate_html(pdf, logger) tdir = PersistentTemporaryDirectory('_pdf2lrf')
htmlfile = generate_html(pdf, tdir)
if not options.output: if not options.output:
ext = '.lrs' if options.lrs else '.lrf' ext = '.lrs' if options.lrs else '.lrf'
options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext) options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext)

View File

@ -1,17 +1,20 @@
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
import os, sys, tempfile, shutil, logging, glob import os, sys, shutil, logging, glob
from lxml import etree from lxml import etree
from calibre.ebooks.lrf import option_parser as lrf_option_parser from calibre.ebooks.lrf import option_parser as lrf_option_parser
from calibre.ebooks.metadata.meta import get_metadata from calibre.ebooks.metadata.meta import get_metadata
from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file
from calibre import setup_cli_handlers, __appname__ from calibre import setup_cli_handlers
from calibre.libwand import convert, WandException from calibre.libwand import convert, WandException
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
from calibre.ebooks.lrf.rtf.xsl import xhtml from calibre.ebooks.lrf.rtf.xsl import xhtml
from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException
from calibre.ptempfile import PersistentTemporaryDirectory
from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.metadata.opf import OPFCreator
def option_parser(): def option_parser():
parser = lrf_option_parser( parser = lrf_option_parser(
@ -44,8 +47,8 @@ def process_file(path, options, logger=None):
f = open(rtf, 'rb') f = open(rtf, 'rb')
mi = get_metadata(f, 'rtf') mi = get_metadata(f, 'rtf')
f.close() f.close()
html = generate_html(rtf, logger) tdir = PersistentTemporaryDirectory('_rtf2lrf')
tdir = os.path.dirname(html) html = generate_html(rtf, tdir)
cwd = os.getcwdu() cwd = os.getcwdu()
try: try:
if not options.output: if not options.output:
@ -83,12 +86,12 @@ def main(args=sys.argv, logger=None):
return 0 return 0
def generate_xml(rtfpath): def generate_xml(rtfpath, tdir):
from calibre.ebooks.rtf2xml.ParseRtf import ParseRtf from calibre.ebooks.rtf2xml.ParseRtf import ParseRtf
tdir = tempfile.mkdtemp(prefix=__appname__+'_')
ofile = os.path.join(tdir, 'index.xml') ofile = os.path.join(tdir, 'index.xml')
cwd = os.getcwdu() cwd = os.getcwdu()
os.chdir(tdir) os.chdir(tdir)
rtfpath = os.path.abspath(rtfpath)
try: try:
parser = ParseRtf( parser = ParseRtf(
in_file = rtfpath, in_file = rtfpath,
@ -134,26 +137,27 @@ def generate_xml(rtfpath):
return ofile return ofile
def generate_html(rtfpath, logger): def generate_html(rtfpath, tdir):
logger.info('Converting RTF to XML...') print 'Converting RTF to XML...'
rtfpath = os.path.abspath(rtfpath)
try: try:
xml = generate_xml(rtfpath) xml = generate_xml(rtfpath, tdir)
except RtfInvalidCodeException: except RtfInvalidCodeException:
raise Exception(_('This RTF file has a feature calibre does not support. Convert it to HTML and then convert it.')) raise Exception(_('This RTF file has a feature calibre does not support. Convert it to HTML and then convert it.'))
tdir = os.path.dirname(xml) tdir = os.path.dirname(xml)
cwd = os.getcwdu() cwd = os.getcwdu()
os.chdir(tdir) os.chdir(tdir)
try: try:
logger.info('Parsing XML...') print 'Parsing XML...'
parser = etree.XMLParser(recover=True, no_network=True) parser = etree.XMLParser(recover=True, no_network=True)
try: try:
doc = etree.parse(xml, parser) doc = etree.parse(xml, parser)
except: except:
raise raise
logger.info('Parsing failed. Trying to clean up XML...') print 'Parsing failed. Trying to clean up XML...'
soup = BeautifulStoneSoup(open(xml, 'rb').read()) soup = BeautifulStoneSoup(open(xml, 'rb').read())
doc = etree.fromstring(str(soup)) doc = etree.fromstring(str(soup))
logger.info('Converting XML to HTML...') print 'Converting XML to HTML...'
styledoc = etree.fromstring(xhtml) styledoc = etree.fromstring(xhtml)
transform = etree.XSLT(styledoc) transform = etree.XSLT(styledoc)
@ -161,8 +165,22 @@ def generate_html(rtfpath, logger):
tdir = os.path.dirname(xml) tdir = os.path.dirname(xml)
html = os.path.join(tdir, 'index.html') html = os.path.join(tdir, 'index.html')
f = open(html, 'wb') f = open(html, 'wb')
f.write(transform.tostring(result)) res = transform.tostring(result)
res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
f.write(res)
f.close() f.close()
try:
mi = get_metadata(open(rtfpath, 'rb'))
except:
mi = MetaInformation(None, None)
if not mi.title:
mi.title = os.path.splitext(os.path.basename(rtfpath))[0]
if not mi.authors:
mi.authors = [_('Unknown')]
opf = OPFCreator(tdir, mi)
opf.create_manifest([('index.html', None)])
opf.create_spine(['index.html'])
opf.render(open('metadata.opf', 'wb'))
finally: finally:
os.chdir(cwd) os.chdir(cwd)
return html return html

View File

@ -5,12 +5,14 @@ Convert .txt files to .lrf
""" """
import os, sys, codecs, logging import os, sys, codecs, logging
from calibre.ptempfile import PersistentTemporaryFile from calibre.ptempfile import PersistentTemporaryDirectory
from calibre.ebooks.lrf import option_parser as lrf_option_parser from calibre.ebooks.lrf import option_parser as lrf_option_parser
from calibre.ebooks import ConversionError from calibre.ebooks import ConversionError
from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file
from calibre.ebooks.markdown import markdown from calibre.ebooks.markdown import markdown
from calibre import setup_cli_handlers from calibre import setup_cli_handlers
from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.metadata.opf import OPFCreator
def option_parser(): def option_parser():
parser = lrf_option_parser( parser = lrf_option_parser(
@ -23,7 +25,7 @@ _('''%prog [options] mybook.txt
return parser return parser
def generate_html(txtfile, encoding, logger): def generate_html(txtfile, encoding, tdir):
''' '''
Convert txtfile to html and return a PersistentTemporaryFile object pointing Convert txtfile to html and return a PersistentTemporaryFile object pointing
to the file with the HTML. to the file with the HTML.
@ -44,15 +46,19 @@ def generate_html(txtfile, encoding, logger):
else: else:
txt = codecs.open(txtfile, 'rb', enc).read() txt = codecs.open(txtfile, 'rb', enc).read()
logger.info('Converting text to HTML...') print 'Converting text to HTML...'
md = markdown.Markdown( md = markdown.Markdown(
extensions=['footnotes', 'tables', 'toc'], extensions=['footnotes', 'tables', 'toc'],
safe_mode=False, safe_mode=False,
) )
html = md.convert(txt) html = '<html><body>'+md.convert(txt)+'</body></html>'
p = PersistentTemporaryFile('.html', dir=os.path.dirname(txtfile)) p = os.path.join(tdir, 'index.html')
p.close() open(p, 'wb').write(html.encode('utf-8'))
codecs.open(p.name, 'wb', 'utf8').write(html) mi = MetaInformation(os.path.splitext(os.path.basename(txtfile))[0], [_('Unknown')])
opf = OPFCreator(tdir, mi)
opf.create_manifest([(os.path.join(tdir, 'index.html'), None)])
opf.create_spine([os.path.join(tdir, 'index.html')])
opf.render(open(os.path.join(tdir, 'metadata.opf'), 'wb'))
return p return p
def process_file(path, options, logger=None): def process_file(path, options, logger=None):
@ -63,7 +69,8 @@ def process_file(path, options, logger=None):
txt = os.path.abspath(os.path.expanduser(path)) txt = os.path.abspath(os.path.expanduser(path))
if not hasattr(options, 'debug_html_generation'): if not hasattr(options, 'debug_html_generation'):
options.debug_html_generation = False options.debug_html_generation = False
htmlfile = generate_html(txt, options.encoding, logger) tdir = PersistentTemporaryDirectory('_txt2lrf')
htmlfile = generate_html(txt, options.encoding, tdir)
options.encoding = 'utf-8' options.encoding = 'utf-8'
if not options.debug_html_generation: if not options.debug_html_generation:
options.force_page_break = 'h2' options.force_page_break = 'h2'
@ -73,9 +80,9 @@ def process_file(path, options, logger=None):
options.output = os.path.abspath(os.path.expanduser(options.output)) options.output = os.path.abspath(os.path.expanduser(options.output))
if not options.title: if not options.title:
options.title = os.path.splitext(os.path.basename(path))[0] options.title = os.path.splitext(os.path.basename(path))[0]
html_process_file(htmlfile.name, options, logger) html_process_file(htmlfile, options, logger)
else: else:
print open(htmlfile.name, 'rb').read() print open(htmlfile, 'rb').read()
def main(args=sys.argv, logger=None): def main(args=sys.argv, logger=None):
parser = option_parser() parser = option_parser()

View File

@ -200,10 +200,10 @@ class MetaInformation(object):
Merge the information in C{mi} into self. In case of conflicts, the information Merge the information in C{mi} into self. In case of conflicts, the information
in C{mi} takes precedence, unless the information in mi is NULL. in C{mi} takes precedence, unless the information in mi is NULL.
''' '''
if mi.title and mi.title.lower() != 'unknown': if mi.title and mi.title != _('Unknown'):
self.title = mi.title self.title = mi.title
if mi.authors and mi.authors[0].lower() != 'unknown': if mi.authors and mi.authors[0] != _('Unknown'):
self.authors = mi.authors self.authors = mi.authors
for attr in ('author_sort', 'title_sort', 'comments', 'category', for attr in ('author_sort', 'title_sort', 'comments', 'category',

View File

@ -12,7 +12,7 @@ try:
except ImportError: except ImportError:
import Image as PILImage import Image as PILImage
from calibre import __appname__ from calibre import __appname__, entity_to_unicode
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
from calibre.ebooks.mobi import MobiError from calibre.ebooks.mobi import MobiError
from calibre.ebooks.mobi.huffcdic import HuffReader from calibre.ebooks.mobi.huffcdic import HuffReader
@ -263,17 +263,19 @@ class MobiReader(object):
if ref.type.lower() == 'toc': if ref.type.lower() == 'toc':
toc = ref.href() toc = ref.href()
if toc: if toc:
index = self.processed_html.find('<a name="%s"'%toc.partition('#')[-1]) index = self.processed_html.find('<a id="%s" name="%s"'%(toc.partition('#')[-1], toc.partition('#')[-1]))
tocobj = None tocobj = None
ent_pat = re.compile(r'&(\S+?);')
if index > -1: if index > -1:
raw = '<html><body>'+self.processed_html[index:] raw = '<html><body>'+self.processed_html[index:]
soup = BeautifulSoup(raw) soup = BeautifulSoup(raw)
tocobj = TOC() tocobj = TOC()
for a in soup.findAll('a', href=True): for a in soup.findAll('a', href=True):
try: try:
text = ''.join(a.findAll(text=True)).strip() text = u''.join(a.findAll(text=True)).strip()
except: except:
text = '' text = ''
text = ent_pat.sub(entity_to_unicode, text)
tocobj.add_item(toc.partition('#')[0], a['href'][1:], text) tocobj.add_item(toc.partition('#')[0], a['href'][1:], text)
if tocobj is not None: if tocobj is not None:
opf.set_toc(tocobj) opf.set_toc(tocobj)
@ -353,7 +355,7 @@ class MobiReader(object):
r = self.mobi_html.find('>', end) r = self.mobi_html.find('>', end)
if r > -1 and r < l: # Move out of tag if r > -1 and r < l: # Move out of tag
end = r+1 end = r+1
self.processed_html += self.mobi_html[pos:end] + '<a name="filepos%d"></a>'%oend self.processed_html += self.mobi_html[pos:end] + '<a id="filepos%d" name="filepos%d"></a>'%(oend, oend)
pos = end pos = end
self.processed_html += self.mobi_html[pos:] self.processed_html += self.mobi_html[pos:]

View File

@ -43,6 +43,7 @@ entry_points = {
'fb22lrf = calibre.ebooks.lrf.fb2.convert_from:main', 'fb22lrf = calibre.ebooks.lrf.fb2.convert_from:main',
'fb2-meta = calibre.ebooks.metadata.fb2:main', 'fb2-meta = calibre.ebooks.metadata.fb2:main',
'any2lrf = calibre.ebooks.lrf.any.convert_from:main', 'any2lrf = calibre.ebooks.lrf.any.convert_from:main',
'any2epub = calibre.ebooks.epub.from_any:main',
'lrf2lrs = calibre.ebooks.lrf.lrfparser:main', 'lrf2lrs = calibre.ebooks.lrf.lrfparser:main',
'lrs2lrf = calibre.ebooks.lrf.lrs.convert_from:main', 'lrs2lrf = calibre.ebooks.lrf.lrs.convert_from:main',
'pdfreflow = calibre.ebooks.lrf.pdf.reflow:main', 'pdfreflow = calibre.ebooks.lrf.pdf.reflow:main',
@ -174,8 +175,10 @@ def setup_completion(fatal_errors):
from calibre.ebooks.lrf.comic.convert_from import option_parser as comicop from calibre.ebooks.lrf.comic.convert_from import option_parser as comicop
from calibre.ebooks.epub.from_html import option_parser as html2epub from calibre.ebooks.epub.from_html import option_parser as html2epub
from calibre.ebooks.html import option_parser as html2oeb from calibre.ebooks.html import option_parser as html2oeb
from calibre.ebooks.epub.from_feeds import option_parser as feeds2epub from calibre.ebooks.epub.from_feeds import option_parser as feeds2epub
from calibre.ebooks.epub.from_any import option_parser as any2epub
any_formats = ['epub', 'htm', 'html', 'xhtml', 'xhtm', 'rar', 'zip',
'txt', 'lit', 'rtf', 'pdf', 'prc', 'mobi', 'fb2']
f = open_file('/etc/bash_completion.d/libprs500') f = open_file('/etc/bash_completion.d/libprs500')
f.close() f.close()
os.remove(f.name) os.remove(f.name)
@ -193,9 +196,8 @@ def setup_completion(fatal_errors):
f.write(opts_and_exts('mobi2lrf', htmlop, ['mobi', 'prc'])) f.write(opts_and_exts('mobi2lrf', htmlop, ['mobi', 'prc']))
f.write(opts_and_exts('fb22lrf', htmlop, ['fb2'])) f.write(opts_and_exts('fb22lrf', htmlop, ['fb2']))
f.write(opts_and_exts('pdf2lrf', htmlop, ['pdf'])) f.write(opts_and_exts('pdf2lrf', htmlop, ['pdf']))
f.write(opts_and_exts('any2lrf', htmlop, f.write(opts_and_exts('any2lrf', htmlop, any_formats))
['epub', 'htm', 'html', 'xhtml', 'xhtm', 'rar', 'zip', f.write(opts_and_exts('any2lrf', any2epub, any_formats))
'txt', 'lit', 'rtf', 'pdf', 'prc', 'mobi', 'fb2']))
f.write(opts_and_exts('lrf2lrs', lrf2lrsop, ['lrf'])) f.write(opts_and_exts('lrf2lrs', lrf2lrsop, ['lrf']))
f.write(opts_and_exts('lrf-meta', metaop, ['lrf'])) f.write(opts_and_exts('lrf-meta', metaop, ['lrf']))
f.write(opts_and_exts('rtf-meta', metaop, ['rtf'])) f.write(opts_and_exts('rtf-meta', metaop, ['rtf']))

View File

@ -177,6 +177,12 @@ class Option(object):
def __eq__(self, other): def __eq__(self, other):
return self.name == getattr(other, 'name', other) return self.name == getattr(other, 'name', other)
def __repr__(self):
return 'Option: '+self.name
def __str__(self):
return repr(self)
class OptionValues(object): class OptionValues(object):