mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Ported structure detection code and added plugin for FB2 input.
This commit is contained in:
parent
02cfaac014
commit
1770f7bf74
@ -281,6 +281,7 @@ from calibre.ebooks.mobi.input import MOBIInput
|
||||
from calibre.ebooks.pdf.input import PDFInput
|
||||
from calibre.ebooks.txt.input import TXTInput
|
||||
from calibre.ebooks.lit.input import LITInput
|
||||
from calibre.ebooks.fb2.input import FB2Input
|
||||
from calibre.ebooks.html.input import HTMLInput
|
||||
from calibre.ebooks.oeb.output import OEBOutput
|
||||
from calibre.ebooks.txt.output import TXTOutput
|
||||
@ -288,7 +289,8 @@ from calibre.ebooks.pdf.output import PDFOutput
|
||||
from calibre.customize.profiles import input_profiles, output_profiles
|
||||
|
||||
plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDFInput, HTMLInput,
|
||||
TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput]
|
||||
TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput,
|
||||
FB2Input]
|
||||
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
|
||||
x.__name__.endswith('MetadataReader')]
|
||||
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
|
||||
|
@ -119,6 +119,24 @@ def add_pipeline_options(parser, plumber):
|
||||
]
|
||||
),
|
||||
|
||||
'STRUCTURE DETECTION' : (
|
||||
_('Control auto-detection of document structure.'),
|
||||
[
|
||||
'dont_split_on_page_breaks', 'chapter', 'chapter_mark',
|
||||
]
|
||||
),
|
||||
|
||||
'TABLE OF CONTENTS' : (
|
||||
_('Control the automatic generation of a Table of Contents. By '
|
||||
'default, if the source file has a Table of Contents, it will '
|
||||
'be used in preference to the automatically generated one.'),
|
||||
[
|
||||
'level1_toc', 'level2_toc', 'level3_toc',
|
||||
'toc_threshold', 'max_toc_links', 'no_chapters_in_toc',
|
||||
'use_auto_toc',
|
||||
]
|
||||
),
|
||||
|
||||
'METADATA' : (_('Options to set metadata in the output'),
|
||||
plumber.metadata_option_names,
|
||||
),
|
||||
@ -130,7 +148,8 @@ def add_pipeline_options(parser, plumber):
|
||||
|
||||
}
|
||||
|
||||
group_order = ['', 'LOOK AND FEEL', 'METADATA', 'DEBUG']
|
||||
group_order = ['', 'LOOK AND FEEL', 'STRUCTURE DETECTION',
|
||||
'TABLE OF CONTENTS', 'METADATA', 'DEBUG']
|
||||
|
||||
for group in group_order:
|
||||
desc, options = groups[group]
|
||||
@ -163,6 +182,10 @@ def main(args=sys.argv):
|
||||
add_pipeline_options(parser, plumber)
|
||||
|
||||
opts = parser.parse_args(args)[0]
|
||||
y = lambda q : os.path.abspath(os.path.expanduser(q))
|
||||
for x in ('read_metadata_from_opf', 'cover'):
|
||||
if getattr(opts, x, None) is not None:
|
||||
setattr(opts, x, y(getattr(opts, x)))
|
||||
recommendations = [(n.dest, getattr(opts, n.dest),
|
||||
OptionRecommendation.HIGH) \
|
||||
for n in parser.options_iter()
|
||||
|
@ -121,6 +121,88 @@ OptionRecommendation(name='dont_split_on_page_breaks',
|
||||
)
|
||||
),
|
||||
|
||||
OptionRecommendation(name='level1_toc',
|
||||
recommended_value=None, level=OptionRecommendation.LOW,
|
||||
help=_('XPath expression that specifies all tags that '
|
||||
'should be added to the Table of Contents at level one. If '
|
||||
'this is specified, it takes precedence over other forms '
|
||||
'of auto-detection.'
|
||||
)
|
||||
),
|
||||
|
||||
OptionRecommendation(name='level2_toc',
|
||||
recommended_value=None, level=OptionRecommendation.LOW,
|
||||
help=_('XPath expression that specifies all tags that should be '
|
||||
'added to the Table of Contents at level two. Each entry is added '
|
||||
'under the previous level one entry.'
|
||||
)
|
||||
),
|
||||
|
||||
OptionRecommendation(name='level3_toc',
|
||||
recommended_value=None, level=OptionRecommendation.LOW,
|
||||
help=_('XPath expression that specifies all tags that should be '
|
||||
'added to the Table of Contents at level three. Each entry '
|
||||
'is added under the previous level two entry.'
|
||||
)
|
||||
),
|
||||
|
||||
OptionRecommendation(name='use_auto_toc',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Normally, if the source file already has a Table of '
|
||||
'Contents, it is used in preference to the auto-generated one. '
|
||||
'With this option, the auto-generated one is always used.'
|
||||
)
|
||||
),
|
||||
|
||||
OptionRecommendation(name='no_chapters_in_toc',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_("Don't add auto-detected chapters to the Table of "
|
||||
'Contents.'
|
||||
)
|
||||
),
|
||||
|
||||
OptionRecommendation(name='toc_threshold',
|
||||
recommended_value=6, level=OptionRecommendation.LOW,
|
||||
help=_(
|
||||
'If fewer than this number of chapters is detected, then links '
|
||||
'are added to the Table of Contents. Default: %default')
|
||||
),
|
||||
|
||||
OptionRecommendation(name='max_toc_links',
|
||||
recommended_value=50, level=OptionRecommendation.LOW,
|
||||
help=_('Maximum number of links to insert into the TOC. Set to 0 '
|
||||
'to disable. Default is: %default. Links are only added to the '
|
||||
'TOC if less than the threshold number of chapters were detected.'
|
||||
)
|
||||
),
|
||||
|
||||
OptionRecommendation(name='chapter',
|
||||
recommended_value="//*[((name()='h1' or name()='h2') and "
|
||||
"re:test(., 'chapter|book|section|part', 'i')) or @class "
|
||||
"= 'chapter']", level=OptionRecommendation.LOW,
|
||||
help=_('An XPath expression to detect chapter titles. The default '
|
||||
'is to consider <h1> or <h2> tags that contain the words '
|
||||
'"chapter","book","section" or "part" as chapter titles as '
|
||||
'well as any tags that have class="chapter". The expression '
|
||||
'used must evaluate to a list of elements. To disable chapter '
|
||||
'detection, use the expression "/". See the XPath Tutorial '
|
||||
'in the calibre User Manual for further help on using this '
|
||||
'feature.'
|
||||
)
|
||||
),
|
||||
|
||||
OptionRecommendation(name='chapter_mark',
|
||||
recommended_value='pagebreak', level=OptionRecommendation.LOW,
|
||||
choices=['pagebreak', 'rule', 'both', 'none'],
|
||||
help=_('Specify how to mark detected chapters. A value of '
|
||||
'"pagebreak" will insert page breaks before chapters. '
|
||||
'A value of "rule" will insert a line before chapters. '
|
||||
'A value of "none" will disable chapter marking and a '
|
||||
'value of "both" will use both page breaks and lines '
|
||||
'to mark chapters.')
|
||||
),
|
||||
|
||||
|
||||
|
||||
OptionRecommendation(name='read_metadata_from_opf',
|
||||
recommended_value=None, level=OptionRecommendation.LOW,
|
||||
@ -130,6 +212,7 @@ OptionRecommendation(name='read_metadata_from_opf',
|
||||
'file.')
|
||||
),
|
||||
|
||||
|
||||
OptionRecommendation(name='title',
|
||||
recommended_value=None, level=OptionRecommendation.LOW,
|
||||
help=_('Set the title.')),
|
||||
@ -237,6 +320,7 @@ OptionRecommendation(name='language',
|
||||
rec = self.get_option_by_name(name)
|
||||
if rec is not None and rec.level <= level:
|
||||
rec.recommended_value = val
|
||||
rec.level = level
|
||||
|
||||
def merge_ui_recommendations(self, recommendations):
|
||||
'''
|
||||
@ -248,6 +332,7 @@ OptionRecommendation(name='language',
|
||||
rec = self.get_option_by_name(name)
|
||||
if rec is not None and rec.level <= level and rec.level < rec.HIGH:
|
||||
rec.recommended_value = val
|
||||
rec.level = level
|
||||
|
||||
def read_user_metadata(self):
|
||||
'''
|
||||
@ -332,6 +417,9 @@ OptionRecommendation(name='language',
|
||||
self.opts.source = self.opts.input_profile
|
||||
self.opts.dest = self.opts.output_profile
|
||||
|
||||
from calibre.ebooks.oeb.transforms.structure import DetectStructure
|
||||
DetectStructure()(self.oeb, self.opts)
|
||||
|
||||
from calibre.ebooks.oeb.transforms.flatcss import CSSFlattener
|
||||
fbase = self.opts.base_font_size
|
||||
if fbase == 0:
|
||||
@ -364,6 +452,8 @@ OptionRecommendation(name='language',
|
||||
trimmer = ManifestTrimmer()
|
||||
trimmer(self.oeb, self.opts)
|
||||
|
||||
self.oeb.toc.rationalize_play_orders()
|
||||
|
||||
self.log.info('Creating %s...'%self.output_plugin.name)
|
||||
self.output_plugin.convert(self.oeb, self.output, self.input_plugin,
|
||||
self.opts, self.log)
|
||||
@ -384,4 +474,3 @@ def create_oebbook(log, path_or_stream, opts, reader=None):
|
||||
|
||||
reader()(oeb, path_or_stream)
|
||||
return oeb
|
||||
|
||||
|
@ -15,88 +15,15 @@ from calibre.ebooks import DRMError
|
||||
from calibre.ebooks.epub import config as common_config
|
||||
from calibre.ebooks.epub.from_html import convert as html2epub, find_html_index
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator, OPF
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
from calibre.customize.ui import run_plugins_on_preprocess
|
||||
|
||||
def lit2opf(path, tdir, opts):
|
||||
from calibre.ebooks.lit.reader import LitReader
|
||||
print 'Exploding LIT file:', path
|
||||
reader = LitReader(path)
|
||||
reader.extract_content(tdir, False)
|
||||
opf = None
|
||||
for opf in walk(tdir):
|
||||
if opf.lower().endswith('.opf'):
|
||||
break
|
||||
if not opf.endswith('.opf'):
|
||||
opf = None
|
||||
if opf is not None: # Check for url-quoted filenames
|
||||
_opf = OPF(opf, os.path.dirname(opf))
|
||||
replacements = []
|
||||
for item in _opf.itermanifest():
|
||||
href = item.get('href', '')
|
||||
path = os.path.join(os.path.dirname(opf), *(href.split('/')))
|
||||
if not os.path.exists(path) and os.path.exists(path.replace('&', '%26')):
|
||||
npath = path
|
||||
path = path.replace('&', '%26')
|
||||
replacements.append((path, npath))
|
||||
if replacements:
|
||||
print 'Fixing quoted filenames...'
|
||||
for path, npath in replacements:
|
||||
if os.path.exists(path):
|
||||
os.rename(path, npath)
|
||||
for f in walk(tdir):
|
||||
with open(f, 'r+b') as f:
|
||||
raw = f.read()
|
||||
for path, npath in replacements:
|
||||
raw = raw.replace(os.path.basename(path), os.path.basename(npath))
|
||||
f.seek(0)
|
||||
f.truncate()
|
||||
f.write(raw)
|
||||
return opf
|
||||
|
||||
def mobi2opf(path, tdir, opts):
|
||||
from calibre.ebooks.mobi.reader import MobiReader
|
||||
print 'Exploding MOBI file:', path.encode('utf-8') if isinstance(path, unicode) else path
|
||||
reader = MobiReader(path)
|
||||
reader.extract_content(tdir)
|
||||
files = list(walk(tdir))
|
||||
opts.encoding = 'utf-8'
|
||||
for f in files:
|
||||
if f.lower().endswith('.opf'):
|
||||
return f
|
||||
html_pat = re.compile(r'\.(x){0,1}htm(l){0,1}', re.IGNORECASE)
|
||||
hf = [f for f in files if html_pat.match(os.path.splitext(f)[1]) is not None]
|
||||
mi = MetaInformation(os.path.splitext(os.path.basename(path))[0], [_('Unknown')])
|
||||
opf = OPFCreator(tdir, mi)
|
||||
opf.create_manifest([(hf[0], None)])
|
||||
opf.create_spine([hf[0]])
|
||||
ans = os.path.join(tdir, 'metadata.opf')
|
||||
opf.render(open(ans, 'wb'))
|
||||
return ans
|
||||
|
||||
def fb22opf(path, tdir, opts):
|
||||
from calibre.ebooks.lrf.fb2.convert_from import to_html
|
||||
print 'Converting FB2 to HTML...'
|
||||
return to_html(path, tdir)
|
||||
|
||||
def rtf2opf(path, tdir, opts):
|
||||
from calibre.ebooks.lrf.rtf.convert_from import generate_html
|
||||
generate_html(path, tdir)
|
||||
return os.path.join(tdir, 'metadata.opf')
|
||||
|
||||
def txt2opf(path, tdir, opts):
|
||||
from calibre.ebooks.lrf.txt.convert_from import generate_html
|
||||
generate_html(path, opts.encoding, tdir)
|
||||
return os.path.join(tdir, 'metadata.opf')
|
||||
|
||||
def pdf2opf(path, tdir, opts):
|
||||
from calibre.ebooks.lrf.pdf.convert_from import generate_html
|
||||
generate_html(path, tdir)
|
||||
opts.dont_split_on_page_breaks = True
|
||||
return os.path.join(tdir, 'metadata.opf')
|
||||
|
||||
def epub2opf(path, tdir, opts):
|
||||
zf = ZipFile(path)
|
||||
zf.extractall(tdir)
|
||||
@ -110,35 +37,23 @@ def epub2opf(path, tdir, opts):
|
||||
if opf and os.path.exists(encfile):
|
||||
if not process_encryption(encfile, opf):
|
||||
raise DRMError(os.path.basename(path))
|
||||
|
||||
|
||||
if opf is None:
|
||||
raise ValueError('%s is not a valid EPUB file'%path)
|
||||
return opf
|
||||
|
||||
|
||||
def odt2epub(path, tdir, opts):
|
||||
from calibre.ebooks.odt.to_oeb import Extract
|
||||
opts.encoding = 'utf-8'
|
||||
return Extract()(path, tdir)
|
||||
|
||||
MAP = {
|
||||
'lit' : lit2opf,
|
||||
'mobi' : mobi2opf,
|
||||
'prc' : mobi2opf,
|
||||
'azw' : mobi2opf,
|
||||
'fb2' : fb22opf,
|
||||
'rtf' : rtf2opf,
|
||||
'txt' : txt2opf,
|
||||
'pdf' : pdf2opf,
|
||||
'epub' : epub2opf,
|
||||
'odt' : odt2epub,
|
||||
}
|
||||
SOURCE_FORMATS = ['lit', 'mobi', 'prc', 'azw', 'fb2', 'odt', 'rtf',
|
||||
SOURCE_FORMATS = ['lit', 'mobi', 'prc', 'azw', 'fb2', 'odt', 'rtf',
|
||||
'txt', 'pdf', 'rar', 'zip', 'oebzip', 'htm', 'html', 'epub']
|
||||
|
||||
def unarchive(path, tdir):
|
||||
extract(path, tdir)
|
||||
files = list(walk(tdir))
|
||||
|
||||
|
||||
for ext in ['opf'] + list(MAP.keys()):
|
||||
for f in files:
|
||||
if f.lower().endswith('.'+ext):
|
||||
@ -147,32 +62,32 @@ def unarchive(path, tdir):
|
||||
return f, ext
|
||||
return find_html_index(files)
|
||||
|
||||
def any2epub(opts, path, notification=None, create_epub=True,
|
||||
def any2epub(opts, path, notification=None, create_epub=True,
|
||||
oeb_cover=False, extract_to=None):
|
||||
path = run_plugins_on_preprocess(path)
|
||||
ext = os.path.splitext(path)[1]
|
||||
if not ext:
|
||||
raise ValueError('Unknown file type: '+path)
|
||||
ext = ext.lower()[1:]
|
||||
|
||||
|
||||
if opts.output is None:
|
||||
opts.output = os.path.splitext(os.path.basename(path))[0]+'.epub'
|
||||
|
||||
|
||||
with nested(TemporaryDirectory('_any2epub1'), TemporaryDirectory('_any2epub2')) as (tdir1, tdir2):
|
||||
if ext in ['rar', 'zip', 'oebzip']:
|
||||
path, ext = unarchive(path, tdir1)
|
||||
print 'Found %s file in archive'%(ext.upper())
|
||||
|
||||
|
||||
if ext in MAP.keys():
|
||||
path = MAP[ext](path, tdir2, opts)
|
||||
ext = 'opf'
|
||||
|
||||
|
||||
|
||||
|
||||
if re.match(r'((x){0,1}htm(l){0,1})|opf', ext) is None:
|
||||
raise ValueError('Conversion from %s is not supported'%ext.upper())
|
||||
|
||||
|
||||
print 'Creating EPUB file...'
|
||||
html2epub(path, opts, notification=notification,
|
||||
html2epub(path, opts, notification=notification,
|
||||
create_epub=create_epub, oeb_cover=oeb_cover,
|
||||
extract_to=extract_to)
|
||||
|
||||
|
74
src/calibre/ebooks/fb2/input.py
Normal file
74
src/calibre/ebooks/fb2/input.py
Normal file
@ -0,0 +1,74 @@
|
||||
from __future__ import with_statement
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Anatoly Shipitsin <norguhtar at gmail.com>'
|
||||
"""
|
||||
Convert .fb2 files to .lrf
|
||||
"""
|
||||
import os
|
||||
from base64 import b64decode
|
||||
from lxml import etree
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
||||
from calibre import guess_type
|
||||
|
||||
FB2NS = 'http://www.gribuser.ru/xml/fictionbook/2.0'
|
||||
|
||||
class FB2Input(InputFormatPlugin):
|
||||
|
||||
name = 'FB2 Input'
|
||||
author = 'Anatoly Shipitsin'
|
||||
description = 'Convert FB2 files to HTML'
|
||||
file_types = set(['fb2'])
|
||||
|
||||
recommendations = set([
|
||||
('level1_toc', '//h:h1', OptionRecommendation.MED),
|
||||
('level2_toc', '//h:h2', OptionRecommendation.MED),
|
||||
('level3_toc', '//h:h3', OptionRecommendation.MED),
|
||||
])
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
from calibre.resources import fb2_xsl
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||
from calibre.ebooks.metadata.meta import get_metadata
|
||||
from calibre.ebooks.oeb.base import XLINK_NS
|
||||
NAMESPACES = {'f':FB2NS, 'l':XLINK_NS}
|
||||
|
||||
log.debug('Parsing XML...')
|
||||
parser = etree.XMLParser(recover=True, no_network=True)
|
||||
doc = etree.parse(stream, parser)
|
||||
self.extract_embedded_content(doc)
|
||||
log.debug('Converting XML to HTML...')
|
||||
styledoc = etree.fromstring(fb2_xsl)
|
||||
|
||||
transform = etree.XSLT(styledoc)
|
||||
result = transform(doc)
|
||||
open('index.xhtml', 'wb').write(transform.tostring(result))
|
||||
stream.seek(0)
|
||||
mi = get_metadata(stream, 'fb2')
|
||||
if not mi.title:
|
||||
mi.title = _('Unknown')
|
||||
if not mi.authors:
|
||||
mi.authors = [_('Unknown')]
|
||||
opf = OPFCreator(os.getcwdu(), mi)
|
||||
entries = [(f, guess_type(f)[0]) for f in os.listdir('.')]
|
||||
opf.create_manifest(entries)
|
||||
opf.create_spine(['index.xhtml'])
|
||||
|
||||
for img in doc.xpath('//f:coverpage/f:image', namespaces=NAMESPACES):
|
||||
href = img.get('{%s}href'%XLINK_NS, img.get('href', None))
|
||||
if href is not None:
|
||||
if href.startswith('#'):
|
||||
href = href[1:]
|
||||
opf.guide.set_cover(os.path.abspath(href))
|
||||
|
||||
opf.render(open('metadata.opf', 'wb'))
|
||||
return os.path.join(os.getcwd(), 'metadata.opf')
|
||||
|
||||
def extract_embedded_content(self, doc):
|
||||
for elem in doc.xpath('./*'):
|
||||
if 'binary' in elem.tag and elem.attrib.has_key('id'):
|
||||
fname = elem.attrib['id']
|
||||
data = b64decode(elem.text.strip())
|
||||
open(fname, 'wb').write(data)
|
||||
|
@ -1,125 +0,0 @@
|
||||
from __future__ import with_statement
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Anatoly Shipitsin <norguhtar at gmail.com>'
|
||||
"""
|
||||
Convert .fb2 files to .lrf
|
||||
"""
|
||||
import os, sys, shutil, logging
|
||||
from base64 import b64decode
|
||||
from lxml import etree
|
||||
|
||||
from calibre.ebooks.lrf import option_parser as lrf_option_parser
|
||||
from calibre.ebooks.metadata.meta import get_metadata
|
||||
from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file
|
||||
from calibre import setup_cli_handlers
|
||||
from calibre.resources import fb2_xsl
|
||||
from calibre.ptempfile import PersistentTemporaryDirectory
|
||||
from calibre.ebooks.metadata.opf import OPFCreator
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
|
||||
|
||||
def option_parser():
|
||||
parser = lrf_option_parser(
|
||||
_('''%prog [options] mybook.fb2
|
||||
|
||||
|
||||
%prog converts mybook.fb2 to mybook.lrf'''))
|
||||
parser.add_option('--debug-html-generation', action='store_true', default=False,
|
||||
dest='debug_html_generation', help=_('Print generated HTML to stdout and quit.'))
|
||||
parser.add_option('--keep-intermediate-files', action='store_true', default=False,
|
||||
help=_('Keep generated HTML files after completing conversion to LRF.'))
|
||||
return parser
|
||||
|
||||
def extract_embedded_content(doc):
|
||||
for elem in doc.xpath('./*'):
|
||||
if 'binary' in elem.tag and elem.attrib.has_key('id'):
|
||||
fname = elem.attrib['id']
|
||||
data = b64decode(elem.text.strip())
|
||||
open(fname, 'wb').write(data)
|
||||
|
||||
def to_html(fb2file, tdir):
|
||||
fb2file = os.path.abspath(fb2file)
|
||||
cwd = os.getcwd()
|
||||
try:
|
||||
os.chdir(tdir)
|
||||
print 'Parsing XML...'
|
||||
parser = etree.XMLParser(recover=True, no_network=True)
|
||||
doc = etree.parse(fb2file, parser)
|
||||
extract_embedded_content(doc)
|
||||
print 'Converting XML to HTML...'
|
||||
styledoc = etree.fromstring(fb2_xsl)
|
||||
|
||||
transform = etree.XSLT(styledoc)
|
||||
result = transform(doc)
|
||||
open('index.html', 'wb').write(transform.tostring(result))
|
||||
try:
|
||||
mi = get_metadata(open(fb2file, 'rb'), 'fb2')
|
||||
except:
|
||||
mi = MetaInformation(None, None)
|
||||
if not mi.title:
|
||||
mi.title = os.path.splitext(os.path.basename(fb2file))[0]
|
||||
if not mi.authors:
|
||||
mi.authors = [_('Unknown')]
|
||||
opf = OPFCreator(tdir, mi)
|
||||
opf.create_manifest([('index.html', None)])
|
||||
opf.create_spine(['index.html'])
|
||||
opf.render(open('metadata.opf', 'wb'))
|
||||
return os.path.join(tdir, 'metadata.opf')
|
||||
finally:
|
||||
os.chdir(cwd)
|
||||
|
||||
|
||||
def generate_html(fb2file, encoding, logger):
|
||||
tdir = PersistentTemporaryDirectory('_fb22lrf')
|
||||
to_html(fb2file, tdir)
|
||||
return os.path.join(tdir, 'index.html')
|
||||
|
||||
def process_file(path, options, logger=None):
|
||||
if logger is None:
|
||||
level = logging.DEBUG if options.verbose else logging.INFO
|
||||
logger = logging.getLogger('fb22lrf')
|
||||
setup_cli_handlers(logger, level)
|
||||
fb2 = os.path.abspath(os.path.expanduser(path))
|
||||
f = open(fb2, 'rb')
|
||||
mi = get_metadata(f, 'fb2')
|
||||
f.close()
|
||||
htmlfile = generate_html(fb2, options.encoding, logger)
|
||||
tdir = os.path.dirname(htmlfile)
|
||||
cwd = os.getcwdu()
|
||||
try:
|
||||
if not options.output:
|
||||
ext = '.lrs' if options.lrs else '.lrf'
|
||||
options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext)
|
||||
options.output = os.path.abspath(os.path.expanduser(options.output))
|
||||
if not mi.title:
|
||||
mi.title = os.path.splitext(os.path.basename(fb2))[0]
|
||||
if (not options.title or options.title == _('Unknown')):
|
||||
options.title = mi.title
|
||||
if (not options.author or options.author == _('Unknown')) and mi.authors:
|
||||
options.author = mi.authors.pop()
|
||||
if (not options.category or options.category == _('Unknown')) and mi.category:
|
||||
options.category = mi.category
|
||||
if (not options.freetext or options.freetext == _('Unknown')) and mi.comments:
|
||||
options.freetext = mi.comments
|
||||
os.chdir(tdir)
|
||||
html_process_file(htmlfile, options, logger)
|
||||
finally:
|
||||
os.chdir(cwd)
|
||||
if getattr(options, 'keep_intermediate_files', False):
|
||||
logger.debug('Intermediate files in '+ tdir)
|
||||
else:
|
||||
shutil.rmtree(tdir)
|
||||
|
||||
def main(args=sys.argv, logger=None):
|
||||
parser = option_parser()
|
||||
options, args = parser.parse_args(args)
|
||||
if len(args) != 2:
|
||||
parser.print_help()
|
||||
print
|
||||
print 'No fb2 file specified'
|
||||
return 1
|
||||
process_file(args[1], options, logger)
|
||||
return 0
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
@ -41,10 +41,12 @@ NCX_NS = 'http://www.daisy.org/z3986/2005/ncx/'
|
||||
SVG_NS = 'http://www.w3.org/2000/svg'
|
||||
XLINK_NS = 'http://www.w3.org/1999/xlink'
|
||||
CALIBRE_NS = 'http://calibre.kovidgoyal.net/2009/metadata'
|
||||
RE_NS = 'http://exslt.org/regular-expressions'
|
||||
|
||||
XPNSMAP = {'h' : XHTML_NS, 'o1' : OPF1_NS, 'o2' : OPF2_NS,
|
||||
'd09': DC09_NS, 'd10': DC10_NS, 'd11': DC11_NS,
|
||||
'xsi': XSI_NS, 'dt' : DCTERMS_NS, 'ncx': NCX_NS,
|
||||
'svg': SVG_NS, 'xl' : XLINK_NS}
|
||||
'svg': SVG_NS, 'xl' : XLINK_NS, 're': RE_NS}
|
||||
OPF1_NSMAP = {'dc': DC11_NS, 'oebpackage': OPF1_NS}
|
||||
OPF2_NSMAP = {'opf': OPF2_NS, 'dc': DC11_NS, 'dcterms': DCTERMS_NS,
|
||||
'xsi': XSI_NS, 'calibre': CALIBRE_NS}
|
||||
@ -1256,16 +1258,21 @@ class TOC(object):
|
||||
:attr:`klass`: Optional semantic class referenced by this node.
|
||||
:attr:`id`: Option unique identifier for this node.
|
||||
"""
|
||||
def __init__(self, title=None, href=None, klass=None, id=None):
|
||||
def __init__(self, title=None, href=None, klass=None, id=None,
|
||||
play_order=None):
|
||||
self.title = title
|
||||
self.href = urlnormalize(href) if href else href
|
||||
self.klass = klass
|
||||
self.id = id
|
||||
self.nodes = []
|
||||
self.play_order = 0
|
||||
if play_order is None:
|
||||
play_order = self.next_play_order()
|
||||
self.play_order = play_order
|
||||
|
||||
def add(self, title, href, klass=None, id=None):
|
||||
def add(self, title, href, klass=None, id=None, play_order=0):
|
||||
"""Create and return a new sub-node of this node."""
|
||||
node = TOC(title, href, klass, id)
|
||||
node = TOC(title, href, klass, id, play_order)
|
||||
self.nodes.append(node)
|
||||
return node
|
||||
|
||||
@ -1276,6 +1283,18 @@ class TOC(object):
|
||||
for node in child.iter():
|
||||
yield node
|
||||
|
||||
def count(self):
|
||||
return len(list(self.iter())) - 1
|
||||
|
||||
def next_play_order(self):
|
||||
return max([x.play_order for x in self.iter()])+1
|
||||
|
||||
def has_href(self, href):
|
||||
for x in self.iter():
|
||||
if x.href == href:
|
||||
return True
|
||||
return False
|
||||
|
||||
def iterdescendants(self):
|
||||
"""Iterate over all descendant nodes in depth-first order."""
|
||||
for child in self.nodes:
|
||||
@ -1309,6 +1328,10 @@ class TOC(object):
|
||||
except ValueError:
|
||||
return 1
|
||||
|
||||
def __str__(self):
|
||||
return 'TOC: %s --> %s'%(self.title, self.href)
|
||||
|
||||
|
||||
def to_opf1(self, tour):
|
||||
for node in self.nodes:
|
||||
element(tour, 'site', attrib={
|
||||
@ -1319,7 +1342,7 @@ class TOC(object):
|
||||
def to_ncx(self, parent):
|
||||
for node in self.nodes:
|
||||
id = node.id or unicode(uuid.uuid4())
|
||||
attrib = {'id': id, 'playOrder': '0'}
|
||||
attrib = {'id': id, 'playOrder': str(node.play_order)}
|
||||
if node.klass:
|
||||
attrib['class'] = node.klass
|
||||
point = element(parent, NCX('navPoint'), attrib=attrib)
|
||||
@ -1329,6 +1352,34 @@ class TOC(object):
|
||||
node.to_ncx(point)
|
||||
return parent
|
||||
|
||||
def rationalize_play_orders(self):
|
||||
'''
|
||||
Ensure that all nodes with the same play_order have the same href and
|
||||
with different play_orders have different hrefs.
|
||||
'''
|
||||
def po_node(n):
|
||||
for x in self.iter():
|
||||
if x is n:
|
||||
return
|
||||
if x.play_order == n.play_order:
|
||||
return x
|
||||
|
||||
def href_node(n):
|
||||
for x in self.iter():
|
||||
if x is n:
|
||||
return
|
||||
if x.href == n.href:
|
||||
return x
|
||||
|
||||
for x in self.iter():
|
||||
y = po_node(x)
|
||||
if y is not None:
|
||||
if x.href != y.href:
|
||||
x.play_order = getattr(href_node(x), 'play_order',
|
||||
self.next_play_order())
|
||||
y = href_node(x)
|
||||
if y is not None:
|
||||
x.play_order = y.play_order
|
||||
|
||||
class PageList(object):
|
||||
"""Collection of named "pages" to mapped positions within an OEB data model
|
||||
|
@ -118,6 +118,7 @@ class EbookIterator(object):
|
||||
print 'Loaded embedded font:', repr(family)
|
||||
|
||||
def __enter__(self):
|
||||
self.delete_on_exit = []
|
||||
self._tdir = TemporaryDirectory('_ebook_iter')
|
||||
self.base = self._tdir.__enter__()
|
||||
from calibre.ebooks.conversion.plumber import Plumber
|
||||
@ -137,9 +138,11 @@ class EbookIterator(object):
|
||||
|
||||
cover = self.opf.cover
|
||||
if self.ebook_ext in ('lit', 'mobi', 'prc', 'opf') and cover:
|
||||
cfile = os.path.join(os.path.dirname(self.spine[0]), 'calibre_ei_cover.html')
|
||||
cfile = os.path.join(os.path.dirname(self.spine[0]),
|
||||
'calibre_iterator_cover.html')
|
||||
open(cfile, 'wb').write(TITLEPAGE%cover)
|
||||
self.spine[0:0] = [SpineItem(cfile)]
|
||||
self.delete_on_exit.append(cfile)
|
||||
|
||||
if self.opf.path_to_html_toc is not None and \
|
||||
self.opf.path_to_html_toc not in self.spine:
|
||||
@ -221,3 +224,6 @@ class EbookIterator(object):
|
||||
|
||||
def __exit__(self, *args):
|
||||
self._tdir.__exit__(*args)
|
||||
for x in self.delete_on_exit:
|
||||
if os.path.exists(x):
|
||||
os.remove(x)
|
||||
|
@ -343,7 +343,8 @@ class OEBReader(object):
|
||||
continue
|
||||
id = child.get('id')
|
||||
klass = child.get('class')
|
||||
node = toc.add(title, href, id=id, klass=klass)
|
||||
po = int(child.get('playOrder', self.oeb.toc.next_play_order()))
|
||||
node = toc.add(title, href, id=id, klass=klass, play_order=po)
|
||||
self._toc_from_navpoint(item, node, child)
|
||||
|
||||
def _toc_from_ncx(self, item):
|
||||
|
@ -15,12 +15,10 @@ from lxml.etree import XPath as _XPath
|
||||
from lxml import etree
|
||||
from lxml.cssselect import CSSSelector
|
||||
|
||||
from calibre.ebooks.oeb.base import OEB_STYLES, XPNSMAP, urldefrag, \
|
||||
rewrite_links
|
||||
from calibre.ebooks.oeb.base import OEB_STYLES, XPNSMAP as NAMESPACES, \
|
||||
urldefrag, rewrite_links
|
||||
from calibre.ebooks.epub import tostring, rules
|
||||
|
||||
NAMESPACES = dict(XPNSMAP)
|
||||
NAMESPACES['re'] = 'http://exslt.org/regular-expressions'
|
||||
|
||||
XPath = functools.partial(_XPath, namespaces=NAMESPACES)
|
||||
|
||||
|
151
src/calibre/ebooks/oeb/transforms/structure.py
Normal file
151
src/calibre/ebooks/oeb/transforms/structure.py
Normal file
@ -0,0 +1,151 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import with_statement
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from lxml import etree
|
||||
from urlparse import urlparse
|
||||
|
||||
from calibre.ebooks.oeb.base import XPNSMAP, TOC
|
||||
XPath = lambda x: etree.XPath(x, namespaces=XPNSMAP)
|
||||
|
||||
class DetectStructure(object):
|
||||
|
||||
def __call__(self, oeb, opts):
|
||||
self.log = oeb.log
|
||||
self.oeb = oeb
|
||||
self.opts = opts
|
||||
self.log('Detecting structure...')
|
||||
|
||||
self.detect_chapters()
|
||||
if self.oeb.auto_generated_toc or opts.use_auto_toc:
|
||||
orig_toc = self.oeb.toc
|
||||
self.oeb.toc = TOC()
|
||||
self.create_level_based_toc()
|
||||
if self.oeb.toc.count() < 1:
|
||||
if not opts.no_chapters_in_toc and self.detected_chapters:
|
||||
self.create_toc_from_chapters()
|
||||
if self.oeb.toc.count() < opts.toc_threshold:
|
||||
self.create_toc_from_links()
|
||||
if self.oeb.toc.count() < 2 and orig_toc.count() > 2:
|
||||
self.oeb.toc = orig_toc
|
||||
else:
|
||||
self.oeb.auto_generated_toc = True
|
||||
self.log('Auto generated TOC with %d entries.' %
|
||||
self.oeb.toc.count())
|
||||
|
||||
|
||||
def detect_chapters(self):
|
||||
self.detected_chapters = []
|
||||
if self.opts.chapter:
|
||||
chapter_xpath = XPath(self.opts.chapter)
|
||||
for item in self.oeb.spine:
|
||||
for x in chapter_xpath(item.data):
|
||||
self.detected_chapters.append((item, x))
|
||||
|
||||
chapter_mark = self.opts.chapter_mark
|
||||
page_break_before = 'display: block; page-break-before: always'
|
||||
page_break_after = 'display: block; page-break-after: always'
|
||||
for item, elem in self.detected_chapters:
|
||||
text = u' '.join([t.strip() for t in elem.xpath('descendant::text()')])
|
||||
self.log('\tDetected chapter:', text[:50])
|
||||
if chapter_mark == 'none':
|
||||
continue
|
||||
elif chapter_mark == 'rule':
|
||||
mark = etree.Element('hr')
|
||||
elif chapter_mark == 'pagebreak':
|
||||
mark = etree.Element('div', style=page_break_after)
|
||||
else: # chapter_mark == 'both':
|
||||
mark = etree.Element('hr', style=page_break_before)
|
||||
elem.addprevious(mark)
|
||||
|
||||
def create_level_based_toc(self):
|
||||
if self.opts.level1_toc is None:
|
||||
return
|
||||
for item in self.oeb.spine:
|
||||
self.add_leveled_toc_items(item)
|
||||
|
||||
def create_toc_from_chapters(self):
|
||||
counter = self.oeb.toc.next_play_order()
|
||||
for item, elem in self.detected_chapters:
|
||||
text, href = self.elem_to_link(item, elem, counter)
|
||||
self.oeb.toc.add(text, href, play_order=counter)
|
||||
counter += 1
|
||||
|
||||
def create_toc_from_links(self):
|
||||
for item in self.oeb.spine:
|
||||
for a in item.data.xpath('//h:a[@href]'):
|
||||
href = a.get('href')
|
||||
purl = urlparse(href)
|
||||
if not purl[0] or purl[0] == 'file':
|
||||
href, frag = purl.path, purl.fragment
|
||||
href = item.abshref(href)
|
||||
if frag:
|
||||
href = '#'.join((href, frag))
|
||||
if not self.oeb.toc.has_href(href):
|
||||
text = u' '.join([t.strip() for t in \
|
||||
a.xpath('descendant::text()')])
|
||||
text = text[:100].strip()
|
||||
if not self.oeb.toc.has_text(text):
|
||||
self.oeb.toc.add(text, href,
|
||||
play_order=self.oeb.toc.next_play_order())
|
||||
|
||||
|
||||
def elem_to_link(self, item, elem, counter):
|
||||
text = u' '.join([t.strip() for t in elem.xpath('descendant::text()')])
|
||||
text = text[:100].strip()
|
||||
id = elem.get('id', 'calibre_toc_%d'%counter)
|
||||
elem.set('id', id)
|
||||
href = '#'.join((item.href, id))
|
||||
return text, href
|
||||
|
||||
|
||||
def add_leveled_toc_items(self, item):
|
||||
level1 = XPath(self.opts.level1_toc)(item.data)
|
||||
level1_order = []
|
||||
|
||||
counter = 1
|
||||
if level1:
|
||||
added = {}
|
||||
for elem in level1:
|
||||
text, _href = self.elem_to_link(item, elem, counter)
|
||||
counter += 1
|
||||
if text:
|
||||
node = self.oeb.toc.add(text, _href,
|
||||
play_order=self.oeb.toc.next_play_order())
|
||||
level1_order.append(node)
|
||||
added[elem] = node
|
||||
#node.add(_('Top'), _href)
|
||||
if self.opts.level2_toc is not None:
|
||||
added2 = {}
|
||||
level2 = list(XPath(self.opts.level2_toc)(item.data))
|
||||
for elem in level2:
|
||||
level1 = None
|
||||
for item in item.data.iterdescendants():
|
||||
if item in added.keys():
|
||||
level1 = added[item]
|
||||
elif item == elem and level1 is not None:
|
||||
text, _href = self.elem_to_link(item, elem, counter)
|
||||
counter += 1
|
||||
if text:
|
||||
added2[elem] = level1.add(text, _href,
|
||||
play_order=self.oeb.toc.next_play_order())
|
||||
if self.opts.level3_toc is not None:
|
||||
level3 = list(XPath(self.opts.level3_toc)(item.data))
|
||||
for elem in level3:
|
||||
level2 = None
|
||||
for item in item.data.iterdescendants():
|
||||
if item in added2.keys():
|
||||
level2 = added2[item]
|
||||
elif item == elem and level2 is not None:
|
||||
text, _href = \
|
||||
self.elem_to_link(item, elem, counter)
|
||||
counter += 1
|
||||
if text:
|
||||
level2.add(text, _href,
|
||||
play_order=self.oeb.toc.next_play_order())
|
||||
|
||||
|
@ -27,10 +27,6 @@ entry_points = {
|
||||
'lrs2lrf = calibre.ebooks.lrf.lrs.convert_from:main',
|
||||
'isbndb = calibre.ebooks.metadata.isbndb:main',
|
||||
'librarything = calibre.ebooks.metadata.library_thing:main',
|
||||
'comic2lrf = calibre.ebooks.lrf.comic.convert_from:main',
|
||||
'comic2epub = calibre.ebooks.epub.from_comic:main',
|
||||
'comic2mobi = calibre.ebooks.mobi.from_comic:main',
|
||||
'comic2pdf = calibre.ebooks.pdf.from_comic:main',
|
||||
'calibre-debug = calibre.debug:main',
|
||||
'calibredb = calibre.library.cli:main',
|
||||
'calibre-fontconfig = calibre.utils.fontconfig:main',
|
||||
@ -151,8 +147,6 @@ def setup_completion(fatal_errors):
|
||||
from calibre.ebooks.lrf.pdf.reflow import option_parser as pdfhtmlop
|
||||
from calibre.web.feeds.main import option_parser as feeds2disk
|
||||
from calibre.web.feeds.recipes import titles as feed_titles
|
||||
from calibre.ebooks.lrf.comic.convert_from import option_parser as comicop
|
||||
from calibre.ebooks.epub.from_comic import option_parser as comic2epub
|
||||
from calibre.ebooks.metadata.fetch import option_parser as fem_op
|
||||
from calibre.gui2.main import option_parser as guiop
|
||||
from calibre.utils.smtp import option_parser as smtp_op
|
||||
@ -181,10 +175,6 @@ def setup_completion(fatal_errors):
|
||||
f.write(opts_and_exts('ebook-meta', metaop, list(meta_filetypes())))
|
||||
f.write(opts_and_exts('lrfviewer', lrfviewerop, ['lrf']))
|
||||
f.write(opts_and_exts('pdfrelow', pdfhtmlop, ['pdf']))
|
||||
f.write(opts_and_exts('comic2lrf', comicop, ['cbz', 'cbr']))
|
||||
f.write(opts_and_exts('comic2epub', comic2epub, ['cbz', 'cbr']))
|
||||
f.write(opts_and_exts('comic2mobi', comic2epub, ['cbz', 'cbr']))
|
||||
f.write(opts_and_exts('comic2pdf', comic2epub, ['cbz', 'cbr']))
|
||||
f.write(opts_and_words('feeds2disk', feeds2disk, feed_titles))
|
||||
f.write(opts_and_words('fetch-ebook-metadata', fem_op, []))
|
||||
f.write(opts_and_words('calibre-smtp', smtp_op, []))
|
||||
|
@ -139,7 +139,7 @@ class resources(OptionlessCommand):
|
||||
RESOURCES = dict(
|
||||
opf_template = 'ebooks/metadata/opf.xml',
|
||||
ncx_template = 'ebooks/metadata/ncx.xml',
|
||||
fb2_xsl = 'ebooks/lrf/fb2/fb2.xsl',
|
||||
fb2_xsl = 'ebooks/fb2/fb2.xsl',
|
||||
metadata_sqlite = 'library/metadata_sqlite.sql',
|
||||
jquery = 'gui2/viewer/jquery.js',
|
||||
jquery_scrollTo = 'gui2/viewer/jquery_scrollTo.js',
|
||||
|
Loading…
x
Reference in New Issue
Block a user