Fix #7936 (FB2: Sectionize TOC entries)

This commit is contained in:
Kovid Goyal 2010-12-18 18:57:06 -07:00
commit c9fe094afa
4 changed files with 113 additions and 43 deletions

View File

@ -27,13 +27,10 @@ class FB2MLizer(object):
'''
Todo: * Include more FB2 specific tags in the conversion.
* Handle a tags.
* Figure out some way to turn oeb_book.toc items into <section><title>
<p> to allow for readers to generate toc from the document.
'''
def __init__(self, log):
self.log = log
self.image_hrefs = {}
self.reset_state()
def reset_state(self):
@ -43,17 +40,25 @@ class FB2MLizer(object):
# in different directories. FB2 images are all in a flat layout so we rename all images
# into a sequential numbering system to ensure there are no collisions between image names.
self.image_hrefs = {}
# Mapping of toc items and their
self.toc = {}
# Used to see whether a new <section> needs to be opened
self.section_level = 0
def extract_content(self, oeb_book, opts):
self.log.info('Converting XHTML to FB2 markup...')
self.oeb_book = oeb_book
self.opts = opts
self.reset_state()
# Used for adding <section>s and <title>s to allow readers
# to generate toc from the document.
if self.opts.sectionize == 'toc':
self.create_flat_toc(self.oeb_book.toc, 1)
return self.fb2mlize_spine()
def fb2mlize_spine(self):
self.reset_state()
output = [self.fb2_header()]
output.append(self.get_text())
output.append(self.fb2mlize_images())
@ -66,13 +71,19 @@ class FB2MLizer(object):
return u'<?xml version="1.0" encoding="UTF-8"?>' + output
def clean_text(self, text):
text = re.sub(r'(?miu)<section>\s*</section>', '', text)
text = re.sub(r'(?miu)\s+</section>', '</section>', text)
text = re.sub(r'(?miu)</section><section>', '</section>\n\n<section>', text)
text = re.sub(r'(?miu)<p>\s*</p>', '', text)
text = re.sub(r'(?miu)\s+</p>', '</p>', text)
text = re.sub(r'(?miu)</p><p>', '</p>\n\n<p>', text)
text = re.sub(r'(?miu)\s*</p>', '</p>', text)
text = re.sub(r'(?miu)</p>\s*<p>', '</p>\n\n<p>', text)
text = re.sub(r'(?miu)<title>\s*</title>', '', text)
text = re.sub(r'(?miu)\s+</title>', '</title>', text)
text = re.sub(r'(?miu)<section>\s*</section>', '', text)
text = re.sub(r'(?miu)\s*</section>', '\n</section>', text)
text = re.sub(r'(?miu)</section>\s*', '</section>\n\n', text)
text = re.sub(r'(?miu)\s*<section>', '\n<section>', text)
text = re.sub(r'(?miu)<section>\s*', '<section>\n', text)
text = re.sub(r'(?miu)</section><section>', '</section>\n\n<section>', text)
if self.opts.insert_blank_line:
text = re.sub(r'(?miu)</p>', '</p><empty-line />', text)
@ -144,12 +155,34 @@ class FB2MLizer(object):
def get_text(self):
text = ['<body>']
# Create main section if there are no others to create
if self.opts.sectionize == 'nothing':
text.append('<section>')
self.section_level += 1
for item in self.oeb_book.spine:
self.log.debug('Converting %s to FictionBook2 XML' % item.href)
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile)
text.append('<section>')
# Start a <section> if we must sectionize each file or if the TOC references this page
page_section_open = False
if self.opts.sectionize == 'files' or self.toc.get(item.href) == 'page':
text.append('<section>')
page_section_open = True
self.section_level += 1
text += self.dump_text(item.data.find(XHTML('body')), stylizer, item)
if page_section_open:
text.append('</section>')
self.section_level -= 1
# Close any open sections
while self.section_level > 0:
text.append('</section>')
self.section_level -= 1
return ''.join(text) + '</body>'
def fb2mlize_images(self):
@ -184,6 +217,17 @@ class FB2MLizer(object):
'%s.' % (item.href, e))
return ''.join(images)
def create_flat_toc(self, nodes, level):
for item in nodes:
href, mid, id = item.href.partition('#')
if not id:
self.toc[href] = 'page'
else:
if not self.toc.get(href, None):
self.toc[href] = {}
self.toc[href][id] = level
self.create_flat_toc(item.nodes, level + 1)
def ensure_p(self):
if self.in_p:
return [], []
@ -254,10 +298,38 @@ class FB2MLizer(object):
# First tag in tree
tag = barename(elem_tree.tag)
# Convert TOC entries to <title>s and add <section>s
if self.opts.sectionize == 'toc':
# A section cannot be a child of any other element than another section,
# so leave the tag alone if there are parents
if not tag_stack:
# There are two reasons to start a new section here: the TOC pointed to
# this page (then we use the first non-<body> on the page as a <title>), or
# the TOC pointed to a specific element
newlevel = 0
toc_entry = self.toc.get(page.href, None)
if toc_entry == 'page':
if tag != 'body' and hasattr(elem_tree, 'text') and elem_tree.text:
newlevel = 1
self.toc[page.href] = None
elif toc_entry and elem_tree.attrib.get('id', None):
newlevel = toc_entry.get(elem_tree.attrib.get('id', None), None)
# Start a new section if necessary
if newlevel:
if not (newlevel > self.section_level):
fb2_out.append('</section>')
self.section_level -= 1
fb2_out.append('<section>')
self.section_level += 1
fb2_out.append('<title>')
tags.append('title')
if self.section_level == 0:
# If none of the prior processing made a section, make one now to be FB2 spec compliant
fb2_out.append('<section>')
self.section_level += 1
# Process the XHTML tag if it needs to be converted to an FB2 tag.
if tag == 'h1' and self.opts.h1_to_title or tag == 'h2' and self.opts.h2_to_title or tag == 'h3' and self.opts.h3_to_title:
fb2_out.append('<title>')
tags.append('title')
if tag == 'img':
if elem_tree.attrib.get('src', None):
# Only write the image tag if it is in the manifest.

View File

@ -16,15 +16,15 @@ class FB2Output(OutputFormatPlugin):
file_type = 'fb2'
options = set([
OptionRecommendation(name='h1_to_title',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Wrap all h1 tags with fb2 title elements.')),
OptionRecommendation(name='h2_to_title',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Wrap all h2 tags with fb2 title elements.')),
OptionRecommendation(name='h3_to_title',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Wrap all h3 tags with fb2 title elements.')),
OptionRecommendation(name='sectionize',
recommended_value='files', level=OptionRecommendation.LOW,
choices=['toc', 'files', 'nothing'],
help=_('Specify the sectionization of elements. '
'A value of "nothing" turns the book into a single section. '
'A value of "files" turns each file into a separate section; use this if your device is having trouble. '
'A value of "Table of Contents" turns the entries in the Table of Contents into titles and creates sections; '
'if it fails, adjust the "Structure Detection" and/or "Table of Contents" settings '
'(turn on "Force use of auto-generated Table of Contents).')),
])
def convert(self, oeb_book, output_path, input_plugin, opts, log):

View File

@ -17,6 +17,8 @@ class PluginWidget(Widget, Ui_Form):
ICON = I('mimetypes/fb2.png')
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
Widget.__init__(self, parent, ['h1_to_title', 'h2_to_title', 'h3_to_title'])
Widget.__init__(self, parent, ['sectionize'])
self.db, self.book_id = db, book_id
for x in ('toc', 'files', 'nothing'):
self.opt_sectionize.addItem(x)
self.initialize_options(get_option, get_help, db, book_id)

View File

@ -14,7 +14,7 @@
<string>Form</string>
</property>
<layout class="QGridLayout" name="gridLayout">
<item row="3" column="0">
<item row="1" column="0">
<spacer name="verticalSpacer">
<property name="orientation">
<enum>Qt::Vertical</enum>
@ -28,23 +28,19 @@
</spacer>
</item>
<item row="0" column="0">
<widget class="QCheckBox" name="opt_h1_to_title">
<property name="text">
<string>Wrap h1 tags with &lt;title&gt; elements</string>
</property>
</widget>
</item>
<item row="1" column="0">
<widget class="QCheckBox" name="opt_h2_to_title">
<property name="text">
<string>Wrap h2 tags with &lt;title&gt; elements</string>
</property>
</widget>
</item>
<item row="2" column="0">
<widget class="QCheckBox" name="opt_h3_to_title">
<property name="text">
<string>Wrap h3 tags with &lt;title&gt; elements</string>
<widget class="QLabel" name="label">
<property name="text">
<string>Sectionize:</string>
</property>
<property name="buddy">
<cstring>opt_sectionize</cstring>
</property>
</widget>
</item>
<item row="0" column="1">
<widget class="QComboBox" name="opt_sectionize">
<property name="minimumContentsLength">
<number>20</number>
</property>
</widget>
</item>