TXT Output: Fix stylizer so it works. Fix handling soft scene breaks created by top margins. FB2 Output: Handle soft scene breaks created by empty paragraphs and top margins. Stylizer: Enable use of lineHeight property.

This commit is contained in:
John Schember 2011-02-02 20:05:50 -05:00
parent 5b244ac857
commit 7ceeef2a3b
3 changed files with 47 additions and 35 deletions

View File

@ -71,19 +71,28 @@ class FB2MLizer(object):
return u'<?xml version="1.0" encoding="UTF-8"?>' + output return u'<?xml version="1.0" encoding="UTF-8"?>' + output
def clean_text(self, text): def clean_text(self, text):
# Condense empty paragraphs into a line break.
text = re.sub(r'(?miu)(<p>\s*</p>\s*){3,}', '<p><empty-line /></p>', text)
# Remove empty paragraphs.
text = re.sub(r'(?miu)<p>\s*</p>', '', text) text = re.sub(r'(?miu)<p>\s*</p>', '', text)
# Clean up pargraph endings.
text = re.sub(r'(?miu)\s*</p>', '</p>', text) text = re.sub(r'(?miu)\s*</p>', '</p>', text)
# Put paragraphs following a paragraph on a separate line.
text = re.sub(r'(?miu)</p>\s*<p>', '</p>\n\n<p>', text) text = re.sub(r'(?miu)</p>\s*<p>', '</p>\n\n<p>', text)
# Remove empty title elements.
text = re.sub(r'(?miu)<title>\s*</title>', '', text) text = re.sub(r'(?miu)<title>\s*</title>', '', text)
text = re.sub(r'(?miu)\s+</title>', '</title>', text) text = re.sub(r'(?miu)\s+</title>', '</title>', text)
# Remove empty sections.
text = re.sub(r'(?miu)<section>\s*</section>', '', text) text = re.sub(r'(?miu)<section>\s*</section>', '', text)
# Clean up sections start and ends.
text = re.sub(r'(?miu)\s*</section>', '\n</section>', text) text = re.sub(r'(?miu)\s*</section>', '\n</section>', text)
text = re.sub(r'(?miu)</section>\s*', '</section>\n\n', text) text = re.sub(r'(?miu)</section>\s*', '</section>\n\n', text)
text = re.sub(r'(?miu)\s*<section>', '\n<section>', text) text = re.sub(r'(?miu)\s*<section>', '\n<section>', text)
text = re.sub(r'(?miu)<section>\s*', '<section>\n', text) text = re.sub(r'(?miu)<section>\s*', '<section>\n', text)
text = re.sub(r'(?miu)</section><section>', '</section>\n\n<section>', text) # Put sectnions followed by sections on a separate line.
text = re.sub(r'(?miu)</section>\s*<section>', '</section>\n\n<section>', text)
if self.opts.insert_blank_line: if self.opts.insert_blank_line:
text = re.sub(r'(?miu)</p>', '</p><empty-line />', text) text = re.sub(r'(?miu)</p>', '</p><empty-line />', text)
@ -338,6 +347,11 @@ class FB2MLizer(object):
tags = [] tags = []
# First tag in tree # First tag in tree
tag = barename(elem_tree.tag) tag = barename(elem_tree.tag)
# Number of blank lines above tag
try:
ems = int(round((float(style.marginTop) / style.fontSize) - 1))
except:
ems = 0
# Convert TOC entries to <title>s and add <section>s # Convert TOC entries to <title>s and add <section>s
if self.opts.sectionize == 'toc': if self.opts.sectionize == 'toc':
@ -370,7 +384,9 @@ class FB2MLizer(object):
fb2_out.append('<section>') fb2_out.append('<section>')
self.section_level += 1 self.section_level += 1
# Process the XHTML tag if it needs to be converted to an FB2 tag. # Process the XHTML tag and styles. Converted to an FB2 tag.
# Use individual if statement not if else. There can be
# only one XHTML tag but it can have multiple styles.
if tag == 'img': if tag == 'img':
if elem_tree.attrib.get('src', None): if elem_tree.attrib.get('src', None):
# Only write the image tag if it is in the manifest. # Only write the image tag if it is in the manifest.
@ -381,7 +397,11 @@ class FB2MLizer(object):
fb2_out += p_txt fb2_out += p_txt
tags += p_tag tags += p_tag
fb2_out.append('<image xlink:href="#%s" />' % self.image_hrefs[page.abshref(elem_tree.attrib['src'])]) fb2_out.append('<image xlink:href="#%s" />' % self.image_hrefs[page.abshref(elem_tree.attrib['src'])])
elif tag == 'br': if tag in ('br', 'hr') or ems:
if not ems:
multiplier = 1
else:
multiplier = ems
if self.in_p: if self.in_p:
closed_tags = [] closed_tags = []
open_tags = tag_stack+tags open_tags = tag_stack+tags
@ -391,52 +411,38 @@ class FB2MLizer(object):
closed_tags.append(t) closed_tags.append(t)
if t == 'p': if t == 'p':
break break
fb2_out.append('<empty-line />') fb2_out.append('<empty-line />' * multiplier)
closed_tags.reverse() closed_tags.reverse()
for t in closed_tags: for t in closed_tags:
fb2_out.append('<%s>' % t) fb2_out.append('<%s>' % t)
else: else:
fb2_out.append('<empty-line />') fb2_out.append('<empty-line />' * multiplier)
elif tag in ('div', 'li', 'p'): if tag in ('div', 'li', 'p'):
p_text, added_p = self.close_open_p(tag_stack+tags) p_text, added_p = self.close_open_p(tag_stack+tags)
fb2_out += p_text fb2_out += p_text
if added_p: if added_p:
tags.append('p') tags.append('p')
elif tag == 'b': if tag == 'b' or style['font-weight'] in ('bold', 'bolder'):
s_out, s_tags = self.handle_simple_tag('strong', tag_stack+tags) s_out, s_tags = self.handle_simple_tag('strong', tag_stack+tags)
fb2_out += s_out fb2_out += s_out
tags += s_tags tags += s_tags
elif tag == 'i': if tag == 'i' or style['font-style'] == 'italic':
s_out, s_tags = self.handle_simple_tag('emphasis', tag_stack+tags) s_out, s_tags = self.handle_simple_tag('emphasis', tag_stack+tags)
fb2_out += s_out fb2_out += s_out
tags += s_tags tags += s_tags
elif tag in ('del', 'strike'): if tag in ('del', 'strike') or style['text-decoration'] == 'line-through':
s_out, s_tags = self.handle_simple_tag('strikethrough', tag_stack+tags) s_out, s_tags = self.handle_simple_tag('strikethrough', tag_stack+tags)
fb2_out += s_out fb2_out += s_out
tags += s_tags tags += s_tags
elif tag == 'sub': if tag == 'sub':
s_out, s_tags = self.handle_simple_tag('sub', tag_stack+tags) s_out, s_tags = self.handle_simple_tag('sub', tag_stack+tags)
fb2_out += s_out fb2_out += s_out
tags += s_tags tags += s_tags
elif tag == 'sup': if tag == 'sup':
s_out, s_tags = self.handle_simple_tag('sup', tag_stack+tags) s_out, s_tags = self.handle_simple_tag('sup', tag_stack+tags)
fb2_out += s_out fb2_out += s_out
tags += s_tags tags += s_tags
# Processes style information.
if style['font-style'] == 'italic':
s_out, s_tags = self.handle_simple_tag('emphasis', tag_stack+tags)
fb2_out += s_out
tags += s_tags
elif style['font-weight'] in ('bold', 'bolder'):
s_out, s_tags = self.handle_simple_tag('strong', tag_stack+tags)
fb2_out += s_out
tags += s_tags
elif style['text-decoration'] == 'line-through':
s_out, s_tags = self.handle_simple_tag('strikethrough', tag_stack+tags)
fb2_out += s_out
tags += s_tags
# Process element text. # Process element text.
if hasattr(elem_tree, 'text') and elem_tree.text: if hasattr(elem_tree, 'text') and elem_tree.text:
if not self.in_p: if not self.in_p:

View File

@ -633,7 +633,7 @@ class Style(object):
def lineHeight(self): def lineHeight(self):
if self._lineHeight is None: if self._lineHeight is None:
result = None result = None
parent = self._getparent() #parent = self._getparent()
if 'line-height' in self._style: if 'line-height' in self._style:
lineh = self._style['line-height'] lineh = self._style['line-height']
if lineh == 'normal': if lineh == 'normal':
@ -642,9 +642,9 @@ class Style(object):
result = float(lineh) * self.fontSize result = float(lineh) * self.fontSize
except ValueError: except ValueError:
result = self._unit_convert(lineh, base=self.fontSize) result = self._unit_convert(lineh, base=self.fontSize)
elif parent is not None: #elif parent is not None:
# TODO: proper inheritance # # TODO: proper inheritance
result = parent.lineHeight # result = parent.lineHeight
else: else:
result = 1.2 * self.fontSize result = 1.2 * self.fontSize
self._lineHeight = result self._lineHeight = result

View File

@ -67,10 +67,11 @@ class TXTMLizer(object):
output.append(self.get_toc()) output.append(self.get_toc())
for item in self.oeb_book.spine: for item in self.oeb_book.spine:
self.log.debug('Converting %s to TXT...' % item.href) self.log.debug('Converting %s to TXT...' % item.href)
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile) content = unicode(etree.tostring(item.data, encoding=unicode))
content = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode))
content = self.remove_newlines(content) content = self.remove_newlines(content)
output += self.dump_text(etree.fromstring(content), stylizer, item) content = etree.fromstring(content)
stylizer = Stylizer(content, item.href, self.oeb_book, self.opts, self.opts.output_profile)
output += self.dump_text(content.find(XHTML('body')), stylizer, item)
output += '\n\n\n\n\n\n' output += '\n\n\n\n\n\n'
output = u''.join(output) output = u''.join(output)
output = u'\n'.join(l.rstrip() for l in output.splitlines()) output = u'\n'.join(l.rstrip() for l in output.splitlines())
@ -219,11 +220,16 @@ class TXTMLizer(object):
if tag in SPACE_TAGS: if tag in SPACE_TAGS:
text.append(u' ') text.append(u' ')
# Scene breaks. # Hard scene breaks.
if tag == 'hr': if tag == 'hr':
text.append('\n\n* * *\n\n') text.append('\n\n* * *\n\n')
elif style['margin-top']: # Soft scene breaks.
text.append('\n\n' + '\n' * round(style['margin-top'])) try:
ems = int(round((float(style.marginTop) / style.fontSize) - 1))
if ems:
text.append('\n' * ems)
except:
pass
# Process tags that contain text. # Process tags that contain text.
if hasattr(elem, 'text') and elem.text: if hasattr(elem, 'text') and elem.text: