mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
TXT Output: Clean up and produce consistant output. Spacing around headings. Headings are not indented when using the remove paragraph spacing option.
This commit is contained in:
parent
04cf7a5e67
commit
f5a6195ceb
@ -8,7 +8,6 @@ __docformat__ = 'restructuredtext en'
|
||||
Transform OEB content into plain text
|
||||
'''
|
||||
|
||||
import os
|
||||
import re
|
||||
|
||||
from lxml import etree
|
||||
@ -33,6 +32,15 @@ BLOCK_STYLES = [
|
||||
'block',
|
||||
]
|
||||
|
||||
HEADING_TAGS = [
|
||||
'h1',
|
||||
'h2',
|
||||
'h3',
|
||||
'h4',
|
||||
'h5',
|
||||
'h6',
|
||||
]
|
||||
|
||||
SPACE_TAGS = [
|
||||
'td',
|
||||
'br',
|
||||
@ -47,6 +55,10 @@ class TXTMLizer(object):
|
||||
self.log.info('Converting XHTML to TXT...')
|
||||
self.oeb_book = oeb_book
|
||||
self.opts = opts
|
||||
self.toc_ids = []
|
||||
self.last_was_heading = False
|
||||
|
||||
self.create_flat_toc(self.oeb_book.toc)
|
||||
|
||||
return self.mlize_spine()
|
||||
|
||||
@ -58,8 +70,11 @@ class TXTMLizer(object):
|
||||
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile)
|
||||
content = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode))
|
||||
content = self.remove_newlines(content)
|
||||
output += self.dump_text(etree.fromstring(content), stylizer)
|
||||
output = self.cleanup_text(u''.join(output))
|
||||
output += self.dump_text(etree.fromstring(content), stylizer, item)
|
||||
output += '\n\n\n\n\n\n'
|
||||
output = u''.join(output)
|
||||
output = u'\n'.join(l.rstrip() for l in output.splitlines())
|
||||
output = self.cleanup_text(output)
|
||||
|
||||
return output
|
||||
|
||||
@ -68,6 +83,8 @@ class TXTMLizer(object):
|
||||
text = text.replace('\r\n', ' ')
|
||||
text = text.replace('\n', ' ')
|
||||
text = text.replace('\r', ' ')
|
||||
# Condense redundant spaces created by replacing newlines with spaces.
|
||||
text = re.sub(r'[ ]{2,}', ' ', text)
|
||||
|
||||
return text
|
||||
|
||||
@ -80,6 +97,14 @@ class TXTMLizer(object):
|
||||
toc.append(u'* %s\n\n' % item.title)
|
||||
return ''.join(toc)
|
||||
|
||||
def create_flat_toc(self, nodes):
|
||||
'''
|
||||
Turns a hierarchical list of TOC href's into a flat list.
|
||||
'''
|
||||
for item in nodes:
|
||||
self.toc_ids.append(item.href)
|
||||
self.create_flat_toc(item.nodes)
|
||||
|
||||
def cleanup_text(self, text):
|
||||
self.log.debug('\tClean up text...')
|
||||
# Replace bad characters.
|
||||
@ -92,7 +117,7 @@ class TXTMLizer(object):
|
||||
text = text.replace('\f+', ' ')
|
||||
|
||||
# Single line paragraph.
|
||||
text = re.sub('(?<=.)%s(?=.)' % os.linesep, ' ', text)
|
||||
text = re.sub('(?<=.)\n(?=.)', ' ', text)
|
||||
|
||||
# Remove multiple spaces.
|
||||
text = re.sub('[ ]{2,}', ' ', text)
|
||||
@ -101,13 +126,19 @@ class TXTMLizer(object):
|
||||
text = re.sub('\n[ ]+\n', '\n\n', text)
|
||||
if self.opts.remove_paragraph_spacing:
|
||||
text = re.sub('\n{2,}', '\n', text)
|
||||
text = re.sub('(?imu)^(?=.)', '\t', text)
|
||||
text = re.sub(r'(?msu)^(?P<t>[^\t\n]+?)$', lambda mo: u'%s\n\n' % mo.group('t'), text)
|
||||
text = re.sub(r'(?msu)(?P<b>[^\n])\n+(?P<t>[^\t\n]+?)(?=\n)', lambda mo: '%s\n\n\n\n\n\n%s' % (mo.group('b'), mo.group('t')), text)
|
||||
else:
|
||||
text = re.sub('\n{3,}', '\n\n', text)
|
||||
text = re.sub('\n{7,}', '\n\n\n\n\n\n', text)
|
||||
|
||||
# Replace spaces at the beginning and end of lines
|
||||
# We don't replace tabs because those are only added
|
||||
# when remove paragraph spacing is enabled.
|
||||
text = re.sub('(?imu)^[ ]+', '', text)
|
||||
text = re.sub('(?imu)[ ]+$', '', text)
|
||||
|
||||
# Remove empty space and newlines at the beginning of the document.
|
||||
text = re.sub(r'(?u)^[ \n]+', '', text)
|
||||
|
||||
if self.opts.max_line_length:
|
||||
max_length = self.opts.max_line_length
|
||||
@ -145,13 +176,11 @@ class TXTMLizer(object):
|
||||
|
||||
return text
|
||||
|
||||
def dump_text(self, elem, stylizer, end=''):
|
||||
def dump_text(self, elem, stylizer, page):
|
||||
'''
|
||||
@elem: The element in the etree that we are working on.
|
||||
@stylizer: The style information attached to the element.
|
||||
@end: The last two characters of the text from the previous element.
|
||||
This is used to determine if a blank line is needed when starting
|
||||
a new block element.
|
||||
@page: OEB page used to determine absolute urls.
|
||||
'''
|
||||
|
||||
if not isinstance(elem.tag, basestring) \
|
||||
@ -170,13 +199,22 @@ class TXTMLizer(object):
|
||||
return ['']
|
||||
|
||||
tag = barename(elem.tag)
|
||||
tag_id = elem.attrib.get('id', None)
|
||||
in_block = False
|
||||
in_heading = False
|
||||
|
||||
# Are we in a heading?
|
||||
# This can either be a heading tag or a TOC item.
|
||||
if tag in HEADING_TAGS or '%s#%s' % (page.href, tag_id) in self.toc_ids:
|
||||
in_heading = True
|
||||
if not self.last_was_heading:
|
||||
text.append('\n\n\n\n\n\n')
|
||||
|
||||
# Are we in a paragraph block?
|
||||
if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES:
|
||||
if self.opts.remove_paragraph_spacing and not in_heading:
|
||||
text.append(u'\t')
|
||||
in_block = True
|
||||
if not end.endswith(u'\n\n') and hasattr(elem, 'text') and elem.text:
|
||||
text.append(u'\n\n')
|
||||
|
||||
if tag in SPACE_TAGS:
|
||||
text.append(u' ')
|
||||
@ -185,14 +223,17 @@ class TXTMLizer(object):
|
||||
if hasattr(elem, 'text') and elem.text:
|
||||
text.append(elem.text)
|
||||
|
||||
# Recurse down into tags within the tag we are in.
|
||||
for item in elem:
|
||||
en = u''
|
||||
if len(text) >= 2:
|
||||
en = text[-1][-2:]
|
||||
text += self.dump_text(item, stylizer, en)
|
||||
text += self.dump_text(item, stylizer, page)
|
||||
|
||||
if in_block:
|
||||
text.append(u'\n\n')
|
||||
if in_heading:
|
||||
text.append(u'\n')
|
||||
self.last_was_heading = True
|
||||
else:
|
||||
self.last_was_heading = False
|
||||
|
||||
if hasattr(elem, 'tail') and elem.tail:
|
||||
text.append(elem.tail)
|
||||
|
Loading…
x
Reference in New Issue
Block a user