New TXT output processor.

This commit is contained in:
John Schember 2009-07-12 20:22:19 -04:00
parent 97c1b8a0c3
commit e09193a48f
4 changed files with 72 additions and 157 deletions

View File

@ -0,0 +1,25 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
class TxtNewlines(object):
NEWLINE_TYPES = {
'system' : os.linesep,
'unix' : '\n',
'old_mac' : '\r',
'windows' : '\r\n'
}
def __init__(self, newline_type):
self.newline = self.NEWLINE_TYPES.get(newline_type.lower(), os.linesep)
def specified_newlines(newline, text):
if newline == os.linesep:
return text
return text.replace(os.linesep, newline)

View File

@ -8,7 +8,8 @@ import os
from calibre.customize.conversion import OutputFormatPlugin, \ from calibre.customize.conversion import OutputFormatPlugin, \
OptionRecommendation OptionRecommendation
from calibre.ebooks.txt.writer import TxtWriter, TxtNewlines from calibre.ebooks.txt.txtml import TXTMLizer
from calibre.ebooks.txt.newlines import TxtNewlines, specified_newlines
class TXTOutput(OutputFormatPlugin): class TXTOutput(OutputFormatPlugin):
@ -32,12 +33,11 @@ class TXTOutput(OutputFormatPlugin):
]) ])
def convert(self, oeb_book, output_path, input_plugin, opts, log): def convert(self, oeb_book, output_path, input_plugin, opts, log):
# writer = TxtWriter(TxtNewlines(opts.newline).newline, log)
# txt = writer.dump(oeb_book.spine)
from calibre.ebooks.txt.txtml import TXTMLizer
writer = TXTMLizer(log) writer = TXTMLizer(log)
txt = writer.extract_content(oeb_book, opts) txt = writer.extract_content(oeb_book, opts)
log.debug('\tReplacing newlines with selected type...')
txt = specified_newlines(TxtNewlines(opts.newline).newline, txt)
close = False close = False
if not hasattr(output_path, 'write'): if not hasattr(output_path, 'write'):

View File

@ -8,7 +8,7 @@ __docformat__ = 'restructuredtext en'
Transform OEB content into plain text Transform OEB content into plain text
''' '''
import os import os, re
from lxml import etree from lxml import etree
@ -32,6 +32,7 @@ BLOCK_STYLES = [
] ]
class TXTMLizer(object): class TXTMLizer(object):
def __init__(self, log): def __init__(self, log):
self.log = log self.log = log
@ -49,6 +50,7 @@ class TXTMLizer(object):
content = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode)) content = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode))
content = self.remove_newlines(content) content = self.remove_newlines(content)
output += self.dump_text(etree.fromstring(content), stylizer) output += self.dump_text(etree.fromstring(content), stylizer)
output = self.cleanup_text(output)
return output return output
@ -60,7 +62,42 @@ class TXTMLizer(object):
return text return text
def dump_text(self, elem, stylizer): def cleanup_text(self, text):
self.log.debug('\tClean up text...')
# Replace bad characters.
text = text.replace(u'\xc2', '')
text = text.replace(u'\xa0', ' ')
# Replace tabs, vertical tags and form feeds with single space.
text = text.replace('\t+', ' ')
text = text.replace('\v+', ' ')
text = text.replace('\f+', ' ')
# Single line paragraph.
text = re.sub('(?<=.)%s(?=.)' % os.linesep, ' ', text)
# Remove multiple spaces.
text = re.sub('[ ]+', ' ', text)
# Remove excessive newlines.
#text = re.sub('\n[ ]+\n', '\n\n', text)
#text = re.sub('\n{3,}', '\n\n', text)
# Replace spaces at the beginning and end of lines
text = re.sub('(?imu)^[ ]+', '', text)
text = re.sub('(?imu)[ ]+$', '', text)
return text
def dump_text(self, elem, stylizer, end=''):
'''
@elem: The element in the etree that we are working on.
@stylizer: The style information attached to the element.
@end: The last two characters of the text from the previous element.
This is used to determine if a blank line is needed when starting
a new block element.
'''
if not isinstance(elem.tag, basestring) \ if not isinstance(elem.tag, basestring) \
or namespace(elem.tag) != XHTML_NS: or namespace(elem.tag) != XHTML_NS:
return u'' return u''
@ -78,16 +115,15 @@ class TXTMLizer(object):
# Are we in a paragraph block? # Are we in a paragraph block?
if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES: if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES:
in_block = True in_block = True
#if not text.endswith(os.linesep) and hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '': if not end.endswith(os.linesep + os.linesep) and hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '':
# print '"%s"' % text text += os.linesep + os.linesep
# text += os.linesep + os.linesep
# Proccess tags that contain text. # Proccess tags that contain text.
if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '': if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '':
text += elem.text text += elem.text
for item in elem: for item in elem:
text += self.dump_text(item, stylizer) text += self.dump_text(item, stylizer, text[-2:])
if in_block: if in_block:
text += os.linesep + os.linesep text += os.linesep + os.linesep

View File

@ -1,146 +0,0 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
'''
Write content to TXT.
'''
import os
import re
from lxml import etree
from calibre import entity_to_unicode
from calibre.ebooks.oeb.base import XHTML
class TxtWriter(object):
def __init__(self, newline, log):
self.newline = newline
self.log = log
def dump(self, spine):
out = u''
for item in spine:
self.log.debug('Processing %s...' % item.href)
content = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode))
content = self.remove_newlines(content)
content = self.strip_html(content)
content = self.replace_html_symbols(content)
content = self.cleanup_text(content)
content = self.specified_newlines(content)
out += content
# Put two blank lines at end of file
end = out[-3 * len(self.newline):]
for i in range(3 - end.count(self.newline)):
out += self.newline
return out
def strip_html(self, text):
self.log.debug('\tStripping html...')
stripped = u''
# Remove unnecessary tags
for tag in ['script', 'style']:
text = re.sub('(?imu)<[ ]*%s[ ]*.*?>(.*)</[ ]*%s[ ]*>' % (tag, tag), '', text)
text = re.sub('<!--.*-->', '', text)
text = re.sub('<\?.*?\?>', '', text)
text = re.sub('<@.*?@>', '', text)
text = re.sub('<%.*?%>', '', text)
# Headings usually indicate Chapters.
# We are going to use a marker to insert the proper number of
# newline characters at the end of cleanup_text because cleanup_text
# remove excessive (more than 2 newlines).
for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
text = re.sub('(?imu)<[ ]*%s[ ]*.*?>' % tag, '-vzxedxy-', text)
text = re.sub('(?imu)</[ ]*%s[ ]*>' % tag, '-vlgzxey-', text)
# Separate content with space.
for tag in ['td']:
text = re.sub('(?imu)</[ ]*%s[ ]*>', ' ', text)
# Separate content with empty line.
for tag in ['p', 'div', 'pre', 'li', 'table', 'tr']:
text = re.sub('(?imu)</[ ]*%s[ ]*>' % tag, '\n\n', text)
for tag in ['hr', 'br']:
text = re.sub('(?imu)<[ ]*%s.*?>' % tag, '\n\n', text)
# Remove any tags that do not need special processing.
text = re.sub('<.*?>', '', text)
stripped = stripped + text
return stripped
def replace_html_symbols(self, content):
self.log.debug('\tReplacing entities with unicode...')
for entity in set(re.findall('&.+?;', content)):
mo = re.search('(%s)' % entity[1:-1], content)
content = content.replace(entity, entity_to_unicode(mo))
return content
def cleanup_text(self, text):
self.log.debug('\tClean up text...')
# Replace bad characters.
text = text.replace(u'\xc2', '')
text = text.replace(u'\xa0', ' ')
# Replace tabs, vertical tags and form feeds with single space.
text = text.replace('\t+', ' ')
text = text.replace('\v+', ' ')
text = text.replace('\f+', ' ')
# Single line paragraph.
text = re.sub('(?<=.)\n(?=.)', ' ', text)
# Remove multiple spaces.
text = re.sub('[ ]+', ' ', text)
# Remove excessive newlines.
text = re.sub('\n[ ]+\n', '\n\n', text)
text = re.sub('\n{3,}', '\n\n', text)
# Replace markers with the proper characters.
text = text.replace('-vzxedxy-', '\n\n\n\n\n')
text = text.replace('-vlgzxey-', '\n\n\n')
# Replace spaces at the beginning and end of lines
text = re.sub('(?imu)^[ ]+', '', text)
text = re.sub('(?imu)[ ]+$', '', text)
return text
def remove_newlines(self, text):
self.log.debug('\tRemove newlines for processing...')
text = text.replace('\r\n', ' ')
text = text.replace('\n', ' ')
text = text.replace('\r', ' ')
return text
def specified_newlines(self, text):
self.log.debug('\tReplacing newlines with selected type...')
if self.newline == '\n':
return text
return text.replace('\n', self.newline)
class TxtNewlines(object):
NEWLINE_TYPES = {
'system' : os.linesep,
'unix' : '\n',
'old_mac' : '\r',
'windows' : '\r\n'
}
def __init__(self, newline_type):
self.newline = self.NEWLINE_TYPES.get(newline_type.lower(), os.linesep)