From e09193a48fc1966e35113af9d3817d03071ffd38 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 12 Jul 2009 20:22:19 -0400 Subject: [PATCH] New TXT output processor. --- src/calibre/ebooks/txt/newlines.py | 25 +++++ src/calibre/ebooks/txt/output.py | 10 +- src/calibre/ebooks/txt/txtml.py | 48 ++++++++-- src/calibre/ebooks/txt/writer.py | 146 ----------------------------- 4 files changed, 72 insertions(+), 157 deletions(-) create mode 100644 src/calibre/ebooks/txt/newlines.py delete mode 100644 src/calibre/ebooks/txt/writer.py diff --git a/src/calibre/ebooks/txt/newlines.py b/src/calibre/ebooks/txt/newlines.py new file mode 100644 index 0000000000..983d356206 --- /dev/null +++ b/src/calibre/ebooks/txt/newlines.py @@ -0,0 +1,25 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +import os + +class TxtNewlines(object): + + NEWLINE_TYPES = { + 'system' : os.linesep, + 'unix' : '\n', + 'old_mac' : '\r', + 'windows' : '\r\n' + } + + def __init__(self, newline_type): + self.newline = self.NEWLINE_TYPES.get(newline_type.lower(), os.linesep) + +def specified_newlines(newline, text): + if newline == os.linesep: + return text + + return text.replace(os.linesep, newline) diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py index f1767700e0..c13949af2e 100644 --- a/src/calibre/ebooks/txt/output.py +++ b/src/calibre/ebooks/txt/output.py @@ -8,7 +8,8 @@ import os from calibre.customize.conversion import OutputFormatPlugin, \ OptionRecommendation -from calibre.ebooks.txt.writer import TxtWriter, TxtNewlines +from calibre.ebooks.txt.txtml import TXTMLizer +from calibre.ebooks.txt.newlines import TxtNewlines, specified_newlines class TXTOutput(OutputFormatPlugin): @@ -32,12 +33,11 @@ class TXTOutput(OutputFormatPlugin): ]) def convert(self, oeb_book, output_path, input_plugin, opts, log): -# writer = TxtWriter(TxtNewlines(opts.newline).newline, log) -# txt = writer.dump(oeb_book.spine) - - from calibre.ebooks.txt.txtml import TXTMLizer writer = TXTMLizer(log) txt = writer.extract_content(oeb_book, opts) + + log.debug('\tReplacing newlines with selected type...') + txt = specified_newlines(TxtNewlines(opts.newline).newline, txt) close = False if not hasattr(output_path, 'write'): diff --git a/src/calibre/ebooks/txt/txtml.py b/src/calibre/ebooks/txt/txtml.py index 5bc7ed45f8..d609426d93 100644 --- a/src/calibre/ebooks/txt/txtml.py +++ b/src/calibre/ebooks/txt/txtml.py @@ -8,7 +8,7 @@ __docformat__ = 'restructuredtext en' Transform OEB content into plain text ''' -import os +import os, re from lxml import etree @@ -32,6 +32,7 @@ BLOCK_STYLES = [ ] class TXTMLizer(object): + def __init__(self, log): self.log = log @@ -49,6 +50,7 @@ class TXTMLizer(object): content = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode)) content = self.remove_newlines(content) output += self.dump_text(etree.fromstring(content), stylizer) + output = self.cleanup_text(output) return output @@ -60,7 +62,42 @@ class TXTMLizer(object): return text - def dump_text(self, elem, stylizer): + def cleanup_text(self, text): + self.log.debug('\tClean up text...') + # Replace bad characters. + text = text.replace(u'\xc2', '') + text = text.replace(u'\xa0', ' ') + + # Replace tabs, vertical tags and form feeds with single space. + text = text.replace('\t+', ' ') + text = text.replace('\v+', ' ') + text = text.replace('\f+', ' ') + + # Single line paragraph. + text = re.sub('(?<=.)%s(?=.)' % os.linesep, ' ', text) + + # Remove multiple spaces. + text = re.sub('[ ]+', ' ', text) + + # Remove excessive newlines. + #text = re.sub('\n[ ]+\n', '\n\n', text) + #text = re.sub('\n{3,}', '\n\n', text) + + # Replace spaces at the beginning and end of lines + text = re.sub('(?imu)^[ ]+', '', text) + text = re.sub('(?imu)[ ]+$', '', text) + + return text + + def dump_text(self, elem, stylizer, end=''): + ''' + @elem: The element in the etree that we are working on. + @stylizer: The style information attached to the element. + @end: The last two characters of the text from the previous element. + This is used to determine if a blank line is needed when starting + a new block element. + ''' + if not isinstance(elem.tag, basestring) \ or namespace(elem.tag) != XHTML_NS: return u'' @@ -78,16 +115,15 @@ class TXTMLizer(object): # Are we in a paragraph block? if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES: in_block = True - #if not text.endswith(os.linesep) and hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '': - # print '"%s"' % text - # text += os.linesep + os.linesep + if not end.endswith(os.linesep + os.linesep) and hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '': + text += os.linesep + os.linesep # Proccess tags that contain text. if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '': text += elem.text for item in elem: - text += self.dump_text(item, stylizer) + text += self.dump_text(item, stylizer, text[-2:]) if in_block: text += os.linesep + os.linesep diff --git a/src/calibre/ebooks/txt/writer.py b/src/calibre/ebooks/txt/writer.py deleted file mode 100644 index a3fbe13199..0000000000 --- a/src/calibre/ebooks/txt/writer.py +++ /dev/null @@ -1,146 +0,0 @@ -# -*- coding: utf-8 -*- - -__license__ = 'GPL v3' -__copyright__ = '2009, John Schember ' -__docformat__ = 'restructuredtext en' - -''' -Write content to TXT. -''' - -import os -import re - -from lxml import etree - -from calibre import entity_to_unicode -from calibre.ebooks.oeb.base import XHTML - -class TxtWriter(object): - def __init__(self, newline, log): - self.newline = newline - self.log = log - - def dump(self, spine): - out = u'' - for item in spine: - self.log.debug('Processing %s...' % item.href) - content = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode)) - content = self.remove_newlines(content) - content = self.strip_html(content) - content = self.replace_html_symbols(content) - content = self.cleanup_text(content) - content = self.specified_newlines(content) - out += content - - # Put two blank lines at end of file - end = out[-3 * len(self.newline):] - for i in range(3 - end.count(self.newline)): - out += self.newline - - return out - - def strip_html(self, text): - self.log.debug('\tStripping html...') - stripped = u'' - - # Remove unnecessary tags - for tag in ['script', 'style']: - text = re.sub('(?imu)<[ ]*%s[ ]*.*?>(.*)' % (tag, tag), '', text) - text = re.sub('', '', text) - text = re.sub('<\?.*?\?>', '', text) - text = re.sub('<@.*?@>', '', text) - text = re.sub('<%.*?%>', '', text) - - # Headings usually indicate Chapters. - # We are going to use a marker to insert the proper number of - # newline characters at the end of cleanup_text because cleanup_text - # remove excessive (more than 2 newlines). - for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: - text = re.sub('(?imu)<[ ]*%s[ ]*.*?>' % tag, '-vzxedxy-', text) - text = re.sub('(?imu)' % tag, '-vlgzxey-', text) - - # Separate content with space. - for tag in ['td']: - text = re.sub('(?imu)', ' ', text) - - # Separate content with empty line. - for tag in ['p', 'div', 'pre', 'li', 'table', 'tr']: - text = re.sub('(?imu)' % tag, '\n\n', text) - - for tag in ['hr', 'br']: - text = re.sub('(?imu)<[ ]*%s.*?>' % tag, '\n\n', text) - - # Remove any tags that do not need special processing. - text = re.sub('<.*?>', '', text) - - stripped = stripped + text - - return stripped - - def replace_html_symbols(self, content): - self.log.debug('\tReplacing entities with unicode...') - for entity in set(re.findall('&.+?;', content)): - mo = re.search('(%s)' % entity[1:-1], content) - content = content.replace(entity, entity_to_unicode(mo)) - - return content - - def cleanup_text(self, text): - self.log.debug('\tClean up text...') - # Replace bad characters. - text = text.replace(u'\xc2', '') - text = text.replace(u'\xa0', ' ') - - # Replace tabs, vertical tags and form feeds with single space. - text = text.replace('\t+', ' ') - text = text.replace('\v+', ' ') - text = text.replace('\f+', ' ') - - # Single line paragraph. - text = re.sub('(?<=.)\n(?=.)', ' ', text) - - # Remove multiple spaces. - text = re.sub('[ ]+', ' ', text) - - # Remove excessive newlines. - text = re.sub('\n[ ]+\n', '\n\n', text) - text = re.sub('\n{3,}', '\n\n', text) - - # Replace markers with the proper characters. - text = text.replace('-vzxedxy-', '\n\n\n\n\n') - text = text.replace('-vlgzxey-', '\n\n\n') - - # Replace spaces at the beginning and end of lines - text = re.sub('(?imu)^[ ]+', '', text) - text = re.sub('(?imu)[ ]+$', '', text) - - return text - - def remove_newlines(self, text): - self.log.debug('\tRemove newlines for processing...') - text = text.replace('\r\n', ' ') - text = text.replace('\n', ' ') - text = text.replace('\r', ' ') - - return text - - def specified_newlines(self, text): - self.log.debug('\tReplacing newlines with selected type...') - if self.newline == '\n': - return text - - return text.replace('\n', self.newline) - - -class TxtNewlines(object): - NEWLINE_TYPES = { - 'system' : os.linesep, - 'unix' : '\n', - 'old_mac' : '\r', - 'windows' : '\r\n' - } - - def __init__(self, newline_type): - self.newline = self.NEWLINE_TYPES.get(newline_type.lower(), os.linesep) -