diff --git a/src/calibre/ebooks/pdb/palmdoc/writer.py b/src/calibre/ebooks/pdb/palmdoc/writer.py index 8eca0db124..12c1c4aaa7 100644 --- a/src/calibre/ebooks/pdb/palmdoc/writer.py +++ b/src/calibre/ebooks/pdb/palmdoc/writer.py @@ -3,6 +3,7 @@ ''' Writer content to palmdoc pdb file. ''' +import os __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' @@ -13,8 +14,8 @@ import struct from calibre.ebooks.compression.palmdoc import compress_doc from calibre.ebooks.pdb.formatwriter import FormatWriter from calibre.ebooks.pdb.header import PdbHeaderBuilder -from calibre.ebooks.txt.writer import TxtNewlines -from calibre.ebooks.txt.writer import TxtWriter +from calibre.ebooks.txt.txtml import TXTMLizer +from calibre.ebooks.txt.newlines import TxtNewlines, specified_newlines MAX_RECORD_SIZE = 4096 @@ -27,7 +28,7 @@ class Writer(FormatWriter): def write_content(self, oeb_book, out_stream, metadata=None): title = self.opts.title if self.opts.title else oeb_book.metadata.title[0].value if oeb_book.metadata.title != [] else _('Unknown') - txt_records, txt_length = self._generate_text(oeb_book.spine) + txt_records, txt_length = self._generate_text(oeb_book) header_record = self._header_record(txt_length, len(txt_records)) section_lengths = [len(header_record)] @@ -44,9 +45,12 @@ class Writer(FormatWriter): for record in [header_record] + txt_records: out_stream.write(record) - def _generate_text(self, spine): - txt_writer = TxtWriter(TxtNewlines('system').newline, self.log) - txt = txt_writer.dump(spine).encode(self.opts.output_encoding, 'replace') + def _generate_text(self, oeb_book): + writer = TXTMLizer(self.log) + txt = writer.extract_content(oeb_book, self.opts) + + self.log.debug('\tReplacing newlines with selected type...') + txt = specified_newlines(TxtNewlines('windows').newline, txt).encode(self.opts.output_encoding, 'replace') txt_length = len(txt) diff --git a/src/calibre/ebooks/pdb/ztxt/writer.py b/src/calibre/ebooks/pdb/ztxt/writer.py index d6bdeefc59..ee4c5752c3 100644 --- a/src/calibre/ebooks/pdb/ztxt/writer.py +++ b/src/calibre/ebooks/pdb/ztxt/writer.py @@ -11,8 +11,9 @@ __docformat__ = 'restructuredtext en' import struct, zlib from calibre.ebooks.pdb.formatwriter import FormatWriter -from calibre.ebooks.txt.writer import TxtWriter, TxtNewlines from calibre.ebooks.pdb.header import PdbHeaderBuilder +from calibre.ebooks.txt.txtml import TXTMLizer +from calibre.ebooks.txt.newlines import TxtNewlines, specified_newlines MAX_RECORD_SIZE = 8192 @@ -25,7 +26,7 @@ class Writer(FormatWriter): def write_content(self, oeb_book, out_stream, metadata=None): title = self.opts.title if self.opts.title else oeb_book.metadata.title[0].value if oeb_book.metadata.title != [] else _('Unknown') - txt_records, txt_length = self._generate_text(oeb_book.spine) + txt_records, txt_length = self._generate_text(oeb_book) crc32 = 0 section_lengths = [] @@ -33,7 +34,7 @@ class Writer(FormatWriter): self.log.info('Compressing data...') for i in range(0, len(txt_records)): self.log.debug('\tCompressing record %i' % i) - txt_records[i] = compressor.compress(txt_records[i].encode('cp1252', 'replace')) + txt_records[i] = compressor.compress(txt_records[i]) txt_records[i] = txt_records[i] + compressor.flush(zlib.Z_FULL_FLUSH) section_lengths.append(len(txt_records[i])) crc32 = zlib.crc32(txt_records[i], crc32) & 0xffffffff @@ -48,10 +49,13 @@ class Writer(FormatWriter): for record in [header_record]+txt_records: out_stream.write(record) - def _generate_text(self, spine): - txt_writer = TxtWriter(TxtNewlines('system').newline, self.log) - txt = txt_writer.dump(spine).encode(self.opts.output_encoding, 'replace') - + def _generate_text(self, oeb_book): + writer = TXTMLizer(self.log) + txt = writer.extract_content(oeb_book, self.opts) + + self.log.debug('\tReplacing newlines with selected type...') + txt = specified_newlines(TxtNewlines('windows').newline, txt).encode(self.opts.output_encoding, 'replace') + txt_length = len(txt) txt_records = [] diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py index 2ca38176d5..a96adc5772 100644 --- a/src/calibre/ebooks/pml/pmlconverter.py +++ b/src/calibre/ebooks/pml/pmlconverter.py @@ -38,7 +38,7 @@ PML_HTML_RULES = [ (re.compile(r'\\U(?P\d+)'), lambda match: '%s' % unichr(int(match.group('num'), 16))), (re.compile(r'\\m="(?P.+?)"'), lambda match: '' % image_name(match.group('name')).strip('\x00')), (re.compile(r'\\q="(?P#.+?)"(?P.+?)\\q', re.DOTALL), lambda match: '%s' % (match.group('target'), match.group('text'))), - (re.compile(r'\\Q="(?P.+?)"'), lambda match: '
' % match.group('target')), + (re.compile(r'\\Q="(?P.+?)"'), lambda match: '' % match.group('target')), (re.compile(r'\\-'), lambda match: ''), (re.compile(r'\\Fn="(?P.+?)"(?P.+?)\\Fn'), lambda match: '%s' % (match.group('target'), match.group('text'))), (re.compile(r'\\Sd="(?P.+?)"(?P.+?)\\Sd'), lambda match: '%s' % (match.group('target'), match.group('text'))), diff --git a/src/calibre/ebooks/txt/newlines.py b/src/calibre/ebooks/txt/newlines.py new file mode 100644 index 0000000000..983d356206 --- /dev/null +++ b/src/calibre/ebooks/txt/newlines.py @@ -0,0 +1,25 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +import os + +class TxtNewlines(object): + + NEWLINE_TYPES = { + 'system' : os.linesep, + 'unix' : '\n', + 'old_mac' : '\r', + 'windows' : '\r\n' + } + + def __init__(self, newline_type): + self.newline = self.NEWLINE_TYPES.get(newline_type.lower(), os.linesep) + +def specified_newlines(newline, text): + if newline == os.linesep: + return text + + return text.replace(os.linesep, newline) diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py index 6cb854df10..c13949af2e 100644 --- a/src/calibre/ebooks/txt/output.py +++ b/src/calibre/ebooks/txt/output.py @@ -8,7 +8,8 @@ import os from calibre.customize.conversion import OutputFormatPlugin, \ OptionRecommendation -from calibre.ebooks.txt.writer import TxtWriter, TxtNewlines +from calibre.ebooks.txt.txtml import TXTMLizer +from calibre.ebooks.txt.newlines import TxtNewlines, specified_newlines class TXTOutput(OutputFormatPlugin): @@ -32,8 +33,11 @@ class TXTOutput(OutputFormatPlugin): ]) def convert(self, oeb_book, output_path, input_plugin, opts, log): - writer = TxtWriter(TxtNewlines(opts.newline).newline, log) - txt = writer.dump(oeb_book.spine) + writer = TXTMLizer(log) + txt = writer.extract_content(oeb_book, opts) + + log.debug('\tReplacing newlines with selected type...') + txt = specified_newlines(TxtNewlines(opts.newline).newline, txt) close = False if not hasattr(output_path, 'write'): diff --git a/src/calibre/ebooks/txt/txtml.py b/src/calibre/ebooks/txt/txtml.py new file mode 100644 index 0000000000..94f2a181c5 --- /dev/null +++ b/src/calibre/ebooks/txt/txtml.py @@ -0,0 +1,134 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +''' +Transform OEB content into plain text +''' + +import os, re + +from lxml import etree + +from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace +from calibre.ebooks.oeb.stylizer import Stylizer + +BLOCK_TAGS = [ + 'div', + 'p', + 'h1', + 'h2', + 'h3', + 'h4', + 'h5', + 'h6', + 'li', +] + +BLOCK_STYLES = [ + 'block', +] + +class TXTMLizer(object): + + def __init__(self, log): + self.log = log + + def extract_content(self, oeb_book, opts): + self.log.info('Converting XHTML to TXT...') + self.oeb_book = oeb_book + self.opts = opts + return self.mlize_spine() + + def mlize_spine(self): + output = u'' + for item in self.oeb_book.spine: + self.log.debug('Converting %s to TXT...' % item.href) + stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) + content = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode)) + content = self.remove_newlines(content) + output += self.dump_text(etree.fromstring(content), stylizer) + output = self.cleanup_text(output) + + return output + + def remove_newlines(self, text): + self.log.debug('\tRemove newlines for processing...') + text = text.replace('\r\n', ' ') + text = text.replace('\n', ' ') + text = text.replace('\r', ' ') + + return text + + def cleanup_text(self, text): + self.log.debug('\tClean up text...') + # Replace bad characters. + text = text.replace(u'\xc2', '') + text = text.replace(u'\xa0', ' ') + + # Replace tabs, vertical tags and form feeds with single space. + text = text.replace('\t+', ' ') + text = text.replace('\v+', ' ') + text = text.replace('\f+', ' ') + + # Single line paragraph. + text = re.sub('(?<=.)%s(?=.)' % os.linesep, ' ', text) + + # Remove multiple spaces. + text = re.sub('[ ]+', ' ', text) + + # Remove excessive newlines. + #text = re.sub('\n[ ]+\n', '\n\n', text) + #text = re.sub('\n{3,}', '\n\n', text) + + # Replace spaces at the beginning and end of lines + text = re.sub('(?imu)^[ ]+', '', text) + text = re.sub('(?imu)[ ]+$', '', text) + + return text + + def dump_text(self, elem, stylizer, end=''): + ''' + @elem: The element in the etree that we are working on. + @stylizer: The style information attached to the element. + @end: The last two characters of the text from the previous element. + This is used to determine if a blank line is needed when starting + a new block element. + ''' + + if not isinstance(elem.tag, basestring) \ + or namespace(elem.tag) != XHTML_NS: + return u'' + + text = u'' + style = stylizer.style(elem) + + if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ + or style['visibility'] == 'hidden': + return u'' + + tag = barename(elem.tag) + in_block = False + + # Are we in a paragraph block? + if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES: + in_block = True + if not end.endswith(os.linesep + os.linesep) and hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '': + text += os.linesep + os.linesep + + # Proccess tags that contain text. + if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '': + text += elem.text + + for item in elem: + text += self.dump_text(item, stylizer, text[-2:]) + + if in_block: + text += os.linesep + os.linesep + + if hasattr(elem, 'tail') and elem.tail != None and elem.tail.strip() != '': + text += elem.tail + + return text diff --git a/src/calibre/ebooks/txt/writer.py b/src/calibre/ebooks/txt/writer.py deleted file mode 100644 index a3fbe13199..0000000000 --- a/src/calibre/ebooks/txt/writer.py +++ /dev/null @@ -1,146 +0,0 @@ -# -*- coding: utf-8 -*- - -__license__ = 'GPL v3' -__copyright__ = '2009, John Schember ' -__docformat__ = 'restructuredtext en' - -''' -Write content to TXT. -''' - -import os -import re - -from lxml import etree - -from calibre import entity_to_unicode -from calibre.ebooks.oeb.base import XHTML - -class TxtWriter(object): - def __init__(self, newline, log): - self.newline = newline - self.log = log - - def dump(self, spine): - out = u'' - for item in spine: - self.log.debug('Processing %s...' % item.href) - content = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode)) - content = self.remove_newlines(content) - content = self.strip_html(content) - content = self.replace_html_symbols(content) - content = self.cleanup_text(content) - content = self.specified_newlines(content) - out += content - - # Put two blank lines at end of file - end = out[-3 * len(self.newline):] - for i in range(3 - end.count(self.newline)): - out += self.newline - - return out - - def strip_html(self, text): - self.log.debug('\tStripping html...') - stripped = u'' - - # Remove unnecessary tags - for tag in ['script', 'style']: - text = re.sub('(?imu)<[ ]*%s[ ]*.*?>(.*)' % (tag, tag), '', text) - text = re.sub('', '', text) - text = re.sub('<\?.*?\?>', '', text) - text = re.sub('<@.*?@>', '', text) - text = re.sub('<%.*?%>', '', text) - - # Headings usually indicate Chapters. - # We are going to use a marker to insert the proper number of - # newline characters at the end of cleanup_text because cleanup_text - # remove excessive (more than 2 newlines). - for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: - text = re.sub('(?imu)<[ ]*%s[ ]*.*?>' % tag, '-vzxedxy-', text) - text = re.sub('(?imu)' % tag, '-vlgzxey-', text) - - # Separate content with space. - for tag in ['td']: - text = re.sub('(?imu)', ' ', text) - - # Separate content with empty line. - for tag in ['p', 'div', 'pre', 'li', 'table', 'tr']: - text = re.sub('(?imu)' % tag, '\n\n', text) - - for tag in ['hr', 'br']: - text = re.sub('(?imu)<[ ]*%s.*?>' % tag, '\n\n', text) - - # Remove any tags that do not need special processing. - text = re.sub('<.*?>', '', text) - - stripped = stripped + text - - return stripped - - def replace_html_symbols(self, content): - self.log.debug('\tReplacing entities with unicode...') - for entity in set(re.findall('&.+?;', content)): - mo = re.search('(%s)' % entity[1:-1], content) - content = content.replace(entity, entity_to_unicode(mo)) - - return content - - def cleanup_text(self, text): - self.log.debug('\tClean up text...') - # Replace bad characters. - text = text.replace(u'\xc2', '') - text = text.replace(u'\xa0', ' ') - - # Replace tabs, vertical tags and form feeds with single space. - text = text.replace('\t+', ' ') - text = text.replace('\v+', ' ') - text = text.replace('\f+', ' ') - - # Single line paragraph. - text = re.sub('(?<=.)\n(?=.)', ' ', text) - - # Remove multiple spaces. - text = re.sub('[ ]+', ' ', text) - - # Remove excessive newlines. - text = re.sub('\n[ ]+\n', '\n\n', text) - text = re.sub('\n{3,}', '\n\n', text) - - # Replace markers with the proper characters. - text = text.replace('-vzxedxy-', '\n\n\n\n\n') - text = text.replace('-vlgzxey-', '\n\n\n') - - # Replace spaces at the beginning and end of lines - text = re.sub('(?imu)^[ ]+', '', text) - text = re.sub('(?imu)[ ]+$', '', text) - - return text - - def remove_newlines(self, text): - self.log.debug('\tRemove newlines for processing...') - text = text.replace('\r\n', ' ') - text = text.replace('\n', ' ') - text = text.replace('\r', ' ') - - return text - - def specified_newlines(self, text): - self.log.debug('\tReplacing newlines with selected type...') - if self.newline == '\n': - return text - - return text.replace('\n', self.newline) - - -class TxtNewlines(object): - NEWLINE_TYPES = { - 'system' : os.linesep, - 'unix' : '\n', - 'old_mac' : '\r', - 'windows' : '\r\n' - } - - def __init__(self, newline_type): - self.newline = self.NEWLINE_TYPES.get(newline_type.lower(), os.linesep) -