From c354272030b3396dbe6d749a1b1038e00ff7f6dc Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 12 Jul 2009 12:47:33 -0400 Subject: [PATCH 1/8] Star of new html to text parser. --- src/calibre/ebooks/pml/pmlconverter.py | 2 +- src/calibre/ebooks/txt/output.py | 8 ++- src/calibre/ebooks/txt/txtml.py | 98 ++++++++++++++++++++++++++ 3 files changed, 105 insertions(+), 3 deletions(-) create mode 100644 src/calibre/ebooks/txt/txtml.py diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py index 2ca38176d5..a96adc5772 100644 --- a/src/calibre/ebooks/pml/pmlconverter.py +++ b/src/calibre/ebooks/pml/pmlconverter.py @@ -38,7 +38,7 @@ PML_HTML_RULES = [ (re.compile(r'\\U(?P\d+)'), lambda match: '%s' % unichr(int(match.group('num'), 16))), (re.compile(r'\\m="(?P.+?)"'), lambda match: '' % image_name(match.group('name')).strip('\x00')), (re.compile(r'\\q="(?P#.+?)"(?P.+?)\\q', re.DOTALL), lambda match: '%s' % (match.group('target'), match.group('text'))), - (re.compile(r'\\Q="(?P.+?)"'), lambda match: '
' % match.group('target')), + (re.compile(r'\\Q="(?P.+?)"'), lambda match: '' % match.group('target')), (re.compile(r'\\-'), lambda match: ''), (re.compile(r'\\Fn="(?P.+?)"(?P.+?)\\Fn'), lambda match: '%s' % (match.group('target'), match.group('text'))), (re.compile(r'\\Sd="(?P.+?)"(?P.+?)\\Sd'), lambda match: '%s' % (match.group('target'), match.group('text'))), diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py index 6cb854df10..f1767700e0 100644 --- a/src/calibre/ebooks/txt/output.py +++ b/src/calibre/ebooks/txt/output.py @@ -32,8 +32,12 @@ class TXTOutput(OutputFormatPlugin): ]) def convert(self, oeb_book, output_path, input_plugin, opts, log): - writer = TxtWriter(TxtNewlines(opts.newline).newline, log) - txt = writer.dump(oeb_book.spine) +# writer = TxtWriter(TxtNewlines(opts.newline).newline, log) +# txt = writer.dump(oeb_book.spine) + + from calibre.ebooks.txt.txtml import TXTMLizer + writer = TXTMLizer(log) + txt = writer.extract_content(oeb_book, opts) close = False if not hasattr(output_path, 'write'): diff --git a/src/calibre/ebooks/txt/txtml.py b/src/calibre/ebooks/txt/txtml.py new file mode 100644 index 0000000000..5bc7ed45f8 --- /dev/null +++ b/src/calibre/ebooks/txt/txtml.py @@ -0,0 +1,98 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +''' +Transform OEB content into plain text +''' + +import os + +from lxml import etree + +from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace +from calibre.ebooks.oeb.stylizer import Stylizer + +BLOCK_TAGS = [ + 'div', + 'p', + 'h1', + 'h2', + 'h3', + 'h4', + 'h5', + 'h6', + 'li', +] + +BLOCK_STYLES = [ + 'block', +] + +class TXTMLizer(object): + def __init__(self, log): + self.log = log + + def extract_content(self, oeb_book, opts): + self.log.info('Converting XHTML to PML markup...') + self.oeb_book = oeb_book + self.opts = opts + return self.mlize_spine() + + def mlize_spine(self): + output = u'' + for item in self.oeb_book.spine: + self.log.debug('Converting %s to TXT...' % item.href) + stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) + content = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode)) + content = self.remove_newlines(content) + output += self.dump_text(etree.fromstring(content), stylizer) + + return output + + def remove_newlines(self, text): + self.log.debug('\tRemove newlines for processing...') + text = text.replace('\r\n', ' ') + text = text.replace('\n', ' ') + text = text.replace('\r', ' ') + + return text + + def dump_text(self, elem, stylizer): + if not isinstance(elem.tag, basestring) \ + or namespace(elem.tag) != XHTML_NS: + return u'' + + text = u'' + style = stylizer.style(elem) + + if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ + or style['visibility'] == 'hidden': + return u'' + + tag = barename(elem.tag) + in_block = False + + # Are we in a paragraph block? + if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES: + in_block = True + #if not text.endswith(os.linesep) and hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '': + # print '"%s"' % text + # text += os.linesep + os.linesep + + # Proccess tags that contain text. + if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '': + text += elem.text + + for item in elem: + text += self.dump_text(item, stylizer) + + if in_block: + text += os.linesep + os.linesep + + if hasattr(elem, 'tail') and elem.tail != None and elem.tail.strip() != '': + text += elem.tail + + return text From e09193a48fc1966e35113af9d3817d03071ffd38 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 12 Jul 2009 20:22:19 -0400 Subject: [PATCH 2/8] New TXT output processor. --- src/calibre/ebooks/txt/newlines.py | 25 +++++ src/calibre/ebooks/txt/output.py | 10 +- src/calibre/ebooks/txt/txtml.py | 48 ++++++++-- src/calibre/ebooks/txt/writer.py | 146 ----------------------------- 4 files changed, 72 insertions(+), 157 deletions(-) create mode 100644 src/calibre/ebooks/txt/newlines.py delete mode 100644 src/calibre/ebooks/txt/writer.py diff --git a/src/calibre/ebooks/txt/newlines.py b/src/calibre/ebooks/txt/newlines.py new file mode 100644 index 0000000000..983d356206 --- /dev/null +++ b/src/calibre/ebooks/txt/newlines.py @@ -0,0 +1,25 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +import os + +class TxtNewlines(object): + + NEWLINE_TYPES = { + 'system' : os.linesep, + 'unix' : '\n', + 'old_mac' : '\r', + 'windows' : '\r\n' + } + + def __init__(self, newline_type): + self.newline = self.NEWLINE_TYPES.get(newline_type.lower(), os.linesep) + +def specified_newlines(newline, text): + if newline == os.linesep: + return text + + return text.replace(os.linesep, newline) diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py index f1767700e0..c13949af2e 100644 --- a/src/calibre/ebooks/txt/output.py +++ b/src/calibre/ebooks/txt/output.py @@ -8,7 +8,8 @@ import os from calibre.customize.conversion import OutputFormatPlugin, \ OptionRecommendation -from calibre.ebooks.txt.writer import TxtWriter, TxtNewlines +from calibre.ebooks.txt.txtml import TXTMLizer +from calibre.ebooks.txt.newlines import TxtNewlines, specified_newlines class TXTOutput(OutputFormatPlugin): @@ -32,12 +33,11 @@ class TXTOutput(OutputFormatPlugin): ]) def convert(self, oeb_book, output_path, input_plugin, opts, log): -# writer = TxtWriter(TxtNewlines(opts.newline).newline, log) -# txt = writer.dump(oeb_book.spine) - - from calibre.ebooks.txt.txtml import TXTMLizer writer = TXTMLizer(log) txt = writer.extract_content(oeb_book, opts) + + log.debug('\tReplacing newlines with selected type...') + txt = specified_newlines(TxtNewlines(opts.newline).newline, txt) close = False if not hasattr(output_path, 'write'): diff --git a/src/calibre/ebooks/txt/txtml.py b/src/calibre/ebooks/txt/txtml.py index 5bc7ed45f8..d609426d93 100644 --- a/src/calibre/ebooks/txt/txtml.py +++ b/src/calibre/ebooks/txt/txtml.py @@ -8,7 +8,7 @@ __docformat__ = 'restructuredtext en' Transform OEB content into plain text ''' -import os +import os, re from lxml import etree @@ -32,6 +32,7 @@ BLOCK_STYLES = [ ] class TXTMLizer(object): + def __init__(self, log): self.log = log @@ -49,6 +50,7 @@ class TXTMLizer(object): content = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode)) content = self.remove_newlines(content) output += self.dump_text(etree.fromstring(content), stylizer) + output = self.cleanup_text(output) return output @@ -60,7 +62,42 @@ class TXTMLizer(object): return text - def dump_text(self, elem, stylizer): + def cleanup_text(self, text): + self.log.debug('\tClean up text...') + # Replace bad characters. + text = text.replace(u'\xc2', '') + text = text.replace(u'\xa0', ' ') + + # Replace tabs, vertical tags and form feeds with single space. + text = text.replace('\t+', ' ') + text = text.replace('\v+', ' ') + text = text.replace('\f+', ' ') + + # Single line paragraph. + text = re.sub('(?<=.)%s(?=.)' % os.linesep, ' ', text) + + # Remove multiple spaces. + text = re.sub('[ ]+', ' ', text) + + # Remove excessive newlines. + #text = re.sub('\n[ ]+\n', '\n\n', text) + #text = re.sub('\n{3,}', '\n\n', text) + + # Replace spaces at the beginning and end of lines + text = re.sub('(?imu)^[ ]+', '', text) + text = re.sub('(?imu)[ ]+$', '', text) + + return text + + def dump_text(self, elem, stylizer, end=''): + ''' + @elem: The element in the etree that we are working on. + @stylizer: The style information attached to the element. + @end: The last two characters of the text from the previous element. + This is used to determine if a blank line is needed when starting + a new block element. + ''' + if not isinstance(elem.tag, basestring) \ or namespace(elem.tag) != XHTML_NS: return u'' @@ -78,16 +115,15 @@ class TXTMLizer(object): # Are we in a paragraph block? if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES: in_block = True - #if not text.endswith(os.linesep) and hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '': - # print '"%s"' % text - # text += os.linesep + os.linesep + if not end.endswith(os.linesep + os.linesep) and hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '': + text += os.linesep + os.linesep # Proccess tags that contain text. if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '': text += elem.text for item in elem: - text += self.dump_text(item, stylizer) + text += self.dump_text(item, stylizer, text[-2:]) if in_block: text += os.linesep + os.linesep diff --git a/src/calibre/ebooks/txt/writer.py b/src/calibre/ebooks/txt/writer.py deleted file mode 100644 index a3fbe13199..0000000000 --- a/src/calibre/ebooks/txt/writer.py +++ /dev/null @@ -1,146 +0,0 @@ -# -*- coding: utf-8 -*- - -__license__ = 'GPL v3' -__copyright__ = '2009, John Schember ' -__docformat__ = 'restructuredtext en' - -''' -Write content to TXT. -''' - -import os -import re - -from lxml import etree - -from calibre import entity_to_unicode -from calibre.ebooks.oeb.base import XHTML - -class TxtWriter(object): - def __init__(self, newline, log): - self.newline = newline - self.log = log - - def dump(self, spine): - out = u'' - for item in spine: - self.log.debug('Processing %s...' % item.href) - content = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode)) - content = self.remove_newlines(content) - content = self.strip_html(content) - content = self.replace_html_symbols(content) - content = self.cleanup_text(content) - content = self.specified_newlines(content) - out += content - - # Put two blank lines at end of file - end = out[-3 * len(self.newline):] - for i in range(3 - end.count(self.newline)): - out += self.newline - - return out - - def strip_html(self, text): - self.log.debug('\tStripping html...') - stripped = u'' - - # Remove unnecessary tags - for tag in ['script', 'style']: - text = re.sub('(?imu)<[ ]*%s[ ]*.*?>(.*)' % (tag, tag), '', text) - text = re.sub('', '', text) - text = re.sub('<\?.*?\?>', '', text) - text = re.sub('<@.*?@>', '', text) - text = re.sub('<%.*?%>', '', text) - - # Headings usually indicate Chapters. - # We are going to use a marker to insert the proper number of - # newline characters at the end of cleanup_text because cleanup_text - # remove excessive (more than 2 newlines). - for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: - text = re.sub('(?imu)<[ ]*%s[ ]*.*?>' % tag, '-vzxedxy-', text) - text = re.sub('(?imu)' % tag, '-vlgzxey-', text) - - # Separate content with space. - for tag in ['td']: - text = re.sub('(?imu)', ' ', text) - - # Separate content with empty line. - for tag in ['p', 'div', 'pre', 'li', 'table', 'tr']: - text = re.sub('(?imu)' % tag, '\n\n', text) - - for tag in ['hr', 'br']: - text = re.sub('(?imu)<[ ]*%s.*?>' % tag, '\n\n', text) - - # Remove any tags that do not need special processing. - text = re.sub('<.*?>', '', text) - - stripped = stripped + text - - return stripped - - def replace_html_symbols(self, content): - self.log.debug('\tReplacing entities with unicode...') - for entity in set(re.findall('&.+?;', content)): - mo = re.search('(%s)' % entity[1:-1], content) - content = content.replace(entity, entity_to_unicode(mo)) - - return content - - def cleanup_text(self, text): - self.log.debug('\tClean up text...') - # Replace bad characters. - text = text.replace(u'\xc2', '') - text = text.replace(u'\xa0', ' ') - - # Replace tabs, vertical tags and form feeds with single space. - text = text.replace('\t+', ' ') - text = text.replace('\v+', ' ') - text = text.replace('\f+', ' ') - - # Single line paragraph. - text = re.sub('(?<=.)\n(?=.)', ' ', text) - - # Remove multiple spaces. - text = re.sub('[ ]+', ' ', text) - - # Remove excessive newlines. - text = re.sub('\n[ ]+\n', '\n\n', text) - text = re.sub('\n{3,}', '\n\n', text) - - # Replace markers with the proper characters. - text = text.replace('-vzxedxy-', '\n\n\n\n\n') - text = text.replace('-vlgzxey-', '\n\n\n') - - # Replace spaces at the beginning and end of lines - text = re.sub('(?imu)^[ ]+', '', text) - text = re.sub('(?imu)[ ]+$', '', text) - - return text - - def remove_newlines(self, text): - self.log.debug('\tRemove newlines for processing...') - text = text.replace('\r\n', ' ') - text = text.replace('\n', ' ') - text = text.replace('\r', ' ') - - return text - - def specified_newlines(self, text): - self.log.debug('\tReplacing newlines with selected type...') - if self.newline == '\n': - return text - - return text.replace('\n', self.newline) - - -class TxtNewlines(object): - NEWLINE_TYPES = { - 'system' : os.linesep, - 'unix' : '\n', - 'old_mac' : '\r', - 'windows' : '\r\n' - } - - def __init__(self, newline_type): - self.newline = self.NEWLINE_TYPES.get(newline_type.lower(), os.linesep) - From bfaa45c56393b1b3f5401b439c61477c6874d21d Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 12 Jul 2009 20:42:03 -0400 Subject: [PATCH 3/8] Have palmdoc and ztxt pdb files use new txt parser. --- src/calibre/ebooks/pdb/palmdoc/writer.py | 11 +++++++---- src/calibre/ebooks/pdb/ztxt/writer.py | 12 ++++++++---- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/src/calibre/ebooks/pdb/palmdoc/writer.py b/src/calibre/ebooks/pdb/palmdoc/writer.py index 8eca0db124..2a46308db8 100644 --- a/src/calibre/ebooks/pdb/palmdoc/writer.py +++ b/src/calibre/ebooks/pdb/palmdoc/writer.py @@ -13,8 +13,8 @@ import struct from calibre.ebooks.compression.palmdoc import compress_doc from calibre.ebooks.pdb.formatwriter import FormatWriter from calibre.ebooks.pdb.header import PdbHeaderBuilder -from calibre.ebooks.txt.writer import TxtNewlines -from calibre.ebooks.txt.writer import TxtWriter +from calibre.ebooks.txt.txtml import TXTMLizer +from calibre.ebooks.txt.newlines import TxtNewlines, specified_newlines MAX_RECORD_SIZE = 4096 @@ -45,8 +45,11 @@ class Writer(FormatWriter): out_stream.write(record) def _generate_text(self, spine): - txt_writer = TxtWriter(TxtNewlines('system').newline, self.log) - txt = txt_writer.dump(spine).encode(self.opts.output_encoding, 'replace') + writer = TXTMLizer(log) + txt = writer.extract_content(oeb_book, opts) + + log.debug('\tReplacing newlines with selected type...') + txt = specified_newlines(TxtNewlines(opts.newline).newline, txt).encode(self.opts.output_encoding, 'replace') txt_length = len(txt) diff --git a/src/calibre/ebooks/pdb/ztxt/writer.py b/src/calibre/ebooks/pdb/ztxt/writer.py index d6bdeefc59..22f7bf002c 100644 --- a/src/calibre/ebooks/pdb/ztxt/writer.py +++ b/src/calibre/ebooks/pdb/ztxt/writer.py @@ -11,8 +11,9 @@ __docformat__ = 'restructuredtext en' import struct, zlib from calibre.ebooks.pdb.formatwriter import FormatWriter -from calibre.ebooks.txt.writer import TxtWriter, TxtNewlines from calibre.ebooks.pdb.header import PdbHeaderBuilder +from calibre.ebooks.txt.txtml import TXTMLizer +from calibre.ebooks.txt.newlines import TxtNewlines, specified_newlines MAX_RECORD_SIZE = 8192 @@ -49,9 +50,12 @@ class Writer(FormatWriter): out_stream.write(record) def _generate_text(self, spine): - txt_writer = TxtWriter(TxtNewlines('system').newline, self.log) - txt = txt_writer.dump(spine).encode(self.opts.output_encoding, 'replace') - + writer = TXTMLizer(log) + txt = writer.extract_content(oeb_book, opts) + + log.debug('\tReplacing newlines with selected type...') + txt = specified_newlines(TxtNewlines(opts.newline).newline, txt).encode(self.opts.output_encoding, 'replace') + txt_length = len(txt) txt_records = [] From 3e3e6a234822858c68c2cbea291463904409b2d6 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 12 Jul 2009 20:44:43 -0400 Subject: [PATCH 4/8] Fix missing self. reference. --- src/calibre/ebooks/pdb/palmdoc/writer.py | 2 +- src/calibre/ebooks/pdb/ztxt/writer.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/pdb/palmdoc/writer.py b/src/calibre/ebooks/pdb/palmdoc/writer.py index 2a46308db8..8ca83a8270 100644 --- a/src/calibre/ebooks/pdb/palmdoc/writer.py +++ b/src/calibre/ebooks/pdb/palmdoc/writer.py @@ -48,7 +48,7 @@ class Writer(FormatWriter): writer = TXTMLizer(log) txt = writer.extract_content(oeb_book, opts) - log.debug('\tReplacing newlines with selected type...') + self.log.debug('\tReplacing newlines with selected type...') txt = specified_newlines(TxtNewlines(opts.newline).newline, txt).encode(self.opts.output_encoding, 'replace') txt_length = len(txt) diff --git a/src/calibre/ebooks/pdb/ztxt/writer.py b/src/calibre/ebooks/pdb/ztxt/writer.py index 22f7bf002c..19824fce91 100644 --- a/src/calibre/ebooks/pdb/ztxt/writer.py +++ b/src/calibre/ebooks/pdb/ztxt/writer.py @@ -53,7 +53,7 @@ class Writer(FormatWriter): writer = TXTMLizer(log) txt = writer.extract_content(oeb_book, opts) - log.debug('\tReplacing newlines with selected type...') + self.log.debug('\tReplacing newlines with selected type...') txt = specified_newlines(TxtNewlines(opts.newline).newline, txt).encode(self.opts.output_encoding, 'replace') txt_length = len(txt) From 82f3409a598f13d5ee6d39b0543bee3619de2682 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 12 Jul 2009 20:46:30 -0400 Subject: [PATCH 5/8] Fix more typos. --- src/calibre/ebooks/pdb/palmdoc/writer.py | 6 +++--- src/calibre/ebooks/pdb/ztxt/writer.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/calibre/ebooks/pdb/palmdoc/writer.py b/src/calibre/ebooks/pdb/palmdoc/writer.py index 8ca83a8270..f99c698b2d 100644 --- a/src/calibre/ebooks/pdb/palmdoc/writer.py +++ b/src/calibre/ebooks/pdb/palmdoc/writer.py @@ -27,7 +27,7 @@ class Writer(FormatWriter): def write_content(self, oeb_book, out_stream, metadata=None): title = self.opts.title if self.opts.title else oeb_book.metadata.title[0].value if oeb_book.metadata.title != [] else _('Unknown') - txt_records, txt_length = self._generate_text(oeb_book.spine) + txt_records, txt_length = self._generate_text(oeb_book) header_record = self._header_record(txt_length, len(txt_records)) section_lengths = [len(header_record)] @@ -44,8 +44,8 @@ class Writer(FormatWriter): for record in [header_record] + txt_records: out_stream.write(record) - def _generate_text(self, spine): - writer = TXTMLizer(log) + def _generate_text(self, oeb_book): + writer = TXTMLizer(self.log) txt = writer.extract_content(oeb_book, opts) self.log.debug('\tReplacing newlines with selected type...') diff --git a/src/calibre/ebooks/pdb/ztxt/writer.py b/src/calibre/ebooks/pdb/ztxt/writer.py index 19824fce91..48600714ca 100644 --- a/src/calibre/ebooks/pdb/ztxt/writer.py +++ b/src/calibre/ebooks/pdb/ztxt/writer.py @@ -26,7 +26,7 @@ class Writer(FormatWriter): def write_content(self, oeb_book, out_stream, metadata=None): title = self.opts.title if self.opts.title else oeb_book.metadata.title[0].value if oeb_book.metadata.title != [] else _('Unknown') - txt_records, txt_length = self._generate_text(oeb_book.spine) + txt_records, txt_length = self._generate_text(oeb_book) crc32 = 0 section_lengths = [] @@ -49,8 +49,8 @@ class Writer(FormatWriter): for record in [header_record]+txt_records: out_stream.write(record) - def _generate_text(self, spine): - writer = TXTMLizer(log) + def _generate_text(self, oeb_book): + writer = TXTMLizer(self.log) txt = writer.extract_content(oeb_book, opts) self.log.debug('\tReplacing newlines with selected type...') From edeedddeb80bc4323f3e7cbb5ac7fd74434378f2 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 12 Jul 2009 20:52:03 -0400 Subject: [PATCH 6/8] Fix more errors when moving to new txtml output. --- src/calibre/ebooks/pdb/palmdoc/writer.py | 5 +++-- src/calibre/ebooks/pdb/ztxt/writer.py | 4 ++-- src/calibre/ebooks/txt/txtml.py | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/calibre/ebooks/pdb/palmdoc/writer.py b/src/calibre/ebooks/pdb/palmdoc/writer.py index f99c698b2d..12c1c4aaa7 100644 --- a/src/calibre/ebooks/pdb/palmdoc/writer.py +++ b/src/calibre/ebooks/pdb/palmdoc/writer.py @@ -3,6 +3,7 @@ ''' Writer content to palmdoc pdb file. ''' +import os __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' @@ -46,10 +47,10 @@ class Writer(FormatWriter): def _generate_text(self, oeb_book): writer = TXTMLizer(self.log) - txt = writer.extract_content(oeb_book, opts) + txt = writer.extract_content(oeb_book, self.opts) self.log.debug('\tReplacing newlines with selected type...') - txt = specified_newlines(TxtNewlines(opts.newline).newline, txt).encode(self.opts.output_encoding, 'replace') + txt = specified_newlines(TxtNewlines('windows').newline, txt).encode(self.opts.output_encoding, 'replace') txt_length = len(txt) diff --git a/src/calibre/ebooks/pdb/ztxt/writer.py b/src/calibre/ebooks/pdb/ztxt/writer.py index 48600714ca..566c0def44 100644 --- a/src/calibre/ebooks/pdb/ztxt/writer.py +++ b/src/calibre/ebooks/pdb/ztxt/writer.py @@ -51,10 +51,10 @@ class Writer(FormatWriter): def _generate_text(self, oeb_book): writer = TXTMLizer(self.log) - txt = writer.extract_content(oeb_book, opts) + txt = writer.extract_content(oeb_book, self.opts) self.log.debug('\tReplacing newlines with selected type...') - txt = specified_newlines(TxtNewlines(opts.newline).newline, txt).encode(self.opts.output_encoding, 'replace') + txt = specified_newlines(TxtNewlines('windows').newline, txt).encode(self.opts.output_encoding, 'replace') txt_length = len(txt) diff --git a/src/calibre/ebooks/txt/txtml.py b/src/calibre/ebooks/txt/txtml.py index d609426d93..94f2a181c5 100644 --- a/src/calibre/ebooks/txt/txtml.py +++ b/src/calibre/ebooks/txt/txtml.py @@ -37,7 +37,7 @@ class TXTMLizer(object): self.log = log def extract_content(self, oeb_book, opts): - self.log.info('Converting XHTML to PML markup...') + self.log.info('Converting XHTML to TXT...') self.oeb_book = oeb_book self.opts = opts return self.mlize_spine() From c602400a68dc3ea48ac77574ffbc90537a8dccb4 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 12 Jul 2009 20:55:21 -0400 Subject: [PATCH 7/8] ztxt pdb output encoding fix. --- src/calibre/ebooks/pdb/ztxt/writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/pdb/ztxt/writer.py b/src/calibre/ebooks/pdb/ztxt/writer.py index 566c0def44..ee4c5752c3 100644 --- a/src/calibre/ebooks/pdb/ztxt/writer.py +++ b/src/calibre/ebooks/pdb/ztxt/writer.py @@ -34,7 +34,7 @@ class Writer(FormatWriter): self.log.info('Compressing data...') for i in range(0, len(txt_records)): self.log.debug('\tCompressing record %i' % i) - txt_records[i] = compressor.compress(txt_records[i].encode('cp1252', 'replace')) + txt_records[i] = compressor.compress(txt_records[i]) txt_records[i] = txt_records[i] + compressor.flush(zlib.Z_FULL_FLUSH) section_lengths.append(len(txt_records[i])) crc32 = zlib.crc32(txt_records[i], crc32) & 0xffffffff From f1806c4aa2c431c17cb1f126effbb5390e8441aa Mon Sep 17 00:00:00 2001 From: John Schember Date: Mon, 13 Jul 2009 12:57:55 -0400 Subject: [PATCH 8/8] Show multiple authors correctly in metadata dialogs. --- src/calibre/gui2/convert/metadata.py | 11 ++++++++--- src/calibre/gui2/dialogs/metadata_single.py | 13 +++++-------- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/src/calibre/gui2/convert/metadata.py b/src/calibre/gui2/convert/metadata.py index 82e7b21148..513535df1b 100644 --- a/src/calibre/gui2/convert/metadata.py +++ b/src/calibre/gui2/convert/metadata.py @@ -39,8 +39,8 @@ class MetadataWidget(Widget, Ui_Form): mi = self.db.get_metadata(self.book_id, index_is_id=True) self.title.setText(mi.title) - if mi.authors: - self.author.setCurrentIndex(self.author.findText(authors_to_string(mi.authors))) +# if mi.authors: +# self.author.setCurrentIndex(self.author.findText(authors_to_string(mi.authors))) if mi.publisher: self.publisher.setCurrentIndex(self.publisher.findText(mi.publisher)) self.author_sort.setText(mi.author_sort if mi.author_sort else '') @@ -75,7 +75,12 @@ class MetadataWidget(Widget, Ui_Form): id, name = i name = authors_to_string([name.strip().replace('|', ',') for n in name.split(',')]) self.author.addItem(name) - self.author.setCurrentIndex(-1) + + au = self.db.authors(self.book_id, True) + if not au: + au = _('Unknown') + au = ' & '.join([a.strip().replace('|', ',') for a in au.split(',')]) + self.author.setEditText(au) def initialize_series(self): all_series = self.db.all_series() diff --git a/src/calibre/gui2/dialogs/metadata_single.py b/src/calibre/gui2/dialogs/metadata_single.py index 0c2211e5c7..13acd161ae 100644 --- a/src/calibre/gui2/dialogs/metadata_single.py +++ b/src/calibre/gui2/dialogs/metadata_single.py @@ -330,19 +330,16 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog): def initalize_authors(self): all_authors = self.db.all_authors() all_authors.sort(cmp=lambda x, y : cmp(x[1], y[1])) - author_id = self.db.author_id(self.row) - idx, c = None, 0 for i in all_authors: id, name = i - if id == author_id: - idx = c name = [name.strip().replace('|', ',') for n in name.split(',')] self.authors.addItem(authors_to_string(name)) - c += 1 - self.authors.setEditText('') - if idx is not None: - self.authors.setCurrentIndex(idx) + au = self.db.authors(self.row) + if not au: + au = _('Unknown') + au = ' & '.join([a.strip().replace('|', ',') for a in au.split(',')]) + self.authors.setEditText(au) def initialize_series(self): self.series.setSizeAdjustPolicy(self.series.AdjustToContentsOnFirstShow)