diff --git a/src/calibre/ebooks/pdb/input.py b/src/calibre/ebooks/pdb/input.py index 6850c48b16..9edf381f1e 100644 --- a/src/calibre/ebooks/pdb/input.py +++ b/src/calibre/ebooks/pdb/input.py @@ -22,13 +22,23 @@ class PDBInput(InputFormatPlugin): OptionRecommendation(name='single_line_paras', recommended_value=False, help=_('Normally calibre treats blank lines as paragraph markers. ' 'With this option it will assume that every line represents ' - 'a paragraph instead.')), + 'a paragraph instead. This option is ignored by eReader format.')), OptionRecommendation(name='print_formatted_paras', recommended_value=False, help=_('Normally calibre treats blank lines as paragraph markers. ' 'With this option it will assume that every line starting with ' 'an indent (either a tab or 2+ spaces) represents a paragraph. ' 'Paragraphs end when the next line that starts with an indent ' - 'is reached.')), + 'is reached. This option is ignored by eReader format.')), + OptionRecommendation(name='preserve_spaces', recommended_value=False, + help=_('Normally extra spaces are condensed into a single space. ' + 'With this option all spaces will be displayed. This option ' + 'is ignored by eReader format.')), + OptionRecommendation(name='markdown', recommended_value=False, + help=_('Run the text input through the markdown pre-processor. To ' + 'learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), + OptionRecommendation(name="markdown_disable_toc", recommended_value=False, + help=_('Do not insert a Table of Contents into the output text. ' + 'This option is ignored by eReader format.')), ]) def convert(self, stream, options, file_ext, log, diff --git a/src/calibre/ebooks/pdb/palmdoc/reader.py b/src/calibre/ebooks/pdb/palmdoc/reader.py index 52b8d1361f..f1f00ea8e3 100644 --- a/src/calibre/ebooks/pdb/palmdoc/reader.py +++ b/src/calibre/ebooks/pdb/palmdoc/reader.py @@ -11,9 +11,9 @@ __docformat__ = 'restructuredtext en' import os import struct +from cStringIO import StringIO + from calibre.ebooks.pdb.formatreader import FormatReader -from calibre.ebooks.txt.processor import convert_basic, opf_writer, \ - separate_paragraphs_single_line, separate_paragraphs_print_formatted class HeaderRecord(object): ''' @@ -33,9 +33,7 @@ class Reader(FormatReader): def __init__(self, header, stream, log, options): self.stream = stream self.log = log - self.encoding = options.input_encoding - self.single_line_paras = options.single_line_paras - self.print_formatted_paras = options.print_formatted_paras + self.options = options self.sections = [] for i in range(header.num_sections): @@ -48,34 +46,23 @@ class Reader(FormatReader): def decompress_text(self, number): if self.header_record.compression == 1: - return self.section_data(number).decode('cp1252' if self.encoding is None else self.encoding) + return self.section_data(number) if self.header_record.compression == 2 or self.header_record.compression == 258: from calibre.ebooks.compression.palmdoc import decompress_doc - return decompress_doc(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding, 'replace') + return decompress_doc(self.section_data(number)) return '' def extract_content(self, output_dir): - txt = '' + raw_txt = '' self.log.info('Decompressing text...') for i in range(1, self.header_record.num_records + 1): self.log.debug('\tDecompressing text section %i' % i) - txt += self.decompress_text(i) + raw_txt += self.decompress_text(i) self.log.info('Converting text to OEB...') - if self.single_line_paras: - txt = separate_paragraphs_single_line(txt) - if self.print_formatted_paras: - txt = separate_paragraphs_print_formatted(txt) - html = convert_basic(txt) - with open(os.path.join(output_dir, 'index.html'), 'wb') as index: - index.write(html.encode('utf-8')) - - from calibre.ebooks.metadata.meta import get_metadata - mi = get_metadata(self.stream, 'pdb') - manifest = [('index.html', None)] - spine = ['index.html'] - opf_writer(output_dir, 'metadata.opf', manifest, spine, mi) - - return os.path.join(output_dir, 'metadata.opf') - + stream = StringIO(raw_txt) + from calibre.customize.ui import plugin_for_input_format + stream.seek(0) + return plugin_for_input_format('txt').convert(stream, self.options, + 'txt', self.log, {}) diff --git a/src/calibre/ebooks/pdb/ztxt/reader.py b/src/calibre/ebooks/pdb/ztxt/reader.py index 5cac283264..7e51dae1fd 100644 --- a/src/calibre/ebooks/pdb/ztxt/reader.py +++ b/src/calibre/ebooks/pdb/ztxt/reader.py @@ -8,12 +8,13 @@ __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' -import os, struct, zlib +import struct +import zlib + +from cStringIO import StringIO from calibre.ebooks.pdb.formatreader import FormatReader from calibre.ebooks.pdb.ztxt import zTXTError -from calibre.ebooks.txt.processor import convert_basic, opf_writer, \ - separate_paragraphs_single_line, separate_paragraphs_print_formatted SUPPORTED_VERSION = (1, 40) @@ -38,9 +39,7 @@ class Reader(FormatReader): def __init__(self, header, stream, log, options): self.stream = stream self.log = log - self.encoding = options.input_encoding - self.single_line_paras = options.single_line_paras - self.print_formatted_paras = options.print_formatted_paras + self.options = options self.sections = [] for i in range(header.num_sections): @@ -68,30 +67,19 @@ class Reader(FormatReader): def decompress_text(self, number): if number == 1: self.uncompressor = zlib.decompressobj() - return self.uncompressor.decompress(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding, 'replace') + return self.uncompressor.decompress(self.section_data(number)) def extract_content(self, output_dir): - txt = '' + raw_txt = '' self.log.info('Decompressing text...') for i in range(1, self.header_record.num_records + 1): self.log.debug('\tDecompressing text section %i' % i) - txt += self.decompress_text(i) - + raw_txt += self.decompress_text(i) + self.log.info('Converting text to OEB...') - if self.single_line_paras: - txt = separate_paragraphs_single_line(txt) - if self.print_formatted_paras: - txt = separate_paragraphs_print_formatted(txt) - html = convert_basic(txt) - with open(os.path.join(output_dir, 'index.html'), 'wb') as index: - index.write(html.encode('utf-8')) - - from calibre.ebooks.metadata.meta import get_metadata - mi = get_metadata(self.stream, 'pdb') - manifest = [('index.html', None)] - spine = ['index.html'] - opf_writer(output_dir, 'metadata.opf', manifest, spine, mi) - - return os.path.join(output_dir, 'metadata.opf') - + stream = StringIO(raw_txt) + from calibre.customize.ui import plugin_for_input_format + stream.seek(0) + return plugin_for_input_format('txt').convert(stream, self.options, + 'txt', self.log, {}) diff --git a/src/calibre/ebooks/tcr/input.py b/src/calibre/ebooks/tcr/input.py index 67fa6ac66e..47154988a0 100644 --- a/src/calibre/ebooks/tcr/input.py +++ b/src/calibre/ebooks/tcr/input.py @@ -4,11 +4,9 @@ __license__ = 'GPL 3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' -import os +from cStringIO import StringIO from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation -from calibre.ebooks.txt.processor import convert_basic, opf_writer, \ - separate_paragraphs_single_line, separate_paragraphs_print_formatted from calibre.ebooks.compression.tcr import decompress class TCRInput(InputFormatPlugin): @@ -29,26 +27,23 @@ class TCRInput(InputFormatPlugin): 'an indent (either a tab or 2+ spaces) represents a paragraph. ' 'Paragraphs end when the next line that starts with an indent ' 'is reached.')), + OptionRecommendation(name='preserve_spaces', recommended_value=False, + help=_('Normally extra spaces are condensed into a single space. ' + 'With this option all spaces will be displayed.')), + OptionRecommendation(name='markdown', recommended_value=False, + help=_('Run the text input through the markdown pre-processor. To ' + 'learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), + OptionRecommendation(name="markdown_disable_toc", recommended_value=False, + help=_('Do not insert a Table of Contents into the output text.')), ]) def convert(self, stream, options, file_ext, log, accelerators): log.info('Decompressing text...') - ienc = options.input_encoding if options.input_encoding else 'utf-8' - txt = decompress(stream).decode(ienc, 'replace') + raw_txt = decompress(stream) log.info('Converting text to OEB...') - if options.single_line_paras: - txt = separate_paragraphs_single_line(txt) - if options.print_formatted_paras: - txt = separate_paragraphs_print_formatted(txt) - html = convert_basic(txt) - with open(os.path.join(os.getcwd(), 'index.html'), 'wb') as index: - index.write(html.encode('utf-8')) - - from calibre.ebooks.metadata.meta import get_metadata - mi = get_metadata(stream, 'tcr') - manifest = [('index.html', None)] - spine = ['index.html'] - opf_writer(os.getcwd(), 'metadata.opf', manifest, spine, mi) - - return os.path.join(os.getcwd(), 'metadata.opf') + stream = StringIO(raw_txt) + from calibre.customize.ui import plugin_for_input_format + stream.seek(0) + return plugin_for_input_format('txt').convert(stream, options, + 'txt', log, accelerators) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 44b98304ea..1a732535b3 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -7,6 +7,7 @@ __docformat__ = 'restructuredtext en' import os from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation +from calibre.ebooks.chardet import detect from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \ separate_paragraphs_single_line, separate_paragraphs_print_formatted, \ preserve_spaces @@ -42,11 +43,19 @@ class TXTInput(InputFormatPlugin): def convert(self, stream, options, file_ext, log, accelerators): - ienc = stream.encoding if stream.encoding else 'utf-8' + log.debug('Reading text from file...') + + txt = stream.read() if options.input_encoding: ienc = options.input_encoding - log.debug('Reading text from file...') - txt = stream.read().decode(ienc, 'replace') + log.debug('Using user specified input encoding of %s' % ienc) + else: + ienc = detect(txt)['encoding'] + log.debug('Detected input encoding as %s' % ienc) + if not ienc: + ienc = 'utf-8' + log.debug('No input encoding specified and could not auto detect using %s' % ienc) + txt = txt.decode(ienc, 'replace') # Adjust paragraph formatting as requested if options.single_line_paras: @@ -85,11 +94,10 @@ class TXTInput(InputFormatPlugin): htmlfile = open(fname, 'wb') with htmlfile: htmlfile.write(html.encode('utf-8')) - cwd = os.getcwdu() odi = options.debug_pipeline options.debug_pipeline = None - oeb = html_input(open(htmlfile.name, 'rb'), options, 'html', log, - {}, cwd) + oeb = html_input.convert(open(htmlfile.name, 'rb'), options, 'html', log, + {}) options.debug_pipeline = odi os.remove(htmlfile.name) return oeb