From 38a82b049dade612732287cd15e9716b56b5f995 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 1 Jan 2011 21:39:57 -0500 Subject: [PATCH 01/10] GUI: Editable combo box with most common character encodings instead of fully free form text entry. This still allows users to specify encodings that are not part of the common list. --- src/calibre/gui2/convert/__init__.py | 4 ++- src/calibre/gui2/convert/look_and_feel.ui | 22 +++++++++---- src/calibre/gui2/convert/pdb_output.ui | 13 +++++++- src/calibre/gui2/convert/pmlz_output.ui | 39 ++++++++++++++++------- src/calibre/gui2/convert/txt_output.ui | 13 +++++++- src/calibre/gui2/widgets.py | 26 +++++++++++++++ 6 files changed, 95 insertions(+), 22 deletions(-) diff --git a/src/calibre/gui2/convert/__init__.py b/src/calibre/gui2/convert/__init__.py index c1efe5b9af..6b977afc19 100644 --- a/src/calibre/gui2/convert/__init__.py +++ b/src/calibre/gui2/convert/__init__.py @@ -191,7 +191,9 @@ class Widget(QWidget): if not val: val = '' getattr(g, 'setPlainText', g.setText)(val) getattr(g, 'setCursorPosition', lambda x: x)(0) - elif isinstance(g, QComboBox) and val: + elif isinstance(g, QComboBox): + if not val: + val = '' idx = g.findText(val, Qt.MatchFixedString) if idx < 0: g.addItem(val) diff --git a/src/calibre/gui2/convert/look_and_feel.ui b/src/calibre/gui2/convert/look_and_feel.ui index 367233e2c0..cd0426ac53 100644 --- a/src/calibre/gui2/convert/look_and_feel.ui +++ b/src/calibre/gui2/convert/look_and_feel.ui @@ -84,7 +84,7 @@ ... - + :/images/wizard.png:/images/wizard.png @@ -122,14 +122,8 @@ Input character &encoding: - - opt_input_encoding - - - - @@ -244,8 +238,22 @@ + + + + true + + + + + + EncodingComboBox + QComboBox +
widgets.h
+
+
diff --git a/src/calibre/gui2/convert/pdb_output.ui b/src/calibre/gui2/convert/pdb_output.ui index 17bdc0a984..a571a0035b 100644 --- a/src/calibre/gui2/convert/pdb_output.ui +++ b/src/calibre/gui2/convert/pdb_output.ui @@ -55,10 +55,21 @@ - + + + true + + + + + EncodingComboBox + QComboBox +
widgets.h
+
+
diff --git a/src/calibre/gui2/convert/pmlz_output.ui b/src/calibre/gui2/convert/pmlz_output.ui index 9754752c8a..bd70cf1039 100644 --- a/src/calibre/gui2/convert/pmlz_output.ui +++ b/src/calibre/gui2/convert/pmlz_output.ui @@ -14,7 +14,7 @@ Form
- + Qt::Vertical @@ -27,32 +27,47 @@ - + &Inline TOC - + Do not reduce image size and depth - - - - Output Encoding: - - - - - + + + + + + Output Encoding: + + + + + + + true + + + + + + + EncodingComboBox + QComboBox +
widgets.h
+
+
diff --git a/src/calibre/gui2/convert/txt_output.ui b/src/calibre/gui2/convert/txt_output.ui index 6290a096c8..3a2516b98e 100644 --- a/src/calibre/gui2/convert/txt_output.ui +++ b/src/calibre/gui2/convert/txt_output.ui @@ -96,10 +96,21 @@ - + + + true + + + + + EncodingComboBox + QComboBox +
widgets.h
+
+
diff --git a/src/calibre/gui2/widgets.py b/src/calibre/gui2/widgets.py index bc3c23876f..cab2e2d4df 100644 --- a/src/calibre/gui2/widgets.py +++ b/src/calibre/gui2/widgets.py @@ -616,6 +616,32 @@ class ComboBoxWithHelp(QComboBox): QComboBox.hidePopup(self) self.set_state() + +class EncodingComboBox(QComboBox): + ''' + A combobox that holds text encodings support + by Python. This is only populated with the most + common and standard encodings. There is no good + way to programatically list all supported encodings + using encodings.aliases.aliases.keys(). It + will not work. + ''' + + ENCODINGS = ['', 'ascii', 'big5', 'cp1250', 'cp1251', 'cp1252', 'cp1253', + 'cp1254', 'cp1255', 'cp1256', 'euc_jp', 'euc_kr', 'gb2312', 'gb18030', + 'hz', 'iso2022_jp', 'iso2022_kr', 'iso8859_5', 'latin_1', 'shift_jis', + 'utf_8', + ] + + def __init__(self, parent=None): + QComboBox.__init__(self, parent) + self.setEditable(True) + self.setLineEdit(EnLineEdit(self)) + + for item in self.ENCODINGS: + self.addItem(item) + + class PythonHighlighter(QSyntaxHighlighter): Rules = [] From 47aeaf10b67498bb8c8c4399abe0ab60f2d0401b Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 1 Jan 2011 23:03:58 -0500 Subject: [PATCH 02/10] TXT Input: Attempt to detect the input encoding when not specified. TCR, PDB Input: Use TXT Input converion plugin for conversion, adds encoding detection and allows for all of TXT Input options to be used (eReader PDB ignores options that do not apply to it). --- src/calibre/ebooks/pdb/input.py | 14 +++++++-- src/calibre/ebooks/pdb/palmdoc/reader.py | 37 +++++++--------------- src/calibre/ebooks/pdb/ztxt/reader.py | 40 +++++++++--------------- src/calibre/ebooks/tcr/input.py | 35 +++++++++------------ src/calibre/ebooks/txt/input.py | 20 ++++++++---- 5 files changed, 67 insertions(+), 79 deletions(-) diff --git a/src/calibre/ebooks/pdb/input.py b/src/calibre/ebooks/pdb/input.py index 6850c48b16..9edf381f1e 100644 --- a/src/calibre/ebooks/pdb/input.py +++ b/src/calibre/ebooks/pdb/input.py @@ -22,13 +22,23 @@ class PDBInput(InputFormatPlugin): OptionRecommendation(name='single_line_paras', recommended_value=False, help=_('Normally calibre treats blank lines as paragraph markers. ' 'With this option it will assume that every line represents ' - 'a paragraph instead.')), + 'a paragraph instead. This option is ignored by eReader format.')), OptionRecommendation(name='print_formatted_paras', recommended_value=False, help=_('Normally calibre treats blank lines as paragraph markers. ' 'With this option it will assume that every line starting with ' 'an indent (either a tab or 2+ spaces) represents a paragraph. ' 'Paragraphs end when the next line that starts with an indent ' - 'is reached.')), + 'is reached. This option is ignored by eReader format.')), + OptionRecommendation(name='preserve_spaces', recommended_value=False, + help=_('Normally extra spaces are condensed into a single space. ' + 'With this option all spaces will be displayed. This option ' + 'is ignored by eReader format.')), + OptionRecommendation(name='markdown', recommended_value=False, + help=_('Run the text input through the markdown pre-processor. To ' + 'learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), + OptionRecommendation(name="markdown_disable_toc", recommended_value=False, + help=_('Do not insert a Table of Contents into the output text. ' + 'This option is ignored by eReader format.')), ]) def convert(self, stream, options, file_ext, log, diff --git a/src/calibre/ebooks/pdb/palmdoc/reader.py b/src/calibre/ebooks/pdb/palmdoc/reader.py index 52b8d1361f..f1f00ea8e3 100644 --- a/src/calibre/ebooks/pdb/palmdoc/reader.py +++ b/src/calibre/ebooks/pdb/palmdoc/reader.py @@ -11,9 +11,9 @@ __docformat__ = 'restructuredtext en' import os import struct +from cStringIO import StringIO + from calibre.ebooks.pdb.formatreader import FormatReader -from calibre.ebooks.txt.processor import convert_basic, opf_writer, \ - separate_paragraphs_single_line, separate_paragraphs_print_formatted class HeaderRecord(object): ''' @@ -33,9 +33,7 @@ class Reader(FormatReader): def __init__(self, header, stream, log, options): self.stream = stream self.log = log - self.encoding = options.input_encoding - self.single_line_paras = options.single_line_paras - self.print_formatted_paras = options.print_formatted_paras + self.options = options self.sections = [] for i in range(header.num_sections): @@ -48,34 +46,23 @@ class Reader(FormatReader): def decompress_text(self, number): if self.header_record.compression == 1: - return self.section_data(number).decode('cp1252' if self.encoding is None else self.encoding) + return self.section_data(number) if self.header_record.compression == 2 or self.header_record.compression == 258: from calibre.ebooks.compression.palmdoc import decompress_doc - return decompress_doc(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding, 'replace') + return decompress_doc(self.section_data(number)) return '' def extract_content(self, output_dir): - txt = '' + raw_txt = '' self.log.info('Decompressing text...') for i in range(1, self.header_record.num_records + 1): self.log.debug('\tDecompressing text section %i' % i) - txt += self.decompress_text(i) + raw_txt += self.decompress_text(i) self.log.info('Converting text to OEB...') - if self.single_line_paras: - txt = separate_paragraphs_single_line(txt) - if self.print_formatted_paras: - txt = separate_paragraphs_print_formatted(txt) - html = convert_basic(txt) - with open(os.path.join(output_dir, 'index.html'), 'wb') as index: - index.write(html.encode('utf-8')) - - from calibre.ebooks.metadata.meta import get_metadata - mi = get_metadata(self.stream, 'pdb') - manifest = [('index.html', None)] - spine = ['index.html'] - opf_writer(output_dir, 'metadata.opf', manifest, spine, mi) - - return os.path.join(output_dir, 'metadata.opf') - + stream = StringIO(raw_txt) + from calibre.customize.ui import plugin_for_input_format + stream.seek(0) + return plugin_for_input_format('txt').convert(stream, self.options, + 'txt', self.log, {}) diff --git a/src/calibre/ebooks/pdb/ztxt/reader.py b/src/calibre/ebooks/pdb/ztxt/reader.py index 5cac283264..7e51dae1fd 100644 --- a/src/calibre/ebooks/pdb/ztxt/reader.py +++ b/src/calibre/ebooks/pdb/ztxt/reader.py @@ -8,12 +8,13 @@ __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' -import os, struct, zlib +import struct +import zlib + +from cStringIO import StringIO from calibre.ebooks.pdb.formatreader import FormatReader from calibre.ebooks.pdb.ztxt import zTXTError -from calibre.ebooks.txt.processor import convert_basic, opf_writer, \ - separate_paragraphs_single_line, separate_paragraphs_print_formatted SUPPORTED_VERSION = (1, 40) @@ -38,9 +39,7 @@ class Reader(FormatReader): def __init__(self, header, stream, log, options): self.stream = stream self.log = log - self.encoding = options.input_encoding - self.single_line_paras = options.single_line_paras - self.print_formatted_paras = options.print_formatted_paras + self.options = options self.sections = [] for i in range(header.num_sections): @@ -68,30 +67,19 @@ class Reader(FormatReader): def decompress_text(self, number): if number == 1: self.uncompressor = zlib.decompressobj() - return self.uncompressor.decompress(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding, 'replace') + return self.uncompressor.decompress(self.section_data(number)) def extract_content(self, output_dir): - txt = '' + raw_txt = '' self.log.info('Decompressing text...') for i in range(1, self.header_record.num_records + 1): self.log.debug('\tDecompressing text section %i' % i) - txt += self.decompress_text(i) - + raw_txt += self.decompress_text(i) + self.log.info('Converting text to OEB...') - if self.single_line_paras: - txt = separate_paragraphs_single_line(txt) - if self.print_formatted_paras: - txt = separate_paragraphs_print_formatted(txt) - html = convert_basic(txt) - with open(os.path.join(output_dir, 'index.html'), 'wb') as index: - index.write(html.encode('utf-8')) - - from calibre.ebooks.metadata.meta import get_metadata - mi = get_metadata(self.stream, 'pdb') - manifest = [('index.html', None)] - spine = ['index.html'] - opf_writer(output_dir, 'metadata.opf', manifest, spine, mi) - - return os.path.join(output_dir, 'metadata.opf') - + stream = StringIO(raw_txt) + from calibre.customize.ui import plugin_for_input_format + stream.seek(0) + return plugin_for_input_format('txt').convert(stream, self.options, + 'txt', self.log, {}) diff --git a/src/calibre/ebooks/tcr/input.py b/src/calibre/ebooks/tcr/input.py index 67fa6ac66e..47154988a0 100644 --- a/src/calibre/ebooks/tcr/input.py +++ b/src/calibre/ebooks/tcr/input.py @@ -4,11 +4,9 @@ __license__ = 'GPL 3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' -import os +from cStringIO import StringIO from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation -from calibre.ebooks.txt.processor import convert_basic, opf_writer, \ - separate_paragraphs_single_line, separate_paragraphs_print_formatted from calibre.ebooks.compression.tcr import decompress class TCRInput(InputFormatPlugin): @@ -29,26 +27,23 @@ class TCRInput(InputFormatPlugin): 'an indent (either a tab or 2+ spaces) represents a paragraph. ' 'Paragraphs end when the next line that starts with an indent ' 'is reached.')), + OptionRecommendation(name='preserve_spaces', recommended_value=False, + help=_('Normally extra spaces are condensed into a single space. ' + 'With this option all spaces will be displayed.')), + OptionRecommendation(name='markdown', recommended_value=False, + help=_('Run the text input through the markdown pre-processor. To ' + 'learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), + OptionRecommendation(name="markdown_disable_toc", recommended_value=False, + help=_('Do not insert a Table of Contents into the output text.')), ]) def convert(self, stream, options, file_ext, log, accelerators): log.info('Decompressing text...') - ienc = options.input_encoding if options.input_encoding else 'utf-8' - txt = decompress(stream).decode(ienc, 'replace') + raw_txt = decompress(stream) log.info('Converting text to OEB...') - if options.single_line_paras: - txt = separate_paragraphs_single_line(txt) - if options.print_formatted_paras: - txt = separate_paragraphs_print_formatted(txt) - html = convert_basic(txt) - with open(os.path.join(os.getcwd(), 'index.html'), 'wb') as index: - index.write(html.encode('utf-8')) - - from calibre.ebooks.metadata.meta import get_metadata - mi = get_metadata(stream, 'tcr') - manifest = [('index.html', None)] - spine = ['index.html'] - opf_writer(os.getcwd(), 'metadata.opf', manifest, spine, mi) - - return os.path.join(os.getcwd(), 'metadata.opf') + stream = StringIO(raw_txt) + from calibre.customize.ui import plugin_for_input_format + stream.seek(0) + return plugin_for_input_format('txt').convert(stream, options, + 'txt', log, accelerators) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 44b98304ea..1a732535b3 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -7,6 +7,7 @@ __docformat__ = 'restructuredtext en' import os from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation +from calibre.ebooks.chardet import detect from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \ separate_paragraphs_single_line, separate_paragraphs_print_formatted, \ preserve_spaces @@ -42,11 +43,19 @@ class TXTInput(InputFormatPlugin): def convert(self, stream, options, file_ext, log, accelerators): - ienc = stream.encoding if stream.encoding else 'utf-8' + log.debug('Reading text from file...') + + txt = stream.read() if options.input_encoding: ienc = options.input_encoding - log.debug('Reading text from file...') - txt = stream.read().decode(ienc, 'replace') + log.debug('Using user specified input encoding of %s' % ienc) + else: + ienc = detect(txt)['encoding'] + log.debug('Detected input encoding as %s' % ienc) + if not ienc: + ienc = 'utf-8' + log.debug('No input encoding specified and could not auto detect using %s' % ienc) + txt = txt.decode(ienc, 'replace') # Adjust paragraph formatting as requested if options.single_line_paras: @@ -85,11 +94,10 @@ class TXTInput(InputFormatPlugin): htmlfile = open(fname, 'wb') with htmlfile: htmlfile.write(html.encode('utf-8')) - cwd = os.getcwdu() odi = options.debug_pipeline options.debug_pipeline = None - oeb = html_input(open(htmlfile.name, 'rb'), options, 'html', log, - {}, cwd) + oeb = html_input.convert(open(htmlfile.name, 'rb'), options, 'html', log, + {}) options.debug_pipeline = odi os.remove(htmlfile.name) return oeb From 089d3679420b087c09dce06b3ea80ac1faf194c0 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 2 Jan 2011 09:59:41 -0500 Subject: [PATCH 03/10] PDF Output: Change call to get_printer to correct get_pdf_printer. --- src/calibre/ebooks/pdf/writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/pdf/writer.py b/src/calibre/ebooks/pdf/writer.py index 4ff10290c9..8938dd66c1 100644 --- a/src/calibre/ebooks/pdf/writer.py +++ b/src/calibre/ebooks/pdf/writer.py @@ -175,7 +175,7 @@ class PDFWriter(QObject): # {{{ if self.cover_data is None: return item_path = os.path.join(self.tmp_path, 'cover.pdf') - printer = self.get_printer() + printer = self.get_pdf_printer() printer.setOutputFileName(item_path) self.combine_queue.insert(0, item_path) p = QPixmap() From d9195c0632ac823e0e581e417596d1d2039aef9d Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 2 Jan 2011 17:32:16 -0500 Subject: [PATCH 04/10] TXT Input: Add confidence of detected encoding to debug log. --- src/calibre/ebooks/txt/input.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 1a732535b3..5e406216d6 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -50,8 +50,9 @@ class TXTInput(InputFormatPlugin): ienc = options.input_encoding log.debug('Using user specified input encoding of %s' % ienc) else: - ienc = detect(txt)['encoding'] - log.debug('Detected input encoding as %s' % ienc) + det_encoding = detect(txt) + ienc = det_encoding['encoding'] + log.debug('Detected input encoding as %s with a confidence of %s%%' % (ienc, det_encoding['confidence'] * 100)) if not ienc: ienc = 'utf-8' log.debug('No input encoding specified and could not auto detect using %s' % ienc) From 9ec91639197e2e1dec38525984787b317c0296c9 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 2 Jan 2011 19:05:35 -0500 Subject: [PATCH 05/10] TXT Input: Auto detect paragraph structure. --- src/calibre/ebooks/pdb/input.py | 30 ++++++++--------- src/calibre/ebooks/tcr/input.py | 24 +++++++------- src/calibre/ebooks/txt/input.py | 51 ++++++++++++++++++----------- src/calibre/ebooks/txt/processor.py | 50 +++++++++++++++++++++++++++- 4 files changed, 104 insertions(+), 51 deletions(-) diff --git a/src/calibre/ebooks/pdb/input.py b/src/calibre/ebooks/pdb/input.py index 9edf381f1e..b8b4b93ca1 100644 --- a/src/calibre/ebooks/pdb/input.py +++ b/src/calibre/ebooks/pdb/input.py @@ -19,26 +19,22 @@ class PDBInput(InputFormatPlugin): file_types = set(['pdb']) options = set([ - OptionRecommendation(name='single_line_paras', recommended_value=False, - help=_('Normally calibre treats blank lines as paragraph markers. ' - 'With this option it will assume that every line represents ' - 'a paragraph instead. This option is ignored by eReader format.')), - OptionRecommendation(name='print_formatted_paras', recommended_value=False, - help=_('Normally calibre treats blank lines as paragraph markers. ' - 'With this option it will assume that every line starting with ' - 'an indent (either a tab or 2+ spaces) represents a paragraph. ' - 'Paragraphs end when the next line that starts with an indent ' - 'is reached. This option is ignored by eReader format.')), + OptionRecommendation(name='paragraph_format', recommended_value='auto', + choices=['auto', 'block', 'single', 'print', 'markdown'], + help=_('How calibre splits text into paragraphs.\n' + 'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n' + '* auto: Try to auto detect paragraph format.\n' + '* block: Treat a blank line as a paragraph break.\n' + '* single: Assume every line is a paragraph.\n' + '* print: Assume every line starting with 2+ spaces or a tab ' + 'starts a paragraph.\n' + '* markdown: Run the input though the markdown pre-processor. ' + 'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), OptionRecommendation(name='preserve_spaces', recommended_value=False, help=_('Normally extra spaces are condensed into a single space. ' - 'With this option all spaces will be displayed. This option ' - 'is ignored by eReader format.')), - OptionRecommendation(name='markdown', recommended_value=False, - help=_('Run the text input through the markdown pre-processor. To ' - 'learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), + 'With this option all spaces will be displayed.')), OptionRecommendation(name="markdown_disable_toc", recommended_value=False, - help=_('Do not insert a Table of Contents into the output text. ' - 'This option is ignored by eReader format.')), + help=_('Do not insert a Table of Contents into the output text.')), ]) def convert(self, stream, options, file_ext, log, diff --git a/src/calibre/ebooks/tcr/input.py b/src/calibre/ebooks/tcr/input.py index 47154988a0..47fe7e7337 100644 --- a/src/calibre/ebooks/tcr/input.py +++ b/src/calibre/ebooks/tcr/input.py @@ -17,22 +17,20 @@ class TCRInput(InputFormatPlugin): file_types = set(['tcr']) options = set([ - OptionRecommendation(name='single_line_paras', recommended_value=False, - help=_('Normally calibre treats blank lines as paragraph markers. ' - 'With this option it will assume that every line represents ' - 'a paragraph instead.')), - OptionRecommendation(name='print_formatted_paras', recommended_value=False, - help=_('Normally calibre treats blank lines as paragraph markers. ' - 'With this option it will assume that every line starting with ' - 'an indent (either a tab or 2+ spaces) represents a paragraph. ' - 'Paragraphs end when the next line that starts with an indent ' - 'is reached.')), + OptionRecommendation(name='paragraph_format', recommended_value='auto', + choices=['auto', 'block', 'single', 'print', 'markdown'], + help=_('How calibre splits text into paragraphs.\n' + 'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n' + '* auto: Try to auto detect paragraph format.\n' + '* block: Treat a blank line as a paragraph break.\n' + '* single: Assume every line is a paragraph.\n' + '* print: Assume every line starting with 2+ spaces or a tab ' + 'starts a paragraph.\n' + '* markdown: Run the input though the markdown pre-processor. ' + 'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), OptionRecommendation(name='preserve_spaces', recommended_value=False, help=_('Normally extra spaces are condensed into a single space. ' 'With this option all spaces will be displayed.')), - OptionRecommendation(name='markdown', recommended_value=False, - help=_('Run the text input through the markdown pre-processor. To ' - 'learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), OptionRecommendation(name="markdown_disable_toc", recommended_value=False, help=_('Do not insert a Table of Contents into the output text.')), ]) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 5e406216d6..e68c47e9b3 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -10,7 +10,7 @@ from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation from calibre.ebooks.chardet import detect from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \ separate_paragraphs_single_line, separate_paragraphs_print_formatted, \ - preserve_spaces + preserve_spaces, detect_paragraph_formatting from calibre import _ent_pat, xml_entity_to_unicode class TXTInput(InputFormatPlugin): @@ -21,22 +21,20 @@ class TXTInput(InputFormatPlugin): file_types = set(['txt']) options = set([ - OptionRecommendation(name='single_line_paras', recommended_value=False, - help=_('Normally calibre treats blank lines as paragraph markers. ' - 'With this option it will assume that every line represents ' - 'a paragraph instead.')), - OptionRecommendation(name='print_formatted_paras', recommended_value=False, - help=_('Normally calibre treats blank lines as paragraph markers. ' - 'With this option it will assume that every line starting with ' - 'an indent (either a tab or 2+ spaces) represents a paragraph. ' - 'Paragraphs end when the next line that starts with an indent ' - 'is reached.')), + OptionRecommendation(name='paragraph_format', recommended_value='auto', + choices=['auto', 'block', 'single', 'print', 'markdown'], + help=_('How calibre splits text into paragraphs.\n' + 'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n' + '* auto: Try to auto detect paragraph format.\n' + '* block: Treat a blank line as a paragraph break.\n' + '* single: Assume every line is a paragraph.\n' + '* print: Assume every line starting with 2+ spaces or a tab ' + 'starts a paragraph.\n' + '* markdown: Run the input though the markdown pre-processor. ' + 'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), OptionRecommendation(name='preserve_spaces', recommended_value=False, help=_('Normally extra spaces are condensed into a single space. ' 'With this option all spaces will be displayed.')), - OptionRecommendation(name='markdown', recommended_value=False, - help=_('Run the text input through the markdown pre-processor. To ' - 'learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), OptionRecommendation(name="markdown_disable_toc", recommended_value=False, help=_('Do not insert a Table of Contents into the output text.')), ]) @@ -46,6 +44,7 @@ class TXTInput(InputFormatPlugin): log.debug('Reading text from file...') txt = stream.read() + # Get the encoding of the document. if options.input_encoding: ienc = options.input_encoding log.debug('Using user specified input encoding of %s' % ienc) @@ -58,17 +57,29 @@ class TXTInput(InputFormatPlugin): log.debug('No input encoding specified and could not auto detect using %s' % ienc) txt = txt.decode(ienc, 'replace') - # Adjust paragraph formatting as requested - if options.single_line_paras: + # Determine the formatting of the document. + if options.paragraph_format == 'auto': + options.paragraph_format = detect_paragraph_formatting(txt) + if options.paragraph_format == 'unknown': + log.debug('Could not reliably determine paragraph format using block format') + options.paragraph_format = 'block' + else: + log.debug('Auto detected paragraph format as %s' % options.paragraph_format) + + # We don't check for block because the processor assumes block. + # single and print at transformed to block for processing. + if options.paragraph_format == 'single': txt = separate_paragraphs_single_line(txt) - if options.print_formatted_paras: + elif options.paragraph_format == 'print': txt = separate_paragraphs_print_formatted(txt) + + txt = _ent_pat.sub(xml_entity_to_unicode, txt) + # Preserve spaces will replace multiple spaces to a space + # followed by the   entity. if options.preserve_spaces: txt = preserve_spaces(txt) - txt = _ent_pat.sub(xml_entity_to_unicode, txt) - - if options.markdown: + if options.paragraph_format == 'markdown': log.debug('Running text though markdown conversion...') try: html = convert_markdown(txt, disable_toc=options.markdown_disable_toc) diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index dac1e34df7..e1014b0c7b 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -49,7 +49,6 @@ def convert_basic(txt, title='', epub_split_size_kb=0): if isbytestring(txt): txt = txt.decode('utf-8') - lines = [] # Split into paragraphs based on having a blank line between text. for line in txt.split('\n\n'): @@ -94,3 +93,52 @@ def split_string_separator(txt, size) : xrange(0, len(txt), size)]) return txt +def detect_paragraph_formatting(txt): + ''' + Tries to determine the formatting of the document. + + block: Paragraphs are separated by a blank line. + single: Each line is a paragraph. + print: Each paragraph starts with a 2+ spaces or a tab + and ends when a new paragraph is reached. + markdown: Markdown formatting is in the document. + + returns block, single, print, markdown + ''' + txt = txt.replace('\r\n', '\n') + txt = txt.replace('\r', '\n') + txt_line_count = len(re.findall('(?mu)^\s*.+$', txt)) + + # Check for markdown + # Headings + if len(re.findall('(?mu)^#+', txt)) >= 5: + return 'markdown' + if len(re.findall('(?mu)^=+$', txt)) >= 5: + return 'markdown' + if len(re.findall('(?mu)^-+$', txt)) >= 5: + return 'markdown' + # Images + if len(re.findall('(?u)!\[.*?\]\(.+?\)', txt)) >= 5: + return 'markdown' + # Links + if len(re.findall('(?u)(^|(?P
[^!]))\[.*?\]\([^)]+\)', txt)) >= 5:
+        return 'markdown'
+    # Escaped characters
+    md_escapted_characters = ['\\', '`', '*', '_', '{', '}', '[', ']', '(', ')', '#', '+', '-', '.', '!']
+    for c in md_escapted_characters:
+        if txt.count('\\'+c) > 10:
+            return 'markdown'
+    
+    # Check for print
+    tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
+    if tab_line_count / float(txt_line_count) >= .25:
+        return 'print'
+    
+    # Check for block
+    empty_line_count = len(re.findall('(?mu)^\s*$', txt))
+    if empty_line_count / float(txt_line_count) >= .25:
+        return 'block'
+    
+    # Nothing else matched to assume single.
+    return 'single'
+

From 521e41973aa09d00bf3a495507b03a21e4257165 Mon Sep 17 00:00:00 2001
From: John Schember 
Date: Sun, 2 Jan 2011 19:18:52 -0500
Subject: [PATCH 06/10] GUI: TXT, TCR, PDB Inputs gui conversion options
 updated.

---
 src/calibre/gui2/convert/pdb_input.py | 10 +++--
 src/calibre/gui2/convert/pdb_input.ui | 48 ---------------------
 src/calibre/gui2/convert/tcr_input.py | 23 ++++++++++
 src/calibre/gui2/convert/txt_input.py |  5 ++-
 src/calibre/gui2/convert/txt_input.ui | 60 +++++++--------------------
 5 files changed, 48 insertions(+), 98 deletions(-)
 delete mode 100644 src/calibre/gui2/convert/pdb_input.ui
 create mode 100644 src/calibre/gui2/convert/tcr_input.py

diff --git a/src/calibre/gui2/convert/pdb_input.py b/src/calibre/gui2/convert/pdb_input.py
index 4510cf81ba..655f4025a7 100644
--- a/src/calibre/gui2/convert/pdb_input.py
+++ b/src/calibre/gui2/convert/pdb_input.py
@@ -1,10 +1,10 @@
 # -*- coding: utf-8 -*-
 
 __license__ = 'GPL 3'
-__copyright__ = '2009, John Schember '
+__copyright__ = '2011, John Schember '
 __docformat__ = 'restructuredtext en'
 
-from calibre.gui2.convert.pdb_input_ui import Ui_Form
+from calibre.gui2.convert.txt_input_ui import Ui_Form
 from calibre.gui2.convert import Widget
 
 class PluginWidget(Widget, Ui_Form):
@@ -12,10 +12,12 @@ class PluginWidget(Widget, Ui_Form):
     TITLE = _('PDB Input')
     HELP = _('Options specific to')+' PDB '+_('input')
     COMMIT_NAME = 'pdb_input'
-    ICON = I('mimetypes/unknown.png')
+    ICON = I('mimetypes/txt.png')
 
     def __init__(self, parent, get_option, get_help, db=None, book_id=None):
         Widget.__init__(self, parent,
-            ['single_line_paras', 'print_formatted_paras'])
+            ['paragraph_format', 'markdown_disable_toc', 'preserve_spaces'])
         self.db, self.book_id = db, book_id
+        for x in get_option('paragraph_format').option.choices:
+            self.opt_paragraph_format.addItem(x)
         self.initialize_options(get_option, get_help, db, book_id)
diff --git a/src/calibre/gui2/convert/pdb_input.ui b/src/calibre/gui2/convert/pdb_input.ui
deleted file mode 100644
index 2b632b1a33..0000000000
--- a/src/calibre/gui2/convert/pdb_input.ui
+++ /dev/null
@@ -1,48 +0,0 @@
-
-
- Form
- 
-  
-   
-    0
-    0
-    400
-    300
-   
-  
-  
-   Form
-  
-  
-   
-    
-     
-      Qt::Vertical
-     
-     
-      
-       20
-       213
-      
-     
-    
-   
-   
-    
-     
-      Treat each &line as a paragraph
-     
-    
-   
-   
-    
-     
-      Assume print formatting
-     
-    
-   
-  
- 
- 
- 
-
diff --git a/src/calibre/gui2/convert/tcr_input.py b/src/calibre/gui2/convert/tcr_input.py
new file mode 100644
index 0000000000..2aa877ce4d
--- /dev/null
+++ b/src/calibre/gui2/convert/tcr_input.py
@@ -0,0 +1,23 @@
+# -*- coding: utf-8 -*-
+
+__license__ = 'GPL 3'
+__copyright__ = '2011, John Schember '
+__docformat__ = 'restructuredtext en'
+
+from calibre.gui2.convert.txt_input_ui import Ui_Form
+from calibre.gui2.convert import Widget
+
+class PluginWidget(Widget, Ui_Form):
+
+    TITLE = _('TCR Input')
+    HELP = _('Options specific to')+' TCR '+_('input')
+    COMMIT_NAME = 'tcr_input'
+    ICON = I('mimetypes/txt.png')
+
+    def __init__(self, parent, get_option, get_help, db=None, book_id=None):
+        Widget.__init__(self, parent,
+            ['paragraph_format', 'markdown_disable_toc', 'preserve_spaces'])
+        self.db, self.book_id = db, book_id
+        for x in get_option('paragraph_format').option.choices:
+            self.opt_paragraph_format.addItem(x)
+        self.initialize_options(get_option, get_help, db, book_id)
diff --git a/src/calibre/gui2/convert/txt_input.py b/src/calibre/gui2/convert/txt_input.py
index 31019251e2..99d04fe2f4 100644
--- a/src/calibre/gui2/convert/txt_input.py
+++ b/src/calibre/gui2/convert/txt_input.py
@@ -16,7 +16,8 @@ class PluginWidget(Widget, Ui_Form):
 
     def __init__(self, parent, get_option, get_help, db=None, book_id=None):
         Widget.__init__(self, parent,
-            ['single_line_paras', 'print_formatted_paras', 'markdown',
-                'markdown_disable_toc', 'preserve_spaces'])
+            ['paragraph_format', 'markdown_disable_toc', 'preserve_spaces'])
         self.db, self.book_id = db, book_id
+        for x in get_option('paragraph_format').option.choices:
+            self.opt_paragraph_format.addItem(x)
         self.initialize_options(get_option, get_help, db, book_id)
diff --git a/src/calibre/gui2/convert/txt_input.ui b/src/calibre/gui2/convert/txt_input.ui
index 186783c277..b45297fdf2 100644
--- a/src/calibre/gui2/convert/txt_input.ui
+++ b/src/calibre/gui2/convert/txt_input.ui
@@ -6,7 +6,7 @@
    
     0
     0
-    470
+    488
     300
    
   
@@ -15,27 +15,16 @@
   
   
    
-    
+    
      
-      Treat each &line as a paragraph
+      Document structure detection
      
     
    
-   
-    
-     
-      Assume print formatting
-     
-    
+   
+    
    
-   
-    
-     
-      Process using markdown
-     
-    
-   
-   
+   
     
      
       <p>Markdown is a simple markup language for text files, that allows for advanced formatting. To learn more visit <a href="http://daringfireball.net/projects/markdown">markdown</a>.
@@ -48,14 +37,21 @@
      
     
    
-   
+   
     
      
       Do not insert Table of Contents into output text when using markdown
      
     
    
-   
+   
+    
+     
+      Preserve &spaces
+     
+    
+   
+   
     
      
       Qt::Vertical
@@ -68,32 +64,8 @@
      
     
    
-   
-    
-     
-      Preserve &spaces
-     
-    
-   
   
  
  
- 
-  
-   opt_markdown
-   toggled(bool)
-   opt_markdown_disable_toc
-   setEnabled(bool)
-   
-    
-     76
-     80
-    
-    
-     418
-     105
-    
-   
-  
- 
+ 
 

From 2427c5bdd01d9c94abd3e887dd9d1cfcc3e2f5fc Mon Sep 17 00:00:00 2001
From: John Schember 
Date: Mon, 3 Jan 2011 20:53:41 -0500
Subject: [PATCH 07/10] FB2 Output: Fix bug #8172, Include cover page in output
 when it is not referenced in the oeb spine.

---
 src/calibre/ebooks/fb2/fb2ml.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py
index f9ce9befb4..8d23a5f0b2 100644
--- a/src/calibre/ebooks/fb2/fb2ml.py
+++ b/src/calibre/ebooks/fb2/fb2ml.py
@@ -161,6 +161,17 @@ class FB2MLizer(object):
             text.append('
') self.section_level += 1 + # Insert the title page / cover into the spine if it is not already referenced. + title_name = u'' + if 'titlepage' in self.oeb_book.guide: + title_name = 'titlepage' + elif 'cover' in self.oeb_book.guide: + title_name = 'cover' + if title_name: + title_item = self.oeb_book.manifest.hrefs[self.oeb_book.guide[title_name].href] + if title_item.spine_position is None and title_item.media_type == 'application/xhtml+xml': + self.oeb_book.spine.insert(0, title_item, True) + for item in self.oeb_book.spine: self.log.debug('Converting %s to FictionBook2 XML' % item.href) stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile) From d23ce51b98629014b0d4ba899b89d74d9ba51812 Mon Sep 17 00:00:00 2001 From: John Schember Date: Wed, 5 Jan 2011 18:30:50 -0500 Subject: [PATCH 08/10] FB2 Ouput: Insert image based covers into document. --- src/calibre/ebooks/fb2/fb2ml.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index 8d23a5f0b2..f6deab677a 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -16,6 +16,7 @@ import uuid from lxml import etree +from calibre import guess_type from calibre import prepare_string_for_xml from calibre.constants import __appname__, __version__ from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace @@ -171,6 +172,12 @@ class FB2MLizer(object): title_item = self.oeb_book.manifest.hrefs[self.oeb_book.guide[title_name].href] if title_item.spine_position is None and title_item.media_type == 'application/xhtml+xml': self.oeb_book.spine.insert(0, title_item, True) + # Create xhtml page to reference cover image so it can be used. + if self.oeb_book.metadata.cover and unicode(self.oeb_book.metadata.cover[0]) in self.oeb_book.manifest.ids: + id = unicode(self.oeb_book.metadata.cover[0]) + cover_item = self.oeb_book.manifest.ids[id] + if cover_item.media_type in OEB_RASTER_IMAGES: + self.insert_image_cover(cover_item.href) for item in self.oeb_book.spine: self.log.debug('Converting %s to FictionBook2 XML' % item.href) @@ -196,6 +203,17 @@ class FB2MLizer(object): return ''.join(text) + '' + def insert_image_cover(self, image_href): + from calibre.ebooks.oeb.base import RECOVER_PARSER + try: + root = etree.fromstring(u'' % (XHTML_NS, image_href), parser=RECOVER_PARSER) + except: + root = etree.fromstring(u'', parser=RECOVER_PARSER) + + id, href = self.oeb_book.manifest.generate('fb2_cover', 'fb2_cover.xhtml') + item = self.oeb_book.manifest.add(id, href, guess_type(href)[0], data=root) + self.oeb_book.spine.insert(0, item, True) + def fb2mlize_images(self): ''' This function uses the self.image_hrefs dictionary mapping. It is populated by the dump_text function. From 3bb40c9911b8cae50cedaa4490d6c5d731f8ddc3 Mon Sep 17 00:00:00 2001 From: John Schember Date: Wed, 5 Jan 2011 18:39:55 -0500 Subject: [PATCH 09/10] TCR, PDB (PalmDoc, zTXT) Input: Call TXT plugin while setting default values for options that are not set by calling plugin. --- src/calibre/ebooks/pdb/palmdoc/reader.py | 10 ++++++++-- src/calibre/ebooks/pdb/ztxt/reader.py | 10 ++++++++-- src/calibre/ebooks/tcr/input.py | 9 ++++++++- 3 files changed, 24 insertions(+), 5 deletions(-) diff --git a/src/calibre/ebooks/pdb/palmdoc/reader.py b/src/calibre/ebooks/pdb/palmdoc/reader.py index f1f00ea8e3..dd9706f00c 100644 --- a/src/calibre/ebooks/pdb/palmdoc/reader.py +++ b/src/calibre/ebooks/pdb/palmdoc/reader.py @@ -62,7 +62,13 @@ class Reader(FormatReader): self.log.info('Converting text to OEB...') stream = StringIO(raw_txt) + from calibre.customize.ui import plugin_for_input_format + + txt_plugin = plugin_for_input_format('txt') + for option in txt_plugin.options: + if not hasattr(self.options, option.option.name): + setattr(self.options, option.name, option.recommend_val) + stream.seek(0) - return plugin_for_input_format('txt').convert(stream, self.options, - 'txt', self.log, {}) + return txt_plugin.convert(stream, self.options, 'txt', self.log, {}) diff --git a/src/calibre/ebooks/pdb/ztxt/reader.py b/src/calibre/ebooks/pdb/ztxt/reader.py index 7e51dae1fd..8d51c07e97 100644 --- a/src/calibre/ebooks/pdb/ztxt/reader.py +++ b/src/calibre/ebooks/pdb/ztxt/reader.py @@ -79,7 +79,13 @@ class Reader(FormatReader): self.log.info('Converting text to OEB...') stream = StringIO(raw_txt) + from calibre.customize.ui import plugin_for_input_format + + txt_plugin = plugin_for_input_format('txt') + for option in txt_plugin.options: + if not hasattr(self.options, option.option.name): + setattr(self.options, option.name, option.recommend_val) + stream.seek(0) - return plugin_for_input_format('txt').convert(stream, self.options, - 'txt', self.log, {}) + return txt_plugin.convert(stream, self.options, 'txt', self.log, {}) diff --git a/src/calibre/ebooks/tcr/input.py b/src/calibre/ebooks/tcr/input.py index 47fe7e7337..5f9554665b 100644 --- a/src/calibre/ebooks/tcr/input.py +++ b/src/calibre/ebooks/tcr/input.py @@ -41,7 +41,14 @@ class TCRInput(InputFormatPlugin): log.info('Converting text to OEB...') stream = StringIO(raw_txt) + from calibre.customize.ui import plugin_for_input_format + + txt_plugin = plugin_for_input_format('txt') + for option in txt_plugin.options: + if not hasattr(options, option.option.name): + setattr(options, option.name, option.recommend_val) + stream.seek(0) - return plugin_for_input_format('txt').convert(stream, options, + return txt_plugin.convert(stream, options, 'txt', log, accelerators) From dea9ae683217159626407e622848c1481c1dcbef Mon Sep 17 00:00:00 2001 From: John Schember Date: Wed, 5 Jan 2011 20:03:49 -0500 Subject: [PATCH 10/10] TXT Input: Split pargarph and formatting into two different options. --- src/calibre/ebooks/pdb/input.py | 15 ++++-- src/calibre/ebooks/tcr/input.py | 15 ++++-- src/calibre/ebooks/txt/input.py | 54 ++++++++++++--------- src/calibre/ebooks/txt/processor.py | 30 ++++++------ src/calibre/gui2/convert/pdb_input.py | 8 ++-- src/calibre/gui2/convert/tcr_input.py | 8 ++-- src/calibre/gui2/convert/txt_input.py | 8 ++-- src/calibre/gui2/convert/txt_input.ui | 69 +++++++++++++++++---------- 8 files changed, 126 insertions(+), 81 deletions(-) diff --git a/src/calibre/ebooks/pdb/input.py b/src/calibre/ebooks/pdb/input.py index b8b4b93ca1..3688abff3f 100644 --- a/src/calibre/ebooks/pdb/input.py +++ b/src/calibre/ebooks/pdb/input.py @@ -19,15 +19,20 @@ class PDBInput(InputFormatPlugin): file_types = set(['pdb']) options = set([ - OptionRecommendation(name='paragraph_format', recommended_value='auto', - choices=['auto', 'block', 'single', 'print', 'markdown'], - help=_('How calibre splits text into paragraphs.\n' + OptionRecommendation(name='paragraph_type', recommended_value='auto', + choices=['auto', 'block', 'single', 'print'], + help=_('Paragraph structure.\n' 'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n' - '* auto: Try to auto detect paragraph format.\n' + '* auto: Try to auto detect paragraph type.\n' '* block: Treat a blank line as a paragraph break.\n' '* single: Assume every line is a paragraph.\n' '* print: Assume every line starting with 2+ spaces or a tab ' - 'starts a paragraph.\n' + 'starts a paragraph.')), + OptionRecommendation(name='formatting_type', recommended_value='auto', + choices=['auto', 'none', 'markdown'], + help=_('Formatting used within the document.' + '* auto: Try to auto detect the document formatting.\n' + '* none: Do not modify the paragraph formatting. Everything is a paragraph.\n' '* markdown: Run the input though the markdown pre-processor. ' 'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), OptionRecommendation(name='preserve_spaces', recommended_value=False, diff --git a/src/calibre/ebooks/tcr/input.py b/src/calibre/ebooks/tcr/input.py index 5f9554665b..e4118c1c0a 100644 --- a/src/calibre/ebooks/tcr/input.py +++ b/src/calibre/ebooks/tcr/input.py @@ -17,15 +17,20 @@ class TCRInput(InputFormatPlugin): file_types = set(['tcr']) options = set([ - OptionRecommendation(name='paragraph_format', recommended_value='auto', - choices=['auto', 'block', 'single', 'print', 'markdown'], - help=_('How calibre splits text into paragraphs.\n' + OptionRecommendation(name='paragraph_type', recommended_value='auto', + choices=['auto', 'block', 'single', 'print'], + help=_('Paragraph structure.\n' 'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n' - '* auto: Try to auto detect paragraph format.\n' + '* auto: Try to auto detect paragraph type.\n' '* block: Treat a blank line as a paragraph break.\n' '* single: Assume every line is a paragraph.\n' '* print: Assume every line starting with 2+ spaces or a tab ' - 'starts a paragraph.\n' + 'starts a paragraph.')), + OptionRecommendation(name='formatting_type', recommended_value='auto', + choices=['auto', 'none', 'markdown'], + help=_('Formatting used within the document.' + '* auto: Try to auto detect the document formatting.\n' + '* none: Do not modify the paragraph formatting. Everything is a paragraph.\n' '* markdown: Run the input though the markdown pre-processor. ' 'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), OptionRecommendation(name='preserve_spaces', recommended_value=False, diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index e68c47e9b3..47e92a45a9 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -10,7 +10,7 @@ from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation from calibre.ebooks.chardet import detect from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \ separate_paragraphs_single_line, separate_paragraphs_print_formatted, \ - preserve_spaces, detect_paragraph_formatting + preserve_spaces, detect_paragraph_type, detect_formatting_type from calibre import _ent_pat, xml_entity_to_unicode class TXTInput(InputFormatPlugin): @@ -21,15 +21,20 @@ class TXTInput(InputFormatPlugin): file_types = set(['txt']) options = set([ - OptionRecommendation(name='paragraph_format', recommended_value='auto', - choices=['auto', 'block', 'single', 'print', 'markdown'], - help=_('How calibre splits text into paragraphs.\n' + OptionRecommendation(name='paragraph_type', recommended_value='auto', + choices=['auto', 'block', 'single', 'print'], + help=_('Paragraph structure.\n' 'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n' - '* auto: Try to auto detect paragraph format.\n' + '* auto: Try to auto detect paragraph type.\n' '* block: Treat a blank line as a paragraph break.\n' '* single: Assume every line is a paragraph.\n' '* print: Assume every line starting with 2+ spaces or a tab ' - 'starts a paragraph.\n' + 'starts a paragraph.')), + OptionRecommendation(name='formatting_type', recommended_value='auto', + choices=['auto', 'none', 'markdown'], + help=_('Formatting used within the document.' + '* auto: Try to auto detect the document formatting.\n' + '* none: Do not modify the paragraph formatting. Everything is a paragraph.\n' '* markdown: Run the input though the markdown pre-processor. ' 'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), OptionRecommendation(name='preserve_spaces', recommended_value=False, @@ -57,29 +62,16 @@ class TXTInput(InputFormatPlugin): log.debug('No input encoding specified and could not auto detect using %s' % ienc) txt = txt.decode(ienc, 'replace') - # Determine the formatting of the document. - if options.paragraph_format == 'auto': - options.paragraph_format = detect_paragraph_formatting(txt) - if options.paragraph_format == 'unknown': - log.debug('Could not reliably determine paragraph format using block format') - options.paragraph_format = 'block' - else: - log.debug('Auto detected paragraph format as %s' % options.paragraph_format) - - # We don't check for block because the processor assumes block. - # single and print at transformed to block for processing. - if options.paragraph_format == 'single': - txt = separate_paragraphs_single_line(txt) - elif options.paragraph_format == 'print': - txt = separate_paragraphs_print_formatted(txt) - txt = _ent_pat.sub(xml_entity_to_unicode, txt) # Preserve spaces will replace multiple spaces to a space # followed by the   entity. if options.preserve_spaces: txt = preserve_spaces(txt) + + if options.formatting_type == 'auto': + options.formatting_type = detect_formatting_type(txt) - if options.paragraph_format == 'markdown': + if options.formatting_type == 'markdown': log.debug('Running text though markdown conversion...') try: html = convert_markdown(txt, disable_toc=options.markdown_disable_toc) @@ -87,6 +79,22 @@ class TXTInput(InputFormatPlugin): raise ValueError('This txt file has malformed markup, it cannot be' ' converted by calibre. See http://daringfireball.net/projects/markdown/syntax') else: + # Determine the paragraph type of the document. + if options.paragraph_type == 'auto': + options.paragraph_type = detect_paragraph_type(txt) + if options.paragraph_type == 'unknown': + log.debug('Could not reliably determine paragraph type using block') + options.paragraph_type = 'block' + else: + log.debug('Auto detected paragraph type as %s' % options.paragraph_type) + + # We don't check for block because the processor assumes block. + # single and print at transformed to block for processing. + if options.paragraph_type == 'single': + txt = separate_paragraphs_single_line(txt) + elif options.paragraph_type == 'print': + txt = separate_paragraphs_print_formatted(txt) + flow_size = getattr(options, 'flow_size', 0) html = convert_basic(txt, epub_split_size_kb=flow_size) diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index e1014b0c7b..f6d628e7c5 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -93,7 +93,7 @@ def split_string_separator(txt, size) : xrange(0, len(txt), size)]) return txt -def detect_paragraph_formatting(txt): +def detect_paragraph_type(txt): ''' Tries to determine the formatting of the document. @@ -109,6 +109,20 @@ def detect_paragraph_formatting(txt): txt = txt.replace('\r', '\n') txt_line_count = len(re.findall('(?mu)^\s*.+$', txt)) + # Check for print + tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt)) + if tab_line_count / float(txt_line_count) >= .25: + return 'print' + + # Check for block + empty_line_count = len(re.findall('(?mu)^\s*$', txt)) + if empty_line_count / float(txt_line_count) >= .25: + return 'block' + + # Nothing else matched to assume single. + return 'single' + +def detect_formatting_type(txt): # Check for markdown # Headings if len(re.findall('(?mu)^#+', txt)) >= 5: @@ -129,16 +143,4 @@ def detect_paragraph_formatting(txt): if txt.count('\\'+c) > 10: return 'markdown' - # Check for print - tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt)) - if tab_line_count / float(txt_line_count) >= .25: - return 'print' - - # Check for block - empty_line_count = len(re.findall('(?mu)^\s*$', txt)) - if empty_line_count / float(txt_line_count) >= .25: - return 'block' - - # Nothing else matched to assume single. - return 'single' - + return 'none' diff --git a/src/calibre/gui2/convert/pdb_input.py b/src/calibre/gui2/convert/pdb_input.py index 655f4025a7..16ff1ff236 100644 --- a/src/calibre/gui2/convert/pdb_input.py +++ b/src/calibre/gui2/convert/pdb_input.py @@ -16,8 +16,10 @@ class PluginWidget(Widget, Ui_Form): def __init__(self, parent, get_option, get_help, db=None, book_id=None): Widget.__init__(self, parent, - ['paragraph_format', 'markdown_disable_toc', 'preserve_spaces']) + ['paragraph_type', 'formatting_type', 'markdown_disable_toc', 'preserve_spaces']) self.db, self.book_id = db, book_id - for x in get_option('paragraph_format').option.choices: - self.opt_paragraph_format.addItem(x) + for x in get_option('paragraph_type').option.choices: + self.opt_paragraph_type.addItem(x) + for x in get_option('formatting_type').option.choices: + self.opt_formatting_type.addItem(x) self.initialize_options(get_option, get_help, db, book_id) diff --git a/src/calibre/gui2/convert/tcr_input.py b/src/calibre/gui2/convert/tcr_input.py index 2aa877ce4d..366643ad5b 100644 --- a/src/calibre/gui2/convert/tcr_input.py +++ b/src/calibre/gui2/convert/tcr_input.py @@ -16,8 +16,10 @@ class PluginWidget(Widget, Ui_Form): def __init__(self, parent, get_option, get_help, db=None, book_id=None): Widget.__init__(self, parent, - ['paragraph_format', 'markdown_disable_toc', 'preserve_spaces']) + ['paragraph_type', 'formatting_type', 'markdown_disable_toc', 'preserve_spaces']) self.db, self.book_id = db, book_id - for x in get_option('paragraph_format').option.choices: - self.opt_paragraph_format.addItem(x) + for x in get_option('paragraph_type').option.choices: + self.opt_paragraph_type.addItem(x) + for x in get_option('formatting_type').option.choices: + self.opt_formatting_type.addItem(x) self.initialize_options(get_option, get_help, db, book_id) diff --git a/src/calibre/gui2/convert/txt_input.py b/src/calibre/gui2/convert/txt_input.py index 99d04fe2f4..62672cc0f9 100644 --- a/src/calibre/gui2/convert/txt_input.py +++ b/src/calibre/gui2/convert/txt_input.py @@ -16,8 +16,10 @@ class PluginWidget(Widget, Ui_Form): def __init__(self, parent, get_option, get_help, db=None, book_id=None): Widget.__init__(self, parent, - ['paragraph_format', 'markdown_disable_toc', 'preserve_spaces']) + ['paragraph_type', 'formatting_type', 'markdown_disable_toc', 'preserve_spaces']) self.db, self.book_id = db, book_id - for x in get_option('paragraph_format').option.choices: - self.opt_paragraph_format.addItem(x) + for x in get_option('paragraph_type').option.choices: + self.opt_paragraph_type.addItem(x) + for x in get_option('formatting_type').option.choices: + self.opt_formatting_type.addItem(x) self.initialize_options(get_option, get_help, db, book_id) diff --git a/src/calibre/gui2/convert/txt_input.ui b/src/calibre/gui2/convert/txt_input.ui index b45297fdf2..6cbd68135f 100644 --- a/src/calibre/gui2/convert/txt_input.ui +++ b/src/calibre/gui2/convert/txt_input.ui @@ -6,7 +6,7 @@ 0 0 - 488 + 518 300 @@ -17,41 +17,21 @@ - Document structure detection + Paragraph style: - + - - - - <p>Markdown is a simple markup language for text files, that allows for advanced formatting. To learn more visit <a href="http://daringfireball.net/projects/markdown">markdown</a>. - - - true - - - true - - - - - - - Do not insert Table of Contents into output text when using markdown - - - - + Preserve &spaces - + Qt::Vertical @@ -64,6 +44,45 @@ + + + + + + + Formatting style: + + + + + + + Markdown Options + + + + + + <p>Markdown is a simple markup language for text files, that allows for advanced formatting. To learn more visit <a href="http://daringfireball.net/projects/markdown">markdown</a>. + + + true + + + true + + + + + + + Do not insert Table of Contents into output text when using markdown + + + + + +