From 9ec91639197e2e1dec38525984787b317c0296c9 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 2 Jan 2011 19:05:35 -0500 Subject: [PATCH] TXT Input: Auto detect paragraph structure. --- src/calibre/ebooks/pdb/input.py | 30 ++++++++--------- src/calibre/ebooks/tcr/input.py | 24 +++++++------- src/calibre/ebooks/txt/input.py | 51 ++++++++++++++++++----------- src/calibre/ebooks/txt/processor.py | 50 +++++++++++++++++++++++++++- 4 files changed, 104 insertions(+), 51 deletions(-) diff --git a/src/calibre/ebooks/pdb/input.py b/src/calibre/ebooks/pdb/input.py index 9edf381f1e..b8b4b93ca1 100644 --- a/src/calibre/ebooks/pdb/input.py +++ b/src/calibre/ebooks/pdb/input.py @@ -19,26 +19,22 @@ class PDBInput(InputFormatPlugin): file_types = set(['pdb']) options = set([ - OptionRecommendation(name='single_line_paras', recommended_value=False, - help=_('Normally calibre treats blank lines as paragraph markers. ' - 'With this option it will assume that every line represents ' - 'a paragraph instead. This option is ignored by eReader format.')), - OptionRecommendation(name='print_formatted_paras', recommended_value=False, - help=_('Normally calibre treats blank lines as paragraph markers. ' - 'With this option it will assume that every line starting with ' - 'an indent (either a tab or 2+ spaces) represents a paragraph. ' - 'Paragraphs end when the next line that starts with an indent ' - 'is reached. This option is ignored by eReader format.')), + OptionRecommendation(name='paragraph_format', recommended_value='auto', + choices=['auto', 'block', 'single', 'print', 'markdown'], + help=_('How calibre splits text into paragraphs.\n' + 'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n' + '* auto: Try to auto detect paragraph format.\n' + '* block: Treat a blank line as a paragraph break.\n' + '* single: Assume every line is a paragraph.\n' + '* print: Assume every line starting with 2+ spaces or a tab ' + 'starts a paragraph.\n' + '* markdown: Run the input though the markdown pre-processor. ' + 'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), OptionRecommendation(name='preserve_spaces', recommended_value=False, help=_('Normally extra spaces are condensed into a single space. ' - 'With this option all spaces will be displayed. This option ' - 'is ignored by eReader format.')), - OptionRecommendation(name='markdown', recommended_value=False, - help=_('Run the text input through the markdown pre-processor. To ' - 'learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), + 'With this option all spaces will be displayed.')), OptionRecommendation(name="markdown_disable_toc", recommended_value=False, - help=_('Do not insert a Table of Contents into the output text. ' - 'This option is ignored by eReader format.')), + help=_('Do not insert a Table of Contents into the output text.')), ]) def convert(self, stream, options, file_ext, log, diff --git a/src/calibre/ebooks/tcr/input.py b/src/calibre/ebooks/tcr/input.py index 47154988a0..47fe7e7337 100644 --- a/src/calibre/ebooks/tcr/input.py +++ b/src/calibre/ebooks/tcr/input.py @@ -17,22 +17,20 @@ class TCRInput(InputFormatPlugin): file_types = set(['tcr']) options = set([ - OptionRecommendation(name='single_line_paras', recommended_value=False, - help=_('Normally calibre treats blank lines as paragraph markers. ' - 'With this option it will assume that every line represents ' - 'a paragraph instead.')), - OptionRecommendation(name='print_formatted_paras', recommended_value=False, - help=_('Normally calibre treats blank lines as paragraph markers. ' - 'With this option it will assume that every line starting with ' - 'an indent (either a tab or 2+ spaces) represents a paragraph. ' - 'Paragraphs end when the next line that starts with an indent ' - 'is reached.')), + OptionRecommendation(name='paragraph_format', recommended_value='auto', + choices=['auto', 'block', 'single', 'print', 'markdown'], + help=_('How calibre splits text into paragraphs.\n' + 'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n' + '* auto: Try to auto detect paragraph format.\n' + '* block: Treat a blank line as a paragraph break.\n' + '* single: Assume every line is a paragraph.\n' + '* print: Assume every line starting with 2+ spaces or a tab ' + 'starts a paragraph.\n' + '* markdown: Run the input though the markdown pre-processor. ' + 'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), OptionRecommendation(name='preserve_spaces', recommended_value=False, help=_('Normally extra spaces are condensed into a single space. ' 'With this option all spaces will be displayed.')), - OptionRecommendation(name='markdown', recommended_value=False, - help=_('Run the text input through the markdown pre-processor. To ' - 'learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), OptionRecommendation(name="markdown_disable_toc", recommended_value=False, help=_('Do not insert a Table of Contents into the output text.')), ]) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 5e406216d6..e68c47e9b3 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -10,7 +10,7 @@ from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation from calibre.ebooks.chardet import detect from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \ separate_paragraphs_single_line, separate_paragraphs_print_formatted, \ - preserve_spaces + preserve_spaces, detect_paragraph_formatting from calibre import _ent_pat, xml_entity_to_unicode class TXTInput(InputFormatPlugin): @@ -21,22 +21,20 @@ class TXTInput(InputFormatPlugin): file_types = set(['txt']) options = set([ - OptionRecommendation(name='single_line_paras', recommended_value=False, - help=_('Normally calibre treats blank lines as paragraph markers. ' - 'With this option it will assume that every line represents ' - 'a paragraph instead.')), - OptionRecommendation(name='print_formatted_paras', recommended_value=False, - help=_('Normally calibre treats blank lines as paragraph markers. ' - 'With this option it will assume that every line starting with ' - 'an indent (either a tab or 2+ spaces) represents a paragraph. ' - 'Paragraphs end when the next line that starts with an indent ' - 'is reached.')), + OptionRecommendation(name='paragraph_format', recommended_value='auto', + choices=['auto', 'block', 'single', 'print', 'markdown'], + help=_('How calibre splits text into paragraphs.\n' + 'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n' + '* auto: Try to auto detect paragraph format.\n' + '* block: Treat a blank line as a paragraph break.\n' + '* single: Assume every line is a paragraph.\n' + '* print: Assume every line starting with 2+ spaces or a tab ' + 'starts a paragraph.\n' + '* markdown: Run the input though the markdown pre-processor. ' + 'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), OptionRecommendation(name='preserve_spaces', recommended_value=False, help=_('Normally extra spaces are condensed into a single space. ' 'With this option all spaces will be displayed.')), - OptionRecommendation(name='markdown', recommended_value=False, - help=_('Run the text input through the markdown pre-processor. To ' - 'learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), OptionRecommendation(name="markdown_disable_toc", recommended_value=False, help=_('Do not insert a Table of Contents into the output text.')), ]) @@ -46,6 +44,7 @@ class TXTInput(InputFormatPlugin): log.debug('Reading text from file...') txt = stream.read() + # Get the encoding of the document. if options.input_encoding: ienc = options.input_encoding log.debug('Using user specified input encoding of %s' % ienc) @@ -58,17 +57,29 @@ class TXTInput(InputFormatPlugin): log.debug('No input encoding specified and could not auto detect using %s' % ienc) txt = txt.decode(ienc, 'replace') - # Adjust paragraph formatting as requested - if options.single_line_paras: + # Determine the formatting of the document. + if options.paragraph_format == 'auto': + options.paragraph_format = detect_paragraph_formatting(txt) + if options.paragraph_format == 'unknown': + log.debug('Could not reliably determine paragraph format using block format') + options.paragraph_format = 'block' + else: + log.debug('Auto detected paragraph format as %s' % options.paragraph_format) + + # We don't check for block because the processor assumes block. + # single and print at transformed to block for processing. + if options.paragraph_format == 'single': txt = separate_paragraphs_single_line(txt) - if options.print_formatted_paras: + elif options.paragraph_format == 'print': txt = separate_paragraphs_print_formatted(txt) + + txt = _ent_pat.sub(xml_entity_to_unicode, txt) + # Preserve spaces will replace multiple spaces to a space + # followed by the   entity. if options.preserve_spaces: txt = preserve_spaces(txt) - txt = _ent_pat.sub(xml_entity_to_unicode, txt) - - if options.markdown: + if options.paragraph_format == 'markdown': log.debug('Running text though markdown conversion...') try: html = convert_markdown(txt, disable_toc=options.markdown_disable_toc) diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index dac1e34df7..e1014b0c7b 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -49,7 +49,6 @@ def convert_basic(txt, title='', epub_split_size_kb=0): if isbytestring(txt): txt = txt.decode('utf-8') - lines = [] # Split into paragraphs based on having a blank line between text. for line in txt.split('\n\n'): @@ -94,3 +93,52 @@ def split_string_separator(txt, size) : xrange(0, len(txt), size)]) return txt +def detect_paragraph_formatting(txt): + ''' + Tries to determine the formatting of the document. + + block: Paragraphs are separated by a blank line. + single: Each line is a paragraph. + print: Each paragraph starts with a 2+ spaces or a tab + and ends when a new paragraph is reached. + markdown: Markdown formatting is in the document. + + returns block, single, print, markdown + ''' + txt = txt.replace('\r\n', '\n') + txt = txt.replace('\r', '\n') + txt_line_count = len(re.findall('(?mu)^\s*.+$', txt)) + + # Check for markdown + # Headings + if len(re.findall('(?mu)^#+', txt)) >= 5: + return 'markdown' + if len(re.findall('(?mu)^=+$', txt)) >= 5: + return 'markdown' + if len(re.findall('(?mu)^-+$', txt)) >= 5: + return 'markdown' + # Images + if len(re.findall('(?u)!\[.*?\]\(.+?\)', txt)) >= 5: + return 'markdown' + # Links + if len(re.findall('(?u)(^|(?P
[^!]))\[.*?\]\([^)]+\)', txt)) >= 5:
+        return 'markdown'
+    # Escaped characters
+    md_escapted_characters = ['\\', '`', '*', '_', '{', '}', '[', ']', '(', ')', '#', '+', '-', '.', '!']
+    for c in md_escapted_characters:
+        if txt.count('\\'+c) > 10:
+            return 'markdown'
+    
+    # Check for print
+    tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
+    if tab_line_count / float(txt_line_count) >= .25:
+        return 'print'
+    
+    # Check for block
+    empty_line_count = len(re.findall('(?mu)^\s*$', txt))
+    if empty_line_count / float(txt_line_count) >= .25:
+        return 'block'
+    
+    # Nothing else matched to assume single.
+    return 'single'
+