From 9ec91639197e2e1dec38525984787b317c0296c9 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sun, 2 Jan 2011 19:05:35 -0500
Subject: [PATCH] TXT Input: Auto detect paragraph structure.

---
 src/calibre/ebooks/pdb/input.py     | 30 ++++++++---------
 src/calibre/ebooks/tcr/input.py     | 24 +++++++-------
 src/calibre/ebooks/txt/input.py     | 51 ++++++++++++++++++-----------
 src/calibre/ebooks/txt/processor.py | 50 +++++++++++++++++++++++++++-
 4 files changed, 104 insertions(+), 51 deletions(-)

diff --git a/src/calibre/ebooks/pdb/input.py b/src/calibre/ebooks/pdb/input.py
index 9edf381f1e..b8b4b93ca1 100644
--- a/src/calibre/ebooks/pdb/input.py
+++ b/src/calibre/ebooks/pdb/input.py
@@ -19,26 +19,22 @@ class PDBInput(InputFormatPlugin):
     file_types  = set(['pdb'])
 
     options = set([
-        OptionRecommendation(name='single_line_paras', recommended_value=False,
-            help=_('Normally calibre treats blank lines as paragraph markers. '
-                'With this option it will assume that every line represents '
-                'a paragraph instead. This option is ignored by eReader format.')),
-        OptionRecommendation(name='print_formatted_paras', recommended_value=False,
-            help=_('Normally calibre treats blank lines as paragraph markers. '
-                'With this option it will assume that every line starting with '
-                'an indent (either a tab or 2+ spaces) represents a paragraph. '
-                'Paragraphs end when the next line that starts with an indent '
-                'is reached. This option is ignored by eReader format.')),
+        OptionRecommendation(name='paragraph_format', recommended_value='auto',
+            choices=['auto', 'block', 'single', 'print', 'markdown'],
+            help=_('How calibre splits text into paragraphs.\n'
+                   'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n'
+                   '* auto: Try to auto detect paragraph format.\n'
+                   '* block: Treat a blank line as a paragraph break.\n'
+                   '* single: Assume every line is a paragraph.\n'
+                   '* print:  Assume every line starting with 2+ spaces or a tab '
+                   'starts a paragraph.\n'
+                   '* markdown: Run the input though the markdown pre-processor. '
+                   'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
         OptionRecommendation(name='preserve_spaces', recommended_value=False,
             help=_('Normally extra spaces are condensed into a single space. '
-                'With this option all spaces will be displayed. This option '
-                'is ignored by eReader format.')),
-        OptionRecommendation(name='markdown', recommended_value=False,
-            help=_('Run the text input through the markdown pre-processor. To '
-                'learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
+                'With this option all spaces will be displayed.')),
         OptionRecommendation(name="markdown_disable_toc", recommended_value=False,
-            help=_('Do not insert a Table of Contents into the output text. '
-                   'This option is ignored by eReader format.')),
+            help=_('Do not insert a Table of Contents into the output text.')),
     ])
 
     def convert(self, stream, options, file_ext, log,
diff --git a/src/calibre/ebooks/tcr/input.py b/src/calibre/ebooks/tcr/input.py
index 47154988a0..47fe7e7337 100644
--- a/src/calibre/ebooks/tcr/input.py
+++ b/src/calibre/ebooks/tcr/input.py
@@ -17,22 +17,20 @@ class TCRInput(InputFormatPlugin):
     file_types  = set(['tcr'])
 
     options = set([
-        OptionRecommendation(name='single_line_paras', recommended_value=False,
-            help=_('Normally calibre treats blank lines as paragraph markers. '
-                'With this option it will assume that every line represents '
-                'a paragraph instead.')),
-        OptionRecommendation(name='print_formatted_paras', recommended_value=False,
-            help=_('Normally calibre treats blank lines as paragraph markers. '
-                'With this option it will assume that every line starting with '
-                'an indent (either a tab or 2+ spaces) represents a paragraph. '
-                'Paragraphs end when the next line that starts with an indent '
-                'is reached.')),
+        OptionRecommendation(name='paragraph_format', recommended_value='auto',
+            choices=['auto', 'block', 'single', 'print', 'markdown'],
+            help=_('How calibre splits text into paragraphs.\n'
+                   'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n'
+                   '* auto: Try to auto detect paragraph format.\n'
+                   '* block: Treat a blank line as a paragraph break.\n'
+                   '* single: Assume every line is a paragraph.\n'
+                   '* print:  Assume every line starting with 2+ spaces or a tab '
+                   'starts a paragraph.\n'
+                   '* markdown: Run the input though the markdown pre-processor. '
+                   'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
         OptionRecommendation(name='preserve_spaces', recommended_value=False,
             help=_('Normally extra spaces are condensed into a single space. '
                 'With this option all spaces will be displayed.')),
-        OptionRecommendation(name='markdown', recommended_value=False,
-            help=_('Run the text input through the markdown pre-processor. To '
-                'learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
         OptionRecommendation(name="markdown_disable_toc", recommended_value=False,
             help=_('Do not insert a Table of Contents into the output text.')),
     ])
diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index 5e406216d6..e68c47e9b3 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -10,7 +10,7 @@ from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
 from calibre.ebooks.chardet import detect
 from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
     separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
-    preserve_spaces
+    preserve_spaces, detect_paragraph_formatting
 from calibre import _ent_pat, xml_entity_to_unicode
 
 class TXTInput(InputFormatPlugin):
@@ -21,22 +21,20 @@ class TXTInput(InputFormatPlugin):
     file_types  = set(['txt'])
 
     options = set([
-        OptionRecommendation(name='single_line_paras', recommended_value=False,
-            help=_('Normally calibre treats blank lines as paragraph markers. '
-                'With this option it will assume that every line represents '
-                'a paragraph instead.')),
-        OptionRecommendation(name='print_formatted_paras', recommended_value=False,
-            help=_('Normally calibre treats blank lines as paragraph markers. '
-                'With this option it will assume that every line starting with '
-                'an indent (either a tab or 2+ spaces) represents a paragraph. '
-                'Paragraphs end when the next line that starts with an indent '
-                'is reached.')),
+        OptionRecommendation(name='paragraph_format', recommended_value='auto',
+            choices=['auto', 'block', 'single', 'print', 'markdown'],
+            help=_('How calibre splits text into paragraphs.\n'
+                   'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n'
+                   '* auto: Try to auto detect paragraph format.\n'
+                   '* block: Treat a blank line as a paragraph break.\n'
+                   '* single: Assume every line is a paragraph.\n'
+                   '* print:  Assume every line starting with 2+ spaces or a tab '
+                   'starts a paragraph.\n'
+                   '* markdown: Run the input though the markdown pre-processor. '
+                   'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
         OptionRecommendation(name='preserve_spaces', recommended_value=False,
             help=_('Normally extra spaces are condensed into a single space. '
                 'With this option all spaces will be displayed.')),
-        OptionRecommendation(name='markdown', recommended_value=False,
-            help=_('Run the text input through the markdown pre-processor. To '
-                'learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
         OptionRecommendation(name="markdown_disable_toc", recommended_value=False,
             help=_('Do not insert a Table of Contents into the output text.')),
     ])
@@ -46,6 +44,7 @@ class TXTInput(InputFormatPlugin):
         log.debug('Reading text from file...')
         
         txt = stream.read()
+        # Get the encoding of the document.
         if options.input_encoding:
             ienc = options.input_encoding
             log.debug('Using user specified input encoding of %s' % ienc)
@@ -58,17 +57,29 @@ class TXTInput(InputFormatPlugin):
             log.debug('No input encoding specified and could not auto detect using %s' % ienc)
         txt = txt.decode(ienc, 'replace')
 
-        # Adjust paragraph formatting as requested
-        if options.single_line_paras:
+        # Determine the formatting of the document.
+        if options.paragraph_format == 'auto':
+            options.paragraph_format = detect_paragraph_formatting(txt)
+            if options.paragraph_format == 'unknown':
+                log.debug('Could not reliably determine paragraph format using block format')
+                options.paragraph_format = 'block'
+            else:
+                log.debug('Auto detected paragraph format as %s' % options.paragraph_format) 
+        
+        # We don't check for block because the processor assumes block.
+        # single and print at transformed to block for processing.
+        if options.paragraph_format == 'single':
             txt = separate_paragraphs_single_line(txt)
-        if options.print_formatted_paras:
+        elif options.paragraph_format == 'print':
             txt = separate_paragraphs_print_formatted(txt)
+
+        txt = _ent_pat.sub(xml_entity_to_unicode, txt)
+        # Preserve spaces will replace multiple spaces to a space
+        # followed by the &nbsp; entity.
         if options.preserve_spaces:
             txt = preserve_spaces(txt)
 
-        txt = _ent_pat.sub(xml_entity_to_unicode, txt)
-
-        if options.markdown:
+        if options.paragraph_format == 'markdown':
             log.debug('Running text though markdown conversion...')
             try:
                 html = convert_markdown(txt, disable_toc=options.markdown_disable_toc)
diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py
index dac1e34df7..e1014b0c7b 100644
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@@ -49,7 +49,6 @@ def convert_basic(txt, title='', epub_split_size_kb=0):
     if isbytestring(txt):
         txt = txt.decode('utf-8')
 
-
     lines = []
     # Split into paragraphs based on having a blank line between text.
     for line in txt.split('\n\n'):
@@ -94,3 +93,52 @@ def split_string_separator(txt, size) :
             xrange(0, len(txt), size)])
     return txt
 
+def detect_paragraph_formatting(txt):
+    '''
+    Tries to determine the formatting of the document.
+    
+    block: Paragraphs are separated by a blank line.
+    single: Each line is a paragraph.
+    print: Each paragraph starts with a 2+ spaces or a tab
+           and ends when a new paragraph is reached.
+    markdown: Markdown formatting is in the document.
+    
+    returns block, single, print, markdown
+    '''
+    txt = txt.replace('\r\n', '\n')
+    txt = txt.replace('\r', '\n')
+    txt_line_count = len(re.findall('(?mu)^\s*.+$', txt))
+    
+    # Check for markdown
+    # Headings
+    if len(re.findall('(?mu)^#+', txt)) >= 5:
+        return 'markdown'
+    if len(re.findall('(?mu)^=+$', txt)) >= 5:
+        return 'markdown'
+    if len(re.findall('(?mu)^-+$', txt)) >= 5:
+        return 'markdown'
+    # Images
+    if len(re.findall('(?u)!\[.*?\]\(.+?\)', txt)) >= 5:
+        return 'markdown'
+    # Links
+    if len(re.findall('(?u)(^|(?P<pre>[^!]))\[.*?\]\([^)]+\)', txt)) >= 5:
+        return 'markdown'
+    # Escaped characters
+    md_escapted_characters = ['\\', '`', '*', '_', '{', '}', '[', ']', '(', ')', '#', '+', '-', '.', '!']
+    for c in md_escapted_characters:
+        if txt.count('\\'+c) > 10:
+            return 'markdown'
+    
+    # Check for print
+    tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
+    if tab_line_count / float(txt_line_count) >= .25:
+        return 'print'
+    
+    # Check for block
+    empty_line_count = len(re.findall('(?mu)^\s*$', txt))
+    if empty_line_count / float(txt_line_count) >= .25:
+        return 'block'
+    
+    # Nothing else matched to assume single.
+    return 'single'
+