TXT Input: Auto detect paragraph structure.

2025-07-09 03:04:10 -04:00 · 2011-01-02 19:05:35 -05:00 · 2011-01-02 19:05:35 -05:00 · 9ec9163919
commit 9ec9163919
parent d9195c0632
4 changed files with 104 additions and 51 deletions
--- a/src/calibre/ebooks/pdb/input.py
+++ b/src/calibre/ebooks/pdb/input.py
@ -19,26 +19,22 @@ class PDBInput(InputFormatPlugin):
    file_types  = set(['pdb'])

    options = set([
-        OptionRecommendation(name='single_line_paras', recommended_value=False,
-            help=_('Normally calibre treats blank lines as paragraph markers. '
-                'With this option it will assume that every line represents '
-                'a paragraph instead. This option is ignored by eReader format.')),
-        OptionRecommendation(name='print_formatted_paras', recommended_value=False,
-            help=_('Normally calibre treats blank lines as paragraph markers. '
-                'With this option it will assume that every line starting with '
-                'an indent (either a tab or 2+ spaces) represents a paragraph. '
-                'Paragraphs end when the next line that starts with an indent '
-                'is reached. This option is ignored by eReader format.')),
+        OptionRecommendation(name='paragraph_format', recommended_value='auto',
+            choices=['auto', 'block', 'single', 'print', 'markdown'],
+            help=_('How calibre splits text into paragraphs.\n'
+                   'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n'
+                   '* auto: Try to auto detect paragraph format.\n'
+                   '* block: Treat a blank line as a paragraph break.\n'
+                   '* single: Assume every line is a paragraph.\n'
+                   '* print:  Assume every line starting with 2+ spaces or a tab '
+                   'starts a paragraph.\n'
+                   '* markdown: Run the input though the markdown pre-processor. '
+                   'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
        OptionRecommendation(name='preserve_spaces', recommended_value=False,
            help=_('Normally extra spaces are condensed into a single space. '
-                'With this option all spaces will be displayed. This option '
-                'is ignored by eReader format.')),
-        OptionRecommendation(name='markdown', recommended_value=False,
-            help=_('Run the text input through the markdown pre-processor. To '
-                'learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
+                'With this option all spaces will be displayed.')),
        OptionRecommendation(name="markdown_disable_toc", recommended_value=False,
-            help=_('Do not insert a Table of Contents into the output text. '
-                   'This option is ignored by eReader format.')),
+            help=_('Do not insert a Table of Contents into the output text.')),
    ])

    def convert(self, stream, options, file_ext, log,
--- a/src/calibre/ebooks/tcr/input.py
+++ b/src/calibre/ebooks/tcr/input.py
@ -17,22 +17,20 @@ class TCRInput(InputFormatPlugin):
    file_types  = set(['tcr'])

    options = set([
-        OptionRecommendation(name='single_line_paras', recommended_value=False,
-            help=_('Normally calibre treats blank lines as paragraph markers. '
-                'With this option it will assume that every line represents '
-                'a paragraph instead.')),
-        OptionRecommendation(name='print_formatted_paras', recommended_value=False,
-            help=_('Normally calibre treats blank lines as paragraph markers. '
-                'With this option it will assume that every line starting with '
-                'an indent (either a tab or 2+ spaces) represents a paragraph. '
-                'Paragraphs end when the next line that starts with an indent '
-                'is reached.')),
+        OptionRecommendation(name='paragraph_format', recommended_value='auto',
+            choices=['auto', 'block', 'single', 'print', 'markdown'],
+            help=_('How calibre splits text into paragraphs.\n'
+                   'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n'
+                   '* auto: Try to auto detect paragraph format.\n'
+                   '* block: Treat a blank line as a paragraph break.\n'
+                   '* single: Assume every line is a paragraph.\n'
+                   '* print:  Assume every line starting with 2+ spaces or a tab '
+                   'starts a paragraph.\n'
+                   '* markdown: Run the input though the markdown pre-processor. '
+                   'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
        OptionRecommendation(name='preserve_spaces', recommended_value=False,
            help=_('Normally extra spaces are condensed into a single space. '
                'With this option all spaces will be displayed.')),
-        OptionRecommendation(name='markdown', recommended_value=False,
-            help=_('Run the text input through the markdown pre-processor. To '
-                'learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
        OptionRecommendation(name="markdown_disable_toc", recommended_value=False,
            help=_('Do not insert a Table of Contents into the output text.')),
    ])
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@ -10,7 +10,7 @@ from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
 from calibre.ebooks.chardet import detect
 from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
    separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
-    preserve_spaces
+    preserve_spaces, detect_paragraph_formatting
 from calibre import _ent_pat, xml_entity_to_unicode

 class TXTInput(InputFormatPlugin):
@ -21,22 +21,20 @@ class TXTInput(InputFormatPlugin):
    file_types  = set(['txt'])

    options = set([
-        OptionRecommendation(name='single_line_paras', recommended_value=False,
-            help=_('Normally calibre treats blank lines as paragraph markers. '
-                'With this option it will assume that every line represents '
-                'a paragraph instead.')),
-        OptionRecommendation(name='print_formatted_paras', recommended_value=False,
-            help=_('Normally calibre treats blank lines as paragraph markers. '
-                'With this option it will assume that every line starting with '
-                'an indent (either a tab or 2+ spaces) represents a paragraph. '
-                'Paragraphs end when the next line that starts with an indent '
-                'is reached.')),
+        OptionRecommendation(name='paragraph_format', recommended_value='auto',
+            choices=['auto', 'block', 'single', 'print', 'markdown'],
+            help=_('How calibre splits text into paragraphs.\n'
+                   'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n'
+                   '* auto: Try to auto detect paragraph format.\n'
+                   '* block: Treat a blank line as a paragraph break.\n'
+                   '* single: Assume every line is a paragraph.\n'
+                   '* print:  Assume every line starting with 2+ spaces or a tab '
+                   'starts a paragraph.\n'
+                   '* markdown: Run the input though the markdown pre-processor. '
+                   'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
        OptionRecommendation(name='preserve_spaces', recommended_value=False,
            help=_('Normally extra spaces are condensed into a single space. '
                'With this option all spaces will be displayed.')),
-        OptionRecommendation(name='markdown', recommended_value=False,
-            help=_('Run the text input through the markdown pre-processor. To '
-                'learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
        OptionRecommendation(name="markdown_disable_toc", recommended_value=False,
            help=_('Do not insert a Table of Contents into the output text.')),
    ])
@ -46,6 +44,7 @@ class TXTInput(InputFormatPlugin):
        log.debug('Reading text from file...')
        
        txt = stream.read()
+        # Get the encoding of the document.
        if options.input_encoding:
            ienc = options.input_encoding
            log.debug('Using user specified input encoding of %s' % ienc)
@ -58,17 +57,29 @@ class TXTInput(InputFormatPlugin):
            log.debug('No input encoding specified and could not auto detect using %s' % ienc)
        txt = txt.decode(ienc, 'replace')

-        # Adjust paragraph formatting as requested
-        if options.single_line_paras:
+        # Determine the formatting of the document.
+        if options.paragraph_format == 'auto':
+            options.paragraph_format = detect_paragraph_formatting(txt)
+            if options.paragraph_format == 'unknown':
+                log.debug('Could not reliably determine paragraph format using block format')
+                options.paragraph_format = 'block'
+            else:
+                log.debug('Auto detected paragraph format as %s' % options.paragraph_format) 
+        
+        # We don't check for block because the processor assumes block.
+        # single and print at transformed to block for processing.
+        if options.paragraph_format == 'single':
            txt = separate_paragraphs_single_line(txt)
-        if options.print_formatted_paras:
+        elif options.paragraph_format == 'print':
            txt = separate_paragraphs_print_formatted(txt)
+
+        txt = _ent_pat.sub(xml_entity_to_unicode, txt)
+        # Preserve spaces will replace multiple spaces to a space
+        # followed by the &nbsp; entity.
        if options.preserve_spaces:
            txt = preserve_spaces(txt)

-        txt = _ent_pat.sub(xml_entity_to_unicode, txt)
-
-        if options.markdown:
+        if options.paragraph_format == 'markdown':
            log.debug('Running text though markdown conversion...')
            try:
                html = convert_markdown(txt, disable_toc=options.markdown_disable_toc)
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@ -49,7 +49,6 @@ def convert_basic(txt, title='', epub_split_size_kb=0):
    if isbytestring(txt):
        txt = txt.decode('utf-8')

-
    lines = []
    # Split into paragraphs based on having a blank line between text.
    for line in txt.split('\n\n'):
@ -94,3 +93,52 @@ def split_string_separator(txt, size) :
            xrange(0, len(txt), size)])
    return txt

+def detect_paragraph_formatting(txt):
+    '''
+    Tries to determine the formatting of the document.
+    
+    block: Paragraphs are separated by a blank line.
+    single: Each line is a paragraph.
+    print: Each paragraph starts with a 2+ spaces or a tab
+           and ends when a new paragraph is reached.
+    markdown: Markdown formatting is in the document.
+    
+    returns block, single, print, markdown
+    '''
+    txt = txt.replace('\r\n', '\n')
+    txt = txt.replace('\r', '\n')
+    txt_line_count = len(re.findall('(?mu)^\s*.+$', txt))
+    
+    # Check for markdown
+    # Headings
+    if len(re.findall('(?mu)^#+', txt)) >= 5:
+        return 'markdown'
+    if len(re.findall('(?mu)^=+$', txt)) >= 5:
+        return 'markdown'
+    if len(re.findall('(?mu)^-+$', txt)) >= 5:
+        return 'markdown'
+    # Images
+    if len(re.findall('(?u)!\[.*?\]\(.+?\)', txt)) >= 5:
+        return 'markdown'
+    # Links
+    if len(re.findall('(?u)(^|(?P<pre>[^!]))\[.*?\]\([^)]+\)', txt)) >= 5:
+        return 'markdown'
+    # Escaped characters
+    md_escapted_characters = ['\\', '`', '*', '_', '{', '}', '[', ']', '(', ')', '#', '+', '-', '.', '!']
+    for c in md_escapted_characters:
+        if txt.count('\\'+c) > 10:
+            return 'markdown'
+    
+    # Check for print
+    tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
+    if tab_line_count / float(txt_line_count) >= .25:
+        return 'print'
+    
+    # Check for block
+    empty_line_count = len(re.findall('(?mu)^\s*$', txt))
+    if empty_line_count / float(txt_line_count) >= .25:
+        return 'block'
+    
+    # Nothing else matched to assume single.
+    return 'single'
+