TXT Input: Auto detect paragraph structure.

2025-07-09 03:04:10 -04:00 · 2011-01-02 19:05:35 -05:00 · 2011-01-02 19:05:35 -05:00 · 9ec9163919
commit 9ec9163919
parent d9195c0632
4 changed files with 104 additions and 51 deletions
--- a/src/calibre/ebooks/pdb/input.py
+++ b/src/calibre/ebooks/pdb/input.py
@ -19,26 +19,22 @@ class PDBInput(InputFormatPlugin):
    file_types  = set(['pdb'])
    options = set([
-        OptionRecommendation(name='single_line_paras', recommended_value=False,
+        OptionRecommendation(name='paragraph_format', recommended_value='auto',
-            help=_('Normally calibre treats blank lines as paragraph markers. '
+            choices=['auto', 'block', 'single', 'print', 'markdown'],
-                'With this option it will assume that every line represents '
+            help=_('How calibre splits text into paragraphs.\n'
-                'a paragraph instead. This option is ignored by eReader format.')),
+                   'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n'
-        OptionRecommendation(name='print_formatted_paras', recommended_value=False,
+                   '* auto: Try to auto detect paragraph format.\n'
-            help=_('Normally calibre treats blank lines as paragraph markers. '
+                   '* block: Treat a blank line as a paragraph break.\n'
-                'With this option it will assume that every line starting with '
+                   '* single: Assume every line is a paragraph.\n'
-                'an indent (either a tab or 2+ spaces) represents a paragraph. '
+                   '* print:  Assume every line starting with 2+ spaces or a tab '
-                'Paragraphs end when the next line that starts with an indent '
+                   'starts a paragraph.\n'
-                'is reached. This option is ignored by eReader format.')),
+                   '* markdown: Run the input though the markdown pre-processor. '
                   'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
        OptionRecommendation(name='preserve_spaces', recommended_value=False,
            help=_('Normally extra spaces are condensed into a single space. '
-                'With this option all spaces will be displayed. This option '
+                'With this option all spaces will be displayed.')),
                'is ignored by eReader format.')),
        OptionRecommendation(name='markdown', recommended_value=False,
            help=_('Run the text input through the markdown pre-processor. To '
                'learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
        OptionRecommendation(name="markdown_disable_toc", recommended_value=False,
-            help=_('Do not insert a Table of Contents into the output text. '
+            help=_('Do not insert a Table of Contents into the output text.')),
                   'This option is ignored by eReader format.')),
    ])
    def convert(self, stream, options, file_ext, log,
--- a/src/calibre/ebooks/tcr/input.py
+++ b/src/calibre/ebooks/tcr/input.py
@ -17,22 +17,20 @@ class TCRInput(InputFormatPlugin):
    file_types  = set(['tcr'])
    options = set([
-        OptionRecommendation(name='single_line_paras', recommended_value=False,
+        OptionRecommendation(name='paragraph_format', recommended_value='auto',
-            help=_('Normally calibre treats blank lines as paragraph markers. '
+            choices=['auto', 'block', 'single', 'print', 'markdown'],
-                'With this option it will assume that every line represents '
+            help=_('How calibre splits text into paragraphs.\n'
-                'a paragraph instead.')),
+                   'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n'
-        OptionRecommendation(name='print_formatted_paras', recommended_value=False,
+                   '* auto: Try to auto detect paragraph format.\n'
-            help=_('Normally calibre treats blank lines as paragraph markers. '
+                   '* block: Treat a blank line as a paragraph break.\n'
-                'With this option it will assume that every line starting with '
+                   '* single: Assume every line is a paragraph.\n'
-                'an indent (either a tab or 2+ spaces) represents a paragraph. '
+                   '* print:  Assume every line starting with 2+ spaces or a tab '
-                'Paragraphs end when the next line that starts with an indent '
+                   'starts a paragraph.\n'
-                'is reached.')),
+                   '* markdown: Run the input though the markdown pre-processor. '
                   'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
        OptionRecommendation(name='preserve_spaces', recommended_value=False,
            help=_('Normally extra spaces are condensed into a single space. '
                'With this option all spaces will be displayed.')),
        OptionRecommendation(name='markdown', recommended_value=False,
            help=_('Run the text input through the markdown pre-processor. To '
                'learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
        OptionRecommendation(name="markdown_disable_toc", recommended_value=False,
            help=_('Do not insert a Table of Contents into the output text.')),
    ])
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@ -10,7 +10,7 @@ from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
 from calibre.ebooks.chardet import detect
 from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
    separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
-    preserve_spaces
+    preserve_spaces, detect_paragraph_formatting
 from calibre import _ent_pat, xml_entity_to_unicode
 class TXTInput(InputFormatPlugin):
@ -21,22 +21,20 @@ class TXTInput(InputFormatPlugin):
    file_types  = set(['txt'])
    options = set([
-        OptionRecommendation(name='single_line_paras', recommended_value=False,
+        OptionRecommendation(name='paragraph_format', recommended_value='auto',
-            help=_('Normally calibre treats blank lines as paragraph markers. '
+            choices=['auto', 'block', 'single', 'print', 'markdown'],
-                'With this option it will assume that every line represents '
+            help=_('How calibre splits text into paragraphs.\n'
-                'a paragraph instead.')),
+                   'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n'
-        OptionRecommendation(name='print_formatted_paras', recommended_value=False,
+                   '* auto: Try to auto detect paragraph format.\n'
-            help=_('Normally calibre treats blank lines as paragraph markers. '
+                   '* block: Treat a blank line as a paragraph break.\n'
-                'With this option it will assume that every line starting with '
+                   '* single: Assume every line is a paragraph.\n'
-                'an indent (either a tab or 2+ spaces) represents a paragraph. '
+                   '* print:  Assume every line starting with 2+ spaces or a tab '
-                'Paragraphs end when the next line that starts with an indent '
+                   'starts a paragraph.\n'
-                'is reached.')),
+                   '* markdown: Run the input though the markdown pre-processor. '
                   'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
        OptionRecommendation(name='preserve_spaces', recommended_value=False,
            help=_('Normally extra spaces are condensed into a single space. '
                'With this option all spaces will be displayed.')),
        OptionRecommendation(name='markdown', recommended_value=False,
            help=_('Run the text input through the markdown pre-processor. To '
                'learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
        OptionRecommendation(name="markdown_disable_toc", recommended_value=False,
            help=_('Do not insert a Table of Contents into the output text.')),
    ])
@ -46,6 +44,7 @@ class TXTInput(InputFormatPlugin):
        log.debug('Reading text from file...')
        txt = stream.read()
        # Get the encoding of the document.
        if options.input_encoding:
            ienc = options.input_encoding
            log.debug('Using user specified input encoding of %s' % ienc)
@ -58,17 +57,29 @@ class TXTInput(InputFormatPlugin):
            log.debug('No input encoding specified and could not auto detect using %s' % ienc)
        txt = txt.decode(ienc, 'replace')
-        # Adjust paragraph formatting as requested
+        # Determine the formatting of the document.
-        if options.single_line_paras:
+        if options.paragraph_format == 'auto':
            options.paragraph_format = detect_paragraph_formatting(txt)
            if options.paragraph_format == 'unknown':
                log.debug('Could not reliably determine paragraph format using block format')
                options.paragraph_format = 'block'
            else:
                log.debug('Auto detected paragraph format as %s' % options.paragraph_format) 
        # We don't check for block because the processor assumes block.
        # single and print at transformed to block for processing.
        if options.paragraph_format == 'single':
            txt = separate_paragraphs_single_line(txt)
-        if options.print_formatted_paras:
+        elif options.paragraph_format == 'print':
            txt = separate_paragraphs_print_formatted(txt)
        txt = _ent_pat.sub(xml_entity_to_unicode, txt)
        # Preserve spaces will replace multiple spaces to a space
        # followed by the &nbsp; entity.
        if options.preserve_spaces:
            txt = preserve_spaces(txt)
-        txt = _ent_pat.sub(xml_entity_to_unicode, txt)
+        if options.paragraph_format == 'markdown':
        if options.markdown:
            log.debug('Running text though markdown conversion...')
            try:
                html = convert_markdown(txt, disable_toc=options.markdown_disable_toc)
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@ -49,7 +49,6 @@ def convert_basic(txt, title='', epub_split_size_kb=0):
    if isbytestring(txt):
        txt = txt.decode('utf-8')
    lines = []
    # Split into paragraphs based on having a blank line between text.
    for line in txt.split('\n\n'):
@ -94,3 +93,52 @@ def split_string_separator(txt, size) :
            xrange(0, len(txt), size)])
    return txt
 def detect_paragraph_formatting(txt):
    '''
    Tries to determine the formatting of the document.
    block: Paragraphs are separated by a blank line.
    single: Each line is a paragraph.
    print: Each paragraph starts with a 2+ spaces or a tab
           and ends when a new paragraph is reached.
    markdown: Markdown formatting is in the document.
    returns block, single, print, markdown
    '''
    txt = txt.replace('\r\n', '\n')
    txt = txt.replace('\r', '\n')
    txt_line_count = len(re.findall('(?mu)^\s*.+$', txt))
    # Check for markdown
    # Headings
    if len(re.findall('(?mu)^#+', txt)) >= 5:
        return 'markdown'
    if len(re.findall('(?mu)^=+$', txt)) >= 5:
        return 'markdown'
    if len(re.findall('(?mu)^-+$', txt)) >= 5:
        return 'markdown'
    # Images
    if len(re.findall('(?u)!\[.*?\]\(.+?\)', txt)) >= 5:
        return 'markdown'
    # Links
    if len(re.findall('(?u)(^|(?P<pre>[^!]))\[.*?\]\([^)]+\)', txt)) >= 5:
        return 'markdown'
    # Escaped characters
    md_escapted_characters = ['\\', '`', '*', '_', '{', '}', '[', ']', '(', ')', '#', '+', '-', '.', '!']
    for c in md_escapted_characters:
        if txt.count('\\'+c) > 10:
            return 'markdown'
    # Check for print
    tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
    if tab_line_count / float(txt_line_count) >= .25:
        return 'print'
    # Check for block
    empty_line_count = len(re.findall('(?mu)^\s*$', txt))
    if empty_line_count / float(txt_line_count) >= .25:
        return 'block'
    # Nothing else matched to assume single.
    return 'single'