tied line histogram into txt paragraph structure detection

2025-07-09 03:04:10 -04:00 · 2011-01-07 14:19:12 +08:00 · 2011-01-07 14:19:12 +08:00 · dd96c645f0
commit dd96c645f0
parent ccd1a633ef
3 changed files with 24 additions and 11 deletions
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -78,6 +78,8 @@ class DocAnalysis(object):
            linere = re.compile('(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL)
        elif format == 'spanned_html':
            linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
+        elif format == 'txt':
+            linere = re.compile('.*?\n', re.DOTALL)
        self.lines = linere.findall(raw)

    def line_length(self, percent):
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@ -90,7 +90,7 @@ class TXTInput(InputFormatPlugin):
            
            # We don't check for block because the processor assumes block.
            # single and print at transformed to block for processing.
-            if options.paragraph_type == 'single':
+            if options.paragraph_type == 'single' or 'unformatted':
                txt = separate_paragraphs_single_line(txt)
            elif options.paragraph_type == 'print':
                txt = separate_paragraphs_print_formatted(txt)
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@ -9,6 +9,7 @@ import os, re
 from calibre import prepare_string_for_xml, isbytestring
 from calibre.ebooks.markdown import markdown
 from calibre.ebooks.metadata.opf2 import OPFCreator
+from calibre.ebooks.conversion.preprocess import DocAnalysis

 __license__   = 'GPL v3'
 __copyright__ = '2009, John Schember <john@nachtimwald.com>'
@ -102,26 +103,36 @@ def detect_paragraph_type(txt):
    print: Each paragraph starts with a 2+ spaces or a tab
           and ends when a new paragraph is reached.
    markdown: Markdown formatting is in the document.
+    unformatted: most lines have hard line breaks, few/no spaces or indents
    
-    returns block, single, print, markdown
+    returns block, single, print, markdown, unformatted
    '''
    txt = txt.replace('\r\n', '\n')
    txt = txt.replace('\r', '\n')
    txt_line_count = len(re.findall('(?mu)^\s*.+$', txt))
    
-    # Check for print
-    tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
-    if tab_line_count / float(txt_line_count) >= .25:
-        return 'print'
+    # Check for hard line breaks - true if 55% of the doc breaks in the same region
+    docanalysis = DocAnalysis('txt', txt)
+    hardbreaks = docanalysis.line_histogram(.55)
    
-    # Check for block
-    empty_line_count = len(re.findall('(?mu)^\s*$', txt))
-    if empty_line_count / float(txt_line_count) >= .25:
-        return 'block'
+    if hardbreaks:
+        # Check for print
+        tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
+        if tab_line_count / float(txt_line_count) >= .25:
+            return 'print'
        
-    # Nothing else matched to assume single.
+        # Check for block
+        empty_line_count = len(re.findall('(?mu)^\s*$', txt))
+        if empty_line_count / float(txt_line_count) >= .25:
+            return 'block'
+
+        # Assume unformatted text with hardbreaks if nothing else matches        
+        return 'unformatted'
+    
+    # return single if hardbreaks is false
    return 'single'

+
 def detect_formatting_type(txt):
    # Check for markdown
    # Headings