tweaked the auto-detection to handle cases where the vast majority of the lines are formatted as block or print

2025-07-09 03:04:10 -04:00 · 2011-01-09 18:38:52 +08:00 · 2011-01-09 18:38:52 +08:00 · 0f109d699f
commit 0f109d699f
parent 696d925232
1 changed files with 13 additions and 7 deletions
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@ -133,14 +133,20 @@ def detect_paragraph_type(txt):
    hardbreaks = docanalysis.line_histogram(.55)
    
    if hardbreaks:
-        # Check for print
+        # Determine print percentage
        tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
-        if tab_line_count / float(txt_line_count) >= .15:
-            return 'print'
+        print_percent = tab_line_count / float(txt_line_count)
     
-        # Check for block
+        # Determine block percentage
        empty_line_count = len(re.findall('(?mu)^\s*$', txt))
-        if empty_line_count / float(txt_line_count) >= .15:
+        block_percent = empty_line_count / float(txt_line_count)
+        
+        # Compare the two types - the type with the larger number of instances wins
+        # in cases where only one or the other represents the vast majority of the document neither wins
+        if print_percent >= block_percent:
+            if .15 <= print_percent <= .75:
+                return 'print'
+        elif .15 <= block_percent <= .75:
            return 'block'     

        # Assume unformatted text with hardbreaks if nothing else matches