mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
tweaked the auto-detection to handle cases where the vast majority of the lines are formatted as block or print
This commit is contained in:
parent
696d925232
commit
0f109d699f
@ -133,15 +133,21 @@ def detect_paragraph_type(txt):
|
|||||||
hardbreaks = docanalysis.line_histogram(.55)
|
hardbreaks = docanalysis.line_histogram(.55)
|
||||||
|
|
||||||
if hardbreaks:
|
if hardbreaks:
|
||||||
# Check for print
|
# Determine print percentage
|
||||||
tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
|
tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
|
||||||
if tab_line_count / float(txt_line_count) >= .15:
|
print_percent = tab_line_count / float(txt_line_count)
|
||||||
return 'print'
|
|
||||||
|
# Determine block percentage
|
||||||
# Check for block
|
|
||||||
empty_line_count = len(re.findall('(?mu)^\s*$', txt))
|
empty_line_count = len(re.findall('(?mu)^\s*$', txt))
|
||||||
if empty_line_count / float(txt_line_count) >= .15:
|
block_percent = empty_line_count / float(txt_line_count)
|
||||||
return 'block'
|
|
||||||
|
# Compare the two types - the type with the larger number of instances wins
|
||||||
|
# in cases where only one or the other represents the vast majority of the document neither wins
|
||||||
|
if print_percent >= block_percent:
|
||||||
|
if .15 <= print_percent <= .75:
|
||||||
|
return 'print'
|
||||||
|
elif .15 <= block_percent <= .75:
|
||||||
|
return 'block'
|
||||||
|
|
||||||
# Assume unformatted text with hardbreaks if nothing else matches
|
# Assume unformatted text with hardbreaks if nothing else matches
|
||||||
return 'unformatted'
|
return 'unformatted'
|
||||||
|
Loading…
x
Reference in New Issue
Block a user