mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
tied line histogram into txt paragraph structure detection
This commit is contained in:
parent
ccd1a633ef
commit
dd96c645f0
@ -78,6 +78,8 @@ class DocAnalysis(object):
|
||||
linere = re.compile('(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL)
|
||||
elif format == 'spanned_html':
|
||||
linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
|
||||
elif format == 'txt':
|
||||
linere = re.compile('.*?\n', re.DOTALL)
|
||||
self.lines = linere.findall(raw)
|
||||
|
||||
def line_length(self, percent):
|
||||
|
@ -90,7 +90,7 @@ class TXTInput(InputFormatPlugin):
|
||||
|
||||
# We don't check for block because the processor assumes block.
|
||||
# single and print at transformed to block for processing.
|
||||
if options.paragraph_type == 'single':
|
||||
if options.paragraph_type == 'single' or 'unformatted':
|
||||
txt = separate_paragraphs_single_line(txt)
|
||||
elif options.paragraph_type == 'print':
|
||||
txt = separate_paragraphs_print_formatted(txt)
|
||||
|
@ -9,6 +9,7 @@ import os, re
|
||||
from calibre import prepare_string_for_xml, isbytestring
|
||||
from calibre.ebooks.markdown import markdown
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||
from calibre.ebooks.conversion.preprocess import DocAnalysis
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
@ -102,26 +103,36 @@ def detect_paragraph_type(txt):
|
||||
print: Each paragraph starts with a 2+ spaces or a tab
|
||||
and ends when a new paragraph is reached.
|
||||
markdown: Markdown formatting is in the document.
|
||||
unformatted: most lines have hard line breaks, few/no spaces or indents
|
||||
|
||||
returns block, single, print, markdown
|
||||
returns block, single, print, markdown, unformatted
|
||||
'''
|
||||
txt = txt.replace('\r\n', '\n')
|
||||
txt = txt.replace('\r', '\n')
|
||||
txt_line_count = len(re.findall('(?mu)^\s*.+$', txt))
|
||||
|
||||
# Check for print
|
||||
tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
|
||||
if tab_line_count / float(txt_line_count) >= .25:
|
||||
return 'print'
|
||||
# Check for hard line breaks - true if 55% of the doc breaks in the same region
|
||||
docanalysis = DocAnalysis('txt', txt)
|
||||
hardbreaks = docanalysis.line_histogram(.55)
|
||||
|
||||
# Check for block
|
||||
empty_line_count = len(re.findall('(?mu)^\s*$', txt))
|
||||
if empty_line_count / float(txt_line_count) >= .25:
|
||||
return 'block'
|
||||
if hardbreaks:
|
||||
# Check for print
|
||||
tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
|
||||
if tab_line_count / float(txt_line_count) >= .25:
|
||||
return 'print'
|
||||
|
||||
# Nothing else matched to assume single.
|
||||
# Check for block
|
||||
empty_line_count = len(re.findall('(?mu)^\s*$', txt))
|
||||
if empty_line_count / float(txt_line_count) >= .25:
|
||||
return 'block'
|
||||
|
||||
# Assume unformatted text with hardbreaks if nothing else matches
|
||||
return 'unformatted'
|
||||
|
||||
# return single if hardbreaks is false
|
||||
return 'single'
|
||||
|
||||
|
||||
def detect_formatting_type(txt):
|
||||
# Check for markdown
|
||||
# Headings
|
||||
|
Loading…
x
Reference in New Issue
Block a user