tied line histogram into txt paragraph structure detection

This commit is contained in:
ldolse 2011-01-07 14:19:12 +08:00
parent ccd1a633ef
commit dd96c645f0
3 changed files with 24 additions and 11 deletions

View File

@ -78,6 +78,8 @@ class DocAnalysis(object):
linere = re.compile('(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL) linere = re.compile('(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL)
elif format == 'spanned_html': elif format == 'spanned_html':
linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL) linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
elif format == 'txt':
linere = re.compile('.*?\n', re.DOTALL)
self.lines = linere.findall(raw) self.lines = linere.findall(raw)
def line_length(self, percent): def line_length(self, percent):

View File

@ -90,7 +90,7 @@ class TXTInput(InputFormatPlugin):
# We don't check for block because the processor assumes block. # We don't check for block because the processor assumes block.
# single and print at transformed to block for processing. # single and print at transformed to block for processing.
if options.paragraph_type == 'single': if options.paragraph_type == 'single' or 'unformatted':
txt = separate_paragraphs_single_line(txt) txt = separate_paragraphs_single_line(txt)
elif options.paragraph_type == 'print': elif options.paragraph_type == 'print':
txt = separate_paragraphs_print_formatted(txt) txt = separate_paragraphs_print_formatted(txt)

View File

@ -9,6 +9,7 @@ import os, re
from calibre import prepare_string_for_xml, isbytestring from calibre import prepare_string_for_xml, isbytestring
from calibre.ebooks.markdown import markdown from calibre.ebooks.markdown import markdown
from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.conversion.preprocess import DocAnalysis
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>' __copyright__ = '2009, John Schember <john@nachtimwald.com>'
@ -102,26 +103,36 @@ def detect_paragraph_type(txt):
print: Each paragraph starts with a 2+ spaces or a tab print: Each paragraph starts with a 2+ spaces or a tab
and ends when a new paragraph is reached. and ends when a new paragraph is reached.
markdown: Markdown formatting is in the document. markdown: Markdown formatting is in the document.
unformatted: most lines have hard line breaks, few/no spaces or indents
returns block, single, print, markdown returns block, single, print, markdown, unformatted
''' '''
txt = txt.replace('\r\n', '\n') txt = txt.replace('\r\n', '\n')
txt = txt.replace('\r', '\n') txt = txt.replace('\r', '\n')
txt_line_count = len(re.findall('(?mu)^\s*.+$', txt)) txt_line_count = len(re.findall('(?mu)^\s*.+$', txt))
# Check for print # Check for hard line breaks - true if 55% of the doc breaks in the same region
tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt)) docanalysis = DocAnalysis('txt', txt)
if tab_line_count / float(txt_line_count) >= .25: hardbreaks = docanalysis.line_histogram(.55)
return 'print'
# Check for block if hardbreaks:
empty_line_count = len(re.findall('(?mu)^\s*$', txt)) # Check for print
if empty_line_count / float(txt_line_count) >= .25: tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
return 'block' if tab_line_count / float(txt_line_count) >= .25:
return 'print'
# Check for block
empty_line_count = len(re.findall('(?mu)^\s*$', txt))
if empty_line_count / float(txt_line_count) >= .25:
return 'block'
# Assume unformatted text with hardbreaks if nothing else matches
return 'unformatted'
# Nothing else matched to assume single. # return single if hardbreaks is false
return 'single' return 'single'
def detect_formatting_type(txt): def detect_formatting_type(txt):
# Check for markdown # Check for markdown
# Headings # Headings