mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
TXT Input: Paragraph detection tweaks
This commit is contained in:
commit
4a61dd50b7
@ -78,6 +78,8 @@ class DocAnalysis(object):
|
|||||||
linere = re.compile('(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL)
|
linere = re.compile('(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL)
|
||||||
elif format == 'spanned_html':
|
elif format == 'spanned_html':
|
||||||
linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
|
linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
|
||||||
|
elif format == 'txt':
|
||||||
|
linere = re.compile('.*?\n', re.DOTALL)
|
||||||
self.lines = linere.findall(raw)
|
self.lines = linere.findall(raw)
|
||||||
|
|
||||||
def line_length(self, percent):
|
def line_length(self, percent):
|
||||||
|
@ -154,7 +154,7 @@ class PreProcessor(object):
|
|||||||
default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
|
default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
|
||||||
|
|
||||||
chapter_types = [
|
chapter_types = [
|
||||||
[r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"],
|
[r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, "Searching for common Chapter Headings"],
|
||||||
[r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines
|
[r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines
|
||||||
[r"[^'\"]?(\d+(\.|:)|CHAPTER)\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters
|
[r"[^'\"]?(\d+(\.|:)|CHAPTER)\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters
|
||||||
[r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings"], # Spaced Lettering
|
[r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings"], # Spaced Lettering
|
||||||
@ -184,7 +184,22 @@ class PreProcessor(object):
|
|||||||
self.log("Total wordcount is: "+ str(wordcount)+", Average words per section is: "+str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters")
|
self.log("Total wordcount is: "+ str(wordcount)+", Average words per section is: "+str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters")
|
||||||
return html
|
return html
|
||||||
|
|
||||||
|
def punctuation_unwrap(self, length, content, format):
|
||||||
|
# define the pieces of the regex
|
||||||
|
lookahead = "(?<=.{"+str(length)+"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
|
||||||
|
line_ending = "\s*</(span|p|div)>\s*(</(p|span|div)>)?"
|
||||||
|
blanklines = "\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*"
|
||||||
|
line_opening = "<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*"
|
||||||
|
txt_line_wrap = u"(\u0020|\u0009)*\n"
|
||||||
|
|
||||||
|
unwrap_regex = lookahead+line_ending+blanklines+line_opening
|
||||||
|
if format == 'txt':
|
||||||
|
unwrap_regex = lookahead+txt_line_wrap
|
||||||
|
|
||||||
|
unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE)
|
||||||
|
content = unwrap.sub(' ', content)
|
||||||
|
return content
|
||||||
|
|
||||||
|
|
||||||
def __call__(self, html):
|
def __call__(self, html):
|
||||||
self.log("********* Preprocessing HTML *********")
|
self.log("********* Preprocessing HTML *********")
|
||||||
@ -194,7 +209,7 @@ class PreProcessor(object):
|
|||||||
totalwords = 0
|
totalwords = 0
|
||||||
totalwords = self.get_word_count(html)
|
totalwords = self.get_word_count(html)
|
||||||
|
|
||||||
if totalwords < 20:
|
if totalwords < 50:
|
||||||
self.log("not enough text, not preprocessing")
|
self.log("not enough text, not preprocessing")
|
||||||
return html
|
return html
|
||||||
|
|
||||||
@ -312,8 +327,7 @@ class PreProcessor(object):
|
|||||||
self.log("Done dehyphenating")
|
self.log("Done dehyphenating")
|
||||||
# Unwrap lines using punctation and line length
|
# Unwrap lines using punctation and line length
|
||||||
#unwrap_quotes = re.compile(u"(?<=.{%i}\"')\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*(?=[a-z])" % length, re.UNICODE)
|
#unwrap_quotes = re.compile(u"(?<=.{%i}\"')\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*(?=[a-z])" % length, re.UNICODE)
|
||||||
unwrap = re.compile(u"(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
|
html = self.punctuation_unwrap(length, html, 'html')
|
||||||
html = unwrap.sub(' ', html)
|
|
||||||
#check any remaining hyphens, but only unwrap if there is a match
|
#check any remaining hyphens, but only unwrap if there is a match
|
||||||
dehyphenator = Dehyphenator()
|
dehyphenator = Dehyphenator()
|
||||||
html = dehyphenator(html,'html_cleanup', length)
|
html = dehyphenator(html,'html_cleanup', length)
|
||||||
|
@ -90,11 +90,21 @@ class TXTInput(InputFormatPlugin):
|
|||||||
|
|
||||||
# We don't check for block because the processor assumes block.
|
# We don't check for block because the processor assumes block.
|
||||||
# single and print at transformed to block for processing.
|
# single and print at transformed to block for processing.
|
||||||
if options.paragraph_type == 'single':
|
if options.paragraph_type == 'single' or 'unformatted':
|
||||||
txt = separate_paragraphs_single_line(txt)
|
txt = separate_paragraphs_single_line(txt)
|
||||||
elif options.paragraph_type == 'print':
|
elif options.paragraph_type == 'print':
|
||||||
txt = separate_paragraphs_print_formatted(txt)
|
txt = separate_paragraphs_print_formatted(txt)
|
||||||
|
|
||||||
|
if options.paragraph_type == 'unformatted':
|
||||||
|
from calibre.ebooks.conversion.utils import PreProcessor
|
||||||
|
from calibre.ebooks.conversion.preprocess import DocAnalysis
|
||||||
|
# get length
|
||||||
|
docanalysis = DocAnalysis('txt', txt)
|
||||||
|
length = docanalysis.line_length(.5)
|
||||||
|
# unwrap lines based on punctuation
|
||||||
|
preprocessor = PreProcessor(options, log=getattr(self, 'log', None))
|
||||||
|
txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
|
||||||
|
|
||||||
flow_size = getattr(options, 'flow_size', 0)
|
flow_size = getattr(options, 'flow_size', 0)
|
||||||
html = convert_basic(txt, epub_split_size_kb=flow_size)
|
html = convert_basic(txt, epub_split_size_kb=flow_size)
|
||||||
|
|
||||||
|
@ -9,6 +9,7 @@ import os, re
|
|||||||
from calibre import prepare_string_for_xml, isbytestring
|
from calibre import prepare_string_for_xml, isbytestring
|
||||||
from calibre.ebooks.markdown import markdown
|
from calibre.ebooks.markdown import markdown
|
||||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||||
|
from calibre.ebooks.conversion.preprocess import DocAnalysis
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||||
@ -101,27 +102,36 @@ def detect_paragraph_type(txt):
|
|||||||
single: Each line is a paragraph.
|
single: Each line is a paragraph.
|
||||||
print: Each paragraph starts with a 2+ spaces or a tab
|
print: Each paragraph starts with a 2+ spaces or a tab
|
||||||
and ends when a new paragraph is reached.
|
and ends when a new paragraph is reached.
|
||||||
markdown: Markdown formatting is in the document.
|
unformatted: most lines have hard line breaks, few/no spaces or indents
|
||||||
|
|
||||||
returns block, single, print, markdown
|
returns block, single, print, unformatted
|
||||||
'''
|
'''
|
||||||
txt = txt.replace('\r\n', '\n')
|
txt = txt.replace('\r\n', '\n')
|
||||||
txt = txt.replace('\r', '\n')
|
txt = txt.replace('\r', '\n')
|
||||||
txt_line_count = len(re.findall('(?mu)^\s*.+$', txt))
|
txt_line_count = len(re.findall('(?mu)^\s*.+$', txt))
|
||||||
|
|
||||||
# Check for print
|
# Check for hard line breaks - true if 55% of the doc breaks in the same region
|
||||||
tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
|
docanalysis = DocAnalysis('txt', txt)
|
||||||
if tab_line_count / float(txt_line_count) >= .25:
|
hardbreaks = docanalysis.line_histogram(.55)
|
||||||
return 'print'
|
|
||||||
|
|
||||||
# Check for block
|
if hardbreaks:
|
||||||
empty_line_count = len(re.findall('(?mu)^\s*$', txt))
|
# Check for print
|
||||||
if empty_line_count / float(txt_line_count) >= .25:
|
tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
|
||||||
return 'block'
|
if tab_line_count / float(txt_line_count) >= .25:
|
||||||
|
return 'print'
|
||||||
|
|
||||||
|
# Check for block
|
||||||
|
empty_line_count = len(re.findall('(?mu)^\s*$', txt))
|
||||||
|
if empty_line_count / float(txt_line_count) >= .25:
|
||||||
|
return 'block'
|
||||||
|
|
||||||
|
# Assume unformatted text with hardbreaks if nothing else matches
|
||||||
|
return 'unformatted'
|
||||||
|
|
||||||
# Nothing else matched to assume single.
|
# return single if hardbreaks is false
|
||||||
return 'single'
|
return 'single'
|
||||||
|
|
||||||
|
|
||||||
def detect_formatting_type(txt):
|
def detect_formatting_type(txt):
|
||||||
# Check for markdown
|
# Check for markdown
|
||||||
# Headings
|
# Headings
|
||||||
|
Loading…
x
Reference in New Issue
Block a user