diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 29006ffd9b..97aaa653a9 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -78,6 +78,8 @@ class DocAnalysis(object):
linere = re.compile('(?<=
)(?!\s*
).*?(?=
)', re.DOTALL)
elif format == 'spanned_html':
linere = re.compile('(?<=)', re.DOTALL)
+ elif format == 'txt':
+ linere = re.compile('.*?\n', re.DOTALL)
self.lines = linere.findall(raw)
def line_length(self, percent):
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 1bb232c911..27dacdf5fb 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -154,7 +154,7 @@ class PreProcessor(object):
default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?([ibu][^>]*>)?(?=<)"
chapter_types = [
- [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"],
+ [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, "Searching for common Chapter Headings"],
[r"]*>\s*(]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)()?\s*", True, "Searching for emphasized lines"], # Emphasized lines
[r"[^'\"]?(\d+(\.|:)|CHAPTER)\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters
[r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings"], # Spaced Lettering
@@ -184,7 +184,22 @@ class PreProcessor(object):
self.log("Total wordcount is: "+ str(wordcount)+", Average words per section is: "+str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters")
return html
-
+ def punctuation_unwrap(self, length, content, format):
+ # define the pieces of the regex
+ lookahead = "(?<=.{"+str(length)+"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?\s*((p|span|div)>)?"
+ blanklines = "\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*(span|p|div)>\s*)(span|p|div)>\s*){0,3}\s*"
+ line_opening = "<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*"
+ txt_line_wrap = u"(\u0020|\u0009)*\n"
+
+ unwrap_regex = lookahead+line_ending+blanklines+line_opening
+ if format == 'txt':
+ unwrap_regex = lookahead+txt_line_wrap
+
+ unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE)
+ content = unwrap.sub(' ', content)
+ return content
+
def __call__(self, html):
self.log("********* Preprocessing HTML *********")
@@ -194,7 +209,7 @@ class PreProcessor(object):
totalwords = 0
totalwords = self.get_word_count(html)
- if totalwords < 20:
+ if totalwords < 50:
self.log("not enough text, not preprocessing")
return html
@@ -312,8 +327,7 @@ class PreProcessor(object):
self.log("Done dehyphenating")
# Unwrap lines using punctation and line length
#unwrap_quotes = re.compile(u"(?<=.{%i}\"')\s*(span|p|div)>\s*((p|span|div)>)?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*(span|p|div)>\s*)(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*(?=[a-z])" % length, re.UNICODE)
- unwrap = re.compile(u"(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?\s*((p|span|div)>)?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*(span|p|div)>\s*)(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
- html = unwrap.sub(' ', html)
+ html = self.punctuation_unwrap(length, html, 'html')
#check any remaining hyphens, but only unwrap if there is a match
dehyphenator = Dehyphenator()
html = dehyphenator(html,'html_cleanup', length)
diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index 47e92a45a9..98756c5fa1 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -90,11 +90,21 @@ class TXTInput(InputFormatPlugin):
# We don't check for block because the processor assumes block.
# single and print at transformed to block for processing.
- if options.paragraph_type == 'single':
+ if options.paragraph_type == 'single' or 'unformatted':
txt = separate_paragraphs_single_line(txt)
elif options.paragraph_type == 'print':
txt = separate_paragraphs_print_formatted(txt)
+ if options.paragraph_type == 'unformatted':
+ from calibre.ebooks.conversion.utils import PreProcessor
+ from calibre.ebooks.conversion.preprocess import DocAnalysis
+ # get length
+ docanalysis = DocAnalysis('txt', txt)
+ length = docanalysis.line_length(.5)
+ # unwrap lines based on punctuation
+ preprocessor = PreProcessor(options, log=getattr(self, 'log', None))
+ txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
+
flow_size = getattr(options, 'flow_size', 0)
html = convert_basic(txt, epub_split_size_kb=flow_size)
diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py
index f6d628e7c5..c6cf1078cd 100644
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@@ -9,6 +9,7 @@ import os, re
from calibre import prepare_string_for_xml, isbytestring
from calibre.ebooks.markdown import markdown
from calibre.ebooks.metadata.opf2 import OPFCreator
+from calibre.ebooks.conversion.preprocess import DocAnalysis
__license__ = 'GPL v3'
__copyright__ = '2009, John Schember '
@@ -101,27 +102,36 @@ def detect_paragraph_type(txt):
single: Each line is a paragraph.
print: Each paragraph starts with a 2+ spaces or a tab
and ends when a new paragraph is reached.
- markdown: Markdown formatting is in the document.
+ unformatted: most lines have hard line breaks, few/no spaces or indents
- returns block, single, print, markdown
+ returns block, single, print, unformatted
'''
txt = txt.replace('\r\n', '\n')
txt = txt.replace('\r', '\n')
txt_line_count = len(re.findall('(?mu)^\s*.+$', txt))
- # Check for print
- tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
- if tab_line_count / float(txt_line_count) >= .25:
- return 'print'
+ # Check for hard line breaks - true if 55% of the doc breaks in the same region
+ docanalysis = DocAnalysis('txt', txt)
+ hardbreaks = docanalysis.line_histogram(.55)
- # Check for block
- empty_line_count = len(re.findall('(?mu)^\s*$', txt))
- if empty_line_count / float(txt_line_count) >= .25:
- return 'block'
+ if hardbreaks:
+ # Check for print
+ tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
+ if tab_line_count / float(txt_line_count) >= .25:
+ return 'print'
+
+ # Check for block
+ empty_line_count = len(re.findall('(?mu)^\s*$', txt))
+ if empty_line_count / float(txt_line_count) >= .25:
+ return 'block'
+
+ # Assume unformatted text with hardbreaks if nothing else matches
+ return 'unformatted'
- # Nothing else matched to assume single.
+ # return single if hardbreaks is false
return 'single'
+
def detect_formatting_type(txt):
# Check for markdown
# Headings