From da679b885faf07c3218946d072b529259c5e6955 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Fri, 7 Jan 2011 11:26:45 +0800
Subject: [PATCH 1/5] chapter heading tweaks

---
 src/calibre/ebooks/conversion/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 4bb96ac088..2090cff12d 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -153,7 +153,7 @@ class PreProcessor(object):
         default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
         
         chapter_types = [
-            [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"],
+            [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, "Searching for common Chapter Headings"],
             [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines
             [r"[^'\"]?(\d+(\.|:)|CHAPTER)\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"],  # Numeric Chapters
             [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings"],  # Spaced Lettering

From dd96c645f020cd57682bbeba8501c21b8b77b0b9 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Fri, 7 Jan 2011 14:19:12 +0800
Subject: [PATCH 2/5] tied line histogram into txt paragraph structure
 detection

---
 src/calibre/ebooks/conversion/preprocess.py |  2 ++
 src/calibre/ebooks/txt/input.py             |  2 +-
 src/calibre/ebooks/txt/processor.py         | 31 ++++++++++++++-------
 3 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 29006ffd9b..97aaa653a9 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -78,6 +78,8 @@ class DocAnalysis(object):
             linere = re.compile('(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL)
         elif format == 'spanned_html':
             linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
+        elif format == 'txt':
+            linere = re.compile('.*?\n', re.DOTALL)
         self.lines = linere.findall(raw)
 
     def line_length(self, percent):
diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index 47e92a45a9..7fb22755de 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -90,7 +90,7 @@ class TXTInput(InputFormatPlugin):
             
             # We don't check for block because the processor assumes block.
             # single and print at transformed to block for processing.
-            if options.paragraph_type == 'single':
+            if options.paragraph_type == 'single' or 'unformatted':
                 txt = separate_paragraphs_single_line(txt)
             elif options.paragraph_type == 'print':
                 txt = separate_paragraphs_print_formatted(txt)
diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py
index f6d628e7c5..53935584d2 100644
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@@ -9,6 +9,7 @@ import os, re
 from calibre import prepare_string_for_xml, isbytestring
 from calibre.ebooks.markdown import markdown
 from calibre.ebooks.metadata.opf2 import OPFCreator
+from calibre.ebooks.conversion.preprocess import DocAnalysis
 
 __license__   = 'GPL v3'
 __copyright__ = '2009, John Schember <john@nachtimwald.com>'
@@ -102,26 +103,36 @@ def detect_paragraph_type(txt):
     print: Each paragraph starts with a 2+ spaces or a tab
            and ends when a new paragraph is reached.
     markdown: Markdown formatting is in the document.
+    unformatted: most lines have hard line breaks, few/no spaces or indents
     
-    returns block, single, print, markdown
+    returns block, single, print, markdown, unformatted
     '''
     txt = txt.replace('\r\n', '\n')
     txt = txt.replace('\r', '\n')
     txt_line_count = len(re.findall('(?mu)^\s*.+$', txt))
     
-    # Check for print
-    tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
-    if tab_line_count / float(txt_line_count) >= .25:
-        return 'print'
+    # Check for hard line breaks - true if 55% of the doc breaks in the same region
+    docanalysis = DocAnalysis('txt', txt)
+    hardbreaks = docanalysis.line_histogram(.55)
     
-    # Check for block
-    empty_line_count = len(re.findall('(?mu)^\s*$', txt))
-    if empty_line_count / float(txt_line_count) >= .25:
-        return 'block'
+    if hardbreaks:
+        # Check for print
+        tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
+        if tab_line_count / float(txt_line_count) >= .25:
+            return 'print'
+        
+        # Check for block
+        empty_line_count = len(re.findall('(?mu)^\s*$', txt))
+        if empty_line_count / float(txt_line_count) >= .25:
+            return 'block'
+
+        # Assume unformatted text with hardbreaks if nothing else matches        
+        return 'unformatted'
     
-    # Nothing else matched to assume single.
+    # return single if hardbreaks is false
     return 'single'
 
+
 def detect_formatting_type(txt):
     # Check for markdown
     # Headings

From 90177a42053f29c302faf7483de6dd3fc455d400 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sat, 8 Jan 2011 09:23:32 +0800
Subject: [PATCH 3/5] tweaked threshold for preprocess

---
 src/calibre/ebooks/conversion/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index f367aa02d7..5db920b01d 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -194,7 +194,7 @@ class PreProcessor(object):
         totalwords = 0
         totalwords = self.get_word_count(html)
 
-        if totalwords < 20:
+        if totalwords < 50:
             self.log("not enough text, not preprocessing")
             return html
 

From 5854f5308e46d1be747cf85d789d9ca9de78e80b Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sat, 8 Jan 2011 19:45:51 +0800
Subject: [PATCH 4/5] moved punctuation unwrap into a function, tied to txt
 input

---
 src/calibre/ebooks/conversion/utils.py | 20 +++++++++++++++++---
 src/calibre/ebooks/txt/input.py        | 10 ++++++++++
 2 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 5db920b01d..27dacdf5fb 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -184,7 +184,22 @@ class PreProcessor(object):
         self.log("Total wordcount is: "+ str(wordcount)+", Average words per section is: "+str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters")
         return html
 
-
+    def punctuation_unwrap(self, length, content, format):
+        # define the pieces of the regex
+        lookahead = "(?<=.{"+str(length)+"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
+        line_ending = "\s*</(span|p|div)>\s*(</(p|span|div)>)?"
+        blanklines = "\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*"
+        line_opening = "<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*"
+        txt_line_wrap = u"(\u0020|\u0009)*\n"
+        
+        unwrap_regex = lookahead+line_ending+blanklines+line_opening
+        if format == 'txt':
+            unwrap_regex = lookahead+txt_line_wrap
+        
+        unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE)
+        content = unwrap.sub(' ', content)
+        return content
+       
 
     def __call__(self, html):
         self.log("*********  Preprocessing HTML  *********")
@@ -312,8 +327,7 @@ class PreProcessor(object):
             self.log("Done dehyphenating")
             # Unwrap lines using punctation and line length
             #unwrap_quotes = re.compile(u"(?<=.{%i}\"')\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*(?=[a-z])" % length, re.UNICODE)
-            unwrap = re.compile(u"(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
-            html = unwrap.sub(' ', html)
+            html = self.punctuation_unwrap(length, html, 'html')
             #check any remaining hyphens, but only unwrap if there is a match
             dehyphenator = Dehyphenator()
             html = dehyphenator(html,'html_cleanup', length)
diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index 7fb22755de..98756c5fa1 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -95,6 +95,16 @@ class TXTInput(InputFormatPlugin):
             elif options.paragraph_type == 'print':
                 txt = separate_paragraphs_print_formatted(txt)
 
+            if options.paragraph_type == 'unformatted':
+                from calibre.ebooks.conversion.utils import PreProcessor
+                from calibre.ebooks.conversion.preprocess import DocAnalysis
+                # get length
+                docanalysis = DocAnalysis('txt', txt)
+                length = docanalysis.line_length(.5)
+                # unwrap lines based on punctuation
+                preprocessor = PreProcessor(options, log=getattr(self, 'log', None))
+                txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
+
             flow_size = getattr(options, 'flow_size', 0)
             html = convert_basic(txt, epub_split_size_kb=flow_size)
 

From f88045c16266474ed625a0e38b0a9fa12aded75d Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sat, 8 Jan 2011 20:35:19 +0800
Subject: [PATCH 5/5] fixed comments

---
 src/calibre/ebooks/txt/processor.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py
index 53935584d2..c6cf1078cd 100644
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@@ -102,10 +102,9 @@ def detect_paragraph_type(txt):
     single: Each line is a paragraph.
     print: Each paragraph starts with a 2+ spaces or a tab
            and ends when a new paragraph is reached.
-    markdown: Markdown formatting is in the document.
     unformatted: most lines have hard line breaks, few/no spaces or indents
     
-    returns block, single, print, markdown, unformatted
+    returns block, single, print, unformatted
     '''
     txt = txt.replace('\r\n', '\n')
     txt = txt.replace('\r', '\n')