From 843e1f2068cf1707f7f002be7c05c37282e9fa36 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 8 Jan 2011 13:17:32 -0500
Subject: [PATCH 1/4] TXT Input: Basic heuristic processor.

---
 src/calibre/ebooks/txt/heuristicprocessor.py | 88 ++++++++++++++++++++
 src/calibre/ebooks/txt/input.py              | 12 ++-
 src/calibre/ebooks/txt/processor.py          | 23 ++++-
 3 files changed, 116 insertions(+), 7 deletions(-)
 create mode 100644 src/calibre/ebooks/txt/heuristicprocessor.py
diff --git a/src/calibre/ebooks/txt/heuristicprocessor.py b/src/calibre/ebooks/txt/heuristicprocessor.py
new file mode 100644
index 0000000000..cbfa33a96a
--- /dev/null
+++ b/src/calibre/ebooks/txt/heuristicprocessor.py
@@ -0,0 +1,88 @@
+# -*- coding: utf-8 -*-
+
+__license__ = 'GPL 3'
+__copyright__ = '2011, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import re
+import string
+
+from calibre import prepare_string_for_xml
+from calibre.ebooks.unidecode.unidecoder import Unidecoder
+
+class TXTHeuristicProcessor(object):
+
+    def __init__(self):
+        self.ITALICIZE_WORDS = [
+            'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
+            'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetra', 'n.b.', 'N.b.',
+            'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.',
+            'Mlle.', 'Mons.', 'PS.', 'PPS.', 
+        ]
+        self.ITALICIZE_STYLE_PATS = [
+            r'(?msu)_(?P<words>.+?)_',
+            r'(?msu)/(?P<words>.+?)/',
+            r'(?msu)~~(?P<words>.+?)~~',
+            r'(?msu)\*(?P<words>.+?)\*',
+            r'(?msu)~(?P<words>.+?)~',
+            r'(?msu)_/(?P<words>.+?)/_',
+            r'(?msu)_\*(?P<words>.+?)\*_',
+            r'(?msu)\*/(?P<words>.+?)/\*',
+            r'(?msu)_\*/(?P<words>.+?)/\*_',
+            r'(?msu)/:(?P<words>.+?):/',
+            r'(?msu)\|:(?P<words>.+?):\|',
+        ]
+
+    def del_maketrans(self, deletechars):
+        return dict([(ord(x), u'') for x in deletechars])
+
+    def is_heading(self, line):
+        if not line:
+            return False
+        if len(line) > 40:
+            return False
+        
+        line = Unidecoder().decode(line)
+
+        # punctuation.
+        if line.translate(self.del_maketrans(string.letters + string.digits + ' :-')):
+            return False
+        
+        # All upper case.
+        #if line.isupper():
+        #    return True
+        # Roman numerals.
+        #if not line.translate(self.del_maketrans('IVXYCivxyc ')):
+        #    return True
+        
+        return True
+
+    def process_paragraph(self, paragraph):
+        for word in self.ITALICIZE_WORDS:
+            paragraph = paragraph.replace(word, '<i>%s</i>' % word)
+        for pat in self.ITALICIZE_STYLE_PATS:
+            paragraph = re.sub(pat, lambda mo: '<i>%s</i>' % mo.group('words'), paragraph)
+        return paragraph
+
+    def convert(self, txt, title='', epub_split_size_kb=0):
+        from calibre.ebooks.txt.processor import clean_txt, split_txt, HTML_TEMPLATE
+        txt = clean_txt(txt)
+        txt = split_txt(txt, epub_split_size_kb)
+        
+        processed = []
+        last_was_heading = False
+        for line in txt.split('\n\n'):
+            if self.is_heading(line):
+                if not last_was_heading:
+                    processed.append(u'<h1>%s</h1>' % prepare_string_for_xml(line.replace('\n', ' ')))
+                else:
+                    processed.append(u'<h2>%s</h2>' % prepare_string_for_xml(line.replace('\n', ' ')))
+                last_was_heading = True
+            else:
+                processed.append(u'<p>%s</p>' % self.process_paragraph(prepare_string_for_xml(line.replace('\n', ' '))))
+                last_was_heading = False
+                
+        txt = u'\n'.join(processed)
+        txt = re.sub('[ ]{2,}', ' ', txt)
+
+        return HTML_TEMPLATE % (title, txt)
diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index 47e92a45a9..fd805f8ce8 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -10,7 +10,8 @@ from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
 from calibre.ebooks.chardet import detect
 from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
     separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
-    preserve_spaces, detect_paragraph_type, detect_formatting_type
+    preserve_spaces, detect_paragraph_type, detect_formatting_type, \
+    convert_heuristic
 from calibre import _ent_pat, xml_entity_to_unicode
 
 class TXTInput(InputFormatPlugin):
@@ -31,7 +32,7 @@ class TXTInput(InputFormatPlugin):
                    '* print:  Assume every line starting with 2+ spaces or a tab '
                    'starts a paragraph.')),
         OptionRecommendation(name='formatting_type', recommended_value='auto',
-            choices=['auto', 'none', 'markdown'],
+            choices=['auto', 'none', 'heuristic', 'markdown'],
             help=_('Formatting used within the document.'
                    '* auto: Try to auto detect the document formatting.\n'
                    '* none: Do not modify the paragraph formatting. Everything is a paragraph.\n'
@@ -96,7 +97,12 @@ class TXTInput(InputFormatPlugin):
                 txt = separate_paragraphs_print_formatted(txt)
 
             flow_size = getattr(options, 'flow_size', 0)
-            html = convert_basic(txt, epub_split_size_kb=flow_size)
+            
+            if options.formatting_type == 'heuristic':
+                html = convert_heuristic(txt, epub_split_size_kb=flow_size)
+            else:
+                html = convert_basic(txt, epub_split_size_kb=flow_size)
+            
 
         from calibre.customize.ui import plugin_for_input_format
         html_input = plugin_for_input_format('html')
diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py
index f6d628e7c5..79eee79c29 100644
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@@ -9,6 +9,7 @@ import os, re
 from calibre import prepare_string_for_xml, isbytestring
 from calibre.ebooks.markdown import markdown
 from calibre.ebooks.metadata.opf2 import OPFCreator
+from calibre.ebooks.txt.heuristicprocessor import TXTHeuristicProcessor
 
 __license__   = 'GPL v3'
 __copyright__ = '2009, John Schember <john@nachtimwald.com>'
@@ -16,7 +17,7 @@ __docformat__ = 'restructuredtext en'
 
 HTML_TEMPLATE = u'<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s</title></head><body>\n%s\n</body></html>'
 
-def convert_basic(txt, title='', epub_split_size_kb=0):
+def clean_txt(txt):
     if isbytestring(txt):
         txt = txt.decode('utf-8', 'replace')
     # Strip whitespace from the beginning and end of the line. Also replace
@@ -35,6 +36,10 @@ def convert_basic(txt, title='', epub_split_size_kb=0):
     chars = list(range(8)) + [0x0B, 0x0E, 0x0F] + list(range(0x10, 0x19))
     illegal_chars = re.compile(u'|'.join(map(unichr, chars)))
     txt = illegal_chars.sub('', txt)
+    
+    return txt
+
+def split_txt(txt, epub_split_size_kb=0):
     #Takes care if there is no point to split
     if epub_split_size_kb > 0:
         if isinstance(txt, unicode):
@@ -49,6 +54,12 @@ def convert_basic(txt, title='', epub_split_size_kb=0):
     if isbytestring(txt):
         txt = txt.decode('utf-8')
 
+    return txt
+
+def convert_basic(txt, title='', epub_split_size_kb=0):
+    txt = clean_txt(txt)
+    txt = split_txt(txt, epub_split_size_kb)
+
     lines = []
     # Split into paragraphs based on having a blank line between text.
     for line in txt.split('\n\n'):
@@ -57,6 +68,10 @@ def convert_basic(txt, title='', epub_split_size_kb=0):
 
     return HTML_TEMPLATE % (title, u'\n'.join(lines))
 
+def convert_heuristic(txt, title='', epub_split_size_kb=0):
+    tp = TXTHeuristicProcessor()
+    return tp.convert(txt, title, epub_split_size_kb)
+
 def convert_markdown(txt, title='', disable_toc=False):
     md = markdown.Markdown(
           extensions=['footnotes', 'tables', 'toc'],
@@ -111,12 +126,12 @@ def detect_paragraph_type(txt):
     
     # Check for print
     tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
-    if tab_line_count / float(txt_line_count) >= .25:
+    if tab_line_count / float(txt_line_count) >= .15:
         return 'print'
     
     # Check for block
     empty_line_count = len(re.findall('(?mu)^\s*$', txt))
-    if empty_line_count / float(txt_line_count) >= .25:
+    if empty_line_count / float(txt_line_count) >= .15:
         return 'block'
     
     # Nothing else matched to assume single.
@@ -143,4 +158,4 @@ def detect_formatting_type(txt):
         if txt.count('\\'+c) > 10:
             return 'markdown'
     
-    return 'none'
+    return 'heuristic'

From f593b2163154bcd61e21b0e06f8cf0e29514af86 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 8 Jan 2011 13:53:32 -0500
Subject: [PATCH 2/4] TXT Input: Tweak Heuristic italicizing.

---
 src/calibre/ebooks/txt/heuristicprocessor.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/calibre/ebooks/txt/heuristicprocessor.py b/src/calibre/ebooks/txt/heuristicprocessor.py
index cbfa33a96a..b0bbd49961 100644
--- a/src/calibre/ebooks/txt/heuristicprocessor.py
+++ b/src/calibre/ebooks/txt/heuristicprocessor.py
@@ -21,15 +21,15 @@ class TXTHeuristicProcessor(object):
         ]
         self.ITALICIZE_STYLE_PATS = [
             r'(?msu)_(?P<words>.+?)_',
-            r'(?msu)/(?P<words>.+?)/',
+            r'(?msu)/(?P<words>[^<>]+?)/',
             r'(?msu)~~(?P<words>.+?)~~',
             r'(?msu)\*(?P<words>.+?)\*',
             r'(?msu)~(?P<words>.+?)~',
-            r'(?msu)_/(?P<words>.+?)/_',
+            r'(?msu)_/(?P<words>[^<>]+?)/_',
             r'(?msu)_\*(?P<words>.+?)\*_',
-            r'(?msu)\*/(?P<words>.+?)/\*',
-            r'(?msu)_\*/(?P<words>.+?)/\*_',
-            r'(?msu)/:(?P<words>.+?):/',
+            r'(?msu)\*/(?P<words>[^<>]+?)/\*',
+            r'(?msu)_\*/(?P<words>[^<>]+?)/\*_',
+            r'(?msu)/:(?P<words>[^<>]+?):/',
             r'(?msu)\|:(?P<words>.+?):\|',
         ]
 
@@ -84,5 +84,6 @@ class TXTHeuristicProcessor(object):
                 
         txt = u'\n'.join(processed)
         txt = re.sub('[ ]{2,}', ' ', txt)
+        print txt
 
         return HTML_TEMPLATE % (title, txt)

From c8f18ff02e32f56220f83872f4def00cca58e73d Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 8 Jan 2011 15:49:10 -0500
Subject: [PATCH 3/4] TXT Input: Heuristic processor, use PreProcessor to mark
 chapter headings.

---
 src/calibre/ebooks/txt/heuristicprocessor.py | 43 ++++----------------
 src/calibre/ebooks/txt/processor.py          |  3 --
 2 files changed, 7 insertions(+), 39 deletions(-)

diff --git a/src/calibre/ebooks/txt/heuristicprocessor.py b/src/calibre/ebooks/txt/heuristicprocessor.py
index b0bbd49961..c4489badc5 100644
--- a/src/calibre/ebooks/txt/heuristicprocessor.py
+++ b/src/calibre/ebooks/txt/heuristicprocessor.py
@@ -33,30 +33,6 @@ class TXTHeuristicProcessor(object):
             r'(?msu)\|:(?P<words>.+?):\|',
         ]
 
-    def del_maketrans(self, deletechars):
-        return dict([(ord(x), u'') for x in deletechars])
-
-    def is_heading(self, line):
-        if not line:
-            return False
-        if len(line) > 40:
-            return False
-        
-        line = Unidecoder().decode(line)
-
-        # punctuation.
-        if line.translate(self.del_maketrans(string.letters + string.digits + ' :-')):
-            return False
-        
-        # All upper case.
-        #if line.isupper():
-        #    return True
-        # Roman numerals.
-        #if not line.translate(self.del_maketrans('IVXYCivxyc ')):
-        #    return True
-        
-        return True
-
     def process_paragraph(self, paragraph):
         for word in self.ITALICIZE_WORDS:
             paragraph = paragraph.replace(word, '<i>%s</i>' % word)
@@ -70,20 +46,15 @@ class TXTHeuristicProcessor(object):
         txt = split_txt(txt, epub_split_size_kb)
         
         processed = []
-        last_was_heading = False
         for line in txt.split('\n\n'):
-            if self.is_heading(line):
-                if not last_was_heading:
-                    processed.append(u'<h1>%s</h1>' % prepare_string_for_xml(line.replace('\n', ' ')))
-                else:
-                    processed.append(u'<h2>%s</h2>' % prepare_string_for_xml(line.replace('\n', ' ')))
-                last_was_heading = True
-            else:
-                processed.append(u'<p>%s</p>' % self.process_paragraph(prepare_string_for_xml(line.replace('\n', ' '))))
-                last_was_heading = False
+            processed.append(u'<p>%s</p>' % self.process_paragraph(prepare_string_for_xml(line.replace('\n', ' '))))
                 
         txt = u'\n'.join(processed)
         txt = re.sub('[ ]{2,}', ' ', txt)
-        print txt
+        html = HTML_TEMPLATE % (title, txt)
+        
+        from calibre.ebooks.conversion.utils import PreProcessor
+        pp = PreProcessor()
+        html = pp.markup_chapters(html, pp.get_word_count(html), False)
 
-        return HTML_TEMPLATE % (title, txt)
+        return html
diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py
index 1e67caccc6..9dc29e45dd 100644
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@@ -9,11 +9,8 @@ import os, re
 from calibre import prepare_string_for_xml, isbytestring
 from calibre.ebooks.markdown import markdown
 from calibre.ebooks.metadata.opf2 import OPFCreator
-<<<<<<< TREE
 from calibre.ebooks.txt.heuristicprocessor import TXTHeuristicProcessor
-=======
 from calibre.ebooks.conversion.preprocess import DocAnalysis
->>>>>>> MERGE-SOURCE
 
 __license__   = 'GPL v3'
 __copyright__ = '2009, John Schember <john@nachtimwald.com>'

From bd14205637cbf71fe4aad655de50f4f0fea98a60 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 8 Jan 2011 15:53:51 -0500
Subject: [PATCH 4/4] ...

---
 src/calibre/ebooks/txt/heuristicprocessor.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/calibre/ebooks/txt/heuristicprocessor.py b/src/calibre/ebooks/txt/heuristicprocessor.py
index c4489badc5..c4c6a56123 100644
--- a/src/calibre/ebooks/txt/heuristicprocessor.py
+++ b/src/calibre/ebooks/txt/heuristicprocessor.py
@@ -5,7 +5,6 @@ __copyright__ = '2011, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'
 
 import re
-import string
 
 from calibre import prepare_string_for_xml
 from calibre.ebooks.unidecode.unidecoder import Unidecoder
@@ -48,7 +47,7 @@ class TXTHeuristicProcessor(object):
         processed = []
         for line in txt.split('\n\n'):
             processed.append(u'<p>%s</p>' % self.process_paragraph(prepare_string_for_xml(line.replace('\n', ' '))))
-                
+
         txt = u'\n'.join(processed)
         txt = re.sub('[ ]{2,}', ' ', txt)
         html = HTML_TEMPLATE % (title, txt)