From cfaa113f9557b9359208409a538302d9ec0af1d4 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 15 Jan 2011 09:05:08 -0500
Subject: [PATCH] Move italic marking to preprocessor. Have TXT input use the
 preprocessor for heuristics. Change preprocessor getattr to default to False
 otherwise every option set to off will run.

---
 src/calibre/ebooks/conversion/utils.py       | 46 +++++++++++++---
 src/calibre/ebooks/txt/heuristicprocessor.py | 58 --------------------
 src/calibre/ebooks/txt/input.py              | 13 +++--
 src/calibre/ebooks/txt/processor.py          |  5 --
 4 files changed, 48 insertions(+), 74 deletions(-)
 delete mode 100644 src/calibre/ebooks/txt/heuristicprocessor.py
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 2a88d371cc..56d4339d8c 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -128,6 +128,36 @@ class PreProcessor(object):
         wordcount = get_wordcount_obj(word_count_text)
         return wordcount.words
 
+    def markup_italicis(self, html):
+        ITALICIZE_WORDS = [
+            'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
+            'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetra', 'n.b.', 'N.b.',
+            'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.',
+            'Mlle.', 'Mons.', 'PS.', 'PPS.',
+        ]
+        
+        ITALICIZE_STYLE_PATS = [
+            r'(?msu)_(?P<words>.+?)_',
+            r'(?msu)/(?P<words>[^<>]+?)/',
+            r'(?msu)~~(?P<words>.+?)~~',
+            r'(?msu)\*(?P<words>.+?)\*',
+            r'(?msu)~(?P<words>.+?)~',
+            r'(?msu)_/(?P<words>[^<>]+?)/_',
+            r'(?msu)_\*(?P<words>.+?)\*_',
+            r'(?msu)\*/(?P<words>[^<>]+?)/\*',
+            r'(?msu)_\*/(?P<words>[^<>]+?)/\*_',
+            r'(?msu)/:(?P<words>[^<>]+?):/',
+            r'(?msu)\|:(?P<words>.+?):\|',
+        ]
+        
+        for word in ITALICIZE_WORDS:
+            html = html.replace(word, '<i>%s</i>' % word)
+
+        for pat in ITALICIZE_STYLE_PATS:
+            html = re.sub(pat, lambda mo: '<i>%s</i>' % mo.group('words'), html)
+
+        return html
+
     def markup_chapters(self, html, wordcount, blanks_between_paragraphs):
         '''
         Searches for common chapter headings throughout the document
@@ -360,7 +390,7 @@ class PreProcessor(object):
             html = self.markup_pre(html)
 
         # Replace series of non-breaking spaces with text-indent
-        if getattr(self.extra_opts, 'fix_indents', True):
+        if getattr(self.extra_opts, 'fix_indents', False):
             html = self.fix_nbsp_indents(html)
 
         if self.cleanup_required():
@@ -375,19 +405,21 @@ class PreProcessor(object):
         #self.dump(html, 'before_chapter_markup')
         # detect chapters/sections to match xpath or splitting logic
 
-        if getattr(self.extra_opts, 'markup_chapter_headings', True):
+        if getattr(self.extra_opts, 'markup_chapter_headings', False):
             html = self.markup_chapters(html, self.totalwords, blanks_between_paragraphs)
 
+        if getattr(self.extra_opts, 'italicize_common_cases', False): 
+            html = self.markup_italicis(html)
+
         # If more than 40% of the lines are empty paragraphs and the user has enabled delete
         # blank paragraphs then delete blank lines to clean up spacing
-        if blanks_between_paragraphs and getattr(self.extra_opts,
-        'delete_blank_paragraphs', False):
+        if blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False):
             self.log("deleting blank lines")
             html = self.multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
             html = self.blankreg.sub('', html)
             
         ###### Unwrap lines ######
-        if getattr(self.extra_opts, 'unwrap_lines', True):
+        if getattr(self.extra_opts, 'unwrap_lines', False):
             # Determine line ending type
             # Some OCR sourced files have line breaks in the html using a combination of span & p tags
             # span are used for hard line breaks, p for new paragraphs.  Determine which is used so
@@ -416,7 +448,7 @@ class PreProcessor(object):
                 dehyphenator = Dehyphenator()
                 html = dehyphenator(html,'html_cleanup', length)
 
-        if getattr(self.extra_opts, 'dehyphenate', True):
+        if getattr(self.extra_opts, 'dehyphenate', False):
             # dehyphenate in cleanup mode to fix anything previous conversions/editing missed
             self.log("Fixing hyphenated content")
             dehyphenator = Dehyphenator()
@@ -435,7 +467,7 @@ class PreProcessor(object):
         doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
         html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
 
-        if getattr(self.extra_opts, 'format_scene_breaks', True):
+        if getattr(self.extra_opts, 'format_scene_breaks', False):
             # Center separator lines
             html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center; margin-top:1.25em; margin-bottom:1.25em">' + '\g<break>' + '</p>', html)
 
diff --git a/src/calibre/ebooks/txt/heuristicprocessor.py b/src/calibre/ebooks/txt/heuristicprocessor.py
deleted file mode 100644
index b9d18fd23a..0000000000
--- a/src/calibre/ebooks/txt/heuristicprocessor.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# -*- coding: utf-8 -*-
-
-__license__ = 'GPL 3'
-__copyright__ = '2011, John Schember <john@nachtimwald.com>'
-__docformat__ = 'restructuredtext en'
-
-import re
-
-from calibre import prepare_string_for_xml
-
-class TXTHeuristicProcessor(object):
-
-    def __init__(self):
-        self.ITALICIZE_WORDS = [
-            'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
-            'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetra', 'n.b.', 'N.b.',
-            'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.',
-            'Mlle.', 'Mons.', 'PS.', 'PPS.',
-        ]
-        self.ITALICIZE_STYLE_PATS = [
-            r'(?msu)_(?P<words>.+?)_',
-            r'(?msu)/(?P<words>[^<>]+?)/',
-            r'(?msu)~~(?P<words>.+?)~~',
-            r'(?msu)\*(?P<words>.+?)\*',
-            r'(?msu)~(?P<words>.+?)~',
-            r'(?msu)_/(?P<words>[^<>]+?)/_',
-            r'(?msu)_\*(?P<words>.+?)\*_',
-            r'(?msu)\*/(?P<words>[^<>]+?)/\*',
-            r'(?msu)_\*/(?P<words>[^<>]+?)/\*_',
-            r'(?msu)/:(?P<words>[^<>]+?):/',
-            r'(?msu)\|:(?P<words>.+?):\|',
-        ]
-
-    def process_paragraph(self, paragraph):
-        for word in self.ITALICIZE_WORDS:
-            paragraph = paragraph.replace(word, '<i>%s</i>' % word)
-        for pat in self.ITALICIZE_STYLE_PATS:
-            paragraph = re.sub(pat, lambda mo: '<i>%s</i>' % mo.group('words'), paragraph)
-        return paragraph
-
-    def convert(self, txt, title='', epub_split_size_kb=0):
-        from calibre.ebooks.txt.processor import clean_txt, split_txt, HTML_TEMPLATE
-        txt = clean_txt(txt)
-        txt = split_txt(txt, epub_split_size_kb)
-
-        processed = []
-        for line in txt.split('\n\n'):
-            processed.append(u'<p>%s</p>' % self.process_paragraph(prepare_string_for_xml(line.replace('\n', ' '))))
-
-        txt = u'\n'.join(processed)
-        txt = re.sub('[ ]{2,}', ' ', txt)
-        html = HTML_TEMPLATE % (title, txt)
-
-        from calibre.ebooks.conversion.utils import PreProcessor
-        pp = PreProcessor()
-        html = pp.markup_chapters(html, pp.get_word_count(html), False)
-
-        return html
diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index 0b0bd6d570..5cffbafe21 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -12,7 +12,7 @@ from calibre.ebooks.chardet import detect
 from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
     separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
     preserve_spaces, detect_paragraph_type, detect_formatting_type, \
-    convert_heuristic, normalize_line_endings, convert_textile
+    normalize_line_endings, convert_textile
 from calibre import _ent_pat, xml_entity_to_unicode
 
 class TXTInput(InputFormatPlugin):
@@ -126,11 +126,16 @@ class TXTInput(InputFormatPlugin):
                 txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
 
             flow_size = getattr(options, 'flow_size', 0)
+            html = convert_basic(txt, epub_split_size_kb=flow_size)
 
             if options.formatting_type == 'heuristic':
-                html = convert_heuristic(txt, epub_split_size_kb=flow_size)
-            else:
-                html = convert_basic(txt, epub_split_size_kb=flow_size)
+                setattr(options, 'enable_heuristics', True)
+                setattr(options, 'markup_chapter_headings', True)
+                setattr(options, 'italicize_common_cases', True)
+                setattr(options, 'fix_indents', True)
+                setattr(options, 'delete_blank_paragraphs', True)
+                setattr(options, 'format_scene_breaks', True)
+                setattr(options, 'dehyphenate', True)
 
         # Dehyphenate in cleanup mode for missed txt and markdown conversion
         dehyphenator = Dehyphenator()
diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py
index e1979063c0..9fd8af0d70 100644
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@@ -12,7 +12,6 @@ import os, re
 
 from calibre import prepare_string_for_xml, isbytestring
 from calibre.ebooks.metadata.opf2 import OPFCreator
-from calibre.ebooks.txt.heuristicprocessor import TXTHeuristicProcessor
 from calibre.ebooks.conversion.preprocess import DocAnalysis
 from calibre.utils.cleantext import clean_ascii_chars
 
@@ -67,10 +66,6 @@ def convert_basic(txt, title='', epub_split_size_kb=0):
 
     return HTML_TEMPLATE % (title, u'\n'.join(lines))
 
-def convert_heuristic(txt, title='', epub_split_size_kb=0):
-    tp = TXTHeuristicProcessor()
-    return tp.convert(txt, title, epub_split_size_kb)
-
 def convert_markdown(txt, title='', disable_toc=False):
     from calibre.ebooks.markdown import markdown
     md = markdown.Markdown(