Move italic marking to preprocessor. Have TXT input use the preprocessor for heuristics. Change preprocessor getattr to default to False otherwise every option set to off will run.

2025-07-09 03:04:10 -04:00 · 2011-01-15 09:05:08 -05:00 · 2011-01-15 09:05:08 -05:00 · cfaa113f95
commit cfaa113f95
parent fabd4f5fdf
4 changed files with 48 additions and 74 deletions
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@ -128,6 +128,36 @@ class PreProcessor(object):
        wordcount = get_wordcount_obj(word_count_text)
        return wordcount.words
    def markup_italicis(self, html):
        ITALICIZE_WORDS = [
            'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
            'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetra', 'n.b.', 'N.b.',
            'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.',
            'Mlle.', 'Mons.', 'PS.', 'PPS.',
        ]
        ITALICIZE_STYLE_PATS = [
            r'(?msu)_(?P<words>.+?)_',
            r'(?msu)/(?P<words>[^<>]+?)/',
            r'(?msu)~~(?P<words>.+?)~~',
            r'(?msu)\*(?P<words>.+?)\*',
            r'(?msu)~(?P<words>.+?)~',
            r'(?msu)_/(?P<words>[^<>]+?)/_',
            r'(?msu)_\*(?P<words>.+?)\*_',
            r'(?msu)\*/(?P<words>[^<>]+?)/\*',
            r'(?msu)_\*/(?P<words>[^<>]+?)/\*_',
            r'(?msu)/:(?P<words>[^<>]+?):/',
            r'(?msu)\|:(?P<words>.+?):\|',
        ]
        for word in ITALICIZE_WORDS:
            html = html.replace(word, '<i>%s</i>' % word)
        for pat in ITALICIZE_STYLE_PATS:
            html = re.sub(pat, lambda mo: '<i>%s</i>' % mo.group('words'), html)
        return html
    def markup_chapters(self, html, wordcount, blanks_between_paragraphs):
        '''
        Searches for common chapter headings throughout the document
@ -360,7 +390,7 @@ class PreProcessor(object):
            html = self.markup_pre(html)
        # Replace series of non-breaking spaces with text-indent
-        if getattr(self.extra_opts, 'fix_indents', True):
+        if getattr(self.extra_opts, 'fix_indents', False):
            html = self.fix_nbsp_indents(html)
        if self.cleanup_required():
@ -375,19 +405,21 @@ class PreProcessor(object):
        #self.dump(html, 'before_chapter_markup')
        # detect chapters/sections to match xpath or splitting logic
-        if getattr(self.extra_opts, 'markup_chapter_headings', True):
+        if getattr(self.extra_opts, 'markup_chapter_headings', False):
            html = self.markup_chapters(html, self.totalwords, blanks_between_paragraphs)
        if getattr(self.extra_opts, 'italicize_common_cases', False): 
            html = self.markup_italicis(html)
        # If more than 40% of the lines are empty paragraphs and the user has enabled delete
        # blank paragraphs then delete blank lines to clean up spacing
-        if blanks_between_paragraphs and getattr(self.extra_opts,
+        if blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False):
        'delete_blank_paragraphs', False):
            self.log("deleting blank lines")
            html = self.multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
            html = self.blankreg.sub('', html)
        ###### Unwrap lines ######
-        if getattr(self.extra_opts, 'unwrap_lines', True):
+        if getattr(self.extra_opts, 'unwrap_lines', False):
            # Determine line ending type
            # Some OCR sourced files have line breaks in the html using a combination of span & p tags
            # span are used for hard line breaks, p for new paragraphs.  Determine which is used so
@ -416,7 +448,7 @@ class PreProcessor(object):
                dehyphenator = Dehyphenator()
                html = dehyphenator(html,'html_cleanup', length)
-        if getattr(self.extra_opts, 'dehyphenate', True):
+        if getattr(self.extra_opts, 'dehyphenate', False):
            # dehyphenate in cleanup mode to fix anything previous conversions/editing missed
            self.log("Fixing hyphenated content")
            dehyphenator = Dehyphenator()
@ -435,7 +467,7 @@ class PreProcessor(object):
        doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
        html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
-        if getattr(self.extra_opts, 'format_scene_breaks', True):
+        if getattr(self.extra_opts, 'format_scene_breaks', False):
            # Center separator lines
            html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center; margin-top:1.25em; margin-bottom:1.25em">' + '\g<break>' + '</p>', html)
--- a/src/calibre/ebooks/txt/heuristicprocessor.py
+++ b/src/calibre/ebooks/txt/heuristicprocessor.py
@ -1,58 +0,0 @@
 # -*- coding: utf-8 -*-
 __license__ = 'GPL 3'
 __copyright__ = '2011, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'
 import re
 from calibre import prepare_string_for_xml
 class TXTHeuristicProcessor(object):
    def __init__(self):
        self.ITALICIZE_WORDS = [
            'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
            'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetra', 'n.b.', 'N.b.',
            'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.',
            'Mlle.', 'Mons.', 'PS.', 'PPS.',
        ]
        self.ITALICIZE_STYLE_PATS = [
            r'(?msu)_(?P<words>.+?)_',
            r'(?msu)/(?P<words>[^<>]+?)/',
            r'(?msu)~~(?P<words>.+?)~~',
            r'(?msu)\*(?P<words>.+?)\*',
            r'(?msu)~(?P<words>.+?)~',
            r'(?msu)_/(?P<words>[^<>]+?)/_',
            r'(?msu)_\*(?P<words>.+?)\*_',
            r'(?msu)\*/(?P<words>[^<>]+?)/\*',
            r'(?msu)_\*/(?P<words>[^<>]+?)/\*_',
            r'(?msu)/:(?P<words>[^<>]+?):/',
            r'(?msu)\|:(?P<words>.+?):\|',
        ]
    def process_paragraph(self, paragraph):
        for word in self.ITALICIZE_WORDS:
            paragraph = paragraph.replace(word, '<i>%s</i>' % word)
        for pat in self.ITALICIZE_STYLE_PATS:
            paragraph = re.sub(pat, lambda mo: '<i>%s</i>' % mo.group('words'), paragraph)
        return paragraph
    def convert(self, txt, title='', epub_split_size_kb=0):
        from calibre.ebooks.txt.processor import clean_txt, split_txt, HTML_TEMPLATE
        txt = clean_txt(txt)
        txt = split_txt(txt, epub_split_size_kb)
        processed = []
        for line in txt.split('\n\n'):
            processed.append(u'<p>%s</p>' % self.process_paragraph(prepare_string_for_xml(line.replace('\n', ' '))))
        txt = u'\n'.join(processed)
        txt = re.sub('[ ]{2,}', ' ', txt)
        html = HTML_TEMPLATE % (title, txt)
        from calibre.ebooks.conversion.utils import PreProcessor
        pp = PreProcessor()
        html = pp.markup_chapters(html, pp.get_word_count(html), False)
        return html
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@ -12,7 +12,7 @@ from calibre.ebooks.chardet import detect
 from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
    separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
    preserve_spaces, detect_paragraph_type, detect_formatting_type, \
-    convert_heuristic, normalize_line_endings, convert_textile
+    normalize_line_endings, convert_textile
 from calibre import _ent_pat, xml_entity_to_unicode
 class TXTInput(InputFormatPlugin):
@ -126,11 +126,16 @@ class TXTInput(InputFormatPlugin):
                txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
            flow_size = getattr(options, 'flow_size', 0)
            html = convert_basic(txt, epub_split_size_kb=flow_size)
            if options.formatting_type == 'heuristic':
-                html = convert_heuristic(txt, epub_split_size_kb=flow_size)
+                setattr(options, 'enable_heuristics', True)
-            else:
+                setattr(options, 'markup_chapter_headings', True)
-                html = convert_basic(txt, epub_split_size_kb=flow_size)
+                setattr(options, 'italicize_common_cases', True)
                setattr(options, 'fix_indents', True)
                setattr(options, 'delete_blank_paragraphs', True)
                setattr(options, 'format_scene_breaks', True)
                setattr(options, 'dehyphenate', True)
        # Dehyphenate in cleanup mode for missed txt and markdown conversion
        dehyphenator = Dehyphenator()
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@ -12,7 +12,6 @@ import os, re
 from calibre import prepare_string_for_xml, isbytestring
 from calibre.ebooks.metadata.opf2 import OPFCreator
 from calibre.ebooks.txt.heuristicprocessor import TXTHeuristicProcessor
 from calibre.ebooks.conversion.preprocess import DocAnalysis
 from calibre.utils.cleantext import clean_ascii_chars
@ -67,10 +66,6 @@ def convert_basic(txt, title='', epub_split_size_kb=0):
    return HTML_TEMPLATE % (title, u'\n'.join(lines))
 def convert_heuristic(txt, title='', epub_split_size_kb=0):
    tp = TXTHeuristicProcessor()
    return tp.convert(txt, title, epub_split_size_kb)
 def convert_markdown(txt, title='', disable_toc=False):
    from calibre.ebooks.markdown import markdown
    md = markdown.Markdown(