TXT Input: Add a heauristic formatting processor

2025-07-09 03:04:10 -04:00 · 2011-01-08 14:51:08 -07:00 · 2011-01-08 14:51:08 -07:00 · 3130c67919
commit 3130c67919
parent 5483bb6f94 bd14205637
3 changed files with 90 additions and 10 deletions
--- a/src/calibre/ebooks/txt/heuristicprocessor.py
+++ b/src/calibre/ebooks/txt/heuristicprocessor.py
@ -0,0 +1,58 @@
+# -*- coding: utf-8 -*-
+
+__license__ = 'GPL 3'
+__copyright__ = '2011, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import re
+
+from calibre import prepare_string_for_xml
+
+class TXTHeuristicProcessor(object):
+
+    def __init__(self):
+        self.ITALICIZE_WORDS = [
+            'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
+            'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetra', 'n.b.', 'N.b.',
+            'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.',
+            'Mlle.', 'Mons.', 'PS.', 'PPS.',
+        ]
+        self.ITALICIZE_STYLE_PATS = [
+            r'(?msu)_(?P<words>.+?)_',
+            r'(?msu)/(?P<words>[^<>]+?)/',
+            r'(?msu)~~(?P<words>.+?)~~',
+            r'(?msu)\*(?P<words>.+?)\*',
+            r'(?msu)~(?P<words>.+?)~',
+            r'(?msu)_/(?P<words>[^<>]+?)/_',
+            r'(?msu)_\*(?P<words>.+?)\*_',
+            r'(?msu)\*/(?P<words>[^<>]+?)/\*',
+            r'(?msu)_\*/(?P<words>[^<>]+?)/\*_',
+            r'(?msu)/:(?P<words>[^<>]+?):/',
+            r'(?msu)\|:(?P<words>.+?):\|',
+        ]
+
+    def process_paragraph(self, paragraph):
+        for word in self.ITALICIZE_WORDS:
+            paragraph = paragraph.replace(word, '<i>%s</i>' % word)
+        for pat in self.ITALICIZE_STYLE_PATS:
+            paragraph = re.sub(pat, lambda mo: '<i>%s</i>' % mo.group('words'), paragraph)
+        return paragraph
+
+    def convert(self, txt, title='', epub_split_size_kb=0):
+        from calibre.ebooks.txt.processor import clean_txt, split_txt, HTML_TEMPLATE
+        txt = clean_txt(txt)
+        txt = split_txt(txt, epub_split_size_kb)
+
+        processed = []
+        for line in txt.split('\n\n'):
+            processed.append(u'<p>%s</p>' % self.process_paragraph(prepare_string_for_xml(line.replace('\n', ' '))))
+
+        txt = u'\n'.join(processed)
+        txt = re.sub('[ ]{2,}', ' ', txt)
+        html = HTML_TEMPLATE % (title, txt)
+
+        from calibre.ebooks.conversion.utils import PreProcessor
+        pp = PreProcessor()
+        html = pp.markup_chapters(html, pp.get_word_count(html), False)
+
+        return html
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@ -10,7 +10,8 @@ from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
 from calibre.ebooks.chardet import detect
 from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
    separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
-    preserve_spaces, detect_paragraph_type, detect_formatting_type
+    preserve_spaces, detect_paragraph_type, detect_formatting_type, \
+    convert_heuristic
 from calibre import _ent_pat, xml_entity_to_unicode

 class TXTInput(InputFormatPlugin):
@ -24,14 +25,15 @@ class TXTInput(InputFormatPlugin):
        OptionRecommendation(name='paragraph_type', recommended_value='auto',
            choices=['auto', 'block', 'single', 'print'],
            help=_('Paragraph structure.\n'
-                   'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n'
+                   'choices are [\'auto\', \'block\', \'single\', \'print\', \'unformatted\']\n'
                   '* auto: Try to auto detect paragraph type.\n'
                   '* block: Treat a blank line as a paragraph break.\n'
                   '* single: Assume every line is a paragraph.\n'
                   '* print:  Assume every line starting with 2+ spaces or a tab '
-                   'starts a paragraph.')),
+                   'starts a paragraph.'
+                   '* unformatted: Most lines have hard line breaks, few/no spaces or indents.')),
        OptionRecommendation(name='formatting_type', recommended_value='auto',
-            choices=['auto', 'none', 'markdown'],
+            choices=['auto', 'none', 'heuristic', 'markdown'],
            help=_('Formatting used within the document.'
                   '* auto: Try to auto detect the document formatting.\n'
                   '* none: Do not modify the paragraph formatting. Everything is a paragraph.\n'
@ -90,7 +92,7 @@ class TXTInput(InputFormatPlugin):
            
            # We don't check for block because the processor assumes block.
            # single and print at transformed to block for processing.
-            if options.paragraph_type == 'single' or 'unformatted':
+            if options.paragraph_type == 'single' or options.paragraph_type == 'unformatted':
                txt = separate_paragraphs_single_line(txt)
            elif options.paragraph_type == 'print':
                txt = separate_paragraphs_print_formatted(txt)
@ -106,7 +108,12 @@ class TXTInput(InputFormatPlugin):
                txt = preprocessor.punctuation_unwrap(length, txt, 'txt')

            flow_size = getattr(options, 'flow_size', 0)
-            html = convert_basic(txt, epub_split_size_kb=flow_size)
+            
+            if options.formatting_type == 'heuristic':
+                html = convert_heuristic(txt, epub_split_size_kb=flow_size)
+            else:
+                html = convert_basic(txt, epub_split_size_kb=flow_size)
+            

        from calibre.customize.ui import plugin_for_input_format
        html_input = plugin_for_input_format('html')
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@ -9,6 +9,7 @@ import os, re
 from calibre import prepare_string_for_xml, isbytestring
 from calibre.ebooks.markdown import markdown
 from calibre.ebooks.metadata.opf2 import OPFCreator
+from calibre.ebooks.txt.heuristicprocessor import TXTHeuristicProcessor
 from calibre.ebooks.conversion.preprocess import DocAnalysis

 __license__   = 'GPL v3'
@ -17,7 +18,7 @@ __docformat__ = 'restructuredtext en'

 HTML_TEMPLATE = u'<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s</title></head><body>\n%s\n</body></html>'

-def convert_basic(txt, title='', epub_split_size_kb=0):
+def clean_txt(txt):
    if isbytestring(txt):
        txt = txt.decode('utf-8', 'replace')
    # Strip whitespace from the beginning and end of the line. Also replace
@ -36,6 +37,10 @@ def convert_basic(txt, title='', epub_split_size_kb=0):
    chars = list(range(8)) + [0x0B, 0x0E, 0x0F] + list(range(0x10, 0x19))
    illegal_chars = re.compile(u'|'.join(map(unichr, chars)))
    txt = illegal_chars.sub('', txt)
+    
+    return txt
+
+def split_txt(txt, epub_split_size_kb=0):
    #Takes care if there is no point to split
    if epub_split_size_kb > 0:
        if isinstance(txt, unicode):
@ -50,6 +55,12 @@ def convert_basic(txt, title='', epub_split_size_kb=0):
    if isbytestring(txt):
        txt = txt.decode('utf-8')

+    return txt
+
+def convert_basic(txt, title='', epub_split_size_kb=0):
+    txt = clean_txt(txt)
+    txt = split_txt(txt, epub_split_size_kb)
+
    lines = []
    # Split into paragraphs based on having a blank line between text.
    for line in txt.split('\n\n'):
@ -58,6 +69,10 @@ def convert_basic(txt, title='', epub_split_size_kb=0):

    return HTML_TEMPLATE % (title, u'\n'.join(lines))

+def convert_heuristic(txt, title='', epub_split_size_kb=0):
+    tp = TXTHeuristicProcessor()
+    return tp.convert(txt, title, epub_split_size_kb)
+
 def convert_markdown(txt, title='', disable_toc=False):
    md = markdown.Markdown(
          extensions=['footnotes', 'tables', 'toc'],
@ -117,12 +132,12 @@ def detect_paragraph_type(txt):
    if hardbreaks:
        # Check for print
        tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
-        if tab_line_count / float(txt_line_count) >= .25:
+        if tab_line_count / float(txt_line_count) >= .15:
            return 'print'
        
        # Check for block
        empty_line_count = len(re.findall('(?mu)^\s*$', txt))
-        if empty_line_count / float(txt_line_count) >= .25:
+        if empty_line_count / float(txt_line_count) >= .15:
            return 'block'

        # Assume unformatted text with hardbreaks if nothing else matches        
@ -153,4 +168,4 @@ def detect_formatting_type(txt):
        if txt.count('\\'+c) > 10:
            return 'markdown'
    
-    return 'none'
+    return 'heuristic'