TXT Input: Heuristic processor, use PreProcessor to mark chapter headings.

2025-08-30 23:00:21 -04:00 · 2011-01-08 15:49:10 -05:00 · 2011-01-08 15:49:10 -05:00 · c8f18ff02e
commit c8f18ff02e
parent 0b08042d46
2 changed files with 7 additions and 39 deletions
--- a/src/calibre/ebooks/txt/heuristicprocessor.py
+++ b/src/calibre/ebooks/txt/heuristicprocessor.py
@ -33,30 +33,6 @@ class TXTHeuristicProcessor(object):
            r'(?msu)\|:(?P<words>.+?):\|',
        ]
    def del_maketrans(self, deletechars):
        return dict([(ord(x), u'') for x in deletechars])
    def is_heading(self, line):
        if not line:
            return False
        if len(line) > 40:
            return False
        line = Unidecoder().decode(line)
        # punctuation.
        if line.translate(self.del_maketrans(string.letters + string.digits + ' :-')):
            return False
        # All upper case.
        #if line.isupper():
        #    return True
        # Roman numerals.
        #if not line.translate(self.del_maketrans('IVXYCivxyc ')):
        #    return True
        return True
    def process_paragraph(self, paragraph):
        for word in self.ITALICIZE_WORDS:
            paragraph = paragraph.replace(word, '<i>%s</i>' % word)
@ -70,20 +46,15 @@ class TXTHeuristicProcessor(object):
        txt = split_txt(txt, epub_split_size_kb)
        processed = []
        last_was_heading = False
        for line in txt.split('\n\n'):
-            if self.is_heading(line):
+            processed.append(u'<p>%s</p>' % self.process_paragraph(prepare_string_for_xml(line.replace('\n', ' '))))
                if not last_was_heading:
                    processed.append(u'<h1>%s</h1>' % prepare_string_for_xml(line.replace('\n', ' ')))
                else:
                    processed.append(u'<h2>%s</h2>' % prepare_string_for_xml(line.replace('\n', ' ')))
                last_was_heading = True
            else:
                processed.append(u'<p>%s</p>' % self.process_paragraph(prepare_string_for_xml(line.replace('\n', ' '))))
                last_was_heading = False
        txt = u'\n'.join(processed)
        txt = re.sub('[ ]{2,}', ' ', txt)
-        print txt
+        html = HTML_TEMPLATE % (title, txt)
        from calibre.ebooks.conversion.utils import PreProcessor
        pp = PreProcessor()
        html = pp.markup_chapters(html, pp.get_word_count(html), False)
-        return HTML_TEMPLATE % (title, txt)
+        return html
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@ -9,11 +9,8 @@ import os, re
 from calibre import prepare_string_for_xml, isbytestring
 from calibre.ebooks.markdown import markdown
 from calibre.ebooks.metadata.opf2 import OPFCreator
 <<<<<<< TREE
 from calibre.ebooks.txt.heuristicprocessor import TXTHeuristicProcessor
 =======
 from calibre.ebooks.conversion.preprocess import DocAnalysis
 >>>>>>> MERGE-SOURCE
 __license__   = 'GPL v3'
 __copyright__ = '2009, John Schember <john@nachtimwald.com>'