TXT Input: Heuristic processor, use PreProcessor to mark chapter headings.

2025-07-09 03:04:10 -04:00 · 2011-01-08 15:49:10 -05:00 · 2011-01-08 15:49:10 -05:00 · c8f18ff02e
commit c8f18ff02e
parent 0b08042d46
2 changed files with 7 additions and 39 deletions
--- a/src/calibre/ebooks/txt/heuristicprocessor.py
+++ b/src/calibre/ebooks/txt/heuristicprocessor.py
@ -33,30 +33,6 @@ class TXTHeuristicProcessor(object):
            r'(?msu)\|:(?P<words>.+?):\|',
        ]

-    def del_maketrans(self, deletechars):
-        return dict([(ord(x), u'') for x in deletechars])
-
-    def is_heading(self, line):
-        if not line:
-            return False
-        if len(line) > 40:
-            return False
-        
-        line = Unidecoder().decode(line)
-
-        # punctuation.
-        if line.translate(self.del_maketrans(string.letters + string.digits + ' :-')):
-            return False
-        
-        # All upper case.
-        #if line.isupper():
-        #    return True
-        # Roman numerals.
-        #if not line.translate(self.del_maketrans('IVXYCivxyc ')):
-        #    return True
-        
-        return True
-
    def process_paragraph(self, paragraph):
        for word in self.ITALICIZE_WORDS:
            paragraph = paragraph.replace(word, '<i>%s</i>' % word)
@ -70,20 +46,15 @@ class TXTHeuristicProcessor(object):
        txt = split_txt(txt, epub_split_size_kb)
        
        processed = []
-        last_was_heading = False
        for line in txt.split('\n\n'):
-            if self.is_heading(line):
-                if not last_was_heading:
-                    processed.append(u'<h1>%s</h1>' % prepare_string_for_xml(line.replace('\n', ' ')))
-                else:
-                    processed.append(u'<h2>%s</h2>' % prepare_string_for_xml(line.replace('\n', ' ')))
-                last_was_heading = True
-            else:
-                processed.append(u'<p>%s</p>' % self.process_paragraph(prepare_string_for_xml(line.replace('\n', ' '))))
-                last_was_heading = False
+            processed.append(u'<p>%s</p>' % self.process_paragraph(prepare_string_for_xml(line.replace('\n', ' '))))
                
        txt = u'\n'.join(processed)
        txt = re.sub('[ ]{2,}', ' ', txt)
-        print txt
+        html = HTML_TEMPLATE % (title, txt)
        
-        return HTML_TEMPLATE % (title, txt)
+        from calibre.ebooks.conversion.utils import PreProcessor
+        pp = PreProcessor()
+        html = pp.markup_chapters(html, pp.get_word_count(html), False)
+
+        return html
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@ -9,11 +9,8 @@ import os, re
 from calibre import prepare_string_for_xml, isbytestring
 from calibre.ebooks.markdown import markdown
 from calibre.ebooks.metadata.opf2 import OPFCreator
-<<<<<<< TREE
 from calibre.ebooks.txt.heuristicprocessor import TXTHeuristicProcessor
-=======
 from calibre.ebooks.conversion.preprocess import DocAnalysis
->>>>>>> MERGE-SOURCE

 __license__   = 'GPL v3'
 __copyright__ = '2009, John Schember <john@nachtimwald.com>'