TXT Input: Heuristic processor, use PreProcessor to mark chapter headings.

This commit is contained in:
John Schember 2011-01-08 15:49:10 -05:00
parent 0b08042d46
commit c8f18ff02e
2 changed files with 7 additions and 39 deletions

View File

@ -33,30 +33,6 @@ class TXTHeuristicProcessor(object):
r'(?msu)\|:(?P<words>.+?):\|',
]
def del_maketrans(self, deletechars):
return dict([(ord(x), u'') for x in deletechars])
def is_heading(self, line):
if not line:
return False
if len(line) > 40:
return False
line = Unidecoder().decode(line)
# punctuation.
if line.translate(self.del_maketrans(string.letters + string.digits + ' :-')):
return False
# All upper case.
#if line.isupper():
# return True
# Roman numerals.
#if not line.translate(self.del_maketrans('IVXYCivxyc ')):
# return True
return True
def process_paragraph(self, paragraph):
for word in self.ITALICIZE_WORDS:
paragraph = paragraph.replace(word, '<i>%s</i>' % word)
@ -70,20 +46,15 @@ class TXTHeuristicProcessor(object):
txt = split_txt(txt, epub_split_size_kb)
processed = []
last_was_heading = False
for line in txt.split('\n\n'):
if self.is_heading(line):
if not last_was_heading:
processed.append(u'<h1>%s</h1>' % prepare_string_for_xml(line.replace('\n', ' ')))
else:
processed.append(u'<h2>%s</h2>' % prepare_string_for_xml(line.replace('\n', ' ')))
last_was_heading = True
else:
processed.append(u'<p>%s</p>' % self.process_paragraph(prepare_string_for_xml(line.replace('\n', ' '))))
last_was_heading = False
processed.append(u'<p>%s</p>' % self.process_paragraph(prepare_string_for_xml(line.replace('\n', ' '))))
txt = u'\n'.join(processed)
txt = re.sub('[ ]{2,}', ' ', txt)
print txt
html = HTML_TEMPLATE % (title, txt)
from calibre.ebooks.conversion.utils import PreProcessor
pp = PreProcessor()
html = pp.markup_chapters(html, pp.get_word_count(html), False)
return HTML_TEMPLATE % (title, txt)
return html

View File

@ -9,11 +9,8 @@ import os, re
from calibre import prepare_string_for_xml, isbytestring
from calibre.ebooks.markdown import markdown
from calibre.ebooks.metadata.opf2 import OPFCreator
<<<<<<< TREE
from calibre.ebooks.txt.heuristicprocessor import TXTHeuristicProcessor
=======
from calibre.ebooks.conversion.preprocess import DocAnalysis
>>>>>>> MERGE-SOURCE
__license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'