mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
TXT Input: Heuristic processor, use PreProcessor to mark chapter headings.
This commit is contained in:
parent
0b08042d46
commit
c8f18ff02e
@ -33,30 +33,6 @@ class TXTHeuristicProcessor(object):
|
|||||||
r'(?msu)\|:(?P<words>.+?):\|',
|
r'(?msu)\|:(?P<words>.+?):\|',
|
||||||
]
|
]
|
||||||
|
|
||||||
def del_maketrans(self, deletechars):
|
|
||||||
return dict([(ord(x), u'') for x in deletechars])
|
|
||||||
|
|
||||||
def is_heading(self, line):
|
|
||||||
if not line:
|
|
||||||
return False
|
|
||||||
if len(line) > 40:
|
|
||||||
return False
|
|
||||||
|
|
||||||
line = Unidecoder().decode(line)
|
|
||||||
|
|
||||||
# punctuation.
|
|
||||||
if line.translate(self.del_maketrans(string.letters + string.digits + ' :-')):
|
|
||||||
return False
|
|
||||||
|
|
||||||
# All upper case.
|
|
||||||
#if line.isupper():
|
|
||||||
# return True
|
|
||||||
# Roman numerals.
|
|
||||||
#if not line.translate(self.del_maketrans('IVXYCivxyc ')):
|
|
||||||
# return True
|
|
||||||
|
|
||||||
return True
|
|
||||||
|
|
||||||
def process_paragraph(self, paragraph):
|
def process_paragraph(self, paragraph):
|
||||||
for word in self.ITALICIZE_WORDS:
|
for word in self.ITALICIZE_WORDS:
|
||||||
paragraph = paragraph.replace(word, '<i>%s</i>' % word)
|
paragraph = paragraph.replace(word, '<i>%s</i>' % word)
|
||||||
@ -70,20 +46,15 @@ class TXTHeuristicProcessor(object):
|
|||||||
txt = split_txt(txt, epub_split_size_kb)
|
txt = split_txt(txt, epub_split_size_kb)
|
||||||
|
|
||||||
processed = []
|
processed = []
|
||||||
last_was_heading = False
|
|
||||||
for line in txt.split('\n\n'):
|
for line in txt.split('\n\n'):
|
||||||
if self.is_heading(line):
|
processed.append(u'<p>%s</p>' % self.process_paragraph(prepare_string_for_xml(line.replace('\n', ' '))))
|
||||||
if not last_was_heading:
|
|
||||||
processed.append(u'<h1>%s</h1>' % prepare_string_for_xml(line.replace('\n', ' ')))
|
|
||||||
else:
|
|
||||||
processed.append(u'<h2>%s</h2>' % prepare_string_for_xml(line.replace('\n', ' ')))
|
|
||||||
last_was_heading = True
|
|
||||||
else:
|
|
||||||
processed.append(u'<p>%s</p>' % self.process_paragraph(prepare_string_for_xml(line.replace('\n', ' '))))
|
|
||||||
last_was_heading = False
|
|
||||||
|
|
||||||
txt = u'\n'.join(processed)
|
txt = u'\n'.join(processed)
|
||||||
txt = re.sub('[ ]{2,}', ' ', txt)
|
txt = re.sub('[ ]{2,}', ' ', txt)
|
||||||
print txt
|
html = HTML_TEMPLATE % (title, txt)
|
||||||
|
|
||||||
|
from calibre.ebooks.conversion.utils import PreProcessor
|
||||||
|
pp = PreProcessor()
|
||||||
|
html = pp.markup_chapters(html, pp.get_word_count(html), False)
|
||||||
|
|
||||||
return HTML_TEMPLATE % (title, txt)
|
return html
|
||||||
|
@ -9,11 +9,8 @@ import os, re
|
|||||||
from calibre import prepare_string_for_xml, isbytestring
|
from calibre import prepare_string_for_xml, isbytestring
|
||||||
from calibre.ebooks.markdown import markdown
|
from calibre.ebooks.markdown import markdown
|
||||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||||
<<<<<<< TREE
|
|
||||||
from calibre.ebooks.txt.heuristicprocessor import TXTHeuristicProcessor
|
from calibre.ebooks.txt.heuristicprocessor import TXTHeuristicProcessor
|
||||||
=======
|
|
||||||
from calibre.ebooks.conversion.preprocess import DocAnalysis
|
from calibre.ebooks.conversion.preprocess import DocAnalysis
|
||||||
>>>>>>> MERGE-SOURCE
|
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||||
|
Loading…
x
Reference in New Issue
Block a user