TXT Processing: Comments.

This commit is contained in:
John Schember 2011-02-06 10:12:43 -05:00
parent 0916a9dc34
commit b1b4e7bac5

View File

@ -18,6 +18,10 @@ from calibre.utils.cleantext import clean_ascii_chars
HTML_TEMPLATE = u'<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s</title></head><body>\n%s\n</body></html>'
def clean_txt(txt):
'''
Run transformations on the text to put it into
consistent state.
'''
if isbytestring(txt):
txt = txt.decode('utf-8', 'replace')
# Strip whitespace from the end of the line. Also replace
@ -42,6 +46,15 @@ def clean_txt(txt):
return txt
def split_txt(txt, epub_split_size_kb=0):
'''
Ensure there are split points for converting
to EPUB. A misdetected paragraph type can
result in the entire document being one giant
paragraph. In this case the EPUB parser will not
be able to determine where to split the file
to accomidate the EPUB file size limitation
and will fail.
'''
#Takes care if there is no point to split
if epub_split_size_kb > 0:
if isinstance(txt, unicode):
@ -60,6 +73,9 @@ def split_txt(txt, epub_split_size_kb=0):
def convert_basic(txt, title='', epub_split_size_kb=0):
'''
Converts plain text to html by putting all paragraphs in
<p> tags. It condense and retains blank lines when necessary.
Requires paragraphs to be in single line format.
'''
txt = clean_txt(txt)
@ -110,11 +126,17 @@ def block_to_single_line(txt):
return txt
def preserve_spaces(txt):
'''
Replaces spaces multiple spaces with &nbsp; entities.
'''
txt = re.sub('(?P<space>[ ]{2,})', lambda mo: ' ' + ('&nbsp;' * (len(mo.group('space')) - 1)), txt)
txt = txt.replace('\t', '&nbsp;&nbsp;&nbsp;&nbsp;')
return txt
def remove_indents(txt):
'''
Remove whitespace at the beginning of each line.
'''
txt = re.sub('(?miu)^\s+', '', txt)
return txt
@ -125,7 +147,10 @@ def opf_writer(path, opf_name, manifest, spine, mi):
with open(os.path.join(path, opf_name), 'wb') as opffile:
opf.render(opffile)
def split_string_separator(txt, size) :
def split_string_separator(txt, size):
'''
Splits the text by putting \n\n at the point size.
'''
if len(txt) > size:
txt = ''.join([re.sub(u'\.(?P<ends>[^.]*)$', '.\n\n\g<ends>',
txt[i:i+size], 1) for i in
@ -134,7 +159,7 @@ def split_string_separator(txt, size) :
def detect_paragraph_type(txt):
'''
Tries to determine the formatting of the document.
Tries to determine the paragraph type of the document.
block: Paragraphs are separated by a blank line.
single: Each line is a paragraph.
@ -177,6 +202,16 @@ def detect_paragraph_type(txt):
def detect_formatting_type(txt):
'''
Tries to determine the formatting of the document.
markdown: Markdown formatting is used.
textile: Textile formatting is used.
heuristic: When none of the above formatting types are
detected heuristic is returned.
'''
# Keep a count of the number of format specific object
# that are found in the text.
markdown_count = 0
textile_count = 0
@ -200,6 +235,8 @@ def detect_formatting_type(txt):
# Links
textile_count += len(re.findall(r'"(?=".*?\()(\(.+?\))*[^\(]+?(\(.+?\))*":[^\s]+', txt))
# Decide if either markdown or textile is used in the text
# based on the number of unique formatting elements found.
if markdown_count > 5 or textile_count > 5:
if markdown_count > textile_count:
return 'markdown'