TXT Processing: Comments.

2025-07-09 03:04:10 -04:00 · 2011-02-06 10:12:43 -05:00 · 2011-02-06 10:12:43 -05:00 · b1b4e7bac5
commit b1b4e7bac5
parent 0916a9dc34
1 changed files with 39 additions and 2 deletions
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@ -18,6 +18,10 @@ from calibre.utils.cleantext import clean_ascii_chars
 HTML_TEMPLATE = u'<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s</title></head><body>\n%s\n</body></html>'
 def clean_txt(txt):
    '''
    Run transformations on the text to put it into
    consistent state.
    '''
    if isbytestring(txt):
        txt = txt.decode('utf-8', 'replace')
    # Strip whitespace from the end of the line. Also replace
@ -42,6 +46,15 @@ def clean_txt(txt):
    return txt
 def split_txt(txt, epub_split_size_kb=0):
    '''
    Ensure there are split points for converting
    to EPUB. A misdetected paragraph type can
    result in the entire document being one giant
    paragraph. In this case the EPUB parser will not
    be able to determine where to split the file
    to accomidate the EPUB file size limitation
    and will fail.
    '''
    #Takes care if there is no point to split
    if epub_split_size_kb > 0:
        if isinstance(txt, unicode):
@ -60,6 +73,9 @@ def split_txt(txt, epub_split_size_kb=0):
 def convert_basic(txt, title='', epub_split_size_kb=0):
    '''
    Converts plain text to html by putting all paragraphs in
    <p> tags. It condense and retains blank lines when necessary.
    Requires paragraphs to be in single line format.
    '''
    txt = clean_txt(txt)
@ -110,11 +126,17 @@ def block_to_single_line(txt):
    return txt
 def preserve_spaces(txt):
    '''
    Replaces spaces multiple spaces with &nbsp; entities.
    '''
    txt = re.sub('(?P<space>[ ]{2,})', lambda mo: ' ' + ('&nbsp;' * (len(mo.group('space')) - 1)), txt)
    txt = txt.replace('\t', '&nbsp;&nbsp;&nbsp;&nbsp;')
    return txt
 def remove_indents(txt):
    '''
    Remove whitespace at the beginning of each line.
    '''
    txt = re.sub('(?miu)^\s+', '', txt)
    return txt
@ -125,7 +147,10 @@ def opf_writer(path, opf_name, manifest, spine, mi):
    with open(os.path.join(path, opf_name), 'wb') as opffile:
        opf.render(opffile)
-def split_string_separator(txt, size) :
+def split_string_separator(txt, size):
    '''
    Splits the text by putting \n\n at the point size.
    '''
    if len(txt) > size:
        txt = ''.join([re.sub(u'\.(?P<ends>[^.]*)$', '.\n\n\g<ends>',
            txt[i:i+size], 1) for i in
@ -134,7 +159,7 @@ def split_string_separator(txt, size) :
 def detect_paragraph_type(txt):
    '''
-    Tries to determine the formatting of the document.
+    Tries to determine the paragraph type of the document.
    block: Paragraphs are separated by a blank line.
    single: Each line is a paragraph.
@ -177,6 +202,16 @@ def detect_paragraph_type(txt):
 def detect_formatting_type(txt):
    '''
    Tries to determine the formatting of the document.
    markdown: Markdown formatting is used.
    textile: Textile formatting is used.
    heuristic: When none of the above formatting types are
               detected heuristic is returned.
    '''
    # Keep a count of the number of format specific object
    # that are found in the text.
    markdown_count = 0
    textile_count = 0
@ -200,6 +235,8 @@ def detect_formatting_type(txt):
    # Links
    textile_count += len(re.findall(r'"(?=".*?\()(\(.+?\))*[^\(]+?(\(.+?\))*":[^\s]+', txt))
    # Decide if either markdown or textile is used in the text
    # based on the number of unique formatting elements found.
    if markdown_count > 5 or textile_count > 5:
        if markdown_count > textile_count:
            return 'markdown'