TXT Processing: Comments.

2025-07-09 03:04:10 -04:00 · 2011-02-06 10:12:43 -05:00 · 2011-02-06 10:12:43 -05:00 · b1b4e7bac5
commit b1b4e7bac5
parent 0916a9dc34
1 changed files with 39 additions and 2 deletions
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@ -18,6 +18,10 @@ from calibre.utils.cleantext import clean_ascii_chars
 HTML_TEMPLATE = u'<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s</title></head><body>\n%s\n</body></html>'

 def clean_txt(txt):
+    '''
+    Run transformations on the text to put it into
+    consistent state.
+    '''
    if isbytestring(txt):
        txt = txt.decode('utf-8', 'replace')
    # Strip whitespace from the end of the line. Also replace
@ -42,6 +46,15 @@ def clean_txt(txt):
    return txt

 def split_txt(txt, epub_split_size_kb=0):
+    '''
+    Ensure there are split points for converting
+    to EPUB. A misdetected paragraph type can
+    result in the entire document being one giant
+    paragraph. In this case the EPUB parser will not
+    be able to determine where to split the file
+    to accomidate the EPUB file size limitation
+    and will fail.
+    '''
    #Takes care if there is no point to split
    if epub_split_size_kb > 0:
        if isinstance(txt, unicode):
@ -60,6 +73,9 @@ def split_txt(txt, epub_split_size_kb=0):

 def convert_basic(txt, title='', epub_split_size_kb=0):
    '''
+    Converts plain text to html by putting all paragraphs in
+    <p> tags. It condense and retains blank lines when necessary.
+    
    Requires paragraphs to be in single line format.
    '''
    txt = clean_txt(txt)
@ -110,11 +126,17 @@ def block_to_single_line(txt):
    return txt

 def preserve_spaces(txt):
+    '''
+    Replaces spaces multiple spaces with &nbsp; entities.
+    '''
    txt = re.sub('(?P<space>[ ]{2,})', lambda mo: ' ' + ('&nbsp;' * (len(mo.group('space')) - 1)), txt)
    txt = txt.replace('\t', '&nbsp;&nbsp;&nbsp;&nbsp;')
    return txt

 def remove_indents(txt):
+    '''
+    Remove whitespace at the beginning of each line.
+    '''
    txt = re.sub('(?miu)^\s+', '', txt)
    return txt

@ -125,7 +147,10 @@ def opf_writer(path, opf_name, manifest, spine, mi):
    with open(os.path.join(path, opf_name), 'wb') as opffile:
        opf.render(opffile)

-def split_string_separator(txt, size) :
+def split_string_separator(txt, size):
+    '''
+    Splits the text by putting \n\n at the point size.
+    '''
    if len(txt) > size:
        txt = ''.join([re.sub(u'\.(?P<ends>[^.]*)$', '.\n\n\g<ends>',
            txt[i:i+size], 1) for i in
@ -134,7 +159,7 @@ def split_string_separator(txt, size) :

 def detect_paragraph_type(txt):
    '''
-    Tries to determine the formatting of the document.
+    Tries to determine the paragraph type of the document.

    block: Paragraphs are separated by a blank line.
    single: Each line is a paragraph.
@ -177,6 +202,16 @@ def detect_paragraph_type(txt):


 def detect_formatting_type(txt):
+    '''
+    Tries to determine the formatting of the document.
+    
+    markdown: Markdown formatting is used.
+    textile: Textile formatting is used.
+    heuristic: When none of the above formatting types are
+               detected heuristic is returned.
+    '''
+    # Keep a count of the number of format specific object
+    # that are found in the text.
    markdown_count = 0
    textile_count = 0

@ -200,6 +235,8 @@ def detect_formatting_type(txt):
    # Links
    textile_count += len(re.findall(r'"(?=".*?\()(\(.+?\))*[^\(]+?(\(.+?\))*":[^\s]+', txt))

+    # Decide if either markdown or textile is used in the text
+    # based on the number of unique formatting elements found.
    if markdown_count > 5 or textile_count > 5:
        if markdown_count > textile_count:
            return 'markdown'