diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py
index b91191e9fe..f7b6cce234 100644
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@@ -18,6 +18,10 @@ from calibre.utils.cleantext import clean_ascii_chars
HTML_TEMPLATE = u'
%s\n%s\n'
def clean_txt(txt):
+ '''
+ Run transformations on the text to put it into
+ consistent state.
+ '''
if isbytestring(txt):
txt = txt.decode('utf-8', 'replace')
# Strip whitespace from the end of the line. Also replace
@@ -42,6 +46,15 @@ def clean_txt(txt):
return txt
def split_txt(txt, epub_split_size_kb=0):
+ '''
+ Ensure there are split points for converting
+ to EPUB. A misdetected paragraph type can
+ result in the entire document being one giant
+ paragraph. In this case the EPUB parser will not
+ be able to determine where to split the file
+ to accomidate the EPUB file size limitation
+ and will fail.
+ '''
#Takes care if there is no point to split
if epub_split_size_kb > 0:
if isinstance(txt, unicode):
@@ -60,6 +73,9 @@ def split_txt(txt, epub_split_size_kb=0):
def convert_basic(txt, title='', epub_split_size_kb=0):
'''
+ Converts plain text to html by putting all paragraphs in
+ tags. It condense and retains blank lines when necessary.
+
Requires paragraphs to be in single line format.
'''
txt = clean_txt(txt)
@@ -110,11 +126,17 @@ def block_to_single_line(txt):
return txt
def preserve_spaces(txt):
+ '''
+ Replaces spaces multiple spaces with entities.
+ '''
txt = re.sub('(?P[ ]{2,})', lambda mo: ' ' + (' ' * (len(mo.group('space')) - 1)), txt)
txt = txt.replace('\t', ' ')
return txt
def remove_indents(txt):
+ '''
+ Remove whitespace at the beginning of each line.
+ '''
txt = re.sub('(?miu)^\s+', '', txt)
return txt
@@ -125,7 +147,10 @@ def opf_writer(path, opf_name, manifest, spine, mi):
with open(os.path.join(path, opf_name), 'wb') as opffile:
opf.render(opffile)
-def split_string_separator(txt, size) :
+def split_string_separator(txt, size):
+ '''
+ Splits the text by putting \n\n at the point size.
+ '''
if len(txt) > size:
txt = ''.join([re.sub(u'\.(?P[^.]*)$', '.\n\n\g',
txt[i:i+size], 1) for i in
@@ -134,7 +159,7 @@ def split_string_separator(txt, size) :
def detect_paragraph_type(txt):
'''
- Tries to determine the formatting of the document.
+ Tries to determine the paragraph type of the document.
block: Paragraphs are separated by a blank line.
single: Each line is a paragraph.
@@ -177,6 +202,16 @@ def detect_paragraph_type(txt):
def detect_formatting_type(txt):
+ '''
+ Tries to determine the formatting of the document.
+
+ markdown: Markdown formatting is used.
+ textile: Textile formatting is used.
+ heuristic: When none of the above formatting types are
+ detected heuristic is returned.
+ '''
+ # Keep a count of the number of format specific object
+ # that are found in the text.
markdown_count = 0
textile_count = 0
@@ -200,6 +235,8 @@ def detect_formatting_type(txt):
# Links
textile_count += len(re.findall(r'"(?=".*?\()(\(.+?\))*[^\(]+?(\(.+?\))*":[^\s]+', txt))
+ # Decide if either markdown or textile is used in the text
+ # based on the number of unique formatting elements found.
if markdown_count > 5 or textile_count > 5:
if markdown_count > textile_count:
return 'markdown'