mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
TXT Processing: Comments.
This commit is contained in:
parent
0916a9dc34
commit
b1b4e7bac5
@ -18,6 +18,10 @@ from calibre.utils.cleantext import clean_ascii_chars
|
|||||||
HTML_TEMPLATE = u'<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s</title></head><body>\n%s\n</body></html>'
|
HTML_TEMPLATE = u'<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s</title></head><body>\n%s\n</body></html>'
|
||||||
|
|
||||||
def clean_txt(txt):
|
def clean_txt(txt):
|
||||||
|
'''
|
||||||
|
Run transformations on the text to put it into
|
||||||
|
consistent state.
|
||||||
|
'''
|
||||||
if isbytestring(txt):
|
if isbytestring(txt):
|
||||||
txt = txt.decode('utf-8', 'replace')
|
txt = txt.decode('utf-8', 'replace')
|
||||||
# Strip whitespace from the end of the line. Also replace
|
# Strip whitespace from the end of the line. Also replace
|
||||||
@ -42,6 +46,15 @@ def clean_txt(txt):
|
|||||||
return txt
|
return txt
|
||||||
|
|
||||||
def split_txt(txt, epub_split_size_kb=0):
|
def split_txt(txt, epub_split_size_kb=0):
|
||||||
|
'''
|
||||||
|
Ensure there are split points for converting
|
||||||
|
to EPUB. A misdetected paragraph type can
|
||||||
|
result in the entire document being one giant
|
||||||
|
paragraph. In this case the EPUB parser will not
|
||||||
|
be able to determine where to split the file
|
||||||
|
to accomidate the EPUB file size limitation
|
||||||
|
and will fail.
|
||||||
|
'''
|
||||||
#Takes care if there is no point to split
|
#Takes care if there is no point to split
|
||||||
if epub_split_size_kb > 0:
|
if epub_split_size_kb > 0:
|
||||||
if isinstance(txt, unicode):
|
if isinstance(txt, unicode):
|
||||||
@ -60,6 +73,9 @@ def split_txt(txt, epub_split_size_kb=0):
|
|||||||
|
|
||||||
def convert_basic(txt, title='', epub_split_size_kb=0):
|
def convert_basic(txt, title='', epub_split_size_kb=0):
|
||||||
'''
|
'''
|
||||||
|
Converts plain text to html by putting all paragraphs in
|
||||||
|
<p> tags. It condense and retains blank lines when necessary.
|
||||||
|
|
||||||
Requires paragraphs to be in single line format.
|
Requires paragraphs to be in single line format.
|
||||||
'''
|
'''
|
||||||
txt = clean_txt(txt)
|
txt = clean_txt(txt)
|
||||||
@ -110,11 +126,17 @@ def block_to_single_line(txt):
|
|||||||
return txt
|
return txt
|
||||||
|
|
||||||
def preserve_spaces(txt):
|
def preserve_spaces(txt):
|
||||||
|
'''
|
||||||
|
Replaces spaces multiple spaces with entities.
|
||||||
|
'''
|
||||||
txt = re.sub('(?P<space>[ ]{2,})', lambda mo: ' ' + (' ' * (len(mo.group('space')) - 1)), txt)
|
txt = re.sub('(?P<space>[ ]{2,})', lambda mo: ' ' + (' ' * (len(mo.group('space')) - 1)), txt)
|
||||||
txt = txt.replace('\t', ' ')
|
txt = txt.replace('\t', ' ')
|
||||||
return txt
|
return txt
|
||||||
|
|
||||||
def remove_indents(txt):
|
def remove_indents(txt):
|
||||||
|
'''
|
||||||
|
Remove whitespace at the beginning of each line.
|
||||||
|
'''
|
||||||
txt = re.sub('(?miu)^\s+', '', txt)
|
txt = re.sub('(?miu)^\s+', '', txt)
|
||||||
return txt
|
return txt
|
||||||
|
|
||||||
@ -125,7 +147,10 @@ def opf_writer(path, opf_name, manifest, spine, mi):
|
|||||||
with open(os.path.join(path, opf_name), 'wb') as opffile:
|
with open(os.path.join(path, opf_name), 'wb') as opffile:
|
||||||
opf.render(opffile)
|
opf.render(opffile)
|
||||||
|
|
||||||
def split_string_separator(txt, size) :
|
def split_string_separator(txt, size):
|
||||||
|
'''
|
||||||
|
Splits the text by putting \n\n at the point size.
|
||||||
|
'''
|
||||||
if len(txt) > size:
|
if len(txt) > size:
|
||||||
txt = ''.join([re.sub(u'\.(?P<ends>[^.]*)$', '.\n\n\g<ends>',
|
txt = ''.join([re.sub(u'\.(?P<ends>[^.]*)$', '.\n\n\g<ends>',
|
||||||
txt[i:i+size], 1) for i in
|
txt[i:i+size], 1) for i in
|
||||||
@ -134,7 +159,7 @@ def split_string_separator(txt, size) :
|
|||||||
|
|
||||||
def detect_paragraph_type(txt):
|
def detect_paragraph_type(txt):
|
||||||
'''
|
'''
|
||||||
Tries to determine the formatting of the document.
|
Tries to determine the paragraph type of the document.
|
||||||
|
|
||||||
block: Paragraphs are separated by a blank line.
|
block: Paragraphs are separated by a blank line.
|
||||||
single: Each line is a paragraph.
|
single: Each line is a paragraph.
|
||||||
@ -177,6 +202,16 @@ def detect_paragraph_type(txt):
|
|||||||
|
|
||||||
|
|
||||||
def detect_formatting_type(txt):
|
def detect_formatting_type(txt):
|
||||||
|
'''
|
||||||
|
Tries to determine the formatting of the document.
|
||||||
|
|
||||||
|
markdown: Markdown formatting is used.
|
||||||
|
textile: Textile formatting is used.
|
||||||
|
heuristic: When none of the above formatting types are
|
||||||
|
detected heuristic is returned.
|
||||||
|
'''
|
||||||
|
# Keep a count of the number of format specific object
|
||||||
|
# that are found in the text.
|
||||||
markdown_count = 0
|
markdown_count = 0
|
||||||
textile_count = 0
|
textile_count = 0
|
||||||
|
|
||||||
@ -200,6 +235,8 @@ def detect_formatting_type(txt):
|
|||||||
# Links
|
# Links
|
||||||
textile_count += len(re.findall(r'"(?=".*?\()(\(.+?\))*[^\(]+?(\(.+?\))*":[^\s]+', txt))
|
textile_count += len(re.findall(r'"(?=".*?\()(\(.+?\))*[^\(]+?(\(.+?\))*":[^\s]+', txt))
|
||||||
|
|
||||||
|
# Decide if either markdown or textile is used in the text
|
||||||
|
# based on the number of unique formatting elements found.
|
||||||
if markdown_count > 5 or textile_count > 5:
|
if markdown_count > 5 or textile_count > 5:
|
||||||
if markdown_count > textile_count:
|
if markdown_count > textile_count:
|
||||||
return 'markdown'
|
return 'markdown'
|
||||||
|
Loading…
x
Reference in New Issue
Block a user