This commit is contained in:
Kovid Goyal 2019-02-07 10:07:18 +05:30
parent 2e141a6175
commit 6b64c0111d
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -37,8 +37,8 @@ def clean_txt(txt):
txt = re.sub('[ ]{2,}', ' ', txt) txt = re.sub('[ ]{2,}', ' ', txt)
# Remove blank space from the beginning and end of the document. # Remove blank space from the beginning and end of the document.
txt = re.sub('^\s+(?=.)', '', txt) txt = re.sub(r'^\s+(?=.)', '', txt)
txt = re.sub('(?<=.)\s+$', '', txt) txt = re.sub(r'(?<=.)\s+$', '', txt)
# Remove excessive line breaks. # Remove excessive line breaks.
txt = re.sub('\n{5,}', '\n\n\n\n', txt) txt = re.sub('\n{5,}', '\n\n\n\n', txt)
# remove ASCII invalid chars : 0 to 8 and 11-14 to 24 # remove ASCII invalid chars : 0 to 8 and 11-14 to 24
@ -178,7 +178,7 @@ def separate_hard_scene_breaks(txt):
return '\n%s\n' % line return '\n%s\n' % line
else: else:
return line return line
txt = re.sub(u'(?miu)^[ \t-=~\/_]+$', lambda mo: sep_break(mo.group()), txt) txt = re.sub(type(u'')(r'(?miu)^[ \t-=~\/_]+$'), lambda mo: sep_break(mo.group()), txt)
return txt return txt
@ -216,7 +216,7 @@ def split_string_separator(txt, size):
Splits the text by putting \n\n at the point size. Splits the text by putting \n\n at the point size.
''' '''
if len(txt) > size: if len(txt) > size:
txt = ''.join([re.sub(u'\.(?P<ends>[^.]*)$', '.\n\n\g<ends>', txt = ''.join([re.sub(type(u'')(r'\.(?P<ends>[^.]*)$'), r'.\n\n\g<ends>',
txt[i:i+size], 1) for i in txt[i:i+size], 1) for i in
xrange(0, len(txt), size)]) xrange(0, len(txt), size)])
return txt return txt
@ -236,7 +236,7 @@ def detect_paragraph_type(txt):
''' '''
txt = txt.replace('\r\n', '\n') txt = txt.replace('\r\n', '\n')
txt = txt.replace('\r', '\n') txt = txt.replace('\r', '\n')
txt_line_count = len(re.findall('(?mu)^\s*.+$', txt)) txt_line_count = len(re.findall(r'(?mu)^\s*.+$', txt))
# Check for hard line breaks - true if 55% of the doc breaks in the same region # Check for hard line breaks - true if 55% of the doc breaks in the same region
docanalysis = DocAnalysis('txt', txt) docanalysis = DocAnalysis('txt', txt)
@ -244,11 +244,11 @@ def detect_paragraph_type(txt):
if hardbreaks: if hardbreaks:
# Determine print percentage # Determine print percentage
tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt)) tab_line_count = len(re.findall(r'(?mu)^(\t|\s{2,}).+$', txt))
print_percent = tab_line_count / float(txt_line_count) print_percent = tab_line_count / float(txt_line_count)
# Determine block percentage # Determine block percentage
empty_line_count = len(re.findall('(?mu)^\s*$', txt)) empty_line_count = len(re.findall(r'(?mu)^\s*$', txt))
block_percent = empty_line_count / float(txt_line_count) block_percent = empty_line_count / float(txt_line_count)
# Compare the two types - the type with the larger number of instances wins # Compare the two types - the type with the larger number of instances wins
@ -286,9 +286,9 @@ def detect_formatting_type(txt):
markdown_count += len(re.findall('(?mu)^=+$', txt)) markdown_count += len(re.findall('(?mu)^=+$', txt))
markdown_count += len(re.findall('(?mu)^-+$', txt)) markdown_count += len(re.findall('(?mu)^-+$', txt))
# Images # Images
markdown_count += len(re.findall('(?u)!\[.*?\](\[|\()', txt)) markdown_count += len(re.findall(r'(?u)!\[.*?\](\[|\()', txt))
# Links # Links
markdown_count += len(re.findall('(?u)^|[^!]\[.*?\](\[|\()', txt)) markdown_count += len(re.findall(r'(?u)^|[^!]\[.*?\](\[|\()', txt))
# Check for textile # Check for textile
# Headings # Headings