diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index 22ace69037..6a20703cbb 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -37,8 +37,8 @@ def clean_txt(txt): txt = re.sub('[ ]{2,}', ' ', txt) # Remove blank space from the beginning and end of the document. - txt = re.sub('^\s+(?=.)', '', txt) - txt = re.sub('(?<=.)\s+$', '', txt) + txt = re.sub(r'^\s+(?=.)', '', txt) + txt = re.sub(r'(?<=.)\s+$', '', txt) # Remove excessive line breaks. txt = re.sub('\n{5,}', '\n\n\n\n', txt) # remove ASCII invalid chars : 0 to 8 and 11-14 to 24 @@ -178,7 +178,7 @@ def separate_hard_scene_breaks(txt): return '\n%s\n' % line else: return line - txt = re.sub(u'(?miu)^[ \t-=~\/_]+$', lambda mo: sep_break(mo.group()), txt) + txt = re.sub(type(u'')(r'(?miu)^[ \t-=~\/_]+$'), lambda mo: sep_break(mo.group()), txt) return txt @@ -216,7 +216,7 @@ def split_string_separator(txt, size): Splits the text by putting \n\n at the point size. ''' if len(txt) > size: - txt = ''.join([re.sub(u'\.(?P[^.]*)$', '.\n\n\g', + txt = ''.join([re.sub(type(u'')(r'\.(?P[^.]*)$'), r'.\n\n\g', txt[i:i+size], 1) for i in xrange(0, len(txt), size)]) return txt @@ -236,7 +236,7 @@ def detect_paragraph_type(txt): ''' txt = txt.replace('\r\n', '\n') txt = txt.replace('\r', '\n') - txt_line_count = len(re.findall('(?mu)^\s*.+$', txt)) + txt_line_count = len(re.findall(r'(?mu)^\s*.+$', txt)) # Check for hard line breaks - true if 55% of the doc breaks in the same region docanalysis = DocAnalysis('txt', txt) @@ -244,11 +244,11 @@ def detect_paragraph_type(txt): if hardbreaks: # Determine print percentage - tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt)) + tab_line_count = len(re.findall(r'(?mu)^(\t|\s{2,}).+$', txt)) print_percent = tab_line_count / float(txt_line_count) # Determine block percentage - empty_line_count = len(re.findall('(?mu)^\s*$', txt)) + empty_line_count = len(re.findall(r'(?mu)^\s*$', txt)) block_percent = empty_line_count / float(txt_line_count) # Compare the two types - the type with the larger number of instances wins @@ -286,9 +286,9 @@ def detect_formatting_type(txt): markdown_count += len(re.findall('(?mu)^=+$', txt)) markdown_count += len(re.findall('(?mu)^-+$', txt)) # Images - markdown_count += len(re.findall('(?u)!\[.*?\](\[|\()', txt)) + markdown_count += len(re.findall(r'(?u)!\[.*?\](\[|\()', txt)) # Links - markdown_count += len(re.findall('(?u)^|[^!]\[.*?\](\[|\()', txt)) + markdown_count += len(re.findall(r'(?u)^|[^!]\[.*?\](\[|\()', txt)) # Check for textile # Headings