py3: make pmlz output work

in python3, the re module is more picky about what arguments are used
with it, and invalid escapes do not fall back on being treated as string
literals, but raise an error. Use raw strings to ensure that the escaped
backslashes are preserved all the way to the regular expressions
themselves.
This commit is contained in:
Eli Schwartz 2019-05-19 14:33:52 -04:00
parent 8e368c0d46
commit 1ed017fabd
No known key found for this signature in database
GPG Key ID: CEB167EFB5722BD6

View File

@ -174,8 +174,8 @@ class PMLMLizer(object):
return text
def prepare_text(self, text):
# Replace empty paragraphs with \c pml codes used to denote emtpy lines.
text = re.sub(unicode_type(r'(?<=</p>)\s*<p[^>]*>[\xc2\xa0\s]*</p>'), '\\c\n\\c', text)
# Replace empty paragraphs with \c pml codes used to denote empty lines.
text = re.sub(unicode_type(r'(?<=</p>)\s*<p[^>]*>[\xc2\xa0\s]*</p>'), r'\\c\n\\c', text)
return text
def clean_text(self, text):
@ -207,7 +207,7 @@ class PMLMLizer(object):
text = re.sub('[ ]{2,}', ' ', text)
# Condense excessive \c empty line sequences.
text = re.sub('(\\c\\s*\\c\\s*){2,}', '\\c \n\\c\n', text)
text = re.sub(r'(\\c\\s*\\c\\s*){2,}', r'\\c \n\\c\n', text)
# Remove excessive newlines.
text = re.sub('\n[ ]+\n', '\n\n', text)