Add more characters referecned by bug #822744 to the line unwrap lookahead character list.

This commit is contained in:
John Schember 2011-08-24 18:20:43 -04:00
parent 140b1181d1
commit 78dc53abae

View File

@ -315,7 +315,7 @@ class HeuristicProcessor(object):
supports a range of html markup and text files
'''
# define the pieces of the regex
lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðßě,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðßôľščťžňďěřů,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
em_en_lookahead = "(?<=.{"+str(length)+u"}[\u2013\u2014])"
soft_hyphen = u"\xad"
line_ending = "\s*</(span|[iubp]|div)>\s*(</(span|[iubp]|div)>)?"