remove full stop punctuation from the line unwrap heuristic

This commit is contained in:
Lee 2012-04-21 00:24:32 +08:00
parent 7a0f6ec510
commit 9217e6bed3

View File

@ -319,7 +319,7 @@ class HeuristicProcessor(object):
'''
# define the pieces of the regex
lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:“”)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
em_en_lookahead = "(?<=.{"+str(length)+u"}[\u2013\u2014])"
soft_hyphen = u"\xad"
line_ending = "\s*</(span|[iubp]|div)>\s*(</(span|[iubp]|div)>)?"