Escaping meta-characters before compiling words as a regex for removing hyphens

This commit is contained in:
ldolse 2010-09-26 10:23:02 +09:00
parent a0a984c5b0
commit 394f09e7f4
2 changed files with 7 additions and 1 deletions

View File

@ -61,6 +61,7 @@ class SonyReaderInput(InputProfile):
dpi = 168.451
fbase = 12
fsizes = [7.5, 9, 10, 12, 15.5, 20, 22, 24]
#unsupported_unicode_chars = [\u2018, \u2019, \u201a, \u201b, \u201c, \u201d, \u201e, \u201f]
class SonyReader300Input(SonyReaderInput):
@ -251,6 +252,9 @@ class OutputProfile(Plugin):
#: The character used to represent a star in ratings
ratings_char = u'*'
#: Unsupported unicode characters to be replaced during preprocessing
unsupported_unicode_chars = []
@classmethod
def tags_to_string(cls, tags):
return escape(', '.join(tags))

View File

@ -182,8 +182,10 @@ class Dehyphenator(object):
lookupword = self.removesuffixes.sub('', dehyphenated)
if self.prefixes.match(firsthalf) is None:
lookupword = self.removeprefix.sub('', lookupword)
booklookup = re.compile(u'%s' % lookupword, re.IGNORECASE)
# escape any meta-characters which may be in the lookup word
lookupword = re.sub(r'(?P<meta>[\[\]\\\^\$\.\|\?\*\+\(\)])', r'\\\g<meta>', lookupword)
#print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
booklookup = re.compile(u'%s' % lookupword, re.IGNORECASE)
if self.format == 'html_cleanup':
match = booklookup.search(self.html)
hyphenmatch = re.search(u'%s' % hyphenated, self.html)