Escaping meta-characters before compiling words as a regex for removing hyphens

2025-07-09 03:04:10 -04:00 · 2010-09-26 10:23:02 +09:00 · 2010-09-26 10:23:02 +09:00 · 394f09e7f4
commit 394f09e7f4
parent a0a984c5b0
2 changed files with 7 additions and 1 deletions
--- a/src/calibre/customize/profiles.py
+++ b/src/calibre/customize/profiles.py
@ -61,6 +61,7 @@ class SonyReaderInput(InputProfile):
    dpi                       = 168.451
    fbase                     = 12
    fsizes                    = [7.5, 9, 10, 12, 15.5, 20, 22, 24]
+    #unsupported_unicode_chars = [\u2018, \u2019, \u201a, \u201b, \u201c, \u201d, \u201e, \u201f]

 class SonyReader300Input(SonyReaderInput):

@ -251,6 +252,9 @@ class OutputProfile(Plugin):
    #: The character used to represent a star in ratings
    ratings_char = u'*'
    
+    #: Unsupported unicode characters to be replaced during preprocessing
+    unsupported_unicode_chars = []
+
    @classmethod
    def tags_to_string(cls, tags):
        return escape(', '.join(tags))
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -182,8 +182,10 @@ class Dehyphenator(object):
        lookupword = self.removesuffixes.sub('', dehyphenated)
        if self.prefixes.match(firsthalf) is None:
           lookupword = self.removeprefix.sub('', lookupword)
-        booklookup = re.compile(u'%s' % lookupword, re.IGNORECASE)
+        # escape any meta-characters which may be in the lookup word
+        lookupword = re.sub(r'(?P<meta>[\[\]\\\^\$\.\|\?\*\+\(\)])', r'\\\g<meta>', lookupword)
        #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
+        booklookup = re.compile(u'%s' % lookupword, re.IGNORECASE)
        if self.format == 'html_cleanup':
           match = booklookup.search(self.html)
           hyphenmatch = re.search(u'%s' % hyphenated, self.html)