From 2677a9296b07acb750d48a70933ee5402e3081bc Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Mon, 27 Sep 2010 17:59:31 +0800
Subject: [PATCH] String searches - avoid regex compilations entirely

---
 src/calibre/ebooks/conversion/preprocess.py | 17 ++++++-----------
 src/calibre/ebooks/conversion/utils.py      |  3 +--
 2 files changed, 7 insertions(+), 13 deletions(-)
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 840eff4c12..36221f486b 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -167,8 +167,8 @@ class Dehyphenator(object):
         # don't add suffixes which are also complete words, such as 'able' or 'sex'
         self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
         # remove prefixes if the prefix was not already the point of hyphenation
-        self.prefixes = re.compile(r'^(un|in|ex)$', re.IGNORECASE)
-        self.removeprefix = re.compile(r'^(un|in|ex)', re.IGNORECASE)
+        self.prefixes = re.compile(r'^(dis|re|un|in|ex)$', re.IGNORECASE)
+        self.removeprefix = re.compile(r'^(dis|re|un|in|ex)', re.IGNORECASE)
 
     def dehyphenate(self, match):
         firsthalf = match.group('firstpart')
@@ -182,17 +182,13 @@ class Dehyphenator(object):
         lookupword = self.removesuffixes.sub('', dehyphenated)
         if self.prefixes.match(firsthalf) is None:
            lookupword = self.removeprefix.sub('', lookupword)
-        # escape any meta-characters which may be in the lookup word
-        lookupword = re.sub(r'(?P<meta>[\[\]\\\^\$\.\|\?\*\+\(\)])', r'\\\g<meta>', lookupword)
         #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
         booklookup = re.compile(u'%s' % lookupword, re.IGNORECASE)
         if self.format == 'html_cleanup':
-           match = booklookup.search(self.html)
-           hyphenmatch = re.search(u'%s' % hyphenated, self.html)
-           if match:
+           if self.html.find(lookupword) != -1 or self.html.find(str.lower(lookupword)) != -1:
                #print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
                return dehyphenated
-           elif hyphenmatch:
+           elif self.html.find(hyphenated) != -1:
                #print "Cleanup:returned hyphenated word: " + str(hyphenated)
                return hyphenated
            else:
@@ -200,8 +196,7 @@ class Dehyphenator(object):
                return firsthalf+u'\u2014'+wraptags+secondhalf
                
         else:
-            match = booklookup.search(self.html)
-            if match:
+            if self.html.find(lookupword) != -1 or self.html.find(str.lower(lookupword)) != -1:
                 #print "returned dehyphenated word: " + str(dehyphenated)
                 return dehyphenated
             else:
@@ -461,7 +456,7 @@ class HTMLPreProcessor(object):
         if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
             length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'), 'median')
             if length:
-                print "The pdf line length returned is " + str(length)
+                #print "The pdf line length returned is " + str(length)
                 end_rules.append(
                     # Un wrap using punctuation
                     (re.compile(u'(?<=.{%i}([a-z,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index f41f6abd08..28c92eb7d8 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -163,7 +163,7 @@ class PreProcessor(object):
         default_title = r"(\s*[\w\'\"-]+){1,5}(?!<)"
         typical_chapters = r".?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}"
         numeric_chapters = r".?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*"
-        uppercase_chapters = r"\s*.?([A-Z#\-]+\s{0,3}){1,5}\s*"
+        uppercase_chapters = r"\s*.?([A-Z#]+(\s|-){0,3}){1,5}\s*"
         
         chapter_marker = lookahead+chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
         #print chapter_marker     
@@ -185,7 +185,6 @@ class PreProcessor(object):
             self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words")
             chapter_marker = lookahead+chapter_line_open+chapter_header_open+uppercase_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
             chapdetect2 = re.compile(r'%s' % chapter_marker,  re.UNICODE)
-            print str(chapter_marker)
             #chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu][^>]*>){0,2}\s*.?([A-Z#\-\s]+)\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE)
             html = chapdetect2.sub(self.chapter_head, html)
         ###### Unwrap lines ######