From 54ee947072e7c9f7de9e6ae56aa793543b5ab8e2 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 30 Sep 2010 17:03:51 -0600 Subject: [PATCH] Fix unicode issues in new preprocess code --- src/calibre/ebooks/conversion/preprocess.py | 6 ++-- src/calibre/ebooks/conversion/utils.py | 32 ++++++++++++--------- 2 files changed, 22 insertions(+), 16 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index bb5c26a50c..c5ebae4bba 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -184,14 +184,14 @@ class Dehyphenator(object): wraptags = match.group('wraptags') except: wraptags = '' - hyphenated = str(firsthalf) + "-" + str(secondhalf) - dehyphenated = str(firsthalf) + str(secondhalf) + hyphenated = unicode(firsthalf) + "-" + unicode(secondhalf) + dehyphenated = unicode(firsthalf) + unicode(secondhalf) lookupword = self.removesuffixes.sub('', dehyphenated) if self.prefixes.match(firsthalf) is None: lookupword = self.removeprefix.sub('', lookupword) #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated) try: - searchresult = self.html.find(str.lower(lookupword)) + searchresult = self.html.find(lookupword.lower()) except: return hyphenated if self.format == 'html_cleanup': diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 5f5c12a703..2faec27b68 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -22,18 +22,21 @@ class PreProcessor(object): title = match.group('title') if not title: self.html_preprocess_sections = self.html_preprocess_sections + 1 - self.log("found " + str(self.html_preprocess_sections) + " chapters. - " + str(chap)) + self.log("found " + unicode(self.html_preprocess_sections) + + " chapters. - " + unicode(chap)) return '

'+chap+'

\n' else: self.html_preprocess_sections = self.html_preprocess_sections + 1 - self.log("found " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title)) + self.log("found " + unicode(self.html_preprocess_sections) + + " chapters & titles. - " + unicode(chap) + ", " + unicode(title)) return '

'+chap+'

\n

'+title+'

\n' def chapter_break(self, match): chap = match.group('section') styles = match.group('styles') self.html_preprocess_sections = self.html_preprocess_sections + 1 - self.log("marked " + str(self.html_preprocess_sections) + " section markers based on punctuation. - " + str(chap)) + self.log("marked " + unicode(self.html_preprocess_sections) + + " section markers based on punctuation. - " + unicode(chap)) return '<'+styles+' style="page-break-before:always">'+chap def insert_indent(self, match): @@ -63,7 +66,8 @@ class PreProcessor(object): line_end = line_end_ere.findall(raw) tot_htm_ends = len(htm_end) tot_ln_fds = len(line_end) - self.log("There are " + str(tot_ln_fds) + " total Line feeds, and " + str(tot_htm_ends) + " marked up endings") + self.log("There are " + unicode(tot_ln_fds) + " total Line feeds, and " + + unicode(tot_htm_ends) + " marked up endings") if percent > 1: percent = 1 @@ -71,7 +75,7 @@ class PreProcessor(object): percent = 0 min_lns = tot_ln_fds * percent - self.log("There must be fewer than " + str(min_lns) + " unmarked lines to add markup") + self.log("There must be fewer than " + unicode(min_lns) + " unmarked lines to add markup") if min_lns > tot_htm_ends: return True @@ -112,7 +116,7 @@ class PreProcessor(object): txtindent = re.compile(ur'[^>]*)>\s*(?P(]*>\s*)+)?\s*(\u00a0){2,}', re.IGNORECASE) html = txtindent.sub(self.insert_indent, html) if self.found_indents > 1: - self.log("replaced "+str(self.found_indents)+ " nbsp indents with inline styles") + self.log("replaced "+unicode(self.found_indents)+ " nbsp indents with inline styles") # remove remaining non-breaking spaces html = re.sub(ur'\u00a0', ' ', html) # Get rid of empty tags to simplify other processing @@ -131,7 +135,8 @@ class PreProcessor(object): lines = linereg.findall(html) blanks_between_paragraphs = False if len(lines) > 1: - self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank") + self.log("There are " + unicode(len(blanklines)) + " blank lines. " + + unicode(float(len(blanklines)) / float(len(lines))) + " percent blank") if float(len(blanklines)) / float(len(lines)) > 0.40 and getattr(self.extra_opts, 'remove_paragraph_spacing', False): self.log("deleting blank lines") @@ -170,20 +175,20 @@ class PreProcessor(object): #print chapter_marker heading = re.compile(']*>', re.IGNORECASE) self.html_preprocess_sections = len(heading.findall(html)) - self.log("found " + str(self.html_preprocess_sections) + " pre-existing headings") + self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings") # # Start with most typical chapter headings, get more aggressive until one works if self.html_preprocess_sections < 10: chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE) html = chapdetect.sub(self.chapter_head, html) if self.html_preprocess_sections < 10: - self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying numeric chapters") + self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying numeric chapters") chapter_marker = lookahead+chapter_line_open+chapter_header_open+numeric_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE) html = chapdetect2.sub(self.chapter_head, html) if self.html_preprocess_sections < 10: - self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words") + self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying with uppercase words") chapter_marker = lookahead+chapter_line_open+chapter_header_open+uppercase_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close chapdetect2 = re.compile(r'%s' % chapter_marker, re.UNICODE) html = chapdetect2.sub(self.chapter_head, html) @@ -207,11 +212,11 @@ class PreProcessor(object): # more of the lines break in the same region of the document then unwrapping is required docanalysis = DocAnalysis(format, html) hardbreaks = docanalysis.line_histogram(.50) - self.log("Hard line breaks check returned "+str(hardbreaks)) + self.log("Hard line breaks check returned "+unicode(hardbreaks)) # Calculate Length unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4) length = docanalysis.line_length(unwrap_factor) - self.log("*** Median line length is " + str(length) + ", calculated with " + format + " format ***") + self.log("*** Median line length is " + unicode(length) + ", calculated with " + format + " format ***") # only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor if hardbreaks or unwrap_factor < 0.4: self.log("Unwrapping required, unwrapping Lines") @@ -240,7 +245,8 @@ class PreProcessor(object): # If still no sections after unwrapping mark split points on lines with no punctuation if self.html_preprocess_sections < 10: - self.log("Looking for more split points based on punctuation, currently have " + str(self.html_preprocess_sections)) + self.log("Looking for more split points based on punctuation," + " currently have " + unicode(self.html_preprocess_sections)) chapdetect3 = re.compile(r'<(?P(p|div)[^>]*)>\s*(?P
(]*>)?\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*()?(){0,2}\s*()?\s*(){0,2}\s*()?\s*)', re.IGNORECASE) html = chapdetect3.sub(self.chapter_break, html) # search for places where a first or second level heading is immediately followed by another