From bbbea665d33f53b6ec029c4c146987c44d1ba35a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 26 Nov 2014 22:36:49 +0530 Subject: [PATCH] Change str() to unicode() should prevent errors when running heuristics on UTF-8 encoded books on linux/windows systems with a non-UTF-8 character encoding --- src/calibre/ebooks/conversion/utils.py | 50 +++++++++++++------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 2dc91b7530..e626747717 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -177,10 +177,10 @@ class HeuristicProcessor(object): search_text = re.sub(r'<[^>]*>', '', search_text) for pat in ITALICIZE_STYLE_PATS: for match in re.finditer(pat, search_text): - ital_string = str(match.group('words')) - #self.log.debug("italicising "+str(match.group(0))+" with "+ital_string+"") + ital_string = unicode(match.group('words')) + #self.log.debug("italicising "+unicode(match.group(0))+" with "+ital_string+"") try: - html = re.sub(re.escape(str(match.group(0))), '%s' % ital_string, html) + html = re.sub(re.escape(unicode(match.group(0))), '%s' % ital_string, html) except OverflowError: # match.group(0) was too large to be compiled into a regex continue @@ -205,7 +205,7 @@ class HeuristicProcessor(object): if wordcount > 200000: typical_chapters = 15000. self.min_chapters = int(ceil(wordcount / typical_chapters)) - self.log.debug("minimum chapters required are: "+str(self.min_chapters)) + self.log.debug("minimum chapters required are: "+unicode(self.min_chapters)) heading = re.compile(']*>', re.IGNORECASE) self.html_preprocess_sections = len(heading.findall(html)) self.log.debug("found " + unicode(self.html_preprocess_sections) + " pre-existing headings") @@ -319,7 +319,7 @@ class HeuristicProcessor(object): words_per_chptr = wordcount if words_per_chptr > 0 and self.html_preprocess_sections > 0: words_per_chptr = wordcount / self.html_preprocess_sections - self.log.debug("Total wordcount is: "+ str(wordcount)+", Average words per section is: "+str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters") + self.log.debug("Total wordcount is: "+ unicode(wordcount)+", Average words per section is: "+unicode(words_per_chptr)+", Marked up "+unicode(self.html_preprocess_sections)+" chapters") return html def punctuation_unwrap(self, length, content, format): @@ -349,8 +349,8 @@ class HeuristicProcessor(object): # define the pieces of the regex - lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\IA\u00DF]|(?)?\s*()?" blanklines = "\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*\s*)\s*){0,3}\s*" @@ -482,9 +482,9 @@ class HeuristicProcessor(object): lines = float(len(self.single_blank.findall(to_merge))) - 1. em = base_em + (em_per_line * lines) if to_merge.find('whitespace'): - newline = self.any_multi_blank.sub('\n

', match.group(0)) + newline = self.any_multi_blank.sub('\n

', match.group(0)) else: - newline = self.any_multi_blank.sub('\n

', match.group(0)) + newline = self.any_multi_blank.sub('\n

', match.group(0)) return newline html = self.any_multi_blank.sub(merge_matches, html) @@ -502,9 +502,9 @@ class HeuristicProcessor(object): top_margin = '' bottom_margin = '' if initblanks is not None: - top_margin = 'margin-top:'+str(len(self.single_blank.findall(initblanks)))+'em;' + top_margin = 'margin-top:'+unicode(len(self.single_blank.findall(initblanks)))+'em;' if endblanks is not None: - bottom_margin = 'margin-bottom:'+str(len(self.single_blank.findall(endblanks)))+'em;' + bottom_margin = 'margin-bottom:'+unicode(len(self.single_blank.findall(endblanks)))+'em;' if initblanks == None and endblanks == None: return content @@ -577,7 +577,7 @@ class HeuristicProcessor(object): else: replacement_break = re.sub('(?i)(width=\d+\%?|width:\s*\d+(\%|px|pt|em)?;?)', '', replacement_break) divpercent = (100 - width) / 2 - hr_open = re.sub('45', str(divpercent), hr_open) + hr_open = re.sub('45', unicode(divpercent), hr_open) scene_break = hr_open+replacement_break+'' else: scene_break = hr_open+'
' @@ -637,12 +637,12 @@ class HeuristicProcessor(object): else: styles = match.group('styles').split(';') is_paragraph = self.check_paragraph(content) - #print "styles for this line are: "+str(styles) + #print "styles for this line are: "+unicode(styles) split_styles = [] for style in styles: - #print "style is: "+str(style) + #print "style is: "+unicode(style) newstyle = style.split(':') - #print "newstyle is: "+str(newstyle) + #print "newstyle is: "+unicode(newstyle) split_styles.append(newstyle) styles = split_styles for style, setting in styles: @@ -653,7 +653,7 @@ class HeuristicProcessor(object): if 9 < setting < 14: text_indent = indented_text else: - text_indent = style+':'+str(setting)+'pt;' + text_indent = style+':'+unicode(setting)+'pt;' if style == 'padding': setting = re.sub('pt', '', setting).split(' ') if int(setting[1]) < 16 and int(setting[3]) < 16: @@ -674,26 +674,26 @@ class HeuristicProcessor(object): blockquote_open_loop = blockquote_open if debugabby: self.log.debug('\n\n******\n') - self.log.debug('padding top is: '+str(setting[0])) + self.log.debug('padding top is: '+unicode(setting[0])) self.log.debug('padding right is:' - +str(setting[1])) + +unicode(setting[1])) self.log.debug('padding bottom is: ' + - str(setting[2])) + unicode(setting[2])) self.log.debug('padding left is: ' - +str(setting[3])) + +unicode(setting[3])) - #print "text-align is: "+str(text_align) - #print "\n***\nline is:\n "+str(match.group(0))+'\n' + #print "text-align is: "+unicode(text_align) + #print "\n***\nline is:\n "+unicode(match.group(0))+'\n' if debugabby: - #print "this line is a paragraph = "+str(is_paragraph)+", previous line was "+str(self.previous_was_paragraph) + #print "this line is a paragraph = "+unicode(is_paragraph)+", previous line was "+unicode(self.previous_was_paragraph) self.log.debug("styles for this line were:", styles) self.log.debug('newline is:') self.log.debug(blockquote_open_loop+blockquote_close_loop+ paragraph_before+'

'+content+'

'+paragraph_after+'\n\n\n\n\n') - #print "is_paragraph is "+str(is_paragraph)+", previous_was_paragraph is "+str(self.previous_was_paragraph) + #print "is_paragraph is "+unicode(is_paragraph)+", previous_was_paragraph is "+unicode(self.previous_was_paragraph) self.previous_was_paragraph = is_paragraph - #print "previous_was_paragraph is now set to "+str(self.previous_was_paragraph)+"\n\n\n" + #print "previous_was_paragraph is now set to "+unicode(self.previous_was_paragraph)+"\n\n\n" return blockquote_open_loop+blockquote_close_loop+paragraph_before+'

'+content+'

'+paragraph_after html = abbyy_line.sub(convert_styles, html)