diff --git a/src/calibre/ebooks/conversion/__init__.py b/src/calibre/ebooks/conversion/__init__.py index 6f7f017f6b..be49b37591 100644 --- a/src/calibre/ebooks/conversion/__init__.py +++ b/src/calibre/ebooks/conversion/__init__.py @@ -5,6 +5,8 @@ __license__ = 'GPL v3' __copyright__ = '2011, Kovid Goyal ' __docformat__ = 'restructuredtext en' +from polyglot.builtins import native_string_type + class ConversionUserFeedBack(Exception): @@ -25,4 +27,4 @@ class ConversionUserFeedBack(Exception): # Ensure exception uses fully qualified name as this is used to detect it in # the GUI. -ConversionUserFeedBack.__name__ = str('calibre.ebooks.conversion.ConversionUserFeedBack') +ConversionUserFeedBack.__name__ = native_string_type('calibre.ebooks.conversion.ConversionUserFeedBack') diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 846f27198b..11f3e1025e 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -75,8 +75,8 @@ def smarten_punctuation(html, log=None): from calibre.ebooks.conversion.utils import HeuristicProcessor preprocessor = HeuristicProcessor(log=log) from uuid import uuid4 - start = 'calibre-smartypants-'+str(uuid4()) - stop = 'calibre-smartypants-'+str(uuid4()) + start = 'calibre-smartypants-'+unicode_type(uuid4()) + stop = 'calibre-smartypants-'+unicode_type(uuid4()) html = html.replace('', stop) html = preprocessor.fix_nbsp_indents(html) @@ -152,20 +152,20 @@ class DocAnalysis(object): maxLineLength=1900 # Discard larger than this to stay in range buckets=20 # Each line is divided into a bucket based on length - # print "there are "+str(len(lines))+" lines" + # print("there are "+unicode_type(len(lines))+" lines") # max = 0 # for line in self.lines: # l = len(line) # if l > max: # max = l - # print "max line found is "+str(max) + # print("max line found is "+unicode_type(max)) # Build the line length histogram hRaw = [0 for i in range(0,buckets)] for line in self.lines: l = len(line) if l > minLineLength and l < maxLineLength: l = int(l/100) - # print "adding "+str(l) + # print("adding "+unicode_type(l)) hRaw[l]+=1 # Normalize the histogram into percents @@ -174,8 +174,8 @@ class DocAnalysis(object): h = [float(count)/totalLines for count in hRaw] else: h = [] - # print "\nhRaw histogram lengths are: "+str(hRaw) - # print " percents are: "+str(h)+"\n" + # print("\nhRaw histogram lengths are: "+unicode_type(hRaw)) + # print(" percents are: "+unicode_type(h)+"\n") # Find the biggest bucket maxValue = 0 @@ -184,10 +184,10 @@ class DocAnalysis(object): maxValue = h[i] if maxValue < percent: - # print "Line lengths are too variable. Not unwrapping." + # print("Line lengths are too variable. Not unwrapping.") return False else: - # print str(maxValue)+" of the lines were in one bucket" + # print(unicode_type(maxValue)+" of the lines were in one bucket") return True @@ -232,7 +232,7 @@ class Dehyphenator(object): if len(firsthalf) > 4 and self.prefixes.match(firsthalf) is None: lookupword = self.removeprefix.sub('', lookupword) if self.verbose > 2: - self.log("lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)) + self.log("lookup word is: "+lookupword+", orig is: " + hyphenated) try: searchresult = self.html.find(lookupword.lower()) except: @@ -240,33 +240,33 @@ class Dehyphenator(object): if self.format == 'html_cleanup' or self.format == 'txt_cleanup': if self.html.find(lookupword) != -1 or searchresult != -1: if self.verbose > 2: - self.log(" Cleanup:returned dehyphenated word: " + str(dehyphenated)) + self.log(" Cleanup:returned dehyphenated word: " + dehyphenated) return dehyphenated elif self.html.find(hyphenated) != -1: if self.verbose > 2: - self.log(" Cleanup:returned hyphenated word: " + str(hyphenated)) + self.log(" Cleanup:returned hyphenated word: " + hyphenated) return hyphenated else: if self.verbose > 2: - self.log(" Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)) + self.log(" Cleanup:returning original text "+firsthalf+" + linefeed "+secondhalf) return firsthalf+'\u2014'+wraptags+secondhalf else: if self.format == 'individual_words' and len(firsthalf) + len(secondhalf) <= 6: if self.verbose > 2: - self.log("too short, returned hyphenated word: " + str(hyphenated)) + self.log("too short, returned hyphenated word: " + hyphenated) return hyphenated if len(firsthalf) <= 2 and len(secondhalf) <= 2: if self.verbose > 2: - self.log("too short, returned hyphenated word: " + str(hyphenated)) + self.log("too short, returned hyphenated word: " + hyphenated) return hyphenated if self.html.find(lookupword) != -1 or searchresult != -1: if self.verbose > 2: - self.log(" returned dehyphenated word: " + str(dehyphenated)) + self.log(" returned dehyphenated word: " + dehyphenated) return dehyphenated else: if self.verbose > 2: - self.log(" returned hyphenated word: " + str(hyphenated)) + self.log(" returned hyphenated word: " + hyphenated) return hyphenated def __call__(self, html, format, length=1): @@ -595,7 +595,7 @@ class HTMLPreProcessor(object): docanalysis = DocAnalysis('pdf', html) length = docanalysis.line_length(getattr(self.extra_opts, 'unwrap_factor')) if length: - # print "The pdf line length returned is " + str(length) + # print("The pdf line length returned is " + unicode_type(length)) # unwrap em/en dashes end_rules.append((re.compile( r'(?<=.{%i}[–—])\s*

\s*(?=[\[a-z\d])' % length), lambda match: ''))