diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index ccbddb2eaa..d791d45aad 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -584,7 +584,10 @@ class HTMLPreProcessor(object): end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*

\s*(?=[[a-z\d])' % length), lambda match: '')) end_rules.append( # Un wrap using punctuation - (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\\\\IA\u00DF]|(?)?\s*(

\s*

\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), # noqa + (re.compile(( + u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\\IA\u00DF]' + u'|(?)?\s*(

\s*

\s*)+\s*(?=(<(i|b|u)>)?' + u'\s*[\w\d$(])') % length, re.UNICODE), wrap_lines), ) for rule in self.PREPROCESS + start_rules: diff --git a/src/calibre/ebooks/mobi/reader/mobi8.py b/src/calibre/ebooks/mobi/reader/mobi8.py index 452bdb7d63..11a9c74f4f 100644 --- a/src/calibre/ebooks/mobi/reader/mobi8.py +++ b/src/calibre/ebooks/mobi/reader/mobi8.py @@ -24,7 +24,7 @@ from calibre.ebooks.metadata.toc import TOC from calibre.ebooks.mobi.utils import read_font_record from calibre.ebooks.oeb.parse_utils import parse_html from calibre.ebooks.oeb.base import XPath, XHTML, xml2text -from polyglot.builtins import range, zip, unicode_type, getcwd +from polyglot.builtins import range, zip, unicode_type, getcwd, as_unicode from polyglot.urllib import urldefrag Part = namedtuple('Part', @@ -403,7 +403,7 @@ class Mobi8Reader(object): continue entry['href'] = href - entry['idtag'] = idtag.decode(self.header.codec) + entry['idtag'] = as_unicode(idtag, self.header.codec or 'utf-8') for e in remove: index_entries.remove(e) diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 98b6ef5c7b..25bb2ad62b 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -600,17 +600,18 @@ class DirContainer(object): def namelist(self): names = [] base = self.rootdir - if isinstance(base, unicode_type): - base = base.encode(filesystem_encoding) for root, dirs, files in os.walk(base): for fname in files: fname = os.path.join(root, fname) - fname = fname.replace(b'\\', b'/') - if not isinstance(fname, unicode_type): + if isinstance(fname, bytes): try: fname = fname.decode(filesystem_encoding) - except: - continue + except Exception: + try: + fname = fname.decode('utf-8') + except Exception: + continue + fname = fname.replace('\\', '/') names.append(fname) return names diff --git a/src/calibre/ebooks/pml/pmlml.py b/src/calibre/ebooks/pml/pmlml.py index 854bd7fa5c..4a0df0d7b4 100644 --- a/src/calibre/ebooks/pml/pmlml.py +++ b/src/calibre/ebooks/pml/pmlml.py @@ -207,7 +207,7 @@ class PMLMLizer(object): text = re.sub('[ ]{2,}', ' ', text) # Condense excessive \c empty line sequences. - text = re.sub(r'(\\c\\s*\\c\\s*){2,}', r'\\c \n\\c\n', text) + text = re.sub(r'(\\c\s*\\c\s*){2,}', r'\\c \n\\c\n', text) # Remove excessive newlines. text = re.sub('\n[ ]+\n', '\n\n', text) diff --git a/src/calibre/ebooks/rtf2xml/paragraph_def.py b/src/calibre/ebooks/rtf2xml/paragraph_def.py index 0812e15776..3243a6d157 100755 --- a/src/calibre/ebooks/rtf2xml/paragraph_def.py +++ b/src/calibre/ebooks/rtf2xml/paragraph_def.py @@ -608,10 +608,9 @@ if another paragraph_def is found, the state changes to collect_tokens. # when determining uniqueness for a style, ingorne these values, since # they don't tell us if the style is unique ignore_values = ['style-num', 'nest-level', 'in-table'] - for k, v in self.__att_val_dict.items(): - if k in ignore_values: - continue - my_string += '%s:%s' % (k, v) + for k in sorted(self.__att_val_dict): + if k not in ignore_values: + my_string += '%s:%s' % (k, self.__att_val_dict[k]) if my_string in self.__style_num_strings: num = self.__style_num_strings.index(my_string) num += 1 # since indexing starts at zero, rather than 1 @@ -635,9 +634,10 @@ if another paragraph_def is found, the state changes to collect_tokens. the_value = self.__att_val_dict['tabs'] # the_value = the_value[:-1] style_string += ('<%s>%s' % ('tabs', the_value)) - for k, v in self.__att_val_dict.items(): - if k not in ['name', 'style-num', 'in-table'] + tabs_list: - style_string += ('<%s>%s' % (k, v)) + exclude = frozenset(['name', 'style-num', 'in-table'] + tabs_list) + for k in sorted(self.__att_val_dict): + if k not in exclude: + style_string += ('<%s>%s' % (k, self.__att_val_dict[k])) style_string += '\n' self.__body_style_strings.append(style_string) @@ -685,9 +685,10 @@ if another paragraph_def is found, the state changes to collect_tokens. the_value = self.__att_val_dict['tabs'] # the_value = the_value[:-1] self.__write_obj.write('<%s>%s' % ('tabs', the_value)) - keys = sorted(self.__att_val_dict.keys()) + keys = sorted(self.__att_val_dict) + exclude = frozenset(['name', 'style-num', 'in-table'] + tabs_list) for key in keys: - if key not in ['name', 'style-num', 'in-table'] + tabs_list: + if key not in exclude: self.__write_obj.write('<%s>%s' % (key, self.__att_val_dict[key])) self.__write_obj.write('\n') self.__write_obj.write(self.__start2_marker)