Merge branch 'py3' of https://github.com/eli-schwartz/calibre

2025-07-09 03:04:10 -04:00 · 2019-05-20 12:20:25 +05:30 · 2019-05-20 12:20:25 +05:30 · 4c5e9a20a4
commit 4c5e9a20a4
parent f32ea26bf1 c6e0698c36
18 changed files with 82 additions and 84 deletions
--- a/resources/templates/html_export_default.tmpl
+++ b/resources/templates/html_export_default.tmpl
@ -14,16 +14,16 @@ ${head_content}$
  ${for title in meta.titles():}$
    ${if pos1:}$
    <h1>
-      <a href="${tocUrl}$">${print title}$</a>
+      <a href="${tocUrl}$">${print(title)}$</a>
    </h1>
    ${:else:}$
-    <div class="calibreMetaSubtitle">${print title}$</div>
+    <div class="calibreMetaSubtitle">${print(title)}$</div>
    ${:endif}$
    ${pos1=0}$
  ${:endfor}$
  </div>
  <div class="calibreMetaAuthor">
-    ${print ', '.join(meta.creators())}$
+    ${print(', '.join(meta.creators()))}$
  </div>
 </div>
@ -33,13 +33,13 @@ ${head_content}$
    ${if prevLink or nextLink:}$
      <div class="calibreEbNavTop">
        ${if prevLink:}$
-          <a href="${prevLink}$" class="calibreAPrev">${print _('previous page'),}$</a>
+          <a href="${prevLink}$" class="calibreAPrev">${print(_('previous page'))}$</a>
        ${:else:}$
-          <a href="${tocUrl}$" class="calibreAPrev">${print _('previous page'),}$</a>
+          <a href="${tocUrl}$" class="calibreAPrev">${print(_('previous page'))}$</a>
        ${:endif}$
        ${if nextLink:}$
-          <a href="${nextLink}$" class="calibreANext">${print _('next page'),}$</a>
+          <a href="${nextLink}$" class="calibreANext">${print(_('next page'))}$</a>
        ${:endif}$
      </div>
    ${:endif}$
@ -49,22 +49,22 @@ ${head_content}$
  ${if has_toc:}$
  <div class="calibreToc">
-    <h2><a href="${tocUrl}$">${print _('Table of contents'),}$</a></h2>
+    <h2><a href="${tocUrl}$">${print( _('Table of contents'))}$</a></h2>
-    ${print toc()}$
+    ${print(toc())}$
  </div>
  ${:endif}$
  <div class="calibreEbNav">
    ${if prevLink:}$
-      <a href="${prevLink}$" class="calibreAPrev">${print _('previous page'),}$</a>
+      <a href="${prevLink}$" class="calibreAPrev">${print(_('previous page'))}$</a>
    ${:else:}$
-      <a href="${tocUrl}$" class="calibreAPrev">${print _('previous page'),}$</a>
+      <a href="${tocUrl}$" class="calibreAPrev">${print(_('previous page'))}$</a>
    ${:endif}$
-    <a href="${tocUrl}$" class="calibreAHome">${print _('start'),}$</a>
+    <a href="${tocUrl}$" class="calibreAHome">${print(_('start'))}$</a>
    ${if nextLink:}$
-      <a href="${nextLink}$" class="calibreANext">${print _('next page'),}$</a>
+      <a href="${nextLink}$" class="calibreANext">${print(_('next page'))}$</a>
    ${:endif}$
  </div>
--- a/resources/templates/html_export_default_index.tmpl
+++ b/resources/templates/html_export_default_index.tmpl
@ -6,10 +6,10 @@
 <link rel="schema.DC" href="http://purl.org/dc/elements/1.1/" />
 <link rel="schema.DCTERMS" href="http://purl.org/dc/terms/" />
-<title>${print ', '.join(meta.creators()),}$ - ${print meta.titles().next(); meta.titles().close()}$</title>
+<title>${print(', '.join(meta.creators()))}$ - ${print(next(meta.titles())); print(meta.titles().close())}$</title>
 ${for item in meta:}$
-  <meta ${print 'name="DC.'+item['name']+'"',}$ ${print 'content="'+item['value']+'"',}$ />
+  <meta ${print('name="DC.'+item['name']+'"')}$ ${print('content="'+item['value']+'"')}$ />
 ${:endfor}$
 <link href="${cssLink}$" type="text/css" rel="stylesheet" />
@ -22,16 +22,16 @@ ${:endfor}$
  ${for title in meta.titles():}$
    ${if pos1:}$
    <h1>
-      <a href="${tocUrl}$">${print title}$</a>
+      <a href="${tocUrl}$">${print(title)}$</a>
    </h1>
    ${:else:}$
-    <div class="calibreMetaSubtitle">${print title}$</div>
+    <div class="calibreMetaSubtitle">${print(title)}$</div>
    ${:endif}$
    ${pos1=0}$
  ${:endfor}$
  </div>
  <div class="calibreMetaAuthor">
-    ${print ', '.join(meta.creators()),}$
+    ${print(', '.join(meta.creators()))}$
  </div>
 </div>
@ -40,19 +40,19 @@ ${:endfor}$
    ${if has_toc:}$
      <div class="calibreTocIndex">
-        <h2>${print _('Table of contents'),}$</h2>
+        <h2>${print(_('Table of contents'))}$</h2>
        ${toc}$
      </div>
    ${:else:}$
-        <h2>${print _('No table of contents present'),}$</h2>
+        <h2>${print(_('No table of contents present'))}$</h2>
-        <div><strong><a href="${nextLink}$">${print _('begin to read'),}$</a></strong></div>
+        <div><strong><a href="${nextLink}$">${print(_('begin to read'))}$</a></strong></div>
    ${:endif}$
  </div>
  <div class="calibreEbNav">
    ${if nextLink:}$
-      <a href="${nextLink}$" class="calibreANext">${print _('next page'),}$</a>
+      <a href="${nextLink}$" class="calibreANext">${print(_('next page'))}$</a>
    ${:endif}$
  </div>
 </div>
--- a/src/calibre/ebooks/conversion/plugins/fb2_input.py
+++ b/src/calibre/ebooks/conversion/plugins/fb2_input.py
@ -90,7 +90,8 @@ class FB2Input(InputFormatPlugin):
            css = re.sub(r'name\s*=\s*', 'class=', css)
        self.extract_embedded_content(doc)
        log.debug('Converting XML to HTML...')
-        ss = open(P('templates/fb2.xsl'), 'rb').read()
+        with open(P('templates/fb2.xsl'), 'rb') as f:
            ss = f.read().decode('utf-8')
        ss = ss.replace("__FB_NS__", fb_ns)
        if options.no_inline_fb2_toc:
            log('Disabling generation of inline FB2 TOC')
@ -124,8 +125,10 @@ class FB2Input(InputFormatPlugin):
            src = img.get('src')
            img.set('src', self.binary_map.get(src, src))
        index = transform.tostring(result)
-        open(u'index.xhtml', 'wb').write(index)
+        with open(u'index.xhtml', 'wb') as f:
-        open(u'inline-styles.css', 'wb').write(css)
+            f.write(index.encode('utf-8'))
        with open(u'inline-styles.css', 'wb') as f:
            f.write(css.encode('utf-8'))
        stream.seek(0)
        mi = get_metadata(stream, 'fb2')
        if not mi.title:
--- a/src/calibre/ebooks/conversion/plugins/html_output.py
+++ b/src/calibre/ebooks/conversion/plugins/html_output.py
@ -79,7 +79,7 @@ class HTMLOutput(OutputFormatPlugin):
        from lxml import etree
        root = self.generate_toc(oeb_book, ref_url, output_dir)
-        return etree.tostring(root, pretty_print=True, encoding='utf-8',
+        return etree.tostring(root, pretty_print=True, encoding='unicode',
                xml_declaration=False)
    def convert(self, oeb_book, output_path, input_plugin, opts, log):
@ -161,14 +161,14 @@ class HTMLOutput(OutputFormatPlugin):
                # get & clean HTML <HEAD>-data
                head = root.xpath('//h:head', namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0]
-                head_content = etree.tostring(head, pretty_print=True, encoding='utf-8')
+                head_content = etree.tostring(head, pretty_print=True, encoding='unicode')
                head_content = re.sub(r'\<\/?head.*\>', '', head_content)
                head_content = re.sub(re.compile(r'\<style.*\/style\>', re.M|re.S), '', head_content)
                head_content = re.sub(r'<(title)([^>]*)/>', r'<\1\2></\1>', head_content)
                # get & clean HTML <BODY>-data
                body = root.xpath('//h:body', namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0]
-                ebook_content = etree.tostring(body, pretty_print=True, encoding='utf-8')
+                ebook_content = etree.tostring(body, pretty_print=True, encoding='unicode')
                ebook_content = re.sub(r'\<\/?body.*\>', '', ebook_content)
                ebook_content = re.sub(r'<(div|a|span)([^>]*)/>', r'<\1\2></\1>', ebook_content)
@ -202,7 +202,7 @@ class HTMLOutput(OutputFormatPlugin):
                # write html to file
                with open(path, 'wb') as f:
-                    f.write(t)
+                    f.write(t.encode('utf-8'))
                item.unload_data_from_memory(memory=path)
        zfile = zipfile.ZipFile(output_path, "w")
--- a/src/calibre/ebooks/conversion/plugins/pml_input.py
+++ b/src/calibre/ebooks/conversion/plugins/pml_input.py
@ -41,7 +41,9 @@ class PMLInput(InputFormatPlugin):
        else:
            html_stream = html_path
-        ienc = pml_stream.encoding if pml_stream.encoding else 'cp1252'
+        ienc = getattr(pml_stream, 'encoding', None)
        if ienc is None:
            ienc = 'cp1252'
        if self.options.input_encoding:
            ienc = self.options.input_encoding
--- a/src/calibre/ebooks/conversion/plugins/snb_output.py
+++ b/src/calibre/ebooks/conversion/plugins/snb_output.py
@ -142,7 +142,7 @@ class SNBOutput(OutputFormatPlugin):
            for tocitem in oeb_book.toc:
                if tocitem.href.find('#') != -1:
-                    item = string.split(tocitem.href, '#')
+                    item = tocitem.href.split('#')
                    if len(item) != 2:
                        log.error('Error in TOC item: %s' % tocitem)
                    else:
--- a/src/calibre/ebooks/conversion/plugins/txt_input.py
+++ b/src/calibre/ebooks/conversion/plugins/txt_input.py
@ -138,7 +138,7 @@ class TXTInput(InputFormatPlugin):
                block_to_single_line, separate_hard_scene_breaks)
        self.log = log
-        txt = ''
+        txt = b''
        log.debug('Reading text from file...')
        length = 0
        base_dir = getcwd()
@ -151,7 +151,7 @@ class TXTInput(InputFormatPlugin):
            for x in walk('.'):
                if os.path.splitext(x)[1].lower() in ('.txt', '.text'):
                    with open(x, 'rb') as tf:
-                        txt += tf.read() + '\n\n'
+                        txt += tf.read() + b'\n\n'
        else:
            if getattr(stream, 'name', None):
                base_dir = os.path.dirname(stream.name)
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -584,7 +584,7 @@ class HTMLPreProcessor(object):
                end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
                end_rules.append(
                    # Un wrap using punctuation
-                    (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),  # noqa
+                    (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\\\\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),  # noqa
                )
        for rule in self.PREPROCESS + start_rules:
--- a/src/calibre/ebooks/mobi/reader/mobi8.py
+++ b/src/calibre/ebooks/mobi/reader/mobi8.py
@ -391,7 +391,7 @@ class Mobi8Reader(object):
                fi = self.get_file_info(pos)
                if fi.filename is None:
                    raise ValueError('Index entry has invalid pos: %d'%pos)
-                idtag = self.get_id_tag(pos).decode(self.header.codec)
+                idtag = self.get_id_tag(pos)
                href = '%s/%s'%(fi.type, fi.filename)
            else:
                try:
@ -403,7 +403,7 @@ class Mobi8Reader(object):
                    continue
            entry['href'] = href
-            entry['idtag'] = idtag
+            entry['idtag'] = idtag.decode(self.header.codec)
        for e in remove:
            index_entries.remove(e)
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@ -605,7 +605,7 @@ class DirContainer(object):
        for root, dirs, files in os.walk(base):
            for fname in files:
                fname = os.path.join(root, fname)
-                fname = fname.replace('\\', '/')
+                fname = fname.replace(b'\\', b'/')
                if not isinstance(fname, unicode_type):
                    try:
                        fname = fname.decode(filesystem_encoding)
--- a/src/calibre/ebooks/pdb/header.py
+++ b/src/calibre/ebooks/pdb/header.py
@ -24,7 +24,7 @@ class PdbHeaderReader(object):
    def identity(self):
        self.stream.seek(60)
        ident = self.stream.read(8)
-        return ident
+        return ident.decode('utf-8')
    def section_count(self):
        self.stream.seek(76)
@ -67,8 +67,8 @@ class PdbHeaderReader(object):
 class PdbHeaderBuilder(object):
    def __init__(self, identity, title):
-        self.identity = identity.ljust(3, '\x00')[:8]
+        self.identity = identity.ljust(3, '\x00')[:8].encode('utf-8')
-        self.title = '%s\x00' % re.sub('[^-A-Za-z0-9 ]+', '_', title).ljust(31, '\x00')[:31].encode('ascii', 'replace')
+        self.title = b'%s\x00' % re.sub('[^-A-Za-z0-9 ]+', '_', title).ljust(31, '\x00')[:31].encode('ascii', 'replace')
    def build_header(self, section_lengths, out_stream):
        '''
@ -85,4 +85,4 @@ class PdbHeaderBuilder(object):
        for id, record in enumerate(section_lengths):
            out_stream.write(struct.pack('>LBBBB', long_type(offset), 0, 0, 0, 0))
            offset += record
-        out_stream.write('\x00\x00')
+        out_stream.write(b'\x00\x00')
--- a/src/calibre/ebooks/pdb/palmdoc/writer.py
+++ b/src/calibre/ebooks/pdb/palmdoc/writer.py
@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
 from __future__ import division
 '''
 Writer content to palmdoc pdb file.
@ -57,13 +58,13 @@ class Writer(FormatWriter):
        txt_length = len(txt)
        txt_records = []
-        for i in range(0, (len(txt) / MAX_RECORD_SIZE) + 1):
+        for i in range(0, (len(txt) // MAX_RECORD_SIZE) + 1):
            txt_records.append(txt[i * MAX_RECORD_SIZE: (i * MAX_RECORD_SIZE) + MAX_RECORD_SIZE])
        return txt_records, txt_length
    def _header_record(self, txt_length, record_count):
-        record = ''
+        record = b''
        record += struct.pack('>H', 2)                  # [0:2],   PalmDoc compression. (1 = No compression).
        record += struct.pack('>H', 0)                  # [2:4],   Always 0.
@ -73,4 +74,3 @@ class Writer(FormatWriter):
        record += struct.pack('>L', 0)                  # [12-16], Current reading position, as an offset into the uncompressed text.
        return record
--- a/src/calibre/ebooks/pml/pmlml.py
+++ b/src/calibre/ebooks/pml/pmlml.py
@ -174,8 +174,8 @@ class PMLMLizer(object):
        return text
    def prepare_text(self, text):
-        # Replace empty paragraphs with \c pml codes used to denote emtpy lines.
+        # Replace empty paragraphs with \c pml codes used to denote empty lines.
-        text = re.sub(unicode_type(r'(?<=</p>)\s*<p[^>]*>[\xc2\xa0\s]*</p>'), '\\c\n\\c', text)
+        text = re.sub(unicode_type(r'(?<=</p>)\s*<p[^>]*>[\xc2\xa0\s]*</p>'), r'\\c\n\\c', text)
        return text
    def clean_text(self, text):
@ -207,7 +207,7 @@ class PMLMLizer(object):
        text = re.sub('[ ]{2,}', ' ', text)
        # Condense excessive \c empty line sequences.
-        text = re.sub('(\\c\\s*\\c\\s*){2,}', '\\c \n\\c\n', text)
+        text = re.sub(r'(\\c\\s*\\c\\s*){2,}', r'\\c \n\\c\n', text)
        # Remove excessive newlines.
        text = re.sub('\n[ ]+\n', '\n\n', text)
--- a/src/calibre/ebooks/rtf2xml/ParseRtf.py
+++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py
@ -562,7 +562,7 @@ class ParseRtf:
    def __make_temp_file(self,file):
        """Make a temporary file to parse"""
        write_file="rtf_write_file"
-        read_obj = file if hasattr(file, 'read') else open(file,'r')
+        read_obj = file if hasattr(file, 'read') else open(file,'rb')
        with open(write_file, 'wb') as write_obj:
            for line in read_obj:
                write_obj.write(line)
--- a/src/calibre/ebooks/rtf2xml/line_endings.py
+++ b/src/calibre/ebooks/rtf2xml/line_endings.py
@ -36,11 +36,11 @@ class FixLineEndings:
    def fix_endings(self):
        # read
-        with open(self.__file, 'r') as read_obj:
+        with open(self.__file, 'rb') as read_obj:
            input_file = read_obj.read()
        # calibre go from win and mac to unix
-        input_file = input_file.replace('\r\n', '\n')
+        input_file = input_file.replace(b'\r\n', b'\n')
-        input_file = input_file.replace('\r', '\n')
+        input_file = input_file.replace(b'\r', b'\n')
        # remove ASCII invalid chars : 0 to 8 and 11-14 to 24-26-27
        if self.__replace_illegals:
            input_file = clean_ascii_chars(input_file)
--- a/src/calibre/ebooks/rtf2xml/paragraph_def.py
+++ b/src/calibre/ebooks/rtf2xml/paragraph_def.py
@ -608,12 +608,10 @@ if another paragraph_def is found, the state changes to collect_tokens.
        # when determining uniqueness for a style, ingorne these values, since
        # they don't tell us if the style is unique
        ignore_values = ['style-num', 'nest-level', 'in-table']
-        keys = self.__att_val_dict.keys()
+        for k, v in self.__att_val_dict.items():
-        keys.sort()
+            if k in ignore_values:
        for key in keys:
            if key in ignore_values:
                continue
-            my_string += '%s:%s' % (key, self.__att_val_dict[key])
+            my_string += '%s:%s' % (k, v)
        if my_string in self.__style_num_strings:
            num = self.__style_num_strings.index(my_string)
            num += 1  # since indexing starts at zero, rather than 1
@ -637,12 +635,9 @@ if another paragraph_def is found, the state changes to collect_tokens.
            the_value = self.__att_val_dict['tabs']
            # the_value = the_value[:-1]
            style_string += ('<%s>%s' % ('tabs', the_value))
-        keys = self.__att_val_dict.keys()
+        for k, v in self.__att_val_dict.items():
-        keys.sort()
+            if k not in ['name', 'style-num', 'in-table'] + tabs_list:
-        for key in keys:
+                style_string += ('<%s>%s' % (k, v))
            if key != 'name' and key !='style-num' and key != 'in-table'\
              and key not in tabs_list:
                style_string += ('<%s>%s' % (key, self.__att_val_dict[key]))
        style_string += '\n'
        self.__body_style_strings.append(style_string)
@ -690,11 +685,9 @@ if another paragraph_def is found, the state changes to collect_tokens.
            the_value = self.__att_val_dict['tabs']
            # the_value = the_value[:-1]
            self.__write_obj.write('<%s>%s' % ('tabs', the_value))
-        keys = self.__att_val_dict.keys()
+        keys = sorted(self.__att_val_dict.keys())
        keys.sort()
        for key in keys:
-            if key != 'name' and key !='style-num' and key != 'in-table'\
+            if key not in ['name', 'style-num', 'in-table'] + tabs_list:
              and key not in tabs_list:
                self.__write_obj.write('<%s>%s' % (key, self.__att_val_dict[key]))
        self.__write_obj.write('\n')
        self.__write_obj.write(self.__start2_marker)
--- a/src/calibre/ebooks/rtf2xml/process_tokens.py
+++ b/src/calibre/ebooks/rtf2xml/process_tokens.py
@ -43,8 +43,8 @@ class ProcessTokens:
        self.__bug_handler = bug_handler
    def compile_expressions(self):
-        self.__num_exp = re.compile(r"([a-zA-Z]+)(.*)")
+        self.__num_exp = re.compile(br"([a-zA-Z]+)(.*)")
-        self.__utf_exp = re.compile(r'(&.*?;)')
+        self.__utf_exp = re.compile(br'(&.*?;)')
    def initiate_token_dict(self):
        self.__return_code = 0
@ -762,10 +762,10 @@ class ProcessTokens:
    def process_cw(self, token):
        """Change the value of the control word by determining what dictionary
        it belongs to"""
-        special = ['*', ':', '}', '{', '~', '_', '-', ';']
+        special = [b'*', b':', b'}', b'{', b'~', b'_', b'-', b';']
        # if token != "{" or token != "}":
        token = token[1:]  # strip off leading \
-        token = token.replace(" ", "")
+        token = token.replace(b" ", b"")
        # if not token: return
        only_alpha = token.isalpha()
        num = None
@ -784,24 +784,24 @@ class ProcessTokens:
    def process_tokens(self):
        """Main method for handling other methods. """
        line_count = 0
-        with open(self.__file, 'r') as read_obj:
+        with open(self.__file, 'rb') as read_obj:
            with open(self.__write_to, 'wb') as write_obj:
                for line in read_obj:
-                    token = line.replace("\n","")
+                    token = line.replace(b"\n",b"")
                    line_count += 1
-                    if line_count == 1 and token != '\\{':
+                    if line_count == 1 and token != b'\\{':
                        msg = '\nInvalid RTF: document doesn\'t start with {\n'
                        raise self.__exception_handler(msg)
-                    elif line_count == 2 and token[0:4] != '\\rtf':
+                    elif line_count == 2 and token[0:4] != b'\\rtf':
                        msg = '\nInvalid RTF: document doesn\'t start with \\rtf \n'
                        raise self.__exception_handler(msg)
-                    the_index = token.find('\\ ')
+                    the_index = token.find(b'\\ ')
                    if token is not None and the_index > -1:
                        msg = '\nInvalid RTF: token "\\ " not valid.\nError at line %d'\
                            % line_count
                        raise self.__exception_handler(msg)
-                    elif token[:1] == "\\":
+                    elif token[:1] == b"\\":
                        try:
                            token.decode('us-ascii')
                        except UnicodeError as msg:
@ -816,10 +816,10 @@ class ProcessTokens:
                        for field in fields:
                            if not field:
                                continue
-                            if field[0:1] == '&':
+                            if field[0:1] == b'&':
-                                write_obj.write('tx<ut<__________<%s\n' % field)
+                                write_obj.write(b'tx<ut<__________<%s\n' % field)
                            else:
-                                write_obj.write('tx<nu<__________<%s\n' % field)
+                                write_obj.write(b'tx<nu<__________<%s\n' % field)
        if not line_count:
            msg = '\nInvalid RTF: file appears to be empty.\n'
--- a/src/calibre/ebooks/rtf2xml/tokenize.py
+++ b/src/calibre/ebooks/rtf2xml/tokenize.py
@ -94,7 +94,7 @@ class Tokenize:
            uni_len = len(match_obj.group(0))
            if uni_char < 0:
                uni_char += 65536
-            uni_char = codepoint_to_chr(uni_char).encode('ascii', 'xmlcharrefreplace')
+            uni_char = codepoint_to_chr(uni_char).encode('ascii', 'xmlcharrefreplace').decode('ascii')
            self.__uc_char = self.__uc_value[-1]
            # there is only an unicode char
            if len(token)<= uni_len:
@ -113,11 +113,11 @@ class Tokenize:
    def __sub_reg_split(self,input_file):
        input_file = self.__replace_spchar.mreplace(input_file)
        # this is for older RTF
-        input_file = self.__par_exp.sub('\n\\par \n', input_file)
+        input_file = self.__par_exp.sub(r'\n\\par \n', input_file)
-        input_file = self.__cwdigit_exp.sub("\\g<1>\n\\g<2>", input_file)
+        input_file = self.__cwdigit_exp.sub(r"\g<1>\n\g<2>", input_file)
        input_file = self.__cs_ast.sub(r"\g<1>", input_file)
-        input_file = self.__ms_hex_exp.sub("\\mshex0\\g<1> ", input_file)
+        input_file = self.__ms_hex_exp.sub(r"\\mshex0\g<1> ", input_file)
-        input_file = self.__utf_ud.sub("\\{\\uc0 \\g<1>\\}", input_file)
+        input_file = self.__utf_ud.sub(r"\\{\\uc0 \g<1>\\}", input_file)
        # remove \n in bin data
        input_file = self.__bin_exp.sub(lambda x:
                                        x.group().replace('\n', '') + '\n', input_file)
@ -188,7 +188,7 @@ class Tokenize:
        # write
        with open(self.__write_to, 'wb') as write_obj:
-            write_obj.write('\n'.join(tokens))
+            write_obj.write('\n'.join(tokens).encode('utf-8'))
        # Move and copy
        copy_obj = copy.Copy(bug_handler=self.__bug_handler)
        if self.__copy: