Global overhaul of rtf2xml : RTF fixes (2) -> first tokenize modifications (not completely working without preprocessing)

2025-07-09 03:04:10 -04:00 · 2010-08-09 00:05:51 +02:00 · 2010-08-09 00:05:51 +02:00 · adcad1cb60
commit adcad1cb60
parent 1f237c99bf 983ff06f35
7 changed files with 256 additions and 160 deletions
--- a/resources/images/mimetypes/odt.svg
+++ b/resources/images/mimetypes/odt.svg
@ -0,0 +1,63 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+<svg xmlns:svg="http://www.w3.org/2000/svg" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" version="1.0" width="128" height="128" id="svg2176">
+  <defs id="defs2178">
+    <linearGradient x1="406.065" y1="290.50299" x2="406.065" y2="276.29501" id="linearGradient4819" xlink:href="#linearGradient7431" gradientUnits="userSpaceOnUse" gradientTransform="matrix(4.80814, 0, 0, 2.87475, -1569.44, -758.786)" spreadMethod="pad"/>
+    <linearGradient x1="68.374298" y1="-410.099" x2="67.912201" y2="-478.508" id="linearGradient4817" xlink:href="#linearGradient11367" gradientUnits="userSpaceOnUse" gradientTransform="matrix(1.58223, 0, 0, -0.727268, 275.522, -213.417)" spreadMethod="pad"/>
+    <linearGradient x1="436.48801" y1="-278.91299" x2="436.51199" y2="-299.88699" id="linearGradient4815" xlink:href="#linearGradient11377" gradientUnits="userSpaceOnUse" gradientTransform="matrix(4.59378, 0, 0, 0.359494, -1920.95, 434.897)" spreadMethod="pad"/>
+    <linearGradient id="linearGradient11377">
+      <stop id="stop11379" style="stop-color: rgb(255, 255, 255); stop-opacity: 0;" offset="0"/>
+      <stop id="stop11385" style="stop-color: rgb(255, 255, 255); stop-opacity: 0.575472;" offset="1"/>
+    </linearGradient>
+    <linearGradient id="linearGradient11367">
+      <stop id="stop11369" style="stop-color: rgb(0, 0, 0); stop-opacity: 0;" offset="0"/>
+      <stop id="stop11371" style="stop-color: rgb(0, 0, 0); stop-opacity: 0.254717;" offset="0.72131097"/>
+      <stop id="stop18428" style="stop-color: rgb(0, 0, 0); stop-opacity: 0.12549;" offset="0.91000003"/>
+      <stop id="stop11375" style="stop-color: rgb(0, 0, 0); stop-opacity: 0;" offset="1"/>
+    </linearGradient>
+    <linearGradient id="linearGradient7431">
+      <stop id="stop7433" style="stop-color: rgb(255, 255, 255); stop-opacity: 0;" offset="0"/>
+      <stop id="stop7439" style="stop-color: rgb(255, 255, 255); stop-opacity: 0.858491;" offset="0.72000003"/>
+      <stop id="stop8224" style="stop-color: rgb(255, 255, 255); stop-opacity: 0.707547;" offset="0.89999998"/>
+      <stop id="stop7435" style="stop-color: rgb(255, 255, 255); stop-opacity: 0.320755;" offset="1"/>
+    </linearGradient>
+    <filter id="filter3659">
+      <feGaussianBlur inkscape:collect="always" stdDeviation="0.25192676" id="feGaussianBlur3661"/>
+    </filter>
+  </defs>
+  <g id="layer1">
+    <g transform="matrix(1.1475, 0, 0, 1.1475, -368.661, -33.5075)" id="g4500">
+      <path d="M 326.964,34.4298 L 423.481,34.4298 C 437.856,66.0223 403.767,104.222 423.481,135.402 L 326.964,135.402 L 326.964,34.4298 z" id="path21978" style="fill: rgb(95, 123, 141); fill-opacity: 1; fill-rule: evenodd; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
+      <path d="M 326.964,34.4298 L 353.143,34.4298 C 367.518,66.0223 333.429,104.222 353.143,135.402 L 326.964,135.402 L 326.964,34.4298 z" id="path21980" style="fill: rgb(29, 70, 89); fill-opacity: 1; fill-rule: evenodd; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
+      <rect width="101.089" height="7.9108801" x="34.429798" y="326.96399" transform="matrix(0, 1, 1, 0, 0, 0)" id="rect21982" style="fill: url(&quot;#linearGradient4815&quot;) rgb(0, 0, 0); fill-opacity: 1; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
+      <g transform="matrix(3.17412, 0, 0, 3.17412, 1038.99, -354.131)" id="g21984">
+        <path d="M -218.445,122.416 C -213.917,132.369 -224.656,144.405 -218.445,154.228 L -216.32,154.228 C -222.531,144.405 -211.792,132.369 -216.32,122.416 L -218.445,122.416 z" id="path21986" style="fill: rgb(0, 0, 0); fill-opacity: 0.0798122; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
+        <path d="M -217.955,122.416 C -213.426,132.369 -224.166,144.405 -217.955,154.228 L -216.32,154.228 C -222.531,144.405 -211.792,132.369 -216.32,122.416 L -217.955,122.416 z" id="path21988" style="fill: rgb(0, 0, 0); fill-opacity: 0.131455; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
+        <path d="M -217.403,122.416 C -212.875,132.369 -223.614,144.405 -217.403,154.228 L -216.32,154.228 C -222.531,144.405 -211.792,132.369 -216.32,122.416 L -217.403,122.416 z" id="path21990" style="fill: rgb(0, 0, 0); fill-opacity: 0.197183; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
+        <path d="M -216.852,122.416 C -212.323,132.369 -223.063,144.405 -216.852,154.228 L -216.32,154.228 C -222.531,144.405 -211.792,132.369 -216.32,122.416 L -216.852,122.416 z" id="path21992" style="fill: rgb(0, 0, 0); fill-opacity: 0.267606; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
+      </g>
+      <path d="M 326.964,135.402 L 422.488,135.402 C 412.274,118.171 416.819,101.345 421.306,83.5374 L 326.964,83.2034 L 326.964,135.402 z" id="path21994" style="fill: url(&quot;#linearGradient4817&quot;) rgb(0, 0, 0); fill-opacity: 1; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
+      <g transform="matrix(2.41517, 0, 0, 2.41517, 876.456, -197.189)" id="g21996">
+        <path d="M 602.125,190.59375 C 599.45874,190.67075 596.74504,191.16798 594.53125,192.78125 C 591.5146,192.34561 588.3664,192.55749 585.5,193.6875 C 583.62824,194.43267 582.15635,195.77855 580.96875,197.34375 C 580.95544,197.36301 580.94492,197.38405 580.9375,197.40625 C 580.92091,197.43509 580.91029,197.46697 580.90625,197.5 C 580.91029,197.53303 580.92091,197.56491 580.9375,197.59375 C 580.94492,197.61595 580.95544,197.63699 580.96875,197.65625 C 580.97822,197.66757 580.98868,197.67803 581,197.6875 C 581.02605,197.71524 581.05813,197.73662 581.09375,197.75 C 581.12472,197.75595 581.15653,197.75595 581.1875,197.75 C 581.20825,197.75263 581.22925,197.75263 581.25,197.75 C 584.80749,196.49944 588.39295,195.15225 592.15625,195.5 C 593.28385,195.58867 594.35616,196.00271 595.46875,196.375 C 595.50974,196.38565 595.55276,196.38565 595.59375,196.375 C 595.62678,196.37096 595.65866,196.36034 595.6875,196.34375 C 595.7097,196.33633 595.73074,196.32581 595.75,196.3125 C 598.71379,193.45164 603.00891,192.72955 606.96875,191.90625 C 606.98007,191.89678 606.99053,191.88632 607,191.875 C 607.19563,191.80037 607.32956,191.73576 607.4375,191.625 C 607.49147,191.56962 607.55414,191.50784 607.5625,191.40625 C 607.57086,191.30466 607.51945,191.21518 607.46875,191.15625 C 607.36735,191.03839 607.25573,190.98239 607.125,190.9375 C 606.99427,190.89261 606.8215,190.87546 606.65625,190.84375 C 605.99526,190.71692 605.12704,190.6454 604.8125,190.625 C 604.80209,190.62434 604.79166,190.62434 604.78125,190.625 C 603.91011,190.58739 603.02603,190.56773 602.125,190.59375 z" transform="translate(-806.724, -92.8004)" id="path21998" style="fill: rgb(0, 0, 0); fill-opacity: 1; fill-rule: evenodd; stroke: none; stroke-width: 1pt; stroke-linecap: butt; stroke-linejoin: miter; stroke-opacity: 1;"/>
+        <path d="M -224.344,103.835 C -219.295,101.9 -214.705,101.331 -211.263,102.86 C -208.45,100.119 -202.237,98.6242 -200.227,98.6199 C -207.528,97.8352 -210.552,99.4967 -212,100.582 C -216.698,100.015 -221.096,100.522 -224.344,103.834" id="path22000" style="fill: rgb(255, 255, 255); fill-opacity: 1; stroke: none;"/>
+      </g>
+      <g transform="matrix(2.41517, 0, 0, 2.41517, 876.456, -197.189)" id="g22002">
+        <path d="M 596.25,27.53125 C 587.9033,27.701471 579.93436,30.011449 573.84375,35.03125 C 565.49276,31.223728 554.44432,30.751141 544.375,32.28125 C 538.97209,33.102263 533.8987,34.480363 529.6875,36.34375 C 525.4884,38.201779 521.99675,40.484175 520.1875,43.8125 C 519.57732,44.883163 519.7128,46.206199 520.46875,47.15625 C 521.22471,48.106278 522.5049,48.470375 523.65625,48.125 C 544.63433,42.131263 561.86554,43.038041 573.6875,52.875 C 574.80806,53.806374 576.46471,53.801852 577.5625,52.84375 C 587.80668,43.812696 604.05857,37.910216 621.5625,38.875 C 622.98852,38.943575 624.2874,37.997938 624.625,36.625 C 624.96264,35.252044 624.25302,33.783013 622.96875,33.1875 C 614.73544,29.461107 605.28688,27.346954 596.25,27.53125 z" transform="matrix(0.292292, -0.0677077, 0.0677077, 0.292292, -381.543, 134.276)" id="path22004" style="fill: rgb(0, 0, 0); fill-opacity: 1; fill-rule: evenodd; stroke: none; stroke-width: 1pt; stroke-linecap: butt; stroke-linejoin: miter; stroke-opacity: 1;"/>
+        <path d="M -225.696,112.15 C -219.345,108.564 -214.201,107.915 -209.842,110.097 C -206.945,105.454 -199.625,102.766 -197.16,102.691 C -204.796,101.053 -210.086,104.587 -211.05,106.575 C -215.328,105.394 -224.104,108.305 -225.696,112.149" id="path22006" style="fill: rgb(255, 255, 255); fill-opacity: 1; stroke: none;"/>
+      </g>
+      <g transform="matrix(1.14159, 0, 0, 1.14159, 265.142, -259.674)" id="g22010">
+        <path d="M 134.221,257.626 C 146.813,285.3 116.952,318.766 134.221,346.078 L 138.68,346.078 C 121.411,318.766 151.272,285.3 138.68,257.626 L 134.221,257.626 z" id="path22012" style="fill: rgb(0, 0, 0); fill-opacity: 0.0798122; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
+        <path d="M 135.222,257.626 C 147.814,285.3 117.953,318.766 135.222,346.078 L 138.68,346.078 C 121.411,318.766 151.272,285.3 138.68,257.626 L 135.222,257.626 z" id="path22014" style="fill: rgb(0, 0, 0); fill-opacity: 0.131455; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
+        <path d="M 136.393,257.626 C 148.985,285.3 119.124,318.766 136.393,346.078 L 138.68,346.078 C 121.411,318.766 151.272,285.3 138.68,257.626 L 136.393,257.626 z" id="path22016" style="fill: rgb(0, 0, 0); fill-opacity: 0.197183; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
+        <path d="M 137.564,257.626 C 150.156,285.3 120.295,318.766 137.564,346.078 L 138.68,346.078 C 121.411,318.766 151.272,285.3 138.68,257.626 L 137.564,257.626 z" id="path22018" style="fill: rgb(0, 0, 0); fill-opacity: 0.267606; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
+        <path d="M 133.134,257.626 C 145.726,285.3 115.865,318.766 133.134,346.078 L 138.68,346.078 C 121.411,318.766 151.272,285.3 138.68,257.626 L 133.134,257.626 z" id="path22020" style="fill: rgb(0, 0, 0); fill-opacity: 0.0798122; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
+      </g>
+      <g transform="matrix(1.14159, 0, 0, 1.14159, -389.722, -484.947)" id="g22103">
+        <path d="M 653.161,498.44 L 693.751,498.44" id="path21604" style="fill: rgb(96, 123, 142); fill-opacity: 1; fill-rule: nonzero; stroke: rgb(255, 254, 255); stroke-width: 8; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-dashoffset: 0pt; stroke-opacity: 0.698039;"/>
+        <path d="M 653.161,515.294 L 683.452,515.294" id="path21606" style="fill: rgb(96, 123, 142); fill-opacity: 1; fill-rule: nonzero; stroke: rgb(255, 254, 255); stroke-width: 8; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-dashoffset: 0pt; stroke-opacity: 0.698039;"/>
+        <path d="M 653.161,532.46 L 693.751,532.46" id="path22101" style="fill: rgb(96, 123, 142); fill-opacity: 1; fill-rule: nonzero; stroke: rgb(255, 254, 255); stroke-width: 8; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-dashoffset: 0pt; stroke-opacity: 0.698039;"/>
+      </g>
+      <path d="M 326.964,34.4298 L 423.285,34.4298 C 433.146,54.4709 420.531,82.4058 417.826,102.327 L 326.964,102.327 L 326.964,34.4298 z" id="path22008" style="fill: url(&quot;#linearGradient4819&quot;) rgb(0, 0, 0); fill-opacity: 1; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
+    </g>
+  </g>
+</svg>
--- a/setup.py
+++ b/setup.py
@ -1,99 +0,0 @@
-#!/usr/bin/env python
-# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
-from __future__ import with_statement
-
-__license__   = 'GPL v3'
-__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
-__docformat__ = 'restructuredtext en'
-
-import sys, os, optparse
-
-sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)))
-
-import setup.commands as commands
-from setup import prints, get_warnings
-
-def check_version_info():
-    vi = sys.version_info
-    if vi[0] == 2 and vi[1] > 5:
-        return None
-    return 'calibre requires python >= 2.6'
-
-def option_parser():
-    parser = optparse.OptionParser()
-    parser.add_option('-c', '--clean', default=False, action='store_true',
-            help=('Instead of running the command delete all files generated '
-                'by the command'))
-    parser.add_option('--clean-backups', default=False, action='store_true',
-            help='Delete all backup files from the source tree')
-    parser.add_option('--clean-all', default=False, action='store_true',
-            help='Delete all machine generated files from the source tree')
-    return parser
-
-def clean_backups():
-    for root, _, files in os.walk('.'):
-        for name in files:
-            for t in ('.pyc', '.pyo', '~', '.swp', '.swo'):
-                if name.endswith(t):
-                    os.remove(os.path.join(root, name))
-
-
-def main(args=sys.argv):
-    if len(args) == 1 or args[1] in ('-h', '--help'):
-        print 'Usage: python', args[0], 'command', '[options]'
-        print '\nWhere command is one of:'
-        print
-        for x in sorted(commands.__all__):
-            print '%-20s -'%x,
-            c = getattr(commands, x)
-            desc = getattr(c, 'short_description', c.description)
-            print desc
-
-        print '\nTo get help on a particular command, run:'
-        print '\tpython', args[0], 'command -h'
-        return 1
-
-    command = args[1]
-    if command not in commands.__all__:
-        print command, 'is not a recognized command.'
-        print 'Valid commands:', ', '.join(commands.__all__)
-        return 1
-
-    command = getattr(commands, command)
-
-    parser = option_parser()
-    command.add_all_options(parser)
-    parser.set_usage('Usage: python setup.py %s [options]\n\n'%args[1]+\
-            command.description)
-
-    opts, args = parser.parse_args(args)
-
-    if opts.clean_backups:
-        clean_backups()
-
-    if opts.clean:
-        prints('Cleaning', args[1])
-        command.clean()
-        return 0
-
-    if opts.clean_all:
-        for cmd in commands.__all__:
-            prints('Cleaning', cmd)
-            getattr(commands, cmd).clean()
-        return 0
-
-    command.run_all(opts)
-
-    warnings = get_warnings()
-    if warnings:
-        print
-        prints('There were', len(warnings), 'warning(s):')
-        print
-        for args, kwargs in warnings:
-            prints('*', *args, **kwargs)
-            print
-
-    return 0
-
-if __name__ == '__main__':
-    sys.exit(main())
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@ -50,7 +50,7 @@ class RTFInput(InputFormatPlugin):
        parser = ParseRtf(
            in_file    = stream,
            out_file   = ofile,
-            #deb_dir = 'I:\\Calibre\\rtfdebug',
+            deb_dir = 'I:\\Calibre\\rtfdebug',
            # Convert symbol fonts to unicode equivalents. Default
            # is 1
            convert_symbol = 1,
@ -187,16 +187,17 @@ class RTFInput(InputFormatPlugin):
        self.log = log
        self.log('Converting RTF to XML...')
        #Name of the preprocesssed RTF file
-        fname = self.preprocess(stream.name)
+        #fname = self.preprocess(stream.name)
+        fname = stream.name
        try:
            xml = self.generate_xml(fname)
        except RtfInvalidCodeException, e:
            raise ValueError(_('This RTF file has a feature calibre does not '
            'support. Convert it to HTML first and then try it.\n%s')%e)
        
-        '''dataxml = open('dataxml.xml', 'w')
+        dataxml = open('dataxml.xml', 'w')
        dataxml.write(xml)
-        dataxml.close'''
+        dataxml.close
        
        d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf'))
        if d:
--- a/src/calibre/ebooks/rtf/preprocess.py
+++ b/src/calibre/ebooks/rtf/preprocess.py
@ -228,8 +228,9 @@ class RtfTokenizer():
    def tokenize(self):
        i = 0
        lastDataStart = -1
+        #on parse caractere par caractere
        while i < len(self.rtfData):
-
+            #si ça commence un grpupe
            if isChar(self.rtfData[i], '{'):
                if lastDataStart > -1:
                    self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
@ -237,7 +238,7 @@ class RtfTokenizer():
                self.tokens.append(tokenDelimitatorStart())
                i = i + 1
                continue
-
+            #si ça fini un grpupe
            if isChar(self.rtfData[i], '}'):
                if lastDataStart > -1:
                    self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
@ -245,7 +246,7 @@ class RtfTokenizer():
                self.tokens.append(tokenDelimitatorEnd())
                i = i + 1
                continue
-
+            #on copie si il y a un charactere de controle
            if isChar(self.rtfData[i], '\\'):
                if i + 1 >= len(self.rtfData):
                    raise Exception('Error: Control character found at the end of the document.')
@ -254,6 +255,7 @@ class RtfTokenizer():
                    self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
                    lastDataStart = -1

+                # le token commence ici
                tokenStart = i
                i = i + 1

--- a/src/calibre/ebooks/rtf2xml/line_endings.py
+++ b/src/calibre/ebooks/rtf2xml/line_endings.py
@ -32,7 +32,7 @@ class FixLineEndings:
        self.__write_to = tempfile.mktemp()
        self.__replace_illegals = replace_illegals
    def fix_endings(self):
-        illegal_regx = re.compile( '\x00|\x01|\x02|\x03|\x04|\x05|\x06|\x07|\x08|\x0B|\x0E|\x0F|\x10|\x11|\x12|\x13')
+        illegal_regx = re.compile('\x00|\x01|\x02|\x03|\x04|\x05|\x06|\x07|\x08|\x0B|\x0E|\x0F|\x10|\x11|\x12|\x13')
        # always check since I have to get rid of illegal characters
        #read
        read_obj = open(self.__file, 'r')
--- a/src/calibre/ebooks/rtf2xml/tokenize.py
+++ b/src/calibre/ebooks/rtf2xml/tokenize.py
@ -16,7 +16,10 @@
 #                                                                       #
 #########################################################################
 import os, re, tempfile
+
 from calibre.ebooks.rtf2xml import copy
+from calibre.utils.mreplace import MReplace
+
 class Tokenize:
    """Tokenize RTF into one line per field. Each line will contain information useful for the rest of the script"""
    def __init__(self,
@ -28,20 +31,162 @@ class Tokenize:
        self.__file = in_file
        self.__bug_handler = bug_handler
        self.__copy = copy
-        self.__special_tokens = [ '_', '~', "'", '{', '}' ]
        self.__write_to = tempfile.mktemp()
+        self.__compile_expressions()
+        #variables
+        self.__uc_char = 0
+        self.__uc_bin = False
+        self.__uc_value = [1]
+        
    def __from_ms_to_utf8(self,match_obj):
        uni_char = int(match_obj.group(1))
        if uni_char < 0:
            uni_char +=  65536
        return   '&#x' + str('%X' % uni_char) + ';'
-    def __neg_unicode_func(self, match_obj):
-        neg_uni_char = int(match_obj.group(1)) * -1
-        # sys.stderr.write(str( neg_uni_char))
-        uni_char = neg_uni_char + 65536
-        return   '&#x' + str('%X' % uni_char) + ';'
-    def __sub_line_reg(self,line):
-        line = line.replace("\\\\", "\\backslash ")
+        
+    def __reini_utf8_counters(self):
+        self.__uc_char = 0
+        self.__uc_bin = False
+
+    def __unicode_process(self, token):
+        #change scope in
+        if token == '\{':
+            self.__uc_value.append(self.__uc_value[-1])
+            #basic error handling
+            self.__reini_utf8_counters()
+            return token
+        #change scope out: evaluate dict and rebuild
+        elif token == '\}':
+            #self.__uc_value.pop()
+            self.__reini_utf8_counters()
+            return token
+        #add a uc control
+        elif token[:3] == '\uc':
+            self.__uc_value[-1] = int(token[3:])
+            self.__reini_utf8_counters()
+            return token
+        #handle uc skippable char
+        elif self.__uc_char:
+            #if token[:1] == "\" and token[:1] == "\"
+            pass
+        #go for real \u token
+        match_obj = self.__utf_exp.match(token)
+        if match_obj is not None:
+            #get value and handle negative case
+            uni_char = int(match_obj.group(1))
+            uni_len = len(match_obj.group(1)) + 2
+            if uni_char < 0:
+                uni_char += 65536
+            uni_char = unichr(uni_char).encode('ascii', 'xmlcharrefreplace')
+            #if not uc0
+            if self.__uc_value[-1]:
+                self.__uc_char = self.__uc_value[-1]
+            #there is only an unicode char
+            if len(token)<= uni_len:
+                return uni_char
+            #an unicode char and something else
+            #must be after as it is splited on \
+            elif not self.__uc_value[-1]:
+                print('not only token uc0 token: ' + uni_char + token[uni_len:])
+                return uni_char + token[uni_len:]
+            #if not uc0 and chars
+            else:
+                for i in xrange(uni_len, len(token)):
+                    if token[i] == " ":
+                        continue
+                    elif self.__uc_char > 0:
+                        self.__uc_char -= 1
+                    else:
+                        return uni_char + token[i:]
+            #print('uc: ' + str(self.__uc_value) + 'uni: ' + str(uni_char) + 'token: ' + token)
+        #default
+        return token
+    
+    def __sub_reg_split(self,input_file):
+        input_file = self.__replace_spchar.mreplace(input_file)
+        #input_file = re.sub(self.__utf_exp, self.__from_ms_to_utf8, input_file)
+        # line = re.sub( self.__neg_utf_exp, self.__neg_unicode_func, line)
+        # this is for older RTF
+        #line = re.sub(self.__par_exp, '\\par ', line)
+        input_file = re.sub(self.__ms_hex_exp, "\\mshex0\g<1> ", input_file)
+        #split
+        tokens = re.split(self.__splitexp, input_file)
+        #remove empty tokens and \n
+        return filter(lambda x: len(x) > 0 and x != '\n', tokens)
+        #return filter(lambda x: len(x) > 0, \
+            #(self.__remove_line.sub('', x) for x in tokens))
+        
+        
+    def __compile_expressions(self):
+        SIMPLE_RPL = {
+            "\\\\": "\\backslash ",
+            "\\~": "\\~ ",
+            "\\;": "\\; ",
+            "&": "&amp;",
+            "<": "&lt;",
+            ">": "&gt;",
+            "\\~": "\\~ ",
+            "\\_": "\\_ ",
+            "\\:": "\\: ",
+            "\\-": "\\- ",
+            # turn into a generic token to eliminate special
+            # cases and make processing easier
+            "\\{": "\\ob ",
+            # turn into a generic token to eliminate special
+            # cases and make processing easier
+            "\\}": "\\cb ",
+            # put a backslash in front of to eliminate special cases and
+            # make processing easier
+            "{": "\\{",
+            # put a backslash in front of to eliminate special cases and
+            # make processing easier
+            "}": "\\}",
+            # this is for older RTF
+            r'\\$': '\\par ',
+            }
+        self.__replace_spchar = MReplace(SIMPLE_RPL)
+        self.__ms_hex_exp = re.compile(r"\\\'([0-9a-fA-F]{2})") #r"\\\'(..)"
+        self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) {0,1}") #modify this
+        #self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})")
+        #add \n in split for whole file reading
+        #self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\n|\\[^\s\\{}&]+(?:\s)?)")
+        #why keep backslash whereas \is replaced before?
+        self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)")
+        #self.__par_exp = re.compile(r'\\$')
+        #self.__remove_line = re.compile(r'\n+')
+        #self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)")
+        ##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)")
+        
+    def tokenize(self):
+        """Main class for handling other methods. Reads the file \
+        , uses method self.sub_reg to make basic substitutions,\
+        and process tokens by itself"""
+        #read
+        read_obj = open(self.__file, 'r')
+        input_file = read_obj.read()
+        read_obj.close()
+        
+        #process simple replacements and split giving us a correct list
+        #remove '' and \n in the process
+        tokens = self.__sub_reg_split(input_file)
+        #correct unicode
+        #tokens = map(self.__unicode_process, tokens)
+        #remove empty items created by removing \uc
+        #tokens = filter(lambda x: len(x) > 0, tokens)
+        
+        #write
+        write_obj = open(self.__write_to, 'wb')
+        write_obj.write('\n'.join(tokens))
+        write_obj.close()
+        #Move and copy
+        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
+        if self.__copy:
+            copy_obj.copy_file(self.__write_to, "tokenize.data")
+        copy_obj.rename(self.__write_to, self.__file)
+        os.remove(self.__write_to)
+        
+        #self.__special_tokens = [ '_', '~', "'", '{', '}' ]
+        '''line = line.replace("\\\\", "\\backslash ")
        line = line.replace("\\~", "\\~ ")
        line = line.replace("\\;", "\\; ")
        line = line.replace("&", "&amp;")
@ -63,54 +208,37 @@ class Tokenize:
        # put a backslash in front of to eliminate special cases and
        # make processing easier
        line = line.replace("}", "\\}")
-        line = re.sub(self.__utf_exp, self.__from_ms_to_utf8, line)
-        # line = re.sub( self.__neg_utf_exp, self.__neg_unicode_func, line)
-        line = re.sub(self.__ms_hex_exp, "\\mshex0\g<1> ", line)
-        ##line = line.replace("\\backslash", "\\\\")
-        # this is for older RTF
-        line = re.sub(self.__par_exp, '\\par ', line)
-        return line
-    def __compile_expressions(self):
-        self.__ms_hex_exp = re.compile(r"\\\'(..)")
-        self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) {0,1}")
-        self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\\[^\s\\{}&]+(?:\s)?)")
-        self.__par_exp = re.compile(r'\\$')
-        self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)")
-        ##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)")
-    def __create_tokens(self):
-        self.__compile_expressions()
-        read_obj = open(self.__file, 'r')
-        write_obj = open(self.__write_to, 'wb')
+        
        line_to_read = "dummy"
        while line_to_read:
            line_to_read = read_obj.readline()
            line = line_to_read
            line = line.replace("\n", "")
-            line =  self.__sub_line_reg(line)
-            tokens = re.split(self.__splitexp, line)
-            ##print tokens
-            for token in tokens:
-                if token != "":
+        '''
+        '''if token != "":
+                write_obj.write(token + "\n")
+                
+                match_obj = re.search(self.__mixed_exp, token)
+                if match_obj != None:
+                    first = match_obj.group(1)
+                    second = match_obj.group(2)
+                    write_obj.write(first + "\n")
+                    write_obj.write(second + "\n")
+                else:
                    write_obj.write(token + "\n")
-                    """
-                    match_obj = re.search(self.__mixed_exp, token)
-                    if match_obj != None:
-                        first = match_obj.group(1)
-                        second = match_obj.group(2)
-                        write_obj.write(first + "\n")
-                        write_obj.write(second + "\n")
-                    else:
-                        write_obj.write(token + "\n")
-                    """
-        read_obj.close()
-        write_obj.close()
-    def tokenize(self):
-        """Main class for handling other methods. Reads in one line \
-        at a time, uses method self.sub_line to make basic substitutions,\
-        uses ? to process tokens"""
-        self.__create_tokens()
-        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
-        if self.__copy:
-            copy_obj.copy_file(self.__write_to, "tokenize.data")
-        copy_obj.rename(self.__write_to, self.__file)
-        os.remove(self.__write_to)
+            '''
+        '''
+        for line in read_obj:
+            #make all replacements
+            line = self.__sub_reg(line)
+            #split token and remove empty tokens
+            tokens = filter(lambda x: len(x) > 0,
+                re.split(self.__splitexp, line))
+            if tokens:
+                write_obj.write('\n'.join(tokens)+'\n')'''
+                
+        '''def __neg_unicode_func(self, match_obj):
+        neg_uni_char = int(match_obj.group(1)) * -1
+        # sys.stderr.write(str( neg_uni_char))
+        uni_char = neg_uni_char + 65536
+        return   '&#x' + str('%X' % uni_char) + ';'''
--- a/src/calibre/gui2/init.py
+++ b/src/calibre/gui2/init.py
@ -329,6 +329,7 @@ class FileIconProvider(QFileIconProvider):
             'epub'    : 'epub',
             'fb2'     : 'fb2',
             'rtf'     : 'rtf',
+             'odt'     : 'odt',
             }

    def __init__(self):