Global overhaul of rtf2xml : RTF fixes (2) -> first tokenize modifications (not completely working without preprocessing)

2025-07-09 03:04:10 -04:00 · 2010-08-09 00:05:51 +02:00 · 2010-08-09 00:05:51 +02:00 · adcad1cb60
commit adcad1cb60
parent 1f237c99bf 983ff06f35
7 changed files with 256 additions and 160 deletions
--- a/resources/images/mimetypes/odt.svg
+++ b/resources/images/mimetypes/odt.svg
@ -0,0 +1,63 @@
 <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 <!-- Created with Inkscape (http://www.inkscape.org/) -->
 <svg xmlns:svg="http://www.w3.org/2000/svg" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" version="1.0" width="128" height="128" id="svg2176">
  <defs id="defs2178">
    <linearGradient x1="406.065" y1="290.50299" x2="406.065" y2="276.29501" id="linearGradient4819" xlink:href="#linearGradient7431" gradientUnits="userSpaceOnUse" gradientTransform="matrix(4.80814, 0, 0, 2.87475, -1569.44, -758.786)" spreadMethod="pad"/>
    <linearGradient x1="68.374298" y1="-410.099" x2="67.912201" y2="-478.508" id="linearGradient4817" xlink:href="#linearGradient11367" gradientUnits="userSpaceOnUse" gradientTransform="matrix(1.58223, 0, 0, -0.727268, 275.522, -213.417)" spreadMethod="pad"/>
    <linearGradient x1="436.48801" y1="-278.91299" x2="436.51199" y2="-299.88699" id="linearGradient4815" xlink:href="#linearGradient11377" gradientUnits="userSpaceOnUse" gradientTransform="matrix(4.59378, 0, 0, 0.359494, -1920.95, 434.897)" spreadMethod="pad"/>
    <linearGradient id="linearGradient11377">
      <stop id="stop11379" style="stop-color: rgb(255, 255, 255); stop-opacity: 0;" offset="0"/>
      <stop id="stop11385" style="stop-color: rgb(255, 255, 255); stop-opacity: 0.575472;" offset="1"/>
    </linearGradient>
    <linearGradient id="linearGradient11367">
      <stop id="stop11369" style="stop-color: rgb(0, 0, 0); stop-opacity: 0;" offset="0"/>
      <stop id="stop11371" style="stop-color: rgb(0, 0, 0); stop-opacity: 0.254717;" offset="0.72131097"/>
      <stop id="stop18428" style="stop-color: rgb(0, 0, 0); stop-opacity: 0.12549;" offset="0.91000003"/>
      <stop id="stop11375" style="stop-color: rgb(0, 0, 0); stop-opacity: 0;" offset="1"/>
    </linearGradient>
    <linearGradient id="linearGradient7431">
      <stop id="stop7433" style="stop-color: rgb(255, 255, 255); stop-opacity: 0;" offset="0"/>
      <stop id="stop7439" style="stop-color: rgb(255, 255, 255); stop-opacity: 0.858491;" offset="0.72000003"/>
      <stop id="stop8224" style="stop-color: rgb(255, 255, 255); stop-opacity: 0.707547;" offset="0.89999998"/>
      <stop id="stop7435" style="stop-color: rgb(255, 255, 255); stop-opacity: 0.320755;" offset="1"/>
    </linearGradient>
    <filter id="filter3659">
      <feGaussianBlur inkscape:collect="always" stdDeviation="0.25192676" id="feGaussianBlur3661"/>
    </filter>
  </defs>
  <g id="layer1">
    <g transform="matrix(1.1475, 0, 0, 1.1475, -368.661, -33.5075)" id="g4500">
      <path d="M 326.964,34.4298 L 423.481,34.4298 C 437.856,66.0223 403.767,104.222 423.481,135.402 L 326.964,135.402 L 326.964,34.4298 z" id="path21978" style="fill: rgb(95, 123, 141); fill-opacity: 1; fill-rule: evenodd; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
      <path d="M 326.964,34.4298 L 353.143,34.4298 C 367.518,66.0223 333.429,104.222 353.143,135.402 L 326.964,135.402 L 326.964,34.4298 z" id="path21980" style="fill: rgb(29, 70, 89); fill-opacity: 1; fill-rule: evenodd; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
      <rect width="101.089" height="7.9108801" x="34.429798" y="326.96399" transform="matrix(0, 1, 1, 0, 0, 0)" id="rect21982" style="fill: url(&quot;#linearGradient4815&quot;) rgb(0, 0, 0); fill-opacity: 1; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
      <g transform="matrix(3.17412, 0, 0, 3.17412, 1038.99, -354.131)" id="g21984">
        <path d="M -218.445,122.416 C -213.917,132.369 -224.656,144.405 -218.445,154.228 L -216.32,154.228 C -222.531,144.405 -211.792,132.369 -216.32,122.416 L -218.445,122.416 z" id="path21986" style="fill: rgb(0, 0, 0); fill-opacity: 0.0798122; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
        <path d="M -217.955,122.416 C -213.426,132.369 -224.166,144.405 -217.955,154.228 L -216.32,154.228 C -222.531,144.405 -211.792,132.369 -216.32,122.416 L -217.955,122.416 z" id="path21988" style="fill: rgb(0, 0, 0); fill-opacity: 0.131455; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
        <path d="M -217.403,122.416 C -212.875,132.369 -223.614,144.405 -217.403,154.228 L -216.32,154.228 C -222.531,144.405 -211.792,132.369 -216.32,122.416 L -217.403,122.416 z" id="path21990" style="fill: rgb(0, 0, 0); fill-opacity: 0.197183; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
        <path d="M -216.852,122.416 C -212.323,132.369 -223.063,144.405 -216.852,154.228 L -216.32,154.228 C -222.531,144.405 -211.792,132.369 -216.32,122.416 L -216.852,122.416 z" id="path21992" style="fill: rgb(0, 0, 0); fill-opacity: 0.267606; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
      </g>
      <path d="M 326.964,135.402 L 422.488,135.402 C 412.274,118.171 416.819,101.345 421.306,83.5374 L 326.964,83.2034 L 326.964,135.402 z" id="path21994" style="fill: url(&quot;#linearGradient4817&quot;) rgb(0, 0, 0); fill-opacity: 1; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
      <g transform="matrix(2.41517, 0, 0, 2.41517, 876.456, -197.189)" id="g21996">
        <path d="M 602.125,190.59375 C 599.45874,190.67075 596.74504,191.16798 594.53125,192.78125 C 591.5146,192.34561 588.3664,192.55749 585.5,193.6875 C 583.62824,194.43267 582.15635,195.77855 580.96875,197.34375 C 580.95544,197.36301 580.94492,197.38405 580.9375,197.40625 C 580.92091,197.43509 580.91029,197.46697 580.90625,197.5 C 580.91029,197.53303 580.92091,197.56491 580.9375,197.59375 C 580.94492,197.61595 580.95544,197.63699 580.96875,197.65625 C 580.97822,197.66757 580.98868,197.67803 581,197.6875 C 581.02605,197.71524 581.05813,197.73662 581.09375,197.75 C 581.12472,197.75595 581.15653,197.75595 581.1875,197.75 C 581.20825,197.75263 581.22925,197.75263 581.25,197.75 C 584.80749,196.49944 588.39295,195.15225 592.15625,195.5 C 593.28385,195.58867 594.35616,196.00271 595.46875,196.375 C 595.50974,196.38565 595.55276,196.38565 595.59375,196.375 C 595.62678,196.37096 595.65866,196.36034 595.6875,196.34375 C 595.7097,196.33633 595.73074,196.32581 595.75,196.3125 C 598.71379,193.45164 603.00891,192.72955 606.96875,191.90625 C 606.98007,191.89678 606.99053,191.88632 607,191.875 C 607.19563,191.80037 607.32956,191.73576 607.4375,191.625 C 607.49147,191.56962 607.55414,191.50784 607.5625,191.40625 C 607.57086,191.30466 607.51945,191.21518 607.46875,191.15625 C 607.36735,191.03839 607.25573,190.98239 607.125,190.9375 C 606.99427,190.89261 606.8215,190.87546 606.65625,190.84375 C 605.99526,190.71692 605.12704,190.6454 604.8125,190.625 C 604.80209,190.62434 604.79166,190.62434 604.78125,190.625 C 603.91011,190.58739 603.02603,190.56773 602.125,190.59375 z" transform="translate(-806.724, -92.8004)" id="path21998" style="fill: rgb(0, 0, 0); fill-opacity: 1; fill-rule: evenodd; stroke: none; stroke-width: 1pt; stroke-linecap: butt; stroke-linejoin: miter; stroke-opacity: 1;"/>
        <path d="M -224.344,103.835 C -219.295,101.9 -214.705,101.331 -211.263,102.86 C -208.45,100.119 -202.237,98.6242 -200.227,98.6199 C -207.528,97.8352 -210.552,99.4967 -212,100.582 C -216.698,100.015 -221.096,100.522 -224.344,103.834" id="path22000" style="fill: rgb(255, 255, 255); fill-opacity: 1; stroke: none;"/>
      </g>
      <g transform="matrix(2.41517, 0, 0, 2.41517, 876.456, -197.189)" id="g22002">
        <path d="M 596.25,27.53125 C 587.9033,27.701471 579.93436,30.011449 573.84375,35.03125 C 565.49276,31.223728 554.44432,30.751141 544.375,32.28125 C 538.97209,33.102263 533.8987,34.480363 529.6875,36.34375 C 525.4884,38.201779 521.99675,40.484175 520.1875,43.8125 C 519.57732,44.883163 519.7128,46.206199 520.46875,47.15625 C 521.22471,48.106278 522.5049,48.470375 523.65625,48.125 C 544.63433,42.131263 561.86554,43.038041 573.6875,52.875 C 574.80806,53.806374 576.46471,53.801852 577.5625,52.84375 C 587.80668,43.812696 604.05857,37.910216 621.5625,38.875 C 622.98852,38.943575 624.2874,37.997938 624.625,36.625 C 624.96264,35.252044 624.25302,33.783013 622.96875,33.1875 C 614.73544,29.461107 605.28688,27.346954 596.25,27.53125 z" transform="matrix(0.292292, -0.0677077, 0.0677077, 0.292292, -381.543, 134.276)" id="path22004" style="fill: rgb(0, 0, 0); fill-opacity: 1; fill-rule: evenodd; stroke: none; stroke-width: 1pt; stroke-linecap: butt; stroke-linejoin: miter; stroke-opacity: 1;"/>
        <path d="M -225.696,112.15 C -219.345,108.564 -214.201,107.915 -209.842,110.097 C -206.945,105.454 -199.625,102.766 -197.16,102.691 C -204.796,101.053 -210.086,104.587 -211.05,106.575 C -215.328,105.394 -224.104,108.305 -225.696,112.149" id="path22006" style="fill: rgb(255, 255, 255); fill-opacity: 1; stroke: none;"/>
      </g>
      <g transform="matrix(1.14159, 0, 0, 1.14159, 265.142, -259.674)" id="g22010">
        <path d="M 134.221,257.626 C 146.813,285.3 116.952,318.766 134.221,346.078 L 138.68,346.078 C 121.411,318.766 151.272,285.3 138.68,257.626 L 134.221,257.626 z" id="path22012" style="fill: rgb(0, 0, 0); fill-opacity: 0.0798122; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
        <path d="M 135.222,257.626 C 147.814,285.3 117.953,318.766 135.222,346.078 L 138.68,346.078 C 121.411,318.766 151.272,285.3 138.68,257.626 L 135.222,257.626 z" id="path22014" style="fill: rgb(0, 0, 0); fill-opacity: 0.131455; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
        <path d="M 136.393,257.626 C 148.985,285.3 119.124,318.766 136.393,346.078 L 138.68,346.078 C 121.411,318.766 151.272,285.3 138.68,257.626 L 136.393,257.626 z" id="path22016" style="fill: rgb(0, 0, 0); fill-opacity: 0.197183; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
        <path d="M 137.564,257.626 C 150.156,285.3 120.295,318.766 137.564,346.078 L 138.68,346.078 C 121.411,318.766 151.272,285.3 138.68,257.626 L 137.564,257.626 z" id="path22018" style="fill: rgb(0, 0, 0); fill-opacity: 0.267606; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
        <path d="M 133.134,257.626 C 145.726,285.3 115.865,318.766 133.134,346.078 L 138.68,346.078 C 121.411,318.766 151.272,285.3 138.68,257.626 L 133.134,257.626 z" id="path22020" style="fill: rgb(0, 0, 0); fill-opacity: 0.0798122; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
      </g>
      <g transform="matrix(1.14159, 0, 0, 1.14159, -389.722, -484.947)" id="g22103">
        <path d="M 653.161,498.44 L 693.751,498.44" id="path21604" style="fill: rgb(96, 123, 142); fill-opacity: 1; fill-rule: nonzero; stroke: rgb(255, 254, 255); stroke-width: 8; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-dashoffset: 0pt; stroke-opacity: 0.698039;"/>
        <path d="M 653.161,515.294 L 683.452,515.294" id="path21606" style="fill: rgb(96, 123, 142); fill-opacity: 1; fill-rule: nonzero; stroke: rgb(255, 254, 255); stroke-width: 8; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-dashoffset: 0pt; stroke-opacity: 0.698039;"/>
        <path d="M 653.161,532.46 L 693.751,532.46" id="path22101" style="fill: rgb(96, 123, 142); fill-opacity: 1; fill-rule: nonzero; stroke: rgb(255, 254, 255); stroke-width: 8; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-dashoffset: 0pt; stroke-opacity: 0.698039;"/>
      </g>
      <path d="M 326.964,34.4298 L 423.285,34.4298 C 433.146,54.4709 420.531,82.4058 417.826,102.327 L 326.964,102.327 L 326.964,34.4298 z" id="path22008" style="fill: url(&quot;#linearGradient4819&quot;) rgb(0, 0, 0); fill-opacity: 1; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
    </g>
  </g>
 </svg>
--- a/setup.py
+++ b/setup.py
@ -1,99 +0,0 @@
 #!/usr/bin/env python
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
 from __future__ import with_statement
 __license__   = 'GPL v3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 import sys, os, optparse
 sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)))
 import setup.commands as commands
 from setup import prints, get_warnings
 def check_version_info():
    vi = sys.version_info
    if vi[0] == 2 and vi[1] > 5:
        return None
    return 'calibre requires python >= 2.6'
 def option_parser():
    parser = optparse.OptionParser()
    parser.add_option('-c', '--clean', default=False, action='store_true',
            help=('Instead of running the command delete all files generated '
                'by the command'))
    parser.add_option('--clean-backups', default=False, action='store_true',
            help='Delete all backup files from the source tree')
    parser.add_option('--clean-all', default=False, action='store_true',
            help='Delete all machine generated files from the source tree')
    return parser
 def clean_backups():
    for root, _, files in os.walk('.'):
        for name in files:
            for t in ('.pyc', '.pyo', '~', '.swp', '.swo'):
                if name.endswith(t):
                    os.remove(os.path.join(root, name))
 def main(args=sys.argv):
    if len(args) == 1 or args[1] in ('-h', '--help'):
        print 'Usage: python', args[0], 'command', '[options]'
        print '\nWhere command is one of:'
        print
        for x in sorted(commands.__all__):
            print '%-20s -'%x,
            c = getattr(commands, x)
            desc = getattr(c, 'short_description', c.description)
            print desc
        print '\nTo get help on a particular command, run:'
        print '\tpython', args[0], 'command -h'
        return 1
    command = args[1]
    if command not in commands.__all__:
        print command, 'is not a recognized command.'
        print 'Valid commands:', ', '.join(commands.__all__)
        return 1
    command = getattr(commands, command)
    parser = option_parser()
    command.add_all_options(parser)
    parser.set_usage('Usage: python setup.py %s [options]\n\n'%args[1]+\
            command.description)
    opts, args = parser.parse_args(args)
    if opts.clean_backups:
        clean_backups()
    if opts.clean:
        prints('Cleaning', args[1])
        command.clean()
        return 0
    if opts.clean_all:
        for cmd in commands.__all__:
            prints('Cleaning', cmd)
            getattr(commands, cmd).clean()
        return 0
    command.run_all(opts)
    warnings = get_warnings()
    if warnings:
        print
        prints('There were', len(warnings), 'warning(s):')
        print
        for args, kwargs in warnings:
            prints('*', *args, **kwargs)
            print
    return 0
 if __name__ == '__main__':
    sys.exit(main())
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@ -50,7 +50,7 @@ class RTFInput(InputFormatPlugin):
        parser = ParseRtf(
            in_file    = stream,
            out_file   = ofile,
-            #deb_dir = 'I:\\Calibre\\rtfdebug',
+            deb_dir = 'I:\\Calibre\\rtfdebug',
            # Convert symbol fonts to unicode equivalents. Default
            # is 1
            convert_symbol = 1,
@ -187,16 +187,17 @@ class RTFInput(InputFormatPlugin):
        self.log = log
        self.log('Converting RTF to XML...')
        #Name of the preprocesssed RTF file
-        fname = self.preprocess(stream.name)
+        #fname = self.preprocess(stream.name)
        fname = stream.name
        try:
            xml = self.generate_xml(fname)
        except RtfInvalidCodeException, e:
            raise ValueError(_('This RTF file has a feature calibre does not '
            'support. Convert it to HTML first and then try it.\n%s')%e)
-        '''dataxml = open('dataxml.xml', 'w')
+        dataxml = open('dataxml.xml', 'w')
        dataxml.write(xml)
-        dataxml.close'''
+        dataxml.close
        d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf'))
        if d:
--- a/src/calibre/ebooks/rtf/preprocess.py
+++ b/src/calibre/ebooks/rtf/preprocess.py
@ -228,8 +228,9 @@ class RtfTokenizer():
    def tokenize(self):
        i = 0
        lastDataStart = -1
        #on parse caractere par caractere
        while i < len(self.rtfData):
-
+            #si ça commence un grpupe
            if isChar(self.rtfData[i], '{'):
                if lastDataStart > -1:
                    self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
@ -237,7 +238,7 @@ class RtfTokenizer():
                self.tokens.append(tokenDelimitatorStart())
                i = i + 1
                continue
-
+            #si ça fini un grpupe
            if isChar(self.rtfData[i], '}'):
                if lastDataStart > -1:
                    self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
@ -245,7 +246,7 @@ class RtfTokenizer():
                self.tokens.append(tokenDelimitatorEnd())
                i = i + 1
                continue
-
+            #on copie si il y a un charactere de controle
            if isChar(self.rtfData[i], '\\'):
                if i + 1 >= len(self.rtfData):
                    raise Exception('Error: Control character found at the end of the document.')
@ -254,6 +255,7 @@ class RtfTokenizer():
                    self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
                    lastDataStart = -1
                # le token commence ici
                tokenStart = i
                i = i + 1
--- a/src/calibre/ebooks/rtf2xml/line_endings.py
+++ b/src/calibre/ebooks/rtf2xml/line_endings.py
@ -32,7 +32,7 @@ class FixLineEndings:
        self.__write_to = tempfile.mktemp()
        self.__replace_illegals = replace_illegals
    def fix_endings(self):
-        illegal_regx = re.compile( '\x00|\x01|\x02|\x03|\x04|\x05|\x06|\x07|\x08|\x0B|\x0E|\x0F|\x10|\x11|\x12|\x13')
+        illegal_regx = re.compile('\x00|\x01|\x02|\x03|\x04|\x05|\x06|\x07|\x08|\x0B|\x0E|\x0F|\x10|\x11|\x12|\x13')
        # always check since I have to get rid of illegal characters
        #read
        read_obj = open(self.__file, 'r')
--- a/src/calibre/ebooks/rtf2xml/tokenize.py
+++ b/src/calibre/ebooks/rtf2xml/tokenize.py
@ -16,7 +16,10 @@
 #                                                                       #
 #########################################################################
 import os, re, tempfile
 from calibre.ebooks.rtf2xml import copy
 from calibre.utils.mreplace import MReplace
 class Tokenize:
    """Tokenize RTF into one line per field. Each line will contain information useful for the rest of the script"""
    def __init__(self,
@ -28,20 +31,162 @@ class Tokenize:
        self.__file = in_file
        self.__bug_handler = bug_handler
        self.__copy = copy
        self.__special_tokens = [ '_', '~', "'", '{', '}' ]
        self.__write_to = tempfile.mktemp()
        self.__compile_expressions()
        #variables
        self.__uc_char = 0
        self.__uc_bin = False
        self.__uc_value = [1]
    def __from_ms_to_utf8(self,match_obj):
        uni_char = int(match_obj.group(1))
        if uni_char < 0:
            uni_char +=  65536
        return   '&#x' + str('%X' % uni_char) + ';'
-    def __neg_unicode_func(self, match_obj):
+        
-        neg_uni_char = int(match_obj.group(1)) * -1
+    def __reini_utf8_counters(self):
-        # sys.stderr.write(str( neg_uni_char))
+        self.__uc_char = 0
-        uni_char = neg_uni_char + 65536
+        self.__uc_bin = False
-        return   '&#x' + str('%X' % uni_char) + ';'
+
-    def __sub_line_reg(self,line):
+    def __unicode_process(self, token):
-        line = line.replace("\\\\", "\\backslash ")
+        #change scope in
        if token == '\{':
            self.__uc_value.append(self.__uc_value[-1])
            #basic error handling
            self.__reini_utf8_counters()
            return token
        #change scope out: evaluate dict and rebuild
        elif token == '\}':
            #self.__uc_value.pop()
            self.__reini_utf8_counters()
            return token
        #add a uc control
        elif token[:3] == '\uc':
            self.__uc_value[-1] = int(token[3:])
            self.__reini_utf8_counters()
            return token
        #handle uc skippable char
        elif self.__uc_char:
            #if token[:1] == "\" and token[:1] == "\"
            pass
        #go for real \u token
        match_obj = self.__utf_exp.match(token)
        if match_obj is not None:
            #get value and handle negative case
            uni_char = int(match_obj.group(1))
            uni_len = len(match_obj.group(1)) + 2
            if uni_char < 0:
                uni_char += 65536
            uni_char = unichr(uni_char).encode('ascii', 'xmlcharrefreplace')
            #if not uc0
            if self.__uc_value[-1]:
                self.__uc_char = self.__uc_value[-1]
            #there is only an unicode char
            if len(token)<= uni_len:
                return uni_char
            #an unicode char and something else
            #must be after as it is splited on \
            elif not self.__uc_value[-1]:
                print('not only token uc0 token: ' + uni_char + token[uni_len:])
                return uni_char + token[uni_len:]
            #if not uc0 and chars
            else:
                for i in xrange(uni_len, len(token)):
                    if token[i] == " ":
                        continue
                    elif self.__uc_char > 0:
                        self.__uc_char -= 1
                    else:
                        return uni_char + token[i:]
            #print('uc: ' + str(self.__uc_value) + 'uni: ' + str(uni_char) + 'token: ' + token)
        #default
        return token
    def __sub_reg_split(self,input_file):
        input_file = self.__replace_spchar.mreplace(input_file)
        #input_file = re.sub(self.__utf_exp, self.__from_ms_to_utf8, input_file)
        # line = re.sub( self.__neg_utf_exp, self.__neg_unicode_func, line)
        # this is for older RTF
        #line = re.sub(self.__par_exp, '\\par ', line)
        input_file = re.sub(self.__ms_hex_exp, "\\mshex0\g<1> ", input_file)
        #split
        tokens = re.split(self.__splitexp, input_file)
        #remove empty tokens and \n
        return filter(lambda x: len(x) > 0 and x != '\n', tokens)
        #return filter(lambda x: len(x) > 0, \
            #(self.__remove_line.sub('', x) for x in tokens))
    def __compile_expressions(self):
        SIMPLE_RPL = {
            "\\\\": "\\backslash ",
            "\\~": "\\~ ",
            "\\;": "\\; ",
            "&": "&amp;",
            "<": "&lt;",
            ">": "&gt;",
            "\\~": "\\~ ",
            "\\_": "\\_ ",
            "\\:": "\\: ",
            "\\-": "\\- ",
            # turn into a generic token to eliminate special
            # cases and make processing easier
            "\\{": "\\ob ",
            # turn into a generic token to eliminate special
            # cases and make processing easier
            "\\}": "\\cb ",
            # put a backslash in front of to eliminate special cases and
            # make processing easier
            "{": "\\{",
            # put a backslash in front of to eliminate special cases and
            # make processing easier
            "}": "\\}",
            # this is for older RTF
            r'\\$': '\\par ',
            }
        self.__replace_spchar = MReplace(SIMPLE_RPL)
        self.__ms_hex_exp = re.compile(r"\\\'([0-9a-fA-F]{2})") #r"\\\'(..)"
        self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) {0,1}") #modify this
        #self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})")
        #add \n in split for whole file reading
        #self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\n|\\[^\s\\{}&]+(?:\s)?)")
        #why keep backslash whereas \is replaced before?
        self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)")
        #self.__par_exp = re.compile(r'\\$')
        #self.__remove_line = re.compile(r'\n+')
        #self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)")
        ##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)")
    def tokenize(self):
        """Main class for handling other methods. Reads the file \
        , uses method self.sub_reg to make basic substitutions,\
        and process tokens by itself"""
        #read
        read_obj = open(self.__file, 'r')
        input_file = read_obj.read()
        read_obj.close()
        #process simple replacements and split giving us a correct list
        #remove '' and \n in the process
        tokens = self.__sub_reg_split(input_file)
        #correct unicode
        #tokens = map(self.__unicode_process, tokens)
        #remove empty items created by removing \uc
        #tokens = filter(lambda x: len(x) > 0, tokens)
        #write
        write_obj = open(self.__write_to, 'wb')
        write_obj.write('\n'.join(tokens))
        write_obj.close()
        #Move and copy
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
        if self.__copy:
            copy_obj.copy_file(self.__write_to, "tokenize.data")
        copy_obj.rename(self.__write_to, self.__file)
        os.remove(self.__write_to)
        #self.__special_tokens = [ '_', '~', "'", '{', '}' ]
        '''line = line.replace("\\\\", "\\backslash ")
        line = line.replace("\\~", "\\~ ")
        line = line.replace("\\;", "\\; ")
        line = line.replace("&", "&amp;")
@ -63,54 +208,37 @@ class Tokenize:
        # put a backslash in front of to eliminate special cases and
        # make processing easier
        line = line.replace("}", "\\}")
-        line = re.sub(self.__utf_exp, self.__from_ms_to_utf8, line)
+        
        # line = re.sub( self.__neg_utf_exp, self.__neg_unicode_func, line)
        line = re.sub(self.__ms_hex_exp, "\\mshex0\g<1> ", line)
        ##line = line.replace("\\backslash", "\\\\")
        # this is for older RTF
        line = re.sub(self.__par_exp, '\\par ', line)
        return line
    def __compile_expressions(self):
        self.__ms_hex_exp = re.compile(r"\\\'(..)")
        self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) {0,1}")
        self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\\[^\s\\{}&]+(?:\s)?)")
        self.__par_exp = re.compile(r'\\$')
        self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)")
        ##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)")
    def __create_tokens(self):
        self.__compile_expressions()
        read_obj = open(self.__file, 'r')
        write_obj = open(self.__write_to, 'wb')
        line_to_read = "dummy"
        while line_to_read:
            line_to_read = read_obj.readline()
            line = line_to_read
            line = line.replace("\n", "")
-            line =  self.__sub_line_reg(line)
+        '''
-            tokens = re.split(self.__splitexp, line)
+        '''if token != "":
-            ##print tokens
+                write_obj.write(token + "\n")
-            for token in tokens:
+                
-                if token != "":
+                match_obj = re.search(self.__mixed_exp, token)
                if match_obj != None:
                    first = match_obj.group(1)
                    second = match_obj.group(2)
                    write_obj.write(first + "\n")
                    write_obj.write(second + "\n")
                else:
                    write_obj.write(token + "\n")
-                    """
+            '''
-                    match_obj = re.search(self.__mixed_exp, token)
+        '''
-                    if match_obj != None:
+        for line in read_obj:
-                        first = match_obj.group(1)
+            #make all replacements
-                        second = match_obj.group(2)
+            line = self.__sub_reg(line)
-                        write_obj.write(first + "\n")
+            #split token and remove empty tokens
-                        write_obj.write(second + "\n")
+            tokens = filter(lambda x: len(x) > 0,
-                    else:
+                re.split(self.__splitexp, line))
-                        write_obj.write(token + "\n")
+            if tokens:
-                    """
+                write_obj.write('\n'.join(tokens)+'\n')'''
-        read_obj.close()
+                
-        write_obj.close()
+        '''def __neg_unicode_func(self, match_obj):
-    def tokenize(self):
+        neg_uni_char = int(match_obj.group(1)) * -1
-        """Main class for handling other methods. Reads in one line \
+        # sys.stderr.write(str( neg_uni_char))
-        at a time, uses method self.sub_line to make basic substitutions,\
+        uni_char = neg_uni_char + 65536
-        uses ? to process tokens"""
+        return   '&#x' + str('%X' % uni_char) + ';'''
        self.__create_tokens()
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
        if self.__copy:
            copy_obj.copy_file(self.__write_to, "tokenize.data")
        copy_obj.rename(self.__write_to, self.__file)
        os.remove(self.__write_to)
--- a/src/calibre/gui2/init.py
+++ b/src/calibre/gui2/init.py
@ -329,6 +329,7 @@ class FileIconProvider(QFileIconProvider):
             'epub'    : 'epub',
             'fb2'     : 'fb2',
             'rtf'     : 'rtf',
             'odt'     : 'odt',
             }
    def __init__(self):