Global overhaul of rtf2xml : RTF fixes (2) -> first tokenize modifications (not completely working without preprocessing)

This commit is contained in:
Sengian 2010-08-09 00:05:51 +02:00
commit adcad1cb60
7 changed files with 256 additions and 160 deletions

View File

@ -0,0 +1,63 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!-- Created with Inkscape (http://www.inkscape.org/) -->
<svg xmlns:svg="http://www.w3.org/2000/svg" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" version="1.0" width="128" height="128" id="svg2176">
<defs id="defs2178">
<linearGradient x1="406.065" y1="290.50299" x2="406.065" y2="276.29501" id="linearGradient4819" xlink:href="#linearGradient7431" gradientUnits="userSpaceOnUse" gradientTransform="matrix(4.80814, 0, 0, 2.87475, -1569.44, -758.786)" spreadMethod="pad"/>
<linearGradient x1="68.374298" y1="-410.099" x2="67.912201" y2="-478.508" id="linearGradient4817" xlink:href="#linearGradient11367" gradientUnits="userSpaceOnUse" gradientTransform="matrix(1.58223, 0, 0, -0.727268, 275.522, -213.417)" spreadMethod="pad"/>
<linearGradient x1="436.48801" y1="-278.91299" x2="436.51199" y2="-299.88699" id="linearGradient4815" xlink:href="#linearGradient11377" gradientUnits="userSpaceOnUse" gradientTransform="matrix(4.59378, 0, 0, 0.359494, -1920.95, 434.897)" spreadMethod="pad"/>
<linearGradient id="linearGradient11377">
<stop id="stop11379" style="stop-color: rgb(255, 255, 255); stop-opacity: 0;" offset="0"/>
<stop id="stop11385" style="stop-color: rgb(255, 255, 255); stop-opacity: 0.575472;" offset="1"/>
</linearGradient>
<linearGradient id="linearGradient11367">
<stop id="stop11369" style="stop-color: rgb(0, 0, 0); stop-opacity: 0;" offset="0"/>
<stop id="stop11371" style="stop-color: rgb(0, 0, 0); stop-opacity: 0.254717;" offset="0.72131097"/>
<stop id="stop18428" style="stop-color: rgb(0, 0, 0); stop-opacity: 0.12549;" offset="0.91000003"/>
<stop id="stop11375" style="stop-color: rgb(0, 0, 0); stop-opacity: 0;" offset="1"/>
</linearGradient>
<linearGradient id="linearGradient7431">
<stop id="stop7433" style="stop-color: rgb(255, 255, 255); stop-opacity: 0;" offset="0"/>
<stop id="stop7439" style="stop-color: rgb(255, 255, 255); stop-opacity: 0.858491;" offset="0.72000003"/>
<stop id="stop8224" style="stop-color: rgb(255, 255, 255); stop-opacity: 0.707547;" offset="0.89999998"/>
<stop id="stop7435" style="stop-color: rgb(255, 255, 255); stop-opacity: 0.320755;" offset="1"/>
</linearGradient>
<filter id="filter3659">
<feGaussianBlur inkscape:collect="always" stdDeviation="0.25192676" id="feGaussianBlur3661"/>
</filter>
</defs>
<g id="layer1">
<g transform="matrix(1.1475, 0, 0, 1.1475, -368.661, -33.5075)" id="g4500">
<path d="M 326.964,34.4298 L 423.481,34.4298 C 437.856,66.0223 403.767,104.222 423.481,135.402 L 326.964,135.402 L 326.964,34.4298 z" id="path21978" style="fill: rgb(95, 123, 141); fill-opacity: 1; fill-rule: evenodd; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
<path d="M 326.964,34.4298 L 353.143,34.4298 C 367.518,66.0223 333.429,104.222 353.143,135.402 L 326.964,135.402 L 326.964,34.4298 z" id="path21980" style="fill: rgb(29, 70, 89); fill-opacity: 1; fill-rule: evenodd; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
<rect width="101.089" height="7.9108801" x="34.429798" y="326.96399" transform="matrix(0, 1, 1, 0, 0, 0)" id="rect21982" style="fill: url(&quot;#linearGradient4815&quot;) rgb(0, 0, 0); fill-opacity: 1; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
<g transform="matrix(3.17412, 0, 0, 3.17412, 1038.99, -354.131)" id="g21984">
<path d="M -218.445,122.416 C -213.917,132.369 -224.656,144.405 -218.445,154.228 L -216.32,154.228 C -222.531,144.405 -211.792,132.369 -216.32,122.416 L -218.445,122.416 z" id="path21986" style="fill: rgb(0, 0, 0); fill-opacity: 0.0798122; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
<path d="M -217.955,122.416 C -213.426,132.369 -224.166,144.405 -217.955,154.228 L -216.32,154.228 C -222.531,144.405 -211.792,132.369 -216.32,122.416 L -217.955,122.416 z" id="path21988" style="fill: rgb(0, 0, 0); fill-opacity: 0.131455; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
<path d="M -217.403,122.416 C -212.875,132.369 -223.614,144.405 -217.403,154.228 L -216.32,154.228 C -222.531,144.405 -211.792,132.369 -216.32,122.416 L -217.403,122.416 z" id="path21990" style="fill: rgb(0, 0, 0); fill-opacity: 0.197183; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
<path d="M -216.852,122.416 C -212.323,132.369 -223.063,144.405 -216.852,154.228 L -216.32,154.228 C -222.531,144.405 -211.792,132.369 -216.32,122.416 L -216.852,122.416 z" id="path21992" style="fill: rgb(0, 0, 0); fill-opacity: 0.267606; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
</g>
<path d="M 326.964,135.402 L 422.488,135.402 C 412.274,118.171 416.819,101.345 421.306,83.5374 L 326.964,83.2034 L 326.964,135.402 z" id="path21994" style="fill: url(&quot;#linearGradient4817&quot;) rgb(0, 0, 0); fill-opacity: 1; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
<g transform="matrix(2.41517, 0, 0, 2.41517, 876.456, -197.189)" id="g21996">
<path d="M 602.125,190.59375 C 599.45874,190.67075 596.74504,191.16798 594.53125,192.78125 C 591.5146,192.34561 588.3664,192.55749 585.5,193.6875 C 583.62824,194.43267 582.15635,195.77855 580.96875,197.34375 C 580.95544,197.36301 580.94492,197.38405 580.9375,197.40625 C 580.92091,197.43509 580.91029,197.46697 580.90625,197.5 C 580.91029,197.53303 580.92091,197.56491 580.9375,197.59375 C 580.94492,197.61595 580.95544,197.63699 580.96875,197.65625 C 580.97822,197.66757 580.98868,197.67803 581,197.6875 C 581.02605,197.71524 581.05813,197.73662 581.09375,197.75 C 581.12472,197.75595 581.15653,197.75595 581.1875,197.75 C 581.20825,197.75263 581.22925,197.75263 581.25,197.75 C 584.80749,196.49944 588.39295,195.15225 592.15625,195.5 C 593.28385,195.58867 594.35616,196.00271 595.46875,196.375 C 595.50974,196.38565 595.55276,196.38565 595.59375,196.375 C 595.62678,196.37096 595.65866,196.36034 595.6875,196.34375 C 595.7097,196.33633 595.73074,196.32581 595.75,196.3125 C 598.71379,193.45164 603.00891,192.72955 606.96875,191.90625 C 606.98007,191.89678 606.99053,191.88632 607,191.875 C 607.19563,191.80037 607.32956,191.73576 607.4375,191.625 C 607.49147,191.56962 607.55414,191.50784 607.5625,191.40625 C 607.57086,191.30466 607.51945,191.21518 607.46875,191.15625 C 607.36735,191.03839 607.25573,190.98239 607.125,190.9375 C 606.99427,190.89261 606.8215,190.87546 606.65625,190.84375 C 605.99526,190.71692 605.12704,190.6454 604.8125,190.625 C 604.80209,190.62434 604.79166,190.62434 604.78125,190.625 C 603.91011,190.58739 603.02603,190.56773 602.125,190.59375 z" transform="translate(-806.724, -92.8004)" id="path21998" style="fill: rgb(0, 0, 0); fill-opacity: 1; fill-rule: evenodd; stroke: none; stroke-width: 1pt; stroke-linecap: butt; stroke-linejoin: miter; stroke-opacity: 1;"/>
<path d="M -224.344,103.835 C -219.295,101.9 -214.705,101.331 -211.263,102.86 C -208.45,100.119 -202.237,98.6242 -200.227,98.6199 C -207.528,97.8352 -210.552,99.4967 -212,100.582 C -216.698,100.015 -221.096,100.522 -224.344,103.834" id="path22000" style="fill: rgb(255, 255, 255); fill-opacity: 1; stroke: none;"/>
</g>
<g transform="matrix(2.41517, 0, 0, 2.41517, 876.456, -197.189)" id="g22002">
<path d="M 596.25,27.53125 C 587.9033,27.701471 579.93436,30.011449 573.84375,35.03125 C 565.49276,31.223728 554.44432,30.751141 544.375,32.28125 C 538.97209,33.102263 533.8987,34.480363 529.6875,36.34375 C 525.4884,38.201779 521.99675,40.484175 520.1875,43.8125 C 519.57732,44.883163 519.7128,46.206199 520.46875,47.15625 C 521.22471,48.106278 522.5049,48.470375 523.65625,48.125 C 544.63433,42.131263 561.86554,43.038041 573.6875,52.875 C 574.80806,53.806374 576.46471,53.801852 577.5625,52.84375 C 587.80668,43.812696 604.05857,37.910216 621.5625,38.875 C 622.98852,38.943575 624.2874,37.997938 624.625,36.625 C 624.96264,35.252044 624.25302,33.783013 622.96875,33.1875 C 614.73544,29.461107 605.28688,27.346954 596.25,27.53125 z" transform="matrix(0.292292, -0.0677077, 0.0677077, 0.292292, -381.543, 134.276)" id="path22004" style="fill: rgb(0, 0, 0); fill-opacity: 1; fill-rule: evenodd; stroke: none; stroke-width: 1pt; stroke-linecap: butt; stroke-linejoin: miter; stroke-opacity: 1;"/>
<path d="M -225.696,112.15 C -219.345,108.564 -214.201,107.915 -209.842,110.097 C -206.945,105.454 -199.625,102.766 -197.16,102.691 C -204.796,101.053 -210.086,104.587 -211.05,106.575 C -215.328,105.394 -224.104,108.305 -225.696,112.149" id="path22006" style="fill: rgb(255, 255, 255); fill-opacity: 1; stroke: none;"/>
</g>
<g transform="matrix(1.14159, 0, 0, 1.14159, 265.142, -259.674)" id="g22010">
<path d="M 134.221,257.626 C 146.813,285.3 116.952,318.766 134.221,346.078 L 138.68,346.078 C 121.411,318.766 151.272,285.3 138.68,257.626 L 134.221,257.626 z" id="path22012" style="fill: rgb(0, 0, 0); fill-opacity: 0.0798122; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
<path d="M 135.222,257.626 C 147.814,285.3 117.953,318.766 135.222,346.078 L 138.68,346.078 C 121.411,318.766 151.272,285.3 138.68,257.626 L 135.222,257.626 z" id="path22014" style="fill: rgb(0, 0, 0); fill-opacity: 0.131455; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
<path d="M 136.393,257.626 C 148.985,285.3 119.124,318.766 136.393,346.078 L 138.68,346.078 C 121.411,318.766 151.272,285.3 138.68,257.626 L 136.393,257.626 z" id="path22016" style="fill: rgb(0, 0, 0); fill-opacity: 0.197183; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
<path d="M 137.564,257.626 C 150.156,285.3 120.295,318.766 137.564,346.078 L 138.68,346.078 C 121.411,318.766 151.272,285.3 138.68,257.626 L 137.564,257.626 z" id="path22018" style="fill: rgb(0, 0, 0); fill-opacity: 0.267606; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
<path d="M 133.134,257.626 C 145.726,285.3 115.865,318.766 133.134,346.078 L 138.68,346.078 C 121.411,318.766 151.272,285.3 138.68,257.626 L 133.134,257.626 z" id="path22020" style="fill: rgb(0, 0, 0); fill-opacity: 0.0798122; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
</g>
<g transform="matrix(1.14159, 0, 0, 1.14159, -389.722, -484.947)" id="g22103">
<path d="M 653.161,498.44 L 693.751,498.44" id="path21604" style="fill: rgb(96, 123, 142); fill-opacity: 1; fill-rule: nonzero; stroke: rgb(255, 254, 255); stroke-width: 8; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-dashoffset: 0pt; stroke-opacity: 0.698039;"/>
<path d="M 653.161,515.294 L 683.452,515.294" id="path21606" style="fill: rgb(96, 123, 142); fill-opacity: 1; fill-rule: nonzero; stroke: rgb(255, 254, 255); stroke-width: 8; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-dashoffset: 0pt; stroke-opacity: 0.698039;"/>
<path d="M 653.161,532.46 L 693.751,532.46" id="path22101" style="fill: rgb(96, 123, 142); fill-opacity: 1; fill-rule: nonzero; stroke: rgb(255, 254, 255); stroke-width: 8; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-dashoffset: 0pt; stroke-opacity: 0.698039;"/>
</g>
<path d="M 326.964,34.4298 L 423.285,34.4298 C 433.146,54.4709 420.531,82.4058 417.826,102.327 L 326.964,102.327 L 326.964,34.4298 z" id="path22008" style="fill: url(&quot;#linearGradient4819&quot;) rgb(0, 0, 0); fill-opacity: 1; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
</g>
</g>
</svg>

After

Width:  |  Height:  |  Size: 12 KiB

View File

@ -1,99 +0,0 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import sys, os, optparse
sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)))
import setup.commands as commands
from setup import prints, get_warnings
def check_version_info():
vi = sys.version_info
if vi[0] == 2 and vi[1] > 5:
return None
return 'calibre requires python >= 2.6'
def option_parser():
parser = optparse.OptionParser()
parser.add_option('-c', '--clean', default=False, action='store_true',
help=('Instead of running the command delete all files generated '
'by the command'))
parser.add_option('--clean-backups', default=False, action='store_true',
help='Delete all backup files from the source tree')
parser.add_option('--clean-all', default=False, action='store_true',
help='Delete all machine generated files from the source tree')
return parser
def clean_backups():
for root, _, files in os.walk('.'):
for name in files:
for t in ('.pyc', '.pyo', '~', '.swp', '.swo'):
if name.endswith(t):
os.remove(os.path.join(root, name))
def main(args=sys.argv):
if len(args) == 1 or args[1] in ('-h', '--help'):
print 'Usage: python', args[0], 'command', '[options]'
print '\nWhere command is one of:'
print
for x in sorted(commands.__all__):
print '%-20s -'%x,
c = getattr(commands, x)
desc = getattr(c, 'short_description', c.description)
print desc
print '\nTo get help on a particular command, run:'
print '\tpython', args[0], 'command -h'
return 1
command = args[1]
if command not in commands.__all__:
print command, 'is not a recognized command.'
print 'Valid commands:', ', '.join(commands.__all__)
return 1
command = getattr(commands, command)
parser = option_parser()
command.add_all_options(parser)
parser.set_usage('Usage: python setup.py %s [options]\n\n'%args[1]+\
command.description)
opts, args = parser.parse_args(args)
if opts.clean_backups:
clean_backups()
if opts.clean:
prints('Cleaning', args[1])
command.clean()
return 0
if opts.clean_all:
for cmd in commands.__all__:
prints('Cleaning', cmd)
getattr(commands, cmd).clean()
return 0
command.run_all(opts)
warnings = get_warnings()
if warnings:
print
prints('There were', len(warnings), 'warning(s):')
print
for args, kwargs in warnings:
prints('*', *args, **kwargs)
print
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -50,7 +50,7 @@ class RTFInput(InputFormatPlugin):
parser = ParseRtf(
in_file = stream,
out_file = ofile,
#deb_dir = 'I:\\Calibre\\rtfdebug',
deb_dir = 'I:\\Calibre\\rtfdebug',
# Convert symbol fonts to unicode equivalents. Default
# is 1
convert_symbol = 1,
@ -187,16 +187,17 @@ class RTFInput(InputFormatPlugin):
self.log = log
self.log('Converting RTF to XML...')
#Name of the preprocesssed RTF file
fname = self.preprocess(stream.name)
#fname = self.preprocess(stream.name)
fname = stream.name
try:
xml = self.generate_xml(fname)
except RtfInvalidCodeException, e:
raise ValueError(_('This RTF file has a feature calibre does not '
'support. Convert it to HTML first and then try it.\n%s')%e)
'''dataxml = open('dataxml.xml', 'w')
dataxml = open('dataxml.xml', 'w')
dataxml.write(xml)
dataxml.close'''
dataxml.close
d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf'))
if d:

View File

@ -228,8 +228,9 @@ class RtfTokenizer():
def tokenize(self):
i = 0
lastDataStart = -1
#on parse caractere par caractere
while i < len(self.rtfData):
#si ça commence un grpupe
if isChar(self.rtfData[i], '{'):
if lastDataStart > -1:
self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
@ -237,7 +238,7 @@ class RtfTokenizer():
self.tokens.append(tokenDelimitatorStart())
i = i + 1
continue
#si ça fini un grpupe
if isChar(self.rtfData[i], '}'):
if lastDataStart > -1:
self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
@ -245,7 +246,7 @@ class RtfTokenizer():
self.tokens.append(tokenDelimitatorEnd())
i = i + 1
continue
#on copie si il y a un charactere de controle
if isChar(self.rtfData[i], '\\'):
if i + 1 >= len(self.rtfData):
raise Exception('Error: Control character found at the end of the document.')
@ -254,6 +255,7 @@ class RtfTokenizer():
self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
lastDataStart = -1
# le token commence ici
tokenStart = i
i = i + 1

View File

@ -32,7 +32,7 @@ class FixLineEndings:
self.__write_to = tempfile.mktemp()
self.__replace_illegals = replace_illegals
def fix_endings(self):
illegal_regx = re.compile( '\x00|\x01|\x02|\x03|\x04|\x05|\x06|\x07|\x08|\x0B|\x0E|\x0F|\x10|\x11|\x12|\x13')
illegal_regx = re.compile('\x00|\x01|\x02|\x03|\x04|\x05|\x06|\x07|\x08|\x0B|\x0E|\x0F|\x10|\x11|\x12|\x13')
# always check since I have to get rid of illegal characters
#read
read_obj = open(self.__file, 'r')

View File

@ -16,7 +16,10 @@
# #
#########################################################################
import os, re, tempfile
from calibre.ebooks.rtf2xml import copy
from calibre.utils.mreplace import MReplace
class Tokenize:
"""Tokenize RTF into one line per field. Each line will contain information useful for the rest of the script"""
def __init__(self,
@ -28,20 +31,162 @@ class Tokenize:
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__special_tokens = [ '_', '~', "'", '{', '}' ]
self.__write_to = tempfile.mktemp()
self.__compile_expressions()
#variables
self.__uc_char = 0
self.__uc_bin = False
self.__uc_value = [1]
def __from_ms_to_utf8(self,match_obj):
uni_char = int(match_obj.group(1))
if uni_char < 0:
uni_char += 65536
return '&#x' + str('%X' % uni_char) + ';'
def __neg_unicode_func(self, match_obj):
neg_uni_char = int(match_obj.group(1)) * -1
# sys.stderr.write(str( neg_uni_char))
uni_char = neg_uni_char + 65536
return '&#x' + str('%X' % uni_char) + ';'
def __sub_line_reg(self,line):
line = line.replace("\\\\", "\\backslash ")
def __reini_utf8_counters(self):
self.__uc_char = 0
self.__uc_bin = False
def __unicode_process(self, token):
#change scope in
if token == '\{':
self.__uc_value.append(self.__uc_value[-1])
#basic error handling
self.__reini_utf8_counters()
return token
#change scope out: evaluate dict and rebuild
elif token == '\}':
#self.__uc_value.pop()
self.__reini_utf8_counters()
return token
#add a uc control
elif token[:3] == '\uc':
self.__uc_value[-1] = int(token[3:])
self.__reini_utf8_counters()
return token
#handle uc skippable char
elif self.__uc_char:
#if token[:1] == "\" and token[:1] == "\"
pass
#go for real \u token
match_obj = self.__utf_exp.match(token)
if match_obj is not None:
#get value and handle negative case
uni_char = int(match_obj.group(1))
uni_len = len(match_obj.group(1)) + 2
if uni_char < 0:
uni_char += 65536
uni_char = unichr(uni_char).encode('ascii', 'xmlcharrefreplace')
#if not uc0
if self.__uc_value[-1]:
self.__uc_char = self.__uc_value[-1]
#there is only an unicode char
if len(token)<= uni_len:
return uni_char
#an unicode char and something else
#must be after as it is splited on \
elif not self.__uc_value[-1]:
print('not only token uc0 token: ' + uni_char + token[uni_len:])
return uni_char + token[uni_len:]
#if not uc0 and chars
else:
for i in xrange(uni_len, len(token)):
if token[i] == " ":
continue
elif self.__uc_char > 0:
self.__uc_char -= 1
else:
return uni_char + token[i:]
#print('uc: ' + str(self.__uc_value) + 'uni: ' + str(uni_char) + 'token: ' + token)
#default
return token
def __sub_reg_split(self,input_file):
input_file = self.__replace_spchar.mreplace(input_file)
#input_file = re.sub(self.__utf_exp, self.__from_ms_to_utf8, input_file)
# line = re.sub( self.__neg_utf_exp, self.__neg_unicode_func, line)
# this is for older RTF
#line = re.sub(self.__par_exp, '\\par ', line)
input_file = re.sub(self.__ms_hex_exp, "\\mshex0\g<1> ", input_file)
#split
tokens = re.split(self.__splitexp, input_file)
#remove empty tokens and \n
return filter(lambda x: len(x) > 0 and x != '\n', tokens)
#return filter(lambda x: len(x) > 0, \
#(self.__remove_line.sub('', x) for x in tokens))
def __compile_expressions(self):
SIMPLE_RPL = {
"\\\\": "\\backslash ",
"\\~": "\\~ ",
"\\;": "\\; ",
"&": "&amp;",
"<": "&lt;",
">": "&gt;",
"\\~": "\\~ ",
"\\_": "\\_ ",
"\\:": "\\: ",
"\\-": "\\- ",
# turn into a generic token to eliminate special
# cases and make processing easier
"\\{": "\\ob ",
# turn into a generic token to eliminate special
# cases and make processing easier
"\\}": "\\cb ",
# put a backslash in front of to eliminate special cases and
# make processing easier
"{": "\\{",
# put a backslash in front of to eliminate special cases and
# make processing easier
"}": "\\}",
# this is for older RTF
r'\\$': '\\par ',
}
self.__replace_spchar = MReplace(SIMPLE_RPL)
self.__ms_hex_exp = re.compile(r"\\\'([0-9a-fA-F]{2})") #r"\\\'(..)"
self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) {0,1}") #modify this
#self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})")
#add \n in split for whole file reading
#self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\n|\\[^\s\\{}&]+(?:\s)?)")
#why keep backslash whereas \is replaced before?
self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)")
#self.__par_exp = re.compile(r'\\$')
#self.__remove_line = re.compile(r'\n+')
#self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)")
##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)")
def tokenize(self):
"""Main class for handling other methods. Reads the file \
, uses method self.sub_reg to make basic substitutions,\
and process tokens by itself"""
#read
read_obj = open(self.__file, 'r')
input_file = read_obj.read()
read_obj.close()
#process simple replacements and split giving us a correct list
#remove '' and \n in the process
tokens = self.__sub_reg_split(input_file)
#correct unicode
#tokens = map(self.__unicode_process, tokens)
#remove empty items created by removing \uc
#tokens = filter(lambda x: len(x) > 0, tokens)
#write
write_obj = open(self.__write_to, 'wb')
write_obj.write('\n'.join(tokens))
write_obj.close()
#Move and copy
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "tokenize.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)
#self.__special_tokens = [ '_', '~', "'", '{', '}' ]
'''line = line.replace("\\\\", "\\backslash ")
line = line.replace("\\~", "\\~ ")
line = line.replace("\\;", "\\; ")
line = line.replace("&", "&amp;")
@ -63,54 +208,37 @@ class Tokenize:
# put a backslash in front of to eliminate special cases and
# make processing easier
line = line.replace("}", "\\}")
line = re.sub(self.__utf_exp, self.__from_ms_to_utf8, line)
# line = re.sub( self.__neg_utf_exp, self.__neg_unicode_func, line)
line = re.sub(self.__ms_hex_exp, "\\mshex0\g<1> ", line)
##line = line.replace("\\backslash", "\\\\")
# this is for older RTF
line = re.sub(self.__par_exp, '\\par ', line)
return line
def __compile_expressions(self):
self.__ms_hex_exp = re.compile(r"\\\'(..)")
self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) {0,1}")
self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\\[^\s\\{}&]+(?:\s)?)")
self.__par_exp = re.compile(r'\\$')
self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)")
##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)")
def __create_tokens(self):
self.__compile_expressions()
read_obj = open(self.__file, 'r')
write_obj = open(self.__write_to, 'wb')
line_to_read = "dummy"
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
line = line.replace("\n", "")
line = self.__sub_line_reg(line)
tokens = re.split(self.__splitexp, line)
##print tokens
for token in tokens:
if token != "":
'''
'''if token != "":
write_obj.write(token + "\n")
match_obj = re.search(self.__mixed_exp, token)
if match_obj != None:
first = match_obj.group(1)
second = match_obj.group(2)
write_obj.write(first + "\n")
write_obj.write(second + "\n")
else:
write_obj.write(token + "\n")
"""
match_obj = re.search(self.__mixed_exp, token)
if match_obj != None:
first = match_obj.group(1)
second = match_obj.group(2)
write_obj.write(first + "\n")
write_obj.write(second + "\n")
else:
write_obj.write(token + "\n")
"""
read_obj.close()
write_obj.close()
def tokenize(self):
"""Main class for handling other methods. Reads in one line \
at a time, uses method self.sub_line to make basic substitutions,\
uses ? to process tokens"""
self.__create_tokens()
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "tokenize.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)
'''
'''
for line in read_obj:
#make all replacements
line = self.__sub_reg(line)
#split token and remove empty tokens
tokens = filter(lambda x: len(x) > 0,
re.split(self.__splitexp, line))
if tokens:
write_obj.write('\n'.join(tokens)+'\n')'''
'''def __neg_unicode_func(self, match_obj):
neg_uni_char = int(match_obj.group(1)) * -1
# sys.stderr.write(str( neg_uni_char))
uni_char = neg_uni_char + 65536
return '&#x' + str('%X' % uni_char) + ';'''

View File

@ -329,6 +329,7 @@ class FileIconProvider(QFileIconProvider):
'epub' : 'epub',
'fb2' : 'fb2',
'rtf' : 'rtf',
'odt' : 'odt',
}
def __init__(self):