Global overhaul of rtf2xml : RTF fixes (2) -> first tokenize modifications (not completely working without preprocessing)

This commit is contained in:
Sengian 2010-08-09 00:05:51 +02:00
commit adcad1cb60
7 changed files with 256 additions and 160 deletions

View File

@ -0,0 +1,63 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!-- Created with Inkscape (http://www.inkscape.org/) -->
<svg xmlns:svg="http://www.w3.org/2000/svg" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" version="1.0" width="128" height="128" id="svg2176">
<defs id="defs2178">
<linearGradient x1="406.065" y1="290.50299" x2="406.065" y2="276.29501" id="linearGradient4819" xlink:href="#linearGradient7431" gradientUnits="userSpaceOnUse" gradientTransform="matrix(4.80814, 0, 0, 2.87475, -1569.44, -758.786)" spreadMethod="pad"/>
<linearGradient x1="68.374298" y1="-410.099" x2="67.912201" y2="-478.508" id="linearGradient4817" xlink:href="#linearGradient11367" gradientUnits="userSpaceOnUse" gradientTransform="matrix(1.58223, 0, 0, -0.727268, 275.522, -213.417)" spreadMethod="pad"/>
<linearGradient x1="436.48801" y1="-278.91299" x2="436.51199" y2="-299.88699" id="linearGradient4815" xlink:href="#linearGradient11377" gradientUnits="userSpaceOnUse" gradientTransform="matrix(4.59378, 0, 0, 0.359494, -1920.95, 434.897)" spreadMethod="pad"/>
<linearGradient id="linearGradient11377">
<stop id="stop11379" style="stop-color: rgb(255, 255, 255); stop-opacity: 0;" offset="0"/>
<stop id="stop11385" style="stop-color: rgb(255, 255, 255); stop-opacity: 0.575472;" offset="1"/>
</linearGradient>
<linearGradient id="linearGradient11367">
<stop id="stop11369" style="stop-color: rgb(0, 0, 0); stop-opacity: 0;" offset="0"/>
<stop id="stop11371" style="stop-color: rgb(0, 0, 0); stop-opacity: 0.254717;" offset="0.72131097"/>
<stop id="stop18428" style="stop-color: rgb(0, 0, 0); stop-opacity: 0.12549;" offset="0.91000003"/>
<stop id="stop11375" style="stop-color: rgb(0, 0, 0); stop-opacity: 0;" offset="1"/>
</linearGradient>
<linearGradient id="linearGradient7431">
<stop id="stop7433" style="stop-color: rgb(255, 255, 255); stop-opacity: 0;" offset="0"/>
<stop id="stop7439" style="stop-color: rgb(255, 255, 255); stop-opacity: 0.858491;" offset="0.72000003"/>
<stop id="stop8224" style="stop-color: rgb(255, 255, 255); stop-opacity: 0.707547;" offset="0.89999998"/>
<stop id="stop7435" style="stop-color: rgb(255, 255, 255); stop-opacity: 0.320755;" offset="1"/>
</linearGradient>
<filter id="filter3659">
<feGaussianBlur inkscape:collect="always" stdDeviation="0.25192676" id="feGaussianBlur3661"/>
</filter>
</defs>
<g id="layer1">
<g transform="matrix(1.1475, 0, 0, 1.1475, -368.661, -33.5075)" id="g4500">
<path d="M 326.964,34.4298 L 423.481,34.4298 C 437.856,66.0223 403.767,104.222 423.481,135.402 L 326.964,135.402 L 326.964,34.4298 z" id="path21978" style="fill: rgb(95, 123, 141); fill-opacity: 1; fill-rule: evenodd; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
<path d="M 326.964,34.4298 L 353.143,34.4298 C 367.518,66.0223 333.429,104.222 353.143,135.402 L 326.964,135.402 L 326.964,34.4298 z" id="path21980" style="fill: rgb(29, 70, 89); fill-opacity: 1; fill-rule: evenodd; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
<rect width="101.089" height="7.9108801" x="34.429798" y="326.96399" transform="matrix(0, 1, 1, 0, 0, 0)" id="rect21982" style="fill: url(&quot;#linearGradient4815&quot;) rgb(0, 0, 0); fill-opacity: 1; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
<g transform="matrix(3.17412, 0, 0, 3.17412, 1038.99, -354.131)" id="g21984">
<path d="M -218.445,122.416 C -213.917,132.369 -224.656,144.405 -218.445,154.228 L -216.32,154.228 C -222.531,144.405 -211.792,132.369 -216.32,122.416 L -218.445,122.416 z" id="path21986" style="fill: rgb(0, 0, 0); fill-opacity: 0.0798122; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
<path d="M -217.955,122.416 C -213.426,132.369 -224.166,144.405 -217.955,154.228 L -216.32,154.228 C -222.531,144.405 -211.792,132.369 -216.32,122.416 L -217.955,122.416 z" id="path21988" style="fill: rgb(0, 0, 0); fill-opacity: 0.131455; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
<path d="M -217.403,122.416 C -212.875,132.369 -223.614,144.405 -217.403,154.228 L -216.32,154.228 C -222.531,144.405 -211.792,132.369 -216.32,122.416 L -217.403,122.416 z" id="path21990" style="fill: rgb(0, 0, 0); fill-opacity: 0.197183; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
<path d="M -216.852,122.416 C -212.323,132.369 -223.063,144.405 -216.852,154.228 L -216.32,154.228 C -222.531,144.405 -211.792,132.369 -216.32,122.416 L -216.852,122.416 z" id="path21992" style="fill: rgb(0, 0, 0); fill-opacity: 0.267606; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
</g>
<path d="M 326.964,135.402 L 422.488,135.402 C 412.274,118.171 416.819,101.345 421.306,83.5374 L 326.964,83.2034 L 326.964,135.402 z" id="path21994" style="fill: url(&quot;#linearGradient4817&quot;) rgb(0, 0, 0); fill-opacity: 1; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
<g transform="matrix(2.41517, 0, 0, 2.41517, 876.456, -197.189)" id="g21996">
<path d="M 602.125,190.59375 C 599.45874,190.67075 596.74504,191.16798 594.53125,192.78125 C 591.5146,192.34561 588.3664,192.55749 585.5,193.6875 C 583.62824,194.43267 582.15635,195.77855 580.96875,197.34375 C 580.95544,197.36301 580.94492,197.38405 580.9375,197.40625 C 580.92091,197.43509 580.91029,197.46697 580.90625,197.5 C 580.91029,197.53303 580.92091,197.56491 580.9375,197.59375 C 580.94492,197.61595 580.95544,197.63699 580.96875,197.65625 C 580.97822,197.66757 580.98868,197.67803 581,197.6875 C 581.02605,197.71524 581.05813,197.73662 581.09375,197.75 C 581.12472,197.75595 581.15653,197.75595 581.1875,197.75 C 581.20825,197.75263 581.22925,197.75263 581.25,197.75 C 584.80749,196.49944 588.39295,195.15225 592.15625,195.5 C 593.28385,195.58867 594.35616,196.00271 595.46875,196.375 C 595.50974,196.38565 595.55276,196.38565 595.59375,196.375 C 595.62678,196.37096 595.65866,196.36034 595.6875,196.34375 C 595.7097,196.33633 595.73074,196.32581 595.75,196.3125 C 598.71379,193.45164 603.00891,192.72955 606.96875,191.90625 C 606.98007,191.89678 606.99053,191.88632 607,191.875 C 607.19563,191.80037 607.32956,191.73576 607.4375,191.625 C 607.49147,191.56962 607.55414,191.50784 607.5625,191.40625 C 607.57086,191.30466 607.51945,191.21518 607.46875,191.15625 C 607.36735,191.03839 607.25573,190.98239 607.125,190.9375 C 606.99427,190.89261 606.8215,190.87546 606.65625,190.84375 C 605.99526,190.71692 605.12704,190.6454 604.8125,190.625 C 604.80209,190.62434 604.79166,190.62434 604.78125,190.625 C 603.91011,190.58739 603.02603,190.56773 602.125,190.59375 z" transform="translate(-806.724, -92.8004)" id="path21998" style="fill: rgb(0, 0, 0); fill-opacity: 1; fill-rule: evenodd; stroke: none; stroke-width: 1pt; stroke-linecap: butt; stroke-linejoin: miter; stroke-opacity: 1;"/>
<path d="M -224.344,103.835 C -219.295,101.9 -214.705,101.331 -211.263,102.86 C -208.45,100.119 -202.237,98.6242 -200.227,98.6199 C -207.528,97.8352 -210.552,99.4967 -212,100.582 C -216.698,100.015 -221.096,100.522 -224.344,103.834" id="path22000" style="fill: rgb(255, 255, 255); fill-opacity: 1; stroke: none;"/>
</g>
<g transform="matrix(2.41517, 0, 0, 2.41517, 876.456, -197.189)" id="g22002">
<path d="M 596.25,27.53125 C 587.9033,27.701471 579.93436,30.011449 573.84375,35.03125 C 565.49276,31.223728 554.44432,30.751141 544.375,32.28125 C 538.97209,33.102263 533.8987,34.480363 529.6875,36.34375 C 525.4884,38.201779 521.99675,40.484175 520.1875,43.8125 C 519.57732,44.883163 519.7128,46.206199 520.46875,47.15625 C 521.22471,48.106278 522.5049,48.470375 523.65625,48.125 C 544.63433,42.131263 561.86554,43.038041 573.6875,52.875 C 574.80806,53.806374 576.46471,53.801852 577.5625,52.84375 C 587.80668,43.812696 604.05857,37.910216 621.5625,38.875 C 622.98852,38.943575 624.2874,37.997938 624.625,36.625 C 624.96264,35.252044 624.25302,33.783013 622.96875,33.1875 C 614.73544,29.461107 605.28688,27.346954 596.25,27.53125 z" transform="matrix(0.292292, -0.0677077, 0.0677077, 0.292292, -381.543, 134.276)" id="path22004" style="fill: rgb(0, 0, 0); fill-opacity: 1; fill-rule: evenodd; stroke: none; stroke-width: 1pt; stroke-linecap: butt; stroke-linejoin: miter; stroke-opacity: 1;"/>
<path d="M -225.696,112.15 C -219.345,108.564 -214.201,107.915 -209.842,110.097 C -206.945,105.454 -199.625,102.766 -197.16,102.691 C -204.796,101.053 -210.086,104.587 -211.05,106.575 C -215.328,105.394 -224.104,108.305 -225.696,112.149" id="path22006" style="fill: rgb(255, 255, 255); fill-opacity: 1; stroke: none;"/>
</g>
<g transform="matrix(1.14159, 0, 0, 1.14159, 265.142, -259.674)" id="g22010">
<path d="M 134.221,257.626 C 146.813,285.3 116.952,318.766 134.221,346.078 L 138.68,346.078 C 121.411,318.766 151.272,285.3 138.68,257.626 L 134.221,257.626 z" id="path22012" style="fill: rgb(0, 0, 0); fill-opacity: 0.0798122; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
<path d="M 135.222,257.626 C 147.814,285.3 117.953,318.766 135.222,346.078 L 138.68,346.078 C 121.411,318.766 151.272,285.3 138.68,257.626 L 135.222,257.626 z" id="path22014" style="fill: rgb(0, 0, 0); fill-opacity: 0.131455; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
<path d="M 136.393,257.626 C 148.985,285.3 119.124,318.766 136.393,346.078 L 138.68,346.078 C 121.411,318.766 151.272,285.3 138.68,257.626 L 136.393,257.626 z" id="path22016" style="fill: rgb(0, 0, 0); fill-opacity: 0.197183; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
<path d="M 137.564,257.626 C 150.156,285.3 120.295,318.766 137.564,346.078 L 138.68,346.078 C 121.411,318.766 151.272,285.3 138.68,257.626 L 137.564,257.626 z" id="path22018" style="fill: rgb(0, 0, 0); fill-opacity: 0.267606; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
<path d="M 133.134,257.626 C 145.726,285.3 115.865,318.766 133.134,346.078 L 138.68,346.078 C 121.411,318.766 151.272,285.3 138.68,257.626 L 133.134,257.626 z" id="path22020" style="fill: rgb(0, 0, 0); fill-opacity: 0.0798122; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
</g>
<g transform="matrix(1.14159, 0, 0, 1.14159, -389.722, -484.947)" id="g22103">
<path d="M 653.161,498.44 L 693.751,498.44" id="path21604" style="fill: rgb(96, 123, 142); fill-opacity: 1; fill-rule: nonzero; stroke: rgb(255, 254, 255); stroke-width: 8; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-dashoffset: 0pt; stroke-opacity: 0.698039;"/>
<path d="M 653.161,515.294 L 683.452,515.294" id="path21606" style="fill: rgb(96, 123, 142); fill-opacity: 1; fill-rule: nonzero; stroke: rgb(255, 254, 255); stroke-width: 8; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-dashoffset: 0pt; stroke-opacity: 0.698039;"/>
<path d="M 653.161,532.46 L 693.751,532.46" id="path22101" style="fill: rgb(96, 123, 142); fill-opacity: 1; fill-rule: nonzero; stroke: rgb(255, 254, 255); stroke-width: 8; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-dashoffset: 0pt; stroke-opacity: 0.698039;"/>
</g>
<path d="M 326.964,34.4298 L 423.285,34.4298 C 433.146,54.4709 420.531,82.4058 417.826,102.327 L 326.964,102.327 L 326.964,34.4298 z" id="path22008" style="fill: url(&quot;#linearGradient4819&quot;) rgb(0, 0, 0); fill-opacity: 1; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
</g>
</g>
</svg>

After

Width:  |  Height:  |  Size: 12 KiB

View File

@ -1,99 +0,0 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import sys, os, optparse
sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)))
import setup.commands as commands
from setup import prints, get_warnings
def check_version_info():
vi = sys.version_info
if vi[0] == 2 and vi[1] > 5:
return None
return 'calibre requires python >= 2.6'
def option_parser():
parser = optparse.OptionParser()
parser.add_option('-c', '--clean', default=False, action='store_true',
help=('Instead of running the command delete all files generated '
'by the command'))
parser.add_option('--clean-backups', default=False, action='store_true',
help='Delete all backup files from the source tree')
parser.add_option('--clean-all', default=False, action='store_true',
help='Delete all machine generated files from the source tree')
return parser
def clean_backups():
for root, _, files in os.walk('.'):
for name in files:
for t in ('.pyc', '.pyo', '~', '.swp', '.swo'):
if name.endswith(t):
os.remove(os.path.join(root, name))
def main(args=sys.argv):
if len(args) == 1 or args[1] in ('-h', '--help'):
print 'Usage: python', args[0], 'command', '[options]'
print '\nWhere command is one of:'
print
for x in sorted(commands.__all__):
print '%-20s -'%x,
c = getattr(commands, x)
desc = getattr(c, 'short_description', c.description)
print desc
print '\nTo get help on a particular command, run:'
print '\tpython', args[0], 'command -h'
return 1
command = args[1]
if command not in commands.__all__:
print command, 'is not a recognized command.'
print 'Valid commands:', ', '.join(commands.__all__)
return 1
command = getattr(commands, command)
parser = option_parser()
command.add_all_options(parser)
parser.set_usage('Usage: python setup.py %s [options]\n\n'%args[1]+\
command.description)
opts, args = parser.parse_args(args)
if opts.clean_backups:
clean_backups()
if opts.clean:
prints('Cleaning', args[1])
command.clean()
return 0
if opts.clean_all:
for cmd in commands.__all__:
prints('Cleaning', cmd)
getattr(commands, cmd).clean()
return 0
command.run_all(opts)
warnings = get_warnings()
if warnings:
print
prints('There were', len(warnings), 'warning(s):')
print
for args, kwargs in warnings:
prints('*', *args, **kwargs)
print
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -50,7 +50,7 @@ class RTFInput(InputFormatPlugin):
parser = ParseRtf( parser = ParseRtf(
in_file = stream, in_file = stream,
out_file = ofile, out_file = ofile,
#deb_dir = 'I:\\Calibre\\rtfdebug', deb_dir = 'I:\\Calibre\\rtfdebug',
# Convert symbol fonts to unicode equivalents. Default # Convert symbol fonts to unicode equivalents. Default
# is 1 # is 1
convert_symbol = 1, convert_symbol = 1,
@ -187,16 +187,17 @@ class RTFInput(InputFormatPlugin):
self.log = log self.log = log
self.log('Converting RTF to XML...') self.log('Converting RTF to XML...')
#Name of the preprocesssed RTF file #Name of the preprocesssed RTF file
fname = self.preprocess(stream.name) #fname = self.preprocess(stream.name)
fname = stream.name
try: try:
xml = self.generate_xml(fname) xml = self.generate_xml(fname)
except RtfInvalidCodeException, e: except RtfInvalidCodeException, e:
raise ValueError(_('This RTF file has a feature calibre does not ' raise ValueError(_('This RTF file has a feature calibre does not '
'support. Convert it to HTML first and then try it.\n%s')%e) 'support. Convert it to HTML first and then try it.\n%s')%e)
'''dataxml = open('dataxml.xml', 'w') dataxml = open('dataxml.xml', 'w')
dataxml.write(xml) dataxml.write(xml)
dataxml.close''' dataxml.close
d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf')) d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf'))
if d: if d:

View File

@ -228,8 +228,9 @@ class RtfTokenizer():
def tokenize(self): def tokenize(self):
i = 0 i = 0
lastDataStart = -1 lastDataStart = -1
#on parse caractere par caractere
while i < len(self.rtfData): while i < len(self.rtfData):
#si ça commence un grpupe
if isChar(self.rtfData[i], '{'): if isChar(self.rtfData[i], '{'):
if lastDataStart > -1: if lastDataStart > -1:
self.tokens.append(tokenData(self.rtfData[lastDataStart : i])) self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
@ -237,7 +238,7 @@ class RtfTokenizer():
self.tokens.append(tokenDelimitatorStart()) self.tokens.append(tokenDelimitatorStart())
i = i + 1 i = i + 1
continue continue
#si ça fini un grpupe
if isChar(self.rtfData[i], '}'): if isChar(self.rtfData[i], '}'):
if lastDataStart > -1: if lastDataStart > -1:
self.tokens.append(tokenData(self.rtfData[lastDataStart : i])) self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
@ -245,7 +246,7 @@ class RtfTokenizer():
self.tokens.append(tokenDelimitatorEnd()) self.tokens.append(tokenDelimitatorEnd())
i = i + 1 i = i + 1
continue continue
#on copie si il y a un charactere de controle
if isChar(self.rtfData[i], '\\'): if isChar(self.rtfData[i], '\\'):
if i + 1 >= len(self.rtfData): if i + 1 >= len(self.rtfData):
raise Exception('Error: Control character found at the end of the document.') raise Exception('Error: Control character found at the end of the document.')
@ -254,6 +255,7 @@ class RtfTokenizer():
self.tokens.append(tokenData(self.rtfData[lastDataStart : i])) self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
lastDataStart = -1 lastDataStart = -1
# le token commence ici
tokenStart = i tokenStart = i
i = i + 1 i = i + 1

View File

@ -32,7 +32,7 @@ class FixLineEndings:
self.__write_to = tempfile.mktemp() self.__write_to = tempfile.mktemp()
self.__replace_illegals = replace_illegals self.__replace_illegals = replace_illegals
def fix_endings(self): def fix_endings(self):
illegal_regx = re.compile( '\x00|\x01|\x02|\x03|\x04|\x05|\x06|\x07|\x08|\x0B|\x0E|\x0F|\x10|\x11|\x12|\x13') illegal_regx = re.compile('\x00|\x01|\x02|\x03|\x04|\x05|\x06|\x07|\x08|\x0B|\x0E|\x0F|\x10|\x11|\x12|\x13')
# always check since I have to get rid of illegal characters # always check since I have to get rid of illegal characters
#read #read
read_obj = open(self.__file, 'r') read_obj = open(self.__file, 'r')

View File

@ -16,7 +16,10 @@
# # # #
######################################################################### #########################################################################
import os, re, tempfile import os, re, tempfile
from calibre.ebooks.rtf2xml import copy from calibre.ebooks.rtf2xml import copy
from calibre.utils.mreplace import MReplace
class Tokenize: class Tokenize:
"""Tokenize RTF into one line per field. Each line will contain information useful for the rest of the script""" """Tokenize RTF into one line per field. Each line will contain information useful for the rest of the script"""
def __init__(self, def __init__(self,
@ -28,20 +31,162 @@ class Tokenize:
self.__file = in_file self.__file = in_file
self.__bug_handler = bug_handler self.__bug_handler = bug_handler
self.__copy = copy self.__copy = copy
self.__special_tokens = [ '_', '~', "'", '{', '}' ]
self.__write_to = tempfile.mktemp() self.__write_to = tempfile.mktemp()
self.__compile_expressions()
#variables
self.__uc_char = 0
self.__uc_bin = False
self.__uc_value = [1]
def __from_ms_to_utf8(self,match_obj): def __from_ms_to_utf8(self,match_obj):
uni_char = int(match_obj.group(1)) uni_char = int(match_obj.group(1))
if uni_char < 0: if uni_char < 0:
uni_char += 65536 uni_char += 65536
return '&#x' + str('%X' % uni_char) + ';' return '&#x' + str('%X' % uni_char) + ';'
def __neg_unicode_func(self, match_obj):
neg_uni_char = int(match_obj.group(1)) * -1 def __reini_utf8_counters(self):
# sys.stderr.write(str( neg_uni_char)) self.__uc_char = 0
uni_char = neg_uni_char + 65536 self.__uc_bin = False
return '&#x' + str('%X' % uni_char) + ';'
def __sub_line_reg(self,line): def __unicode_process(self, token):
line = line.replace("\\\\", "\\backslash ") #change scope in
if token == '\{':
self.__uc_value.append(self.__uc_value[-1])
#basic error handling
self.__reini_utf8_counters()
return token
#change scope out: evaluate dict and rebuild
elif token == '\}':
#self.__uc_value.pop()
self.__reini_utf8_counters()
return token
#add a uc control
elif token[:3] == '\uc':
self.__uc_value[-1] = int(token[3:])
self.__reini_utf8_counters()
return token
#handle uc skippable char
elif self.__uc_char:
#if token[:1] == "\" and token[:1] == "\"
pass
#go for real \u token
match_obj = self.__utf_exp.match(token)
if match_obj is not None:
#get value and handle negative case
uni_char = int(match_obj.group(1))
uni_len = len(match_obj.group(1)) + 2
if uni_char < 0:
uni_char += 65536
uni_char = unichr(uni_char).encode('ascii', 'xmlcharrefreplace')
#if not uc0
if self.__uc_value[-1]:
self.__uc_char = self.__uc_value[-1]
#there is only an unicode char
if len(token)<= uni_len:
return uni_char
#an unicode char and something else
#must be after as it is splited on \
elif not self.__uc_value[-1]:
print('not only token uc0 token: ' + uni_char + token[uni_len:])
return uni_char + token[uni_len:]
#if not uc0 and chars
else:
for i in xrange(uni_len, len(token)):
if token[i] == " ":
continue
elif self.__uc_char > 0:
self.__uc_char -= 1
else:
return uni_char + token[i:]
#print('uc: ' + str(self.__uc_value) + 'uni: ' + str(uni_char) + 'token: ' + token)
#default
return token
def __sub_reg_split(self,input_file):
input_file = self.__replace_spchar.mreplace(input_file)
#input_file = re.sub(self.__utf_exp, self.__from_ms_to_utf8, input_file)
# line = re.sub( self.__neg_utf_exp, self.__neg_unicode_func, line)
# this is for older RTF
#line = re.sub(self.__par_exp, '\\par ', line)
input_file = re.sub(self.__ms_hex_exp, "\\mshex0\g<1> ", input_file)
#split
tokens = re.split(self.__splitexp, input_file)
#remove empty tokens and \n
return filter(lambda x: len(x) > 0 and x != '\n', tokens)
#return filter(lambda x: len(x) > 0, \
#(self.__remove_line.sub('', x) for x in tokens))
def __compile_expressions(self):
SIMPLE_RPL = {
"\\\\": "\\backslash ",
"\\~": "\\~ ",
"\\;": "\\; ",
"&": "&amp;",
"<": "&lt;",
">": "&gt;",
"\\~": "\\~ ",
"\\_": "\\_ ",
"\\:": "\\: ",
"\\-": "\\- ",
# turn into a generic token to eliminate special
# cases and make processing easier
"\\{": "\\ob ",
# turn into a generic token to eliminate special
# cases and make processing easier
"\\}": "\\cb ",
# put a backslash in front of to eliminate special cases and
# make processing easier
"{": "\\{",
# put a backslash in front of to eliminate special cases and
# make processing easier
"}": "\\}",
# this is for older RTF
r'\\$': '\\par ',
}
self.__replace_spchar = MReplace(SIMPLE_RPL)
self.__ms_hex_exp = re.compile(r"\\\'([0-9a-fA-F]{2})") #r"\\\'(..)"
self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) {0,1}") #modify this
#self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})")
#add \n in split for whole file reading
#self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\n|\\[^\s\\{}&]+(?:\s)?)")
#why keep backslash whereas \is replaced before?
self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)")
#self.__par_exp = re.compile(r'\\$')
#self.__remove_line = re.compile(r'\n+')
#self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)")
##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)")
def tokenize(self):
"""Main class for handling other methods. Reads the file \
, uses method self.sub_reg to make basic substitutions,\
and process tokens by itself"""
#read
read_obj = open(self.__file, 'r')
input_file = read_obj.read()
read_obj.close()
#process simple replacements and split giving us a correct list
#remove '' and \n in the process
tokens = self.__sub_reg_split(input_file)
#correct unicode
#tokens = map(self.__unicode_process, tokens)
#remove empty items created by removing \uc
#tokens = filter(lambda x: len(x) > 0, tokens)
#write
write_obj = open(self.__write_to, 'wb')
write_obj.write('\n'.join(tokens))
write_obj.close()
#Move and copy
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "tokenize.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)
#self.__special_tokens = [ '_', '~', "'", '{', '}' ]
'''line = line.replace("\\\\", "\\backslash ")
line = line.replace("\\~", "\\~ ") line = line.replace("\\~", "\\~ ")
line = line.replace("\\;", "\\; ") line = line.replace("\\;", "\\; ")
line = line.replace("&", "&amp;") line = line.replace("&", "&amp;")
@ -63,54 +208,37 @@ class Tokenize:
# put a backslash in front of to eliminate special cases and # put a backslash in front of to eliminate special cases and
# make processing easier # make processing easier
line = line.replace("}", "\\}") line = line.replace("}", "\\}")
line = re.sub(self.__utf_exp, self.__from_ms_to_utf8, line)
# line = re.sub( self.__neg_utf_exp, self.__neg_unicode_func, line)
line = re.sub(self.__ms_hex_exp, "\\mshex0\g<1> ", line)
##line = line.replace("\\backslash", "\\\\")
# this is for older RTF
line = re.sub(self.__par_exp, '\\par ', line)
return line
def __compile_expressions(self):
self.__ms_hex_exp = re.compile(r"\\\'(..)")
self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) {0,1}")
self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\\[^\s\\{}&]+(?:\s)?)")
self.__par_exp = re.compile(r'\\$')
self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)")
##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)")
def __create_tokens(self):
self.__compile_expressions()
read_obj = open(self.__file, 'r')
write_obj = open(self.__write_to, 'wb')
line_to_read = "dummy" line_to_read = "dummy"
while line_to_read: while line_to_read:
line_to_read = read_obj.readline() line_to_read = read_obj.readline()
line = line_to_read line = line_to_read
line = line.replace("\n", "") line = line.replace("\n", "")
line = self.__sub_line_reg(line) '''
tokens = re.split(self.__splitexp, line) '''if token != "":
##print tokens write_obj.write(token + "\n")
for token in tokens:
if token != "": match_obj = re.search(self.__mixed_exp, token)
if match_obj != None:
first = match_obj.group(1)
second = match_obj.group(2)
write_obj.write(first + "\n")
write_obj.write(second + "\n")
else:
write_obj.write(token + "\n") write_obj.write(token + "\n")
""" '''
match_obj = re.search(self.__mixed_exp, token) '''
if match_obj != None: for line in read_obj:
first = match_obj.group(1) #make all replacements
second = match_obj.group(2) line = self.__sub_reg(line)
write_obj.write(first + "\n") #split token and remove empty tokens
write_obj.write(second + "\n") tokens = filter(lambda x: len(x) > 0,
else: re.split(self.__splitexp, line))
write_obj.write(token + "\n") if tokens:
""" write_obj.write('\n'.join(tokens)+'\n')'''
read_obj.close()
write_obj.close() '''def __neg_unicode_func(self, match_obj):
def tokenize(self): neg_uni_char = int(match_obj.group(1)) * -1
"""Main class for handling other methods. Reads in one line \ # sys.stderr.write(str( neg_uni_char))
at a time, uses method self.sub_line to make basic substitutions,\ uni_char = neg_uni_char + 65536
uses ? to process tokens""" return '&#x' + str('%X' % uni_char) + ';'''
self.__create_tokens()
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "tokenize.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)

View File

@ -329,6 +329,7 @@ class FileIconProvider(QFileIconProvider):
'epub' : 'epub', 'epub' : 'epub',
'fb2' : 'fb2', 'fb2' : 'fb2',
'rtf' : 'rtf', 'rtf' : 'rtf',
'odt' : 'odt',
} }
def __init__(self): def __init__(self):