mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Global overhaul of rtf2xml : RTF fixes (2) -> first tokenize modifications (not completely working without preprocessing)
This commit is contained in:
commit
adcad1cb60
63
resources/images/mimetypes/odt.svg
Normal file
63
resources/images/mimetypes/odt.svg
Normal file
@ -0,0 +1,63 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<!-- Created with Inkscape (http://www.inkscape.org/) -->
|
||||
<svg xmlns:svg="http://www.w3.org/2000/svg" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" version="1.0" width="128" height="128" id="svg2176">
|
||||
<defs id="defs2178">
|
||||
<linearGradient x1="406.065" y1="290.50299" x2="406.065" y2="276.29501" id="linearGradient4819" xlink:href="#linearGradient7431" gradientUnits="userSpaceOnUse" gradientTransform="matrix(4.80814, 0, 0, 2.87475, -1569.44, -758.786)" spreadMethod="pad"/>
|
||||
<linearGradient x1="68.374298" y1="-410.099" x2="67.912201" y2="-478.508" id="linearGradient4817" xlink:href="#linearGradient11367" gradientUnits="userSpaceOnUse" gradientTransform="matrix(1.58223, 0, 0, -0.727268, 275.522, -213.417)" spreadMethod="pad"/>
|
||||
<linearGradient x1="436.48801" y1="-278.91299" x2="436.51199" y2="-299.88699" id="linearGradient4815" xlink:href="#linearGradient11377" gradientUnits="userSpaceOnUse" gradientTransform="matrix(4.59378, 0, 0, 0.359494, -1920.95, 434.897)" spreadMethod="pad"/>
|
||||
<linearGradient id="linearGradient11377">
|
||||
<stop id="stop11379" style="stop-color: rgb(255, 255, 255); stop-opacity: 0;" offset="0"/>
|
||||
<stop id="stop11385" style="stop-color: rgb(255, 255, 255); stop-opacity: 0.575472;" offset="1"/>
|
||||
</linearGradient>
|
||||
<linearGradient id="linearGradient11367">
|
||||
<stop id="stop11369" style="stop-color: rgb(0, 0, 0); stop-opacity: 0;" offset="0"/>
|
||||
<stop id="stop11371" style="stop-color: rgb(0, 0, 0); stop-opacity: 0.254717;" offset="0.72131097"/>
|
||||
<stop id="stop18428" style="stop-color: rgb(0, 0, 0); stop-opacity: 0.12549;" offset="0.91000003"/>
|
||||
<stop id="stop11375" style="stop-color: rgb(0, 0, 0); stop-opacity: 0;" offset="1"/>
|
||||
</linearGradient>
|
||||
<linearGradient id="linearGradient7431">
|
||||
<stop id="stop7433" style="stop-color: rgb(255, 255, 255); stop-opacity: 0;" offset="0"/>
|
||||
<stop id="stop7439" style="stop-color: rgb(255, 255, 255); stop-opacity: 0.858491;" offset="0.72000003"/>
|
||||
<stop id="stop8224" style="stop-color: rgb(255, 255, 255); stop-opacity: 0.707547;" offset="0.89999998"/>
|
||||
<stop id="stop7435" style="stop-color: rgb(255, 255, 255); stop-opacity: 0.320755;" offset="1"/>
|
||||
</linearGradient>
|
||||
<filter id="filter3659">
|
||||
<feGaussianBlur inkscape:collect="always" stdDeviation="0.25192676" id="feGaussianBlur3661"/>
|
||||
</filter>
|
||||
</defs>
|
||||
<g id="layer1">
|
||||
<g transform="matrix(1.1475, 0, 0, 1.1475, -368.661, -33.5075)" id="g4500">
|
||||
<path d="M 326.964,34.4298 L 423.481,34.4298 C 437.856,66.0223 403.767,104.222 423.481,135.402 L 326.964,135.402 L 326.964,34.4298 z" id="path21978" style="fill: rgb(95, 123, 141); fill-opacity: 1; fill-rule: evenodd; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
|
||||
<path d="M 326.964,34.4298 L 353.143,34.4298 C 367.518,66.0223 333.429,104.222 353.143,135.402 L 326.964,135.402 L 326.964,34.4298 z" id="path21980" style="fill: rgb(29, 70, 89); fill-opacity: 1; fill-rule: evenodd; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
|
||||
<rect width="101.089" height="7.9108801" x="34.429798" y="326.96399" transform="matrix(0, 1, 1, 0, 0, 0)" id="rect21982" style="fill: url("#linearGradient4815") rgb(0, 0, 0); fill-opacity: 1; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
|
||||
<g transform="matrix(3.17412, 0, 0, 3.17412, 1038.99, -354.131)" id="g21984">
|
||||
<path d="M -218.445,122.416 C -213.917,132.369 -224.656,144.405 -218.445,154.228 L -216.32,154.228 C -222.531,144.405 -211.792,132.369 -216.32,122.416 L -218.445,122.416 z" id="path21986" style="fill: rgb(0, 0, 0); fill-opacity: 0.0798122; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
|
||||
<path d="M -217.955,122.416 C -213.426,132.369 -224.166,144.405 -217.955,154.228 L -216.32,154.228 C -222.531,144.405 -211.792,132.369 -216.32,122.416 L -217.955,122.416 z" id="path21988" style="fill: rgb(0, 0, 0); fill-opacity: 0.131455; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
|
||||
<path d="M -217.403,122.416 C -212.875,132.369 -223.614,144.405 -217.403,154.228 L -216.32,154.228 C -222.531,144.405 -211.792,132.369 -216.32,122.416 L -217.403,122.416 z" id="path21990" style="fill: rgb(0, 0, 0); fill-opacity: 0.197183; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
|
||||
<path d="M -216.852,122.416 C -212.323,132.369 -223.063,144.405 -216.852,154.228 L -216.32,154.228 C -222.531,144.405 -211.792,132.369 -216.32,122.416 L -216.852,122.416 z" id="path21992" style="fill: rgb(0, 0, 0); fill-opacity: 0.267606; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
|
||||
</g>
|
||||
<path d="M 326.964,135.402 L 422.488,135.402 C 412.274,118.171 416.819,101.345 421.306,83.5374 L 326.964,83.2034 L 326.964,135.402 z" id="path21994" style="fill: url("#linearGradient4817") rgb(0, 0, 0); fill-opacity: 1; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
|
||||
<g transform="matrix(2.41517, 0, 0, 2.41517, 876.456, -197.189)" id="g21996">
|
||||
<path d="M 602.125,190.59375 C 599.45874,190.67075 596.74504,191.16798 594.53125,192.78125 C 591.5146,192.34561 588.3664,192.55749 585.5,193.6875 C 583.62824,194.43267 582.15635,195.77855 580.96875,197.34375 C 580.95544,197.36301 580.94492,197.38405 580.9375,197.40625 C 580.92091,197.43509 580.91029,197.46697 580.90625,197.5 C 580.91029,197.53303 580.92091,197.56491 580.9375,197.59375 C 580.94492,197.61595 580.95544,197.63699 580.96875,197.65625 C 580.97822,197.66757 580.98868,197.67803 581,197.6875 C 581.02605,197.71524 581.05813,197.73662 581.09375,197.75 C 581.12472,197.75595 581.15653,197.75595 581.1875,197.75 C 581.20825,197.75263 581.22925,197.75263 581.25,197.75 C 584.80749,196.49944 588.39295,195.15225 592.15625,195.5 C 593.28385,195.58867 594.35616,196.00271 595.46875,196.375 C 595.50974,196.38565 595.55276,196.38565 595.59375,196.375 C 595.62678,196.37096 595.65866,196.36034 595.6875,196.34375 C 595.7097,196.33633 595.73074,196.32581 595.75,196.3125 C 598.71379,193.45164 603.00891,192.72955 606.96875,191.90625 C 606.98007,191.89678 606.99053,191.88632 607,191.875 C 607.19563,191.80037 607.32956,191.73576 607.4375,191.625 C 607.49147,191.56962 607.55414,191.50784 607.5625,191.40625 C 607.57086,191.30466 607.51945,191.21518 607.46875,191.15625 C 607.36735,191.03839 607.25573,190.98239 607.125,190.9375 C 606.99427,190.89261 606.8215,190.87546 606.65625,190.84375 C 605.99526,190.71692 605.12704,190.6454 604.8125,190.625 C 604.80209,190.62434 604.79166,190.62434 604.78125,190.625 C 603.91011,190.58739 603.02603,190.56773 602.125,190.59375 z" transform="translate(-806.724, -92.8004)" id="path21998" style="fill: rgb(0, 0, 0); fill-opacity: 1; fill-rule: evenodd; stroke: none; stroke-width: 1pt; stroke-linecap: butt; stroke-linejoin: miter; stroke-opacity: 1;"/>
|
||||
<path d="M -224.344,103.835 C -219.295,101.9 -214.705,101.331 -211.263,102.86 C -208.45,100.119 -202.237,98.6242 -200.227,98.6199 C -207.528,97.8352 -210.552,99.4967 -212,100.582 C -216.698,100.015 -221.096,100.522 -224.344,103.834" id="path22000" style="fill: rgb(255, 255, 255); fill-opacity: 1; stroke: none;"/>
|
||||
</g>
|
||||
<g transform="matrix(2.41517, 0, 0, 2.41517, 876.456, -197.189)" id="g22002">
|
||||
<path d="M 596.25,27.53125 C 587.9033,27.701471 579.93436,30.011449 573.84375,35.03125 C 565.49276,31.223728 554.44432,30.751141 544.375,32.28125 C 538.97209,33.102263 533.8987,34.480363 529.6875,36.34375 C 525.4884,38.201779 521.99675,40.484175 520.1875,43.8125 C 519.57732,44.883163 519.7128,46.206199 520.46875,47.15625 C 521.22471,48.106278 522.5049,48.470375 523.65625,48.125 C 544.63433,42.131263 561.86554,43.038041 573.6875,52.875 C 574.80806,53.806374 576.46471,53.801852 577.5625,52.84375 C 587.80668,43.812696 604.05857,37.910216 621.5625,38.875 C 622.98852,38.943575 624.2874,37.997938 624.625,36.625 C 624.96264,35.252044 624.25302,33.783013 622.96875,33.1875 C 614.73544,29.461107 605.28688,27.346954 596.25,27.53125 z" transform="matrix(0.292292, -0.0677077, 0.0677077, 0.292292, -381.543, 134.276)" id="path22004" style="fill: rgb(0, 0, 0); fill-opacity: 1; fill-rule: evenodd; stroke: none; stroke-width: 1pt; stroke-linecap: butt; stroke-linejoin: miter; stroke-opacity: 1;"/>
|
||||
<path d="M -225.696,112.15 C -219.345,108.564 -214.201,107.915 -209.842,110.097 C -206.945,105.454 -199.625,102.766 -197.16,102.691 C -204.796,101.053 -210.086,104.587 -211.05,106.575 C -215.328,105.394 -224.104,108.305 -225.696,112.149" id="path22006" style="fill: rgb(255, 255, 255); fill-opacity: 1; stroke: none;"/>
|
||||
</g>
|
||||
<g transform="matrix(1.14159, 0, 0, 1.14159, 265.142, -259.674)" id="g22010">
|
||||
<path d="M 134.221,257.626 C 146.813,285.3 116.952,318.766 134.221,346.078 L 138.68,346.078 C 121.411,318.766 151.272,285.3 138.68,257.626 L 134.221,257.626 z" id="path22012" style="fill: rgb(0, 0, 0); fill-opacity: 0.0798122; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
|
||||
<path d="M 135.222,257.626 C 147.814,285.3 117.953,318.766 135.222,346.078 L 138.68,346.078 C 121.411,318.766 151.272,285.3 138.68,257.626 L 135.222,257.626 z" id="path22014" style="fill: rgb(0, 0, 0); fill-opacity: 0.131455; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
|
||||
<path d="M 136.393,257.626 C 148.985,285.3 119.124,318.766 136.393,346.078 L 138.68,346.078 C 121.411,318.766 151.272,285.3 138.68,257.626 L 136.393,257.626 z" id="path22016" style="fill: rgb(0, 0, 0); fill-opacity: 0.197183; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
|
||||
<path d="M 137.564,257.626 C 150.156,285.3 120.295,318.766 137.564,346.078 L 138.68,346.078 C 121.411,318.766 151.272,285.3 138.68,257.626 L 137.564,257.626 z" id="path22018" style="fill: rgb(0, 0, 0); fill-opacity: 0.267606; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
|
||||
<path d="M 133.134,257.626 C 145.726,285.3 115.865,318.766 133.134,346.078 L 138.68,346.078 C 121.411,318.766 151.272,285.3 138.68,257.626 L 133.134,257.626 z" id="path22020" style="fill: rgb(0, 0, 0); fill-opacity: 0.0798122; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
|
||||
</g>
|
||||
<g transform="matrix(1.14159, 0, 0, 1.14159, -389.722, -484.947)" id="g22103">
|
||||
<path d="M 653.161,498.44 L 693.751,498.44" id="path21604" style="fill: rgb(96, 123, 142); fill-opacity: 1; fill-rule: nonzero; stroke: rgb(255, 254, 255); stroke-width: 8; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-dashoffset: 0pt; stroke-opacity: 0.698039;"/>
|
||||
<path d="M 653.161,515.294 L 683.452,515.294" id="path21606" style="fill: rgb(96, 123, 142); fill-opacity: 1; fill-rule: nonzero; stroke: rgb(255, 254, 255); stroke-width: 8; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-dashoffset: 0pt; stroke-opacity: 0.698039;"/>
|
||||
<path d="M 653.161,532.46 L 693.751,532.46" id="path22101" style="fill: rgb(96, 123, 142); fill-opacity: 1; fill-rule: nonzero; stroke: rgb(255, 254, 255); stroke-width: 8; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-dashoffset: 0pt; stroke-opacity: 0.698039;"/>
|
||||
</g>
|
||||
<path d="M 326.964,34.4298 L 423.285,34.4298 C 433.146,54.4709 420.531,82.4058 417.826,102.327 L 326.964,102.327 L 326.964,34.4298 z" id="path22008" style="fill: url("#linearGradient4819") rgb(0, 0, 0); fill-opacity: 1; fill-rule: nonzero; stroke: none; stroke-width: 1.25; stroke-linecap: round; stroke-linejoin: round; stroke-miterlimit: 4; stroke-opacity: 1;"/>
|
||||
</g>
|
||||
</g>
|
||||
</svg>
|
After Width: | Height: | Size: 12 KiB |
99
setup.py
99
setup.py
@ -1,99 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import with_statement
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import sys, os, optparse
|
||||
|
||||
sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)))
|
||||
|
||||
import setup.commands as commands
|
||||
from setup import prints, get_warnings
|
||||
|
||||
def check_version_info():
|
||||
vi = sys.version_info
|
||||
if vi[0] == 2 and vi[1] > 5:
|
||||
return None
|
||||
return 'calibre requires python >= 2.6'
|
||||
|
||||
def option_parser():
|
||||
parser = optparse.OptionParser()
|
||||
parser.add_option('-c', '--clean', default=False, action='store_true',
|
||||
help=('Instead of running the command delete all files generated '
|
||||
'by the command'))
|
||||
parser.add_option('--clean-backups', default=False, action='store_true',
|
||||
help='Delete all backup files from the source tree')
|
||||
parser.add_option('--clean-all', default=False, action='store_true',
|
||||
help='Delete all machine generated files from the source tree')
|
||||
return parser
|
||||
|
||||
def clean_backups():
|
||||
for root, _, files in os.walk('.'):
|
||||
for name in files:
|
||||
for t in ('.pyc', '.pyo', '~', '.swp', '.swo'):
|
||||
if name.endswith(t):
|
||||
os.remove(os.path.join(root, name))
|
||||
|
||||
|
||||
def main(args=sys.argv):
|
||||
if len(args) == 1 or args[1] in ('-h', '--help'):
|
||||
print 'Usage: python', args[0], 'command', '[options]'
|
||||
print '\nWhere command is one of:'
|
||||
print
|
||||
for x in sorted(commands.__all__):
|
||||
print '%-20s -'%x,
|
||||
c = getattr(commands, x)
|
||||
desc = getattr(c, 'short_description', c.description)
|
||||
print desc
|
||||
|
||||
print '\nTo get help on a particular command, run:'
|
||||
print '\tpython', args[0], 'command -h'
|
||||
return 1
|
||||
|
||||
command = args[1]
|
||||
if command not in commands.__all__:
|
||||
print command, 'is not a recognized command.'
|
||||
print 'Valid commands:', ', '.join(commands.__all__)
|
||||
return 1
|
||||
|
||||
command = getattr(commands, command)
|
||||
|
||||
parser = option_parser()
|
||||
command.add_all_options(parser)
|
||||
parser.set_usage('Usage: python setup.py %s [options]\n\n'%args[1]+\
|
||||
command.description)
|
||||
|
||||
opts, args = parser.parse_args(args)
|
||||
|
||||
if opts.clean_backups:
|
||||
clean_backups()
|
||||
|
||||
if opts.clean:
|
||||
prints('Cleaning', args[1])
|
||||
command.clean()
|
||||
return 0
|
||||
|
||||
if opts.clean_all:
|
||||
for cmd in commands.__all__:
|
||||
prints('Cleaning', cmd)
|
||||
getattr(commands, cmd).clean()
|
||||
return 0
|
||||
|
||||
command.run_all(opts)
|
||||
|
||||
warnings = get_warnings()
|
||||
if warnings:
|
||||
print
|
||||
prints('There were', len(warnings), 'warning(s):')
|
||||
print
|
||||
for args, kwargs in warnings:
|
||||
prints('*', *args, **kwargs)
|
||||
print
|
||||
|
||||
return 0
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
@ -50,7 +50,7 @@ class RTFInput(InputFormatPlugin):
|
||||
parser = ParseRtf(
|
||||
in_file = stream,
|
||||
out_file = ofile,
|
||||
#deb_dir = 'I:\\Calibre\\rtfdebug',
|
||||
deb_dir = 'I:\\Calibre\\rtfdebug',
|
||||
# Convert symbol fonts to unicode equivalents. Default
|
||||
# is 1
|
||||
convert_symbol = 1,
|
||||
@ -187,16 +187,17 @@ class RTFInput(InputFormatPlugin):
|
||||
self.log = log
|
||||
self.log('Converting RTF to XML...')
|
||||
#Name of the preprocesssed RTF file
|
||||
fname = self.preprocess(stream.name)
|
||||
#fname = self.preprocess(stream.name)
|
||||
fname = stream.name
|
||||
try:
|
||||
xml = self.generate_xml(fname)
|
||||
except RtfInvalidCodeException, e:
|
||||
raise ValueError(_('This RTF file has a feature calibre does not '
|
||||
'support. Convert it to HTML first and then try it.\n%s')%e)
|
||||
|
||||
'''dataxml = open('dataxml.xml', 'w')
|
||||
dataxml = open('dataxml.xml', 'w')
|
||||
dataxml.write(xml)
|
||||
dataxml.close'''
|
||||
dataxml.close
|
||||
|
||||
d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf'))
|
||||
if d:
|
||||
|
@ -228,8 +228,9 @@ class RtfTokenizer():
|
||||
def tokenize(self):
|
||||
i = 0
|
||||
lastDataStart = -1
|
||||
#on parse caractere par caractere
|
||||
while i < len(self.rtfData):
|
||||
|
||||
#si ça commence un grpupe
|
||||
if isChar(self.rtfData[i], '{'):
|
||||
if lastDataStart > -1:
|
||||
self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
|
||||
@ -237,7 +238,7 @@ class RtfTokenizer():
|
||||
self.tokens.append(tokenDelimitatorStart())
|
||||
i = i + 1
|
||||
continue
|
||||
|
||||
#si ça fini un grpupe
|
||||
if isChar(self.rtfData[i], '}'):
|
||||
if lastDataStart > -1:
|
||||
self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
|
||||
@ -245,7 +246,7 @@ class RtfTokenizer():
|
||||
self.tokens.append(tokenDelimitatorEnd())
|
||||
i = i + 1
|
||||
continue
|
||||
|
||||
#on copie si il y a un charactere de controle
|
||||
if isChar(self.rtfData[i], '\\'):
|
||||
if i + 1 >= len(self.rtfData):
|
||||
raise Exception('Error: Control character found at the end of the document.')
|
||||
@ -254,6 +255,7 @@ class RtfTokenizer():
|
||||
self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
|
||||
lastDataStart = -1
|
||||
|
||||
# le token commence ici
|
||||
tokenStart = i
|
||||
i = i + 1
|
||||
|
||||
|
@ -32,7 +32,7 @@ class FixLineEndings:
|
||||
self.__write_to = tempfile.mktemp()
|
||||
self.__replace_illegals = replace_illegals
|
||||
def fix_endings(self):
|
||||
illegal_regx = re.compile( '\x00|\x01|\x02|\x03|\x04|\x05|\x06|\x07|\x08|\x0B|\x0E|\x0F|\x10|\x11|\x12|\x13')
|
||||
illegal_regx = re.compile('\x00|\x01|\x02|\x03|\x04|\x05|\x06|\x07|\x08|\x0B|\x0E|\x0F|\x10|\x11|\x12|\x13')
|
||||
# always check since I have to get rid of illegal characters
|
||||
#read
|
||||
read_obj = open(self.__file, 'r')
|
||||
|
@ -16,7 +16,10 @@
|
||||
# #
|
||||
#########################################################################
|
||||
import os, re, tempfile
|
||||
|
||||
from calibre.ebooks.rtf2xml import copy
|
||||
from calibre.utils.mreplace import MReplace
|
||||
|
||||
class Tokenize:
|
||||
"""Tokenize RTF into one line per field. Each line will contain information useful for the rest of the script"""
|
||||
def __init__(self,
|
||||
@ -28,20 +31,162 @@ class Tokenize:
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__special_tokens = [ '_', '~', "'", '{', '}' ]
|
||||
self.__write_to = tempfile.mktemp()
|
||||
self.__compile_expressions()
|
||||
#variables
|
||||
self.__uc_char = 0
|
||||
self.__uc_bin = False
|
||||
self.__uc_value = [1]
|
||||
|
||||
def __from_ms_to_utf8(self,match_obj):
|
||||
uni_char = int(match_obj.group(1))
|
||||
if uni_char < 0:
|
||||
uni_char += 65536
|
||||
return '&#x' + str('%X' % uni_char) + ';'
|
||||
def __neg_unicode_func(self, match_obj):
|
||||
neg_uni_char = int(match_obj.group(1)) * -1
|
||||
# sys.stderr.write(str( neg_uni_char))
|
||||
uni_char = neg_uni_char + 65536
|
||||
return '&#x' + str('%X' % uni_char) + ';'
|
||||
def __sub_line_reg(self,line):
|
||||
line = line.replace("\\\\", "\\backslash ")
|
||||
|
||||
def __reini_utf8_counters(self):
|
||||
self.__uc_char = 0
|
||||
self.__uc_bin = False
|
||||
|
||||
def __unicode_process(self, token):
|
||||
#change scope in
|
||||
if token == '\{':
|
||||
self.__uc_value.append(self.__uc_value[-1])
|
||||
#basic error handling
|
||||
self.__reini_utf8_counters()
|
||||
return token
|
||||
#change scope out: evaluate dict and rebuild
|
||||
elif token == '\}':
|
||||
#self.__uc_value.pop()
|
||||
self.__reini_utf8_counters()
|
||||
return token
|
||||
#add a uc control
|
||||
elif token[:3] == '\uc':
|
||||
self.__uc_value[-1] = int(token[3:])
|
||||
self.__reini_utf8_counters()
|
||||
return token
|
||||
#handle uc skippable char
|
||||
elif self.__uc_char:
|
||||
#if token[:1] == "\" and token[:1] == "\"
|
||||
pass
|
||||
#go for real \u token
|
||||
match_obj = self.__utf_exp.match(token)
|
||||
if match_obj is not None:
|
||||
#get value and handle negative case
|
||||
uni_char = int(match_obj.group(1))
|
||||
uni_len = len(match_obj.group(1)) + 2
|
||||
if uni_char < 0:
|
||||
uni_char += 65536
|
||||
uni_char = unichr(uni_char).encode('ascii', 'xmlcharrefreplace')
|
||||
#if not uc0
|
||||
if self.__uc_value[-1]:
|
||||
self.__uc_char = self.__uc_value[-1]
|
||||
#there is only an unicode char
|
||||
if len(token)<= uni_len:
|
||||
return uni_char
|
||||
#an unicode char and something else
|
||||
#must be after as it is splited on \
|
||||
elif not self.__uc_value[-1]:
|
||||
print('not only token uc0 token: ' + uni_char + token[uni_len:])
|
||||
return uni_char + token[uni_len:]
|
||||
#if not uc0 and chars
|
||||
else:
|
||||
for i in xrange(uni_len, len(token)):
|
||||
if token[i] == " ":
|
||||
continue
|
||||
elif self.__uc_char > 0:
|
||||
self.__uc_char -= 1
|
||||
else:
|
||||
return uni_char + token[i:]
|
||||
#print('uc: ' + str(self.__uc_value) + 'uni: ' + str(uni_char) + 'token: ' + token)
|
||||
#default
|
||||
return token
|
||||
|
||||
def __sub_reg_split(self,input_file):
|
||||
input_file = self.__replace_spchar.mreplace(input_file)
|
||||
#input_file = re.sub(self.__utf_exp, self.__from_ms_to_utf8, input_file)
|
||||
# line = re.sub( self.__neg_utf_exp, self.__neg_unicode_func, line)
|
||||
# this is for older RTF
|
||||
#line = re.sub(self.__par_exp, '\\par ', line)
|
||||
input_file = re.sub(self.__ms_hex_exp, "\\mshex0\g<1> ", input_file)
|
||||
#split
|
||||
tokens = re.split(self.__splitexp, input_file)
|
||||
#remove empty tokens and \n
|
||||
return filter(lambda x: len(x) > 0 and x != '\n', tokens)
|
||||
#return filter(lambda x: len(x) > 0, \
|
||||
#(self.__remove_line.sub('', x) for x in tokens))
|
||||
|
||||
|
||||
def __compile_expressions(self):
|
||||
SIMPLE_RPL = {
|
||||
"\\\\": "\\backslash ",
|
||||
"\\~": "\\~ ",
|
||||
"\\;": "\\; ",
|
||||
"&": "&",
|
||||
"<": "<",
|
||||
">": ">",
|
||||
"\\~": "\\~ ",
|
||||
"\\_": "\\_ ",
|
||||
"\\:": "\\: ",
|
||||
"\\-": "\\- ",
|
||||
# turn into a generic token to eliminate special
|
||||
# cases and make processing easier
|
||||
"\\{": "\\ob ",
|
||||
# turn into a generic token to eliminate special
|
||||
# cases and make processing easier
|
||||
"\\}": "\\cb ",
|
||||
# put a backslash in front of to eliminate special cases and
|
||||
# make processing easier
|
||||
"{": "\\{",
|
||||
# put a backslash in front of to eliminate special cases and
|
||||
# make processing easier
|
||||
"}": "\\}",
|
||||
# this is for older RTF
|
||||
r'\\$': '\\par ',
|
||||
}
|
||||
self.__replace_spchar = MReplace(SIMPLE_RPL)
|
||||
self.__ms_hex_exp = re.compile(r"\\\'([0-9a-fA-F]{2})") #r"\\\'(..)"
|
||||
self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) {0,1}") #modify this
|
||||
#self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})")
|
||||
#add \n in split for whole file reading
|
||||
#self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\n|\\[^\s\\{}&]+(?:\s)?)")
|
||||
#why keep backslash whereas \is replaced before?
|
||||
self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)")
|
||||
#self.__par_exp = re.compile(r'\\$')
|
||||
#self.__remove_line = re.compile(r'\n+')
|
||||
#self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)")
|
||||
##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)")
|
||||
|
||||
def tokenize(self):
|
||||
"""Main class for handling other methods. Reads the file \
|
||||
, uses method self.sub_reg to make basic substitutions,\
|
||||
and process tokens by itself"""
|
||||
#read
|
||||
read_obj = open(self.__file, 'r')
|
||||
input_file = read_obj.read()
|
||||
read_obj.close()
|
||||
|
||||
#process simple replacements and split giving us a correct list
|
||||
#remove '' and \n in the process
|
||||
tokens = self.__sub_reg_split(input_file)
|
||||
#correct unicode
|
||||
#tokens = map(self.__unicode_process, tokens)
|
||||
#remove empty items created by removing \uc
|
||||
#tokens = filter(lambda x: len(x) > 0, tokens)
|
||||
|
||||
#write
|
||||
write_obj = open(self.__write_to, 'wb')
|
||||
write_obj.write('\n'.join(tokens))
|
||||
write_obj.close()
|
||||
#Move and copy
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "tokenize.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
|
||||
#self.__special_tokens = [ '_', '~', "'", '{', '}' ]
|
||||
'''line = line.replace("\\\\", "\\backslash ")
|
||||
line = line.replace("\\~", "\\~ ")
|
||||
line = line.replace("\\;", "\\; ")
|
||||
line = line.replace("&", "&")
|
||||
@ -63,54 +208,37 @@ class Tokenize:
|
||||
# put a backslash in front of to eliminate special cases and
|
||||
# make processing easier
|
||||
line = line.replace("}", "\\}")
|
||||
line = re.sub(self.__utf_exp, self.__from_ms_to_utf8, line)
|
||||
# line = re.sub( self.__neg_utf_exp, self.__neg_unicode_func, line)
|
||||
line = re.sub(self.__ms_hex_exp, "\\mshex0\g<1> ", line)
|
||||
##line = line.replace("\\backslash", "\\\\")
|
||||
# this is for older RTF
|
||||
line = re.sub(self.__par_exp, '\\par ', line)
|
||||
return line
|
||||
def __compile_expressions(self):
|
||||
self.__ms_hex_exp = re.compile(r"\\\'(..)")
|
||||
self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) {0,1}")
|
||||
self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\\[^\s\\{}&]+(?:\s)?)")
|
||||
self.__par_exp = re.compile(r'\\$')
|
||||
self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)")
|
||||
##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)")
|
||||
def __create_tokens(self):
|
||||
self.__compile_expressions()
|
||||
read_obj = open(self.__file, 'r')
|
||||
write_obj = open(self.__write_to, 'wb')
|
||||
|
||||
line_to_read = "dummy"
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
line = line.replace("\n", "")
|
||||
line = self.__sub_line_reg(line)
|
||||
tokens = re.split(self.__splitexp, line)
|
||||
##print tokens
|
||||
for token in tokens:
|
||||
if token != "":
|
||||
'''
|
||||
'''if token != "":
|
||||
write_obj.write(token + "\n")
|
||||
|
||||
match_obj = re.search(self.__mixed_exp, token)
|
||||
if match_obj != None:
|
||||
first = match_obj.group(1)
|
||||
second = match_obj.group(2)
|
||||
write_obj.write(first + "\n")
|
||||
write_obj.write(second + "\n")
|
||||
else:
|
||||
write_obj.write(token + "\n")
|
||||
"""
|
||||
match_obj = re.search(self.__mixed_exp, token)
|
||||
if match_obj != None:
|
||||
first = match_obj.group(1)
|
||||
second = match_obj.group(2)
|
||||
write_obj.write(first + "\n")
|
||||
write_obj.write(second + "\n")
|
||||
else:
|
||||
write_obj.write(token + "\n")
|
||||
"""
|
||||
read_obj.close()
|
||||
write_obj.close()
|
||||
def tokenize(self):
|
||||
"""Main class for handling other methods. Reads in one line \
|
||||
at a time, uses method self.sub_line to make basic substitutions,\
|
||||
uses ? to process tokens"""
|
||||
self.__create_tokens()
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "tokenize.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
'''
|
||||
'''
|
||||
for line in read_obj:
|
||||
#make all replacements
|
||||
line = self.__sub_reg(line)
|
||||
#split token and remove empty tokens
|
||||
tokens = filter(lambda x: len(x) > 0,
|
||||
re.split(self.__splitexp, line))
|
||||
if tokens:
|
||||
write_obj.write('\n'.join(tokens)+'\n')'''
|
||||
|
||||
'''def __neg_unicode_func(self, match_obj):
|
||||
neg_uni_char = int(match_obj.group(1)) * -1
|
||||
# sys.stderr.write(str( neg_uni_char))
|
||||
uni_char = neg_uni_char + 65536
|
||||
return '&#x' + str('%X' % uni_char) + ';'''
|
@ -329,6 +329,7 @@ class FileIconProvider(QFileIconProvider):
|
||||
'epub' : 'epub',
|
||||
'fb2' : 'fb2',
|
||||
'rtf' : 'rtf',
|
||||
'odt' : 'odt',
|
||||
}
|
||||
|
||||
def __init__(self):
|
||||
|
Loading…
x
Reference in New Issue
Block a user