RTF Input: Fix regression in 0.7.40 that broke conversion of some old style RTF files

This commit is contained in:
Kovid Goyal 2011-01-15 15:46:19 -07:00
parent 4e93c30cd0
commit f6d72fbe0b
7 changed files with 153 additions and 95 deletions

View File

@ -286,7 +286,6 @@ class RTFInput(InputFormatPlugin):
try: try:
xml = self.generate_xml(stream.name) xml = self.generate_xml(stream.name)
except RtfInvalidCodeException, e: except RtfInvalidCodeException, e:
raise
raise ValueError(_('This RTF file has a feature calibre does not ' raise ValueError(_('This RTF file has a feature calibre does not '
'support. Convert it to HTML first and then try it.\n%s')%e) 'support. Convert it to HTML first and then try it.\n%s')%e)

View File

@ -226,10 +226,6 @@ class ParseRtf:
try: try:
return_value = process_tokens_obj.process_tokens() return_value = process_tokens_obj.process_tokens()
except InvalidRtfException, msg: except InvalidRtfException, msg:
try:
os.remove(self.__temp_file)
except OSError:
pass
#Check to see if the file is correctly encoded #Check to see if the file is correctly encoded
encode_obj = default_encoding.DefaultEncoding( encode_obj = default_encoding.DefaultEncoding(
in_file = self.__temp_file, in_file = self.__temp_file,
@ -241,13 +237,16 @@ class ParseRtf:
check_encoding_obj = check_encoding.CheckEncoding( check_encoding_obj = check_encoding.CheckEncoding(
bug_handler = RtfInvalidCodeException, bug_handler = RtfInvalidCodeException,
) )
enc = encode_obj.get_codepage() enc = 'cp' + encode_obj.get_codepage()
if enc != 'mac_roman': msg = 'Exception in token processing'
enc = 'cp' + enc
if check_encoding_obj.check_encoding(self.__file, enc): if check_encoding_obj.check_encoding(self.__file, enc):
file_name = self.__file if isinstance(self.__file, str) \ file_name = self.__file if isinstance(self.__file, str) \
else self.__file.encode('utf-8') else self.__file.encode('utf-8')
msg = 'File %s does not appear to be correctly encoded.\n' % file_name msg = 'File %s does not appear to be correctly encoded.\n' % file_name
try:
os.remove(self.__temp_file)
except OSError:
pass
raise InvalidRtfException, msg raise InvalidRtfException, msg
delete_info_obj = delete_info.DeleteInfo( delete_info_obj = delete_info.DeleteInfo(
in_file = self.__temp_file, in_file = self.__temp_file,

View File

@ -74,9 +74,6 @@ class DefaultEncoding:
if not self.__datafetched: if not self.__datafetched:
self._encoding() self._encoding()
self.__datafetched = True self.__datafetched = True
if self.__platform == 'Macintosh':
code_page = self.__code_page
else:
code_page = 'ansicpg' + self.__code_page code_page = 'ansicpg' + self.__code_page
return self.__platform, code_page, self.__default_num return self.__platform, code_page, self.__default_num
@ -94,49 +91,60 @@ class DefaultEncoding:
def _encoding(self): def _encoding(self):
with open(self.__file, 'r') as read_obj: with open(self.__file, 'r') as read_obj:
cpfound = False
if not self.__fetchraw: if not self.__fetchraw:
for line in read_obj: for line in read_obj:
self.__token_info = line[:16] self.__token_info = line[:16]
if self.__token_info == 'mi<mk<rtfhed-end': if self.__token_info == 'mi<mk<rtfhed-end':
break break
if self.__token_info == 'cw<ri<ansi-codpg':
#cw<ri<ansi-codpg<nu<10000
self.__code_page = line[20:-1] if int(line[20:-1]) \
else '1252'
if self.__token_info == 'cw<ri<macintosh_': if self.__token_info == 'cw<ri<macintosh_':
self.__platform = 'Macintosh' self.__platform = 'Macintosh'
self.__code_page = 'mac_roman'
elif self.__token_info == 'cw<ri<pc________': elif self.__token_info == 'cw<ri<pc________':
self.__platform = 'IBMPC' self.__platform = 'IBMPC'
self.__code_page = '437'
elif self.__token_info == 'cw<ri<pca_______': elif self.__token_info == 'cw<ri<pca_______':
self.__platform = 'OS/2' self.__platform = 'OS/2'
self.__code_page = '850' if self.__token_info == 'cw<ri<ansi-codpg' \
and int(line[20:-1]):
self.__code_page = line[20:-1]
if self.__token_info == 'cw<ri<deflt-font': if self.__token_info == 'cw<ri<deflt-font':
self.__default_num = line[20:-1] self.__default_num = line[20:-1]
cpfound = True
#cw<ri<deflt-font<nu<0 #cw<ri<deflt-font<nu<0
if self.__platform != 'Windows' and \
not cpfound:
if self.__platform == 'Macintosh':
self.__code_page = '10000'
elif self.__platform == 'IBMPC':
self.__code_page = '437'
elif self.__platform == 'OS/2':
self.__code_page = '850'
else: else:
fenc = re.compile(r'\\(mac|pc|ansi|pca)[\\ \{\}\t\n]+') fenc = re.compile(r'\\(mac|pc|ansi|pca)[\\ \{\}\t\n]+')
fenccp = re.compile(r'\\ansicpg(\d+)[\\ \{\}\t\n]+') fenccp = re.compile(r'\\ansicpg(\d+)[\\ \{\}\t\n]+')
for line in read_obj: for line in read_obj:
if fenc.search(line):
enc = fenc.search(line).group(1)
if fenccp.search(line): if fenccp.search(line):
cp = fenccp.search(line).group(1) cp = fenccp.search(line).group(1)
if not int(cp): if not int(cp):
self.__code_page = cp self.__code_page = cp
cpfound = True
break break
if fenc.search(line): if self.__platform != 'Windows' and \
enc = fenc.search(line).group(1) not cpfound:
if enc == 'mac': if enc == 'mac':
self.__code_page = 'mac_roman' self.__code_page = '10000'
elif enc == 'pc': elif enc == 'pc':
self.__code_page = '437' self.__code_page = '437'
elif enc == 'pca': elif enc == 'pca':
self.__code_page = '850' self.__code_page = '850'
# if __name__ == '__main__': if __name__ == '__main__':
# encode_obj = DefaultEncoding( import sys
# in_file = sys.argv[1], encode_obj = DefaultEncoding(
# bug_handler = Exception, in_file = sys.argv[1],
# check_raw = True, bug_handler = Exception,
# ) check_raw = True,
# print encode_obj.get_codepage() )
print encode_obj.get_codepage()

View File

@ -20,7 +20,7 @@ import sys, os, tempfile
from calibre.ebooks.rtf2xml import copy from calibre.ebooks.rtf2xml import copy
class DeleteInfo: class DeleteInfo:
"""Delelet unecessary destination groups""" """Delete unecessary destination groups"""
def __init__(self, def __init__(self,
in_file , in_file ,
bug_handler, bug_handler,
@ -31,17 +31,14 @@ class DeleteInfo:
self.__bug_handler = bug_handler self.__bug_handler = bug_handler
self.__copy = copy self.__copy = copy
self.__write_to = tempfile.mktemp() self.__write_to = tempfile.mktemp()
self.__run_level = run_level
self.__initiate_allow()
self.__bracket_count= 0 self.__bracket_count= 0
self.__ob_count = 0 self.__ob_count = 0
self.__cb_count = 0 self.__cb_count = 0
# self.__after_asterisk = False
# self.__delete = 0
self.__initiate_allow()
self.__ob = 0 self.__ob = 0
self.__write_cb = False self.__write_cb = False
self.__run_level = run_level
self.__found_delete = False self.__found_delete = False
# self.__list = False
def __initiate_allow(self): def __initiate_allow(self):
""" """
@ -57,6 +54,8 @@ class DeleteInfo:
'cw<an<annotation', 'cw<an<annotation',
'cw<cm<comment___', 'cw<cm<comment___',
'cw<it<lovr-table', 'cw<it<lovr-table',
# info table
'cw<di<company___',
# 'cw<ls<list______', # 'cw<ls<list______',
) )
self.__not_allowable = ( self.__not_allowable = (
@ -116,7 +115,6 @@ class DeleteInfo:
""" """
# Test for {\*}, in which case don't enter # Test for {\*}, in which case don't enter
# delete state # delete state
# self.__after_asterisk = False # only enter this function once
self.__found_delete = True self.__found_delete = True
if self.__token_info == 'cb<nu<clos-brack': if self.__token_info == 'cb<nu<clos-brack':
if self.__delete_count == self.__cb_count: if self.__delete_count == self.__cb_count:
@ -128,7 +126,7 @@ class DeleteInfo:
# not sure what happens here! # not sure what happens here!
# believe I have a '{\*} # believe I have a '{\*}
if self.__run_level > 3: if self.__run_level > 3:
msg = 'flag problem\n' msg = 'Flag problem\n'
raise self.__bug_handler, msg raise self.__bug_handler, msg
return True return True
elif self.__token_info in self.__allowable : elif self.__token_info in self.__allowable :
@ -173,8 +171,8 @@ class DeleteInfo:
Return True for all control words. Return True for all control words.
Return False otherwise. Return False otherwise.
""" """
if self.__delete_count == self.__cb_count and self.__token_info ==\ if self.__delete_count == self.__cb_count and \
'cb<nu<clos-brack': self.__token_info == 'cb<nu<clos-brack':
self.__state = 'default' self.__state = 'default'
if self.__write_cb: if self.__write_cb:
self.__write_cb = False self.__write_cb = False
@ -186,32 +184,24 @@ class DeleteInfo:
return False return False
def delete_info(self): def delete_info(self):
"""Main method for handling other methods. Read one line in at """Main method for handling other methods. Read one line at
a time, and determine whether to print the line based on the state.""" a time, and determine whether to print the line based on the state."""
with open(self.__file, 'r') as read_obj: with open(self.__file, 'r') as read_obj:
with open(self.__write_to, 'w') as self.__write_obj: with open(self.__write_to, 'w') as self.__write_obj:
for line in read_obj: for line in read_obj:
#ob<nu<open-brack<0001 #ob<nu<open-brack<0001
to_print = True
self.__token_info = line[:16] self.__token_info = line[:16]
if self.__token_info == 'ob<nu<open-brack': if self.__token_info == 'ob<nu<open-brack':
self.__ob_count = line[-5:-1] self.__ob_count = line[-5:-1]
if self.__token_info == 'cb<nu<clos-brack': if self.__token_info == 'cb<nu<clos-brack':
self.__cb_count = line[-5:-1] self.__cb_count = line[-5:-1]
# Get action to perform
action = self.__state_dict.get(self.__state) action = self.__state_dict.get(self.__state)
if not action: if not action:
sys.stderr.write(_('No action in dictionary state is "%s" \n') sys.stderr.write('No action in dictionary state is "%s" \n'
% self.__state) % self.__state)
to_print = action(line) # Print if allowed by action
# if self.__after_asterisk: if action(line):
# to_print = self.__asterisk_func(line)
# elif self.__list:
# self.__in_list_func(line)
# elif self.__delete:
# to_print = self.__delete_func(line)
# else:
# to_print = self.__default_func(line)
if to_print:
self.__write_obj.write(line) self.__write_obj.write(line)
copy_obj = copy.Copy(bug_handler = self.__bug_handler) copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy: if self.__copy:

View File

@ -15,8 +15,10 @@
# # # #
# # # #
######################################################################### #########################################################################
import sys, os, tempfile import sys, os, tempfile, re
from calibre.ebooks.rtf2xml import copy from calibre.ebooks.rtf2xml import copy
class Info: class Info:
""" """
Make tags for document-information Make tags for document-information
@ -42,12 +44,14 @@ class Info:
self.__copy = copy self.__copy = copy
self.__run_level = run_level self.__run_level = run_level
self.__write_to = tempfile.mktemp() self.__write_to = tempfile.mktemp()
def __initiate_values(self): def __initiate_values(self):
""" """
Initiate all values. Initiate all values.
""" """
self.__text_string = '' self.__text_string = ''
self.__state = 'before_info_table' self.__state = 'before_info_table'
self.rmspace = re.compile(r'\s+')
self.__state_dict = { self.__state_dict = {
'before_info_table': self.__before_info_table_func, 'before_info_table': self.__before_info_table_func,
'after_info_table': self.__after_info_table_func, 'after_info_table': self.__after_info_table_func,
@ -58,27 +62,49 @@ class Info:
self.__info_table_dict = { self.__info_table_dict = {
'cw<di<title_____' : (self.__found_tag_with_text_func, 'title'), 'cw<di<title_____' : (self.__found_tag_with_text_func, 'title'),
'cw<di<author____' : (self.__found_tag_with_text_func, 'author'), 'cw<di<author____' : (self.__found_tag_with_text_func, 'author'),
'cw<di<operator__' : (self.__found_tag_with_text_func, 'operator'),
'cw<di<manager___' : (self.__found_tag_with_text_func, 'manager'),
'cw<di<company___' : (self.__found_tag_with_text_func, 'company'),
'cw<di<keywords__' : (self.__found_tag_with_text_func, 'keywords'), 'cw<di<keywords__' : (self.__found_tag_with_text_func, 'keywords'),
'cw<di<category__' : (self.__found_tag_with_text_func, 'category'),
'cw<di<doc-notes_' : (self.__found_tag_with_text_func, 'doc-notes'), 'cw<di<doc-notes_' : (self.__found_tag_with_text_func, 'doc-notes'),
'cw<di<subject___' : (self.__found_tag_with_text_func, 'subject'), 'cw<di<subject___' : (self.__found_tag_with_text_func, 'subject'),
'cw<di<operator__' : (self.__found_tag_with_text_func, 'operator'), 'cw<di<linkbase__' : (self.__found_tag_with_text_func, 'hyperlink-base'),
'cw<di<create-tim' : (self.__found_tag_with_tokens_func, 'creation-time'), 'cw<di<create-tim' : (self.__found_tag_with_tokens_func, 'creation-time'),
'cw<di<revis-time' : (self.__found_tag_with_tokens_func, 'revision-time'), 'cw<di<revis-time' : (self.__found_tag_with_tokens_func, 'revision-time'),
'cw<di<edit-time_' : (self.__single_field_func, 'editing-time'), 'cw<di<edit-time_' : (self.__found_tag_with_tokens_func, 'editing-time'),
'cw<di<print-time' : (self.__found_tag_with_tokens_func, 'printing-time'),
'cw<di<backuptime' : (self.__found_tag_with_tokens_func, 'backup-time'),
'cw<di<num-of-wor' : (self.__single_field_func, 'number-of-words'), 'cw<di<num-of-wor' : (self.__single_field_func, 'number-of-words'),
'cw<di<num-of-chr' : (self.__single_field_func, 'number-of-characters'), 'cw<di<num-of-chr' : (self.__single_field_func, 'number-of-characters'),
'cw<di<numofchrws' : (self.__single_field_func, 'number-of-characters-without-space'),
'cw<di<num-of-pag' : (self.__single_field_func, 'number-of-pages'), 'cw<di<num-of-pag' : (self.__single_field_func, 'number-of-pages'),
'cw<di<version___' : (self.__single_field_func, 'version'),
'cw<di<intern-ver' : (self.__single_field_func, 'internal-version-number'),
'cw<di<internalID' : (self.__single_field_func, 'internal-id-number'),
} }
self.__token_dict = { self.__token_dict = {
'year______' : 'year', 'year______' : 'year',
'month_____' : 'month', 'month_____' : 'month',
'day_______' : 'day', 'day_______' : 'day',
'minute____' : 'minute', 'minute____' : 'minute',
'second____' : 'second',
'revis-time' : 'revision-time', 'revis-time' : 'revision-time',
'create-tim' : 'creation-time',
'edit-time_' : 'editing-time',
'print-time' : 'printing-time',
'backuptime' : 'backup-time',
'num-of-wor' : 'number-of-words', 'num-of-wor' : 'number-of-words',
'num-of-chr' : 'number-of-characters', 'num-of-chr' : 'number-of-characters',
'numofchrws' : 'number-of-characters-without-space',
'num-of-pag' : 'number-of-pages', 'num-of-pag' : 'number-of-pages',
'version___' : 'version',
'intern-ver' : 'internal-version-number',
'internalID' : 'internal-id-number',
} }
def __before_info_table_func(self, line): def __before_info_table_func(self, line):
""" """
Required: Required:
@ -92,6 +118,7 @@ class Info:
if self.__token_info == 'mi<mk<doc-in-beg': if self.__token_info == 'mi<mk<doc-in-beg':
self.__state = 'in_info_table' self.__state = 'in_info_table'
self.__write_obj.write(line) self.__write_obj.write(line)
def __in_info_table_func(self, line): def __in_info_table_func(self, line):
""" """
Requires: Requires:
@ -112,6 +139,7 @@ class Info:
action(line, tag) action(line, tag)
else: else:
self.__write_obj.write(line) self.__write_obj.write(line)
def __found_tag_with_text_func(self, line, tag): def __found_tag_with_text_func(self, line, tag):
""" """
Requires: Requires:
@ -126,6 +154,7 @@ class Info:
""" """
self.__tag = tag self.__tag = tag
self.__state = 'collect_text' self.__state = 'collect_text'
def __collect_text_func(self, line): def __collect_text_func(self, line):
""" """
Requires: Requires:
@ -139,6 +168,8 @@ class Info:
""" """
if self.__token_info == 'mi<mk<docinf-end': if self.__token_info == 'mi<mk<docinf-end':
self.__state = 'in_info_table' self.__state = 'in_info_table'
#Don't print empty tags
if len(self.rmspace.sub('',self.__text_string)):
self.__write_obj.write( self.__write_obj.write(
'mi<tg<open______<%s\n' 'mi<tg<open______<%s\n'
'tx<nu<__________<%s\n' 'tx<nu<__________<%s\n'
@ -147,6 +178,7 @@ class Info:
self.__text_string = '' self.__text_string = ''
elif line[0:2] == 'tx': elif line[0:2] == 'tx':
self.__text_string += line[17:-1] self.__text_string += line[17:-1]
def __found_tag_with_tokens_func(self, line, tag): def __found_tag_with_tokens_func(self, line, tag):
""" """
Requires: Requires:
@ -163,6 +195,7 @@ class Info:
self.__state = 'collect_tokens' self.__state = 'collect_tokens'
self.__text_string = 'mi<tg<empty-att_<%s' % tag self.__text_string = 'mi<tg<empty-att_<%s' % tag
#mi<tg<empty-att_<page-definition<margin>33\n #mi<tg<empty-att_<page-definition<margin>33\n
def __collect_tokens_func(self, line): def __collect_tokens_func(self, line):
""" """
Requires: Requires:
@ -194,18 +227,19 @@ class Info:
att = line[6:16] att = line[6:16]
value = line[20:-1] value = line[20:-1]
att_changed = self.__token_dict.get(att) att_changed = self.__token_dict.get(att)
if att_changed == None: if att_changed is None:
if self.__run_level > 3: if self.__run_level > 3:
msg = 'no dictionary match for %s\n' % att msg = 'No dictionary match for %s\n' % att
raise self.__bug_handler, msg raise self.__bug_handler, msg
else: else:
self.__text_string += '<%s>%s' % (att_changed, value) self.__text_string += '<%s>%s' % (att_changed, value)
def __single_field_func(self, line, tag): def __single_field_func(self, line, tag):
value = line[20:-1] value = line[20:-1]
self.__write_obj.write( self.__write_obj.write(
'mi<tg<empty-att_<%s' 'mi<tg<empty-att_<%s<%s>%s\n' % (tag, tag, value)
'<%s>%s\n' % (tag, tag, value)
) )
def __after_info_table_func(self, line): def __after_info_table_func(self, line):
""" """
Requires: Requires:
@ -217,6 +251,7 @@ class Info:
the file. the file.
""" """
self.__write_obj.write(line) self.__write_obj.write(line)
def fix_info(self): def fix_info(self):
""" """
Requires: Requires:
@ -234,20 +269,15 @@ class Info:
information table, simply write the line to the output file. information table, simply write the line to the output file.
""" """
self.__initiate_values() self.__initiate_values()
read_obj = open(self.__file, 'r') with open(self.__file, 'r') as read_obj:
self.__write_obj = open(self.__write_to, 'w') with open(self.__write_to, 'wb') as self.__write_obj:
line_to_read = 1 for line in read_obj:
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16] self.__token_info = line[:16]
action = self.__state_dict.get(self.__state) action = self.__state_dict.get(self.__state)
if action == None: if action is None:
sys.stderr.write('no no matching state in module styles.py\n') sys.stderr.write('No matching state in module styles.py\n')
sys.stderr.write(self.__state + '\n') sys.stderr.write(self.__state + '\n')
action(line) action(line)
read_obj.close()
self.__write_obj.close()
copy_obj = copy.Copy(bug_handler = self.__bug_handler) copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy: if self.__copy:
copy_obj.copy_file(self.__write_to, "info.data") copy_obj.copy_file(self.__write_to, "info.data")

View File

@ -70,7 +70,7 @@ class ProcessTokens:
';' : ('mc', ';', self.ms_sub_func), ';' : ('mc', ';', self.ms_sub_func),
# this must be wrong # this must be wrong
'-' : ('mc', '-', self.ms_sub_func), '-' : ('mc', '-', self.ms_sub_func),
'line' : ('mi', 'hardline-break', self.hardline_func), #calibre 'line' : ('mi', 'hardline-break', self.direct_conv_func), #calibre
# misc => ml # misc => ml
'*' : ('ml', 'asterisk__', self.default_func), '*' : ('ml', 'asterisk__', self.default_func),
':' : ('ml', 'colon_____', self.default_func), ':' : ('ml', 'colon_____', self.default_func),
@ -78,7 +78,6 @@ class ProcessTokens:
'backslash' : ('nu', '\\', self.text_func), 'backslash' : ('nu', '\\', self.text_func),
'ob' : ('nu', '{', self.text_func), 'ob' : ('nu', '{', self.text_func),
'cb' : ('nu', '}', self.text_func), 'cb' : ('nu', '}', self.text_func),
#'line' : ('nu', ' ', self.text_func), calibre
# paragraph formatting => pf # paragraph formatting => pf
'page' : ('pf', 'page-break', self.default_func), 'page' : ('pf', 'page-break', self.default_func),
'par' : ('pf', 'par-end___', self.default_func), 'par' : ('pf', 'par-end___', self.default_func),
@ -231,11 +230,15 @@ class ProcessTokens:
'trhdr' : ('tb', 'row-header', self.default_func), 'trhdr' : ('tb', 'row-header', self.default_func),
# preamble => pr # preamble => pr
# document information => di # document information => di
# TODO integrate \userprops
'info' : ('di', 'doc-info__', self.default_func), 'info' : ('di', 'doc-info__', self.default_func),
'title' : ('di', 'title_____', self.default_func),
'author' : ('di', 'author____', self.default_func), 'author' : ('di', 'author____', self.default_func),
'operator' : ('di', 'operator__', self.default_func), 'operator' : ('di', 'operator__', self.default_func),
'title' : ('di', 'title_____', self.default_func), 'manager' : ('di', 'manager___', self.default_func),
'company' : ('di', 'company___', self.default_func),
'keywords' : ('di', 'keywords__', self.default_func), 'keywords' : ('di', 'keywords__', self.default_func),
'category' : ('di', 'category__', self.default_func),
'doccomm' : ('di', 'doc-notes_', self.default_func), 'doccomm' : ('di', 'doc-notes_', self.default_func),
'comment' : ('di', 'doc-notes_', self.default_func), 'comment' : ('di', 'doc-notes_', self.default_func),
'subject' : ('di', 'subject___', self.default_func), 'subject' : ('di', 'subject___', self.default_func),
@ -244,11 +247,19 @@ class ProcessTokens:
'mo' : ('di', 'month_____', self.default_func), 'mo' : ('di', 'month_____', self.default_func),
'dy' : ('di', 'day_______', self.default_func), 'dy' : ('di', 'day_______', self.default_func),
'min' : ('di', 'minute____', self.default_func), 'min' : ('di', 'minute____', self.default_func),
'sec' : ('di', 'second____', self.default_func),
'revtim' : ('di', 'revis-time', self.default_func), 'revtim' : ('di', 'revis-time', self.default_func),
'edmins' : ('di', 'edit-time_', self.default_func),
'printim' : ('di', 'print-time', self.default_func),
'buptim' : ('di', 'backuptime', self.default_func),
'nofwords' : ('di', 'num-of-wor', self.default_func), 'nofwords' : ('di', 'num-of-wor', self.default_func),
'nofchars' : ('di', 'num-of-chr', self.default_func), 'nofchars' : ('di', 'num-of-chr', self.default_func),
'nofcharsws' : ('di', 'numofchrws', self.default_func),
'nofpages' : ('di', 'num-of-pag', self.default_func), 'nofpages' : ('di', 'num-of-pag', self.default_func),
'edmins' : ('di', 'edit-time_', self.default_func), 'version' : ('di', 'version___', self.default_func),
'vern' : ('di', 'intern-ver', self.default_func),
'hlinkbase' : ('di', 'linkbase__', self.default_func),
'id' : ('di', 'internalID', self.default_func),
# headers and footers => hf # headers and footers => hf
'headerf' : ('hf', 'head-first', self.default_func), 'headerf' : ('hf', 'head-first', self.default_func),
'headerl' : ('hf', 'head-left_', self.default_func), 'headerl' : ('hf', 'head-left_', self.default_func),
@ -605,7 +616,7 @@ class ProcessTokens:
def ms_sub_func(self, pre, token, num): def ms_sub_func(self, pre, token, num):
return 'tx<mc<__________<%s\n' % token return 'tx<mc<__________<%s\n' % token
def hardline_func(self, pre, token, num): def direct_conv_func(self, pre, token, num):
return 'mi<tg<empty_____<%s\n' % token return 'mi<tg<empty_____<%s\n' % token
def default_func(self, pre, token, num): def default_func(self, pre, token, num):

View File

@ -27,11 +27,13 @@ class Tokenize:
bug_handler, bug_handler,
copy = None, copy = None,
run_level = 1, run_level = 1,
# out_file = None,
): ):
self.__file = in_file self.__file = in_file
self.__bug_handler = bug_handler self.__bug_handler = bug_handler
self.__copy = copy self.__copy = copy
self.__write_to = tempfile.mktemp() self.__write_to = tempfile.mktemp()
# self.__out_file = out_file
self.__compile_expressions() self.__compile_expressions()
#variables #variables
self.__uc_char = 0 self.__uc_char = 0
@ -113,6 +115,8 @@ class Tokenize:
def __sub_reg_split(self,input_file): def __sub_reg_split(self,input_file):
input_file = self.__replace_spchar.mreplace(input_file) input_file = self.__replace_spchar.mreplace(input_file)
# this is for older RTF
input_file = self.__par_exp.sub('\n\\par \n', input_file)
input_file = self.__ms_hex_exp.sub("\\mshex0\g<1> ", input_file) input_file = self.__ms_hex_exp.sub("\\mshex0\g<1> ", input_file)
input_file = self.__utf_ud.sub("\\{\\uc0 \g<1>\\}", input_file) input_file = self.__utf_ud.sub("\\{\\uc0 \g<1>\\}", input_file)
#remove \n in bin data #remove \n in bin data
@ -153,8 +157,6 @@ class Tokenize:
# put a backslash in front of to eliminate special cases and # put a backslash in front of to eliminate special cases and
# make processing easier # make processing easier
"}": "\\}", "}": "\\}",
# this is for older RTF
r'\\$': '\\par ',
} }
self.__replace_spchar = MReplace(SIMPLE_RPL) self.__replace_spchar = MReplace(SIMPLE_RPL)
#add ;? in case of char following \u #add ;? in case of char following \u
@ -168,10 +170,12 @@ class Tokenize:
#why keep backslash whereas \is replaced before? #why keep backslash whereas \is replaced before?
#remove \n from endline char #remove \n from endline char
self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)") self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)")
#this is for old RTF
self.__par_exp = re.compile(r'\\\n+')
# self.__par_exp = re.compile(r'\\$')
#self.__bin_exp = re.compile(r"\\bin(-?\d{1,8}) {0,1}") #self.__bin_exp = re.compile(r"\\bin(-?\d{1,8}) {0,1}")
#self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})") #self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})")
#self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\n|\\[^\s\\{}&]+(?:\s)?)") #self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\n|\\[^\s\\{}&]+(?:\s)?)")
#self.__par_exp = re.compile(r'\\$')
#self.__remove_line = re.compile(r'\n+') #self.__remove_line = re.compile(r'\n+')
#self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)") #self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)")
##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)") ##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)")
@ -199,7 +203,24 @@ class Tokenize:
copy_obj = copy.Copy(bug_handler = self.__bug_handler) copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy: if self.__copy:
copy_obj.copy_file(self.__write_to, "tokenize.data") copy_obj.copy_file(self.__write_to, "tokenize.data")
# if self.__out_file:
# self.__file = self.__out_file
copy_obj.rename(self.__write_to, self.__file) copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to) os.remove(self.__write_to)
#self.__special_tokens = [ '_', '~', "'", '{', '}' ] #self.__special_tokens = [ '_', '~', "'", '{', '}' ]
# import sys
# def main(args=sys.argv):
# if len(args) < 1:
# print 'No file'
# return
# file = 'data_tokens.txt'
# if len(args) == 3:
# file = args[2]
# to = Tokenize(args[1], Exception, out_file = file)
# to.tokenize()
# if __name__ == '__main__':
# sys.exit(main())