mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
RTF Input: Fix regression in 0.7.40 that broke conversion of some old style RTF files
This commit is contained in:
parent
4e93c30cd0
commit
f6d72fbe0b
@ -286,7 +286,6 @@ class RTFInput(InputFormatPlugin):
|
|||||||
try:
|
try:
|
||||||
xml = self.generate_xml(stream.name)
|
xml = self.generate_xml(stream.name)
|
||||||
except RtfInvalidCodeException, e:
|
except RtfInvalidCodeException, e:
|
||||||
raise
|
|
||||||
raise ValueError(_('This RTF file has a feature calibre does not '
|
raise ValueError(_('This RTF file has a feature calibre does not '
|
||||||
'support. Convert it to HTML first and then try it.\n%s')%e)
|
'support. Convert it to HTML first and then try it.\n%s')%e)
|
||||||
|
|
||||||
|
@ -226,10 +226,6 @@ class ParseRtf:
|
|||||||
try:
|
try:
|
||||||
return_value = process_tokens_obj.process_tokens()
|
return_value = process_tokens_obj.process_tokens()
|
||||||
except InvalidRtfException, msg:
|
except InvalidRtfException, msg:
|
||||||
try:
|
|
||||||
os.remove(self.__temp_file)
|
|
||||||
except OSError:
|
|
||||||
pass
|
|
||||||
#Check to see if the file is correctly encoded
|
#Check to see if the file is correctly encoded
|
||||||
encode_obj = default_encoding.DefaultEncoding(
|
encode_obj = default_encoding.DefaultEncoding(
|
||||||
in_file = self.__temp_file,
|
in_file = self.__temp_file,
|
||||||
@ -241,14 +237,17 @@ class ParseRtf:
|
|||||||
check_encoding_obj = check_encoding.CheckEncoding(
|
check_encoding_obj = check_encoding.CheckEncoding(
|
||||||
bug_handler = RtfInvalidCodeException,
|
bug_handler = RtfInvalidCodeException,
|
||||||
)
|
)
|
||||||
enc = encode_obj.get_codepage()
|
enc = 'cp' + encode_obj.get_codepage()
|
||||||
if enc != 'mac_roman':
|
msg = 'Exception in token processing'
|
||||||
enc = 'cp' + enc
|
|
||||||
if check_encoding_obj.check_encoding(self.__file, enc):
|
if check_encoding_obj.check_encoding(self.__file, enc):
|
||||||
file_name = self.__file if isinstance(self.__file, str) \
|
file_name = self.__file if isinstance(self.__file, str) \
|
||||||
else self.__file.encode('utf-8')
|
else self.__file.encode('utf-8')
|
||||||
msg = 'File %s does not appear to be correctly encoded.\n' % file_name
|
msg = 'File %s does not appear to be correctly encoded.\n' % file_name
|
||||||
raise InvalidRtfException, msg
|
try:
|
||||||
|
os.remove(self.__temp_file)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
raise InvalidRtfException, msg
|
||||||
delete_info_obj = delete_info.DeleteInfo(
|
delete_info_obj = delete_info.DeleteInfo(
|
||||||
in_file = self.__temp_file,
|
in_file = self.__temp_file,
|
||||||
copy = self.__copy,
|
copy = self.__copy,
|
||||||
|
@ -74,9 +74,6 @@ class DefaultEncoding:
|
|||||||
if not self.__datafetched:
|
if not self.__datafetched:
|
||||||
self._encoding()
|
self._encoding()
|
||||||
self.__datafetched = True
|
self.__datafetched = True
|
||||||
if self.__platform == 'Macintosh':
|
|
||||||
code_page = self.__code_page
|
|
||||||
else:
|
|
||||||
code_page = 'ansicpg' + self.__code_page
|
code_page = 'ansicpg' + self.__code_page
|
||||||
return self.__platform, code_page, self.__default_num
|
return self.__platform, code_page, self.__default_num
|
||||||
|
|
||||||
@ -94,49 +91,60 @@ class DefaultEncoding:
|
|||||||
|
|
||||||
def _encoding(self):
|
def _encoding(self):
|
||||||
with open(self.__file, 'r') as read_obj:
|
with open(self.__file, 'r') as read_obj:
|
||||||
|
cpfound = False
|
||||||
if not self.__fetchraw:
|
if not self.__fetchraw:
|
||||||
for line in read_obj:
|
for line in read_obj:
|
||||||
self.__token_info = line[:16]
|
self.__token_info = line[:16]
|
||||||
if self.__token_info == 'mi<mk<rtfhed-end':
|
if self.__token_info == 'mi<mk<rtfhed-end':
|
||||||
break
|
break
|
||||||
if self.__token_info == 'cw<ri<ansi-codpg':
|
|
||||||
#cw<ri<ansi-codpg<nu<10000
|
|
||||||
self.__code_page = line[20:-1] if int(line[20:-1]) \
|
|
||||||
else '1252'
|
|
||||||
if self.__token_info == 'cw<ri<macintosh_':
|
if self.__token_info == 'cw<ri<macintosh_':
|
||||||
self.__platform = 'Macintosh'
|
self.__platform = 'Macintosh'
|
||||||
self.__code_page = 'mac_roman'
|
|
||||||
elif self.__token_info == 'cw<ri<pc________':
|
elif self.__token_info == 'cw<ri<pc________':
|
||||||
self.__platform = 'IBMPC'
|
self.__platform = 'IBMPC'
|
||||||
self.__code_page = '437'
|
|
||||||
elif self.__token_info == 'cw<ri<pca_______':
|
elif self.__token_info == 'cw<ri<pca_______':
|
||||||
self.__platform = 'OS/2'
|
self.__platform = 'OS/2'
|
||||||
self.__code_page = '850'
|
if self.__token_info == 'cw<ri<ansi-codpg' \
|
||||||
|
and int(line[20:-1]):
|
||||||
|
self.__code_page = line[20:-1]
|
||||||
if self.__token_info == 'cw<ri<deflt-font':
|
if self.__token_info == 'cw<ri<deflt-font':
|
||||||
self.__default_num = line[20:-1]
|
self.__default_num = line[20:-1]
|
||||||
|
cpfound = True
|
||||||
#cw<ri<deflt-font<nu<0
|
#cw<ri<deflt-font<nu<0
|
||||||
|
if self.__platform != 'Windows' and \
|
||||||
|
not cpfound:
|
||||||
|
if self.__platform == 'Macintosh':
|
||||||
|
self.__code_page = '10000'
|
||||||
|
elif self.__platform == 'IBMPC':
|
||||||
|
self.__code_page = '437'
|
||||||
|
elif self.__platform == 'OS/2':
|
||||||
|
self.__code_page = '850'
|
||||||
else:
|
else:
|
||||||
fenc = re.compile(r'\\(mac|pc|ansi|pca)[\\ \{\}\t\n]+')
|
fenc = re.compile(r'\\(mac|pc|ansi|pca)[\\ \{\}\t\n]+')
|
||||||
fenccp = re.compile(r'\\ansicpg(\d+)[\\ \{\}\t\n]+')
|
fenccp = re.compile(r'\\ansicpg(\d+)[\\ \{\}\t\n]+')
|
||||||
|
|
||||||
for line in read_obj:
|
for line in read_obj:
|
||||||
|
if fenc.search(line):
|
||||||
|
enc = fenc.search(line).group(1)
|
||||||
if fenccp.search(line):
|
if fenccp.search(line):
|
||||||
cp = fenccp.search(line).group(1)
|
cp = fenccp.search(line).group(1)
|
||||||
if not int(cp):
|
if not int(cp):
|
||||||
self.__code_page = cp
|
self.__code_page = cp
|
||||||
|
cpfound = True
|
||||||
break
|
break
|
||||||
if fenc.search(line):
|
if self.__platform != 'Windows' and \
|
||||||
enc = fenc.search(line).group(1)
|
not cpfound:
|
||||||
if enc == 'mac':
|
if enc == 'mac':
|
||||||
self.__code_page = 'mac_roman'
|
self.__code_page = '10000'
|
||||||
elif enc == 'pc':
|
elif enc == 'pc':
|
||||||
self.__code_page = '437'
|
self.__code_page = '437'
|
||||||
elif enc == 'pca':
|
elif enc == 'pca':
|
||||||
self.__code_page = '850'
|
self.__code_page = '850'
|
||||||
|
|
||||||
# if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# encode_obj = DefaultEncoding(
|
import sys
|
||||||
# in_file = sys.argv[1],
|
encode_obj = DefaultEncoding(
|
||||||
# bug_handler = Exception,
|
in_file = sys.argv[1],
|
||||||
# check_raw = True,
|
bug_handler = Exception,
|
||||||
# )
|
check_raw = True,
|
||||||
# print encode_obj.get_codepage()
|
)
|
||||||
|
print encode_obj.get_codepage()
|
||||||
|
@ -20,7 +20,7 @@ import sys, os, tempfile
|
|||||||
from calibre.ebooks.rtf2xml import copy
|
from calibre.ebooks.rtf2xml import copy
|
||||||
|
|
||||||
class DeleteInfo:
|
class DeleteInfo:
|
||||||
"""Delelet unecessary destination groups"""
|
"""Delete unecessary destination groups"""
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
in_file ,
|
in_file ,
|
||||||
bug_handler,
|
bug_handler,
|
||||||
@ -31,17 +31,14 @@ class DeleteInfo:
|
|||||||
self.__bug_handler = bug_handler
|
self.__bug_handler = bug_handler
|
||||||
self.__copy = copy
|
self.__copy = copy
|
||||||
self.__write_to = tempfile.mktemp()
|
self.__write_to = tempfile.mktemp()
|
||||||
|
self.__run_level = run_level
|
||||||
|
self.__initiate_allow()
|
||||||
self.__bracket_count= 0
|
self.__bracket_count= 0
|
||||||
self.__ob_count = 0
|
self.__ob_count = 0
|
||||||
self.__cb_count = 0
|
self.__cb_count = 0
|
||||||
# self.__after_asterisk = False
|
|
||||||
# self.__delete = 0
|
|
||||||
self.__initiate_allow()
|
|
||||||
self.__ob = 0
|
self.__ob = 0
|
||||||
self.__write_cb = False
|
self.__write_cb = False
|
||||||
self.__run_level = run_level
|
|
||||||
self.__found_delete = False
|
self.__found_delete = False
|
||||||
# self.__list = False
|
|
||||||
|
|
||||||
def __initiate_allow(self):
|
def __initiate_allow(self):
|
||||||
"""
|
"""
|
||||||
@ -57,6 +54,8 @@ class DeleteInfo:
|
|||||||
'cw<an<annotation',
|
'cw<an<annotation',
|
||||||
'cw<cm<comment___',
|
'cw<cm<comment___',
|
||||||
'cw<it<lovr-table',
|
'cw<it<lovr-table',
|
||||||
|
# info table
|
||||||
|
'cw<di<company___',
|
||||||
# 'cw<ls<list______',
|
# 'cw<ls<list______',
|
||||||
)
|
)
|
||||||
self.__not_allowable = (
|
self.__not_allowable = (
|
||||||
@ -116,7 +115,6 @@ class DeleteInfo:
|
|||||||
"""
|
"""
|
||||||
# Test for {\*}, in which case don't enter
|
# Test for {\*}, in which case don't enter
|
||||||
# delete state
|
# delete state
|
||||||
# self.__after_asterisk = False # only enter this function once
|
|
||||||
self.__found_delete = True
|
self.__found_delete = True
|
||||||
if self.__token_info == 'cb<nu<clos-brack':
|
if self.__token_info == 'cb<nu<clos-brack':
|
||||||
if self.__delete_count == self.__cb_count:
|
if self.__delete_count == self.__cb_count:
|
||||||
@ -128,7 +126,7 @@ class DeleteInfo:
|
|||||||
# not sure what happens here!
|
# not sure what happens here!
|
||||||
# believe I have a '{\*}
|
# believe I have a '{\*}
|
||||||
if self.__run_level > 3:
|
if self.__run_level > 3:
|
||||||
msg = 'flag problem\n'
|
msg = 'Flag problem\n'
|
||||||
raise self.__bug_handler, msg
|
raise self.__bug_handler, msg
|
||||||
return True
|
return True
|
||||||
elif self.__token_info in self.__allowable :
|
elif self.__token_info in self.__allowable :
|
||||||
@ -173,8 +171,8 @@ class DeleteInfo:
|
|||||||
Return True for all control words.
|
Return True for all control words.
|
||||||
Return False otherwise.
|
Return False otherwise.
|
||||||
"""
|
"""
|
||||||
if self.__delete_count == self.__cb_count and self.__token_info ==\
|
if self.__delete_count == self.__cb_count and \
|
||||||
'cb<nu<clos-brack':
|
self.__token_info == 'cb<nu<clos-brack':
|
||||||
self.__state = 'default'
|
self.__state = 'default'
|
||||||
if self.__write_cb:
|
if self.__write_cb:
|
||||||
self.__write_cb = False
|
self.__write_cb = False
|
||||||
@ -186,32 +184,24 @@ class DeleteInfo:
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
def delete_info(self):
|
def delete_info(self):
|
||||||
"""Main method for handling other methods. Read one line in at
|
"""Main method for handling other methods. Read one line at
|
||||||
a time, and determine whether to print the line based on the state."""
|
a time, and determine whether to print the line based on the state."""
|
||||||
with open(self.__file, 'r') as read_obj:
|
with open(self.__file, 'r') as read_obj:
|
||||||
with open(self.__write_to, 'w') as self.__write_obj:
|
with open(self.__write_to, 'w') as self.__write_obj:
|
||||||
for line in read_obj:
|
for line in read_obj:
|
||||||
#ob<nu<open-brack<0001
|
#ob<nu<open-brack<0001
|
||||||
to_print = True
|
|
||||||
self.__token_info = line[:16]
|
self.__token_info = line[:16]
|
||||||
if self.__token_info == 'ob<nu<open-brack':
|
if self.__token_info == 'ob<nu<open-brack':
|
||||||
self.__ob_count = line[-5:-1]
|
self.__ob_count = line[-5:-1]
|
||||||
if self.__token_info == 'cb<nu<clos-brack':
|
if self.__token_info == 'cb<nu<clos-brack':
|
||||||
self.__cb_count = line[-5:-1]
|
self.__cb_count = line[-5:-1]
|
||||||
|
# Get action to perform
|
||||||
action = self.__state_dict.get(self.__state)
|
action = self.__state_dict.get(self.__state)
|
||||||
if not action:
|
if not action:
|
||||||
sys.stderr.write(_('No action in dictionary state is "%s" \n')
|
sys.stderr.write('No action in dictionary state is "%s" \n'
|
||||||
% self.__state)
|
% self.__state)
|
||||||
to_print = action(line)
|
# Print if allowed by action
|
||||||
# if self.__after_asterisk:
|
if action(line):
|
||||||
# to_print = self.__asterisk_func(line)
|
|
||||||
# elif self.__list:
|
|
||||||
# self.__in_list_func(line)
|
|
||||||
# elif self.__delete:
|
|
||||||
# to_print = self.__delete_func(line)
|
|
||||||
# else:
|
|
||||||
# to_print = self.__default_func(line)
|
|
||||||
if to_print:
|
|
||||||
self.__write_obj.write(line)
|
self.__write_obj.write(line)
|
||||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||||
if self.__copy:
|
if self.__copy:
|
||||||
|
@ -15,8 +15,10 @@
|
|||||||
# #
|
# #
|
||||||
# #
|
# #
|
||||||
#########################################################################
|
#########################################################################
|
||||||
import sys, os, tempfile
|
import sys, os, tempfile, re
|
||||||
|
|
||||||
from calibre.ebooks.rtf2xml import copy
|
from calibre.ebooks.rtf2xml import copy
|
||||||
|
|
||||||
class Info:
|
class Info:
|
||||||
"""
|
"""
|
||||||
Make tags for document-information
|
Make tags for document-information
|
||||||
@ -42,12 +44,14 @@ class Info:
|
|||||||
self.__copy = copy
|
self.__copy = copy
|
||||||
self.__run_level = run_level
|
self.__run_level = run_level
|
||||||
self.__write_to = tempfile.mktemp()
|
self.__write_to = tempfile.mktemp()
|
||||||
|
|
||||||
def __initiate_values(self):
|
def __initiate_values(self):
|
||||||
"""
|
"""
|
||||||
Initiate all values.
|
Initiate all values.
|
||||||
"""
|
"""
|
||||||
self.__text_string = ''
|
self.__text_string = ''
|
||||||
self.__state = 'before_info_table'
|
self.__state = 'before_info_table'
|
||||||
|
self.rmspace = re.compile(r'\s+')
|
||||||
self.__state_dict = {
|
self.__state_dict = {
|
||||||
'before_info_table': self.__before_info_table_func,
|
'before_info_table': self.__before_info_table_func,
|
||||||
'after_info_table': self.__after_info_table_func,
|
'after_info_table': self.__after_info_table_func,
|
||||||
@ -58,27 +62,49 @@ class Info:
|
|||||||
self.__info_table_dict = {
|
self.__info_table_dict = {
|
||||||
'cw<di<title_____' : (self.__found_tag_with_text_func, 'title'),
|
'cw<di<title_____' : (self.__found_tag_with_text_func, 'title'),
|
||||||
'cw<di<author____' : (self.__found_tag_with_text_func, 'author'),
|
'cw<di<author____' : (self.__found_tag_with_text_func, 'author'),
|
||||||
|
'cw<di<operator__' : (self.__found_tag_with_text_func, 'operator'),
|
||||||
|
'cw<di<manager___' : (self.__found_tag_with_text_func, 'manager'),
|
||||||
|
'cw<di<company___' : (self.__found_tag_with_text_func, 'company'),
|
||||||
'cw<di<keywords__' : (self.__found_tag_with_text_func, 'keywords'),
|
'cw<di<keywords__' : (self.__found_tag_with_text_func, 'keywords'),
|
||||||
|
'cw<di<category__' : (self.__found_tag_with_text_func, 'category'),
|
||||||
'cw<di<doc-notes_' : (self.__found_tag_with_text_func, 'doc-notes'),
|
'cw<di<doc-notes_' : (self.__found_tag_with_text_func, 'doc-notes'),
|
||||||
'cw<di<subject___' : (self.__found_tag_with_text_func, 'subject'),
|
'cw<di<subject___' : (self.__found_tag_with_text_func, 'subject'),
|
||||||
'cw<di<operator__' : (self.__found_tag_with_text_func, 'operator'),
|
'cw<di<linkbase__' : (self.__found_tag_with_text_func, 'hyperlink-base'),
|
||||||
|
|
||||||
'cw<di<create-tim' : (self.__found_tag_with_tokens_func, 'creation-time'),
|
'cw<di<create-tim' : (self.__found_tag_with_tokens_func, 'creation-time'),
|
||||||
'cw<di<revis-time' : (self.__found_tag_with_tokens_func, 'revision-time'),
|
'cw<di<revis-time' : (self.__found_tag_with_tokens_func, 'revision-time'),
|
||||||
'cw<di<edit-time_' : (self.__single_field_func, 'editing-time'),
|
'cw<di<edit-time_' : (self.__found_tag_with_tokens_func, 'editing-time'),
|
||||||
|
'cw<di<print-time' : (self.__found_tag_with_tokens_func, 'printing-time'),
|
||||||
|
'cw<di<backuptime' : (self.__found_tag_with_tokens_func, 'backup-time'),
|
||||||
|
|
||||||
'cw<di<num-of-wor' : (self.__single_field_func, 'number-of-words'),
|
'cw<di<num-of-wor' : (self.__single_field_func, 'number-of-words'),
|
||||||
'cw<di<num-of-chr' : (self.__single_field_func, 'number-of-characters'),
|
'cw<di<num-of-chr' : (self.__single_field_func, 'number-of-characters'),
|
||||||
|
'cw<di<numofchrws' : (self.__single_field_func, 'number-of-characters-without-space'),
|
||||||
'cw<di<num-of-pag' : (self.__single_field_func, 'number-of-pages'),
|
'cw<di<num-of-pag' : (self.__single_field_func, 'number-of-pages'),
|
||||||
|
'cw<di<version___' : (self.__single_field_func, 'version'),
|
||||||
|
'cw<di<intern-ver' : (self.__single_field_func, 'internal-version-number'),
|
||||||
|
'cw<di<internalID' : (self.__single_field_func, 'internal-id-number'),
|
||||||
}
|
}
|
||||||
self.__token_dict = {
|
self.__token_dict = {
|
||||||
'year______' : 'year',
|
'year______' : 'year',
|
||||||
'month_____' : 'month',
|
'month_____' : 'month',
|
||||||
'day_______' : 'day',
|
'day_______' : 'day',
|
||||||
'minute____' : 'minute',
|
'minute____' : 'minute',
|
||||||
|
'second____' : 'second',
|
||||||
'revis-time' : 'revision-time',
|
'revis-time' : 'revision-time',
|
||||||
|
'create-tim' : 'creation-time',
|
||||||
|
'edit-time_' : 'editing-time',
|
||||||
|
'print-time' : 'printing-time',
|
||||||
|
'backuptime' : 'backup-time',
|
||||||
'num-of-wor' : 'number-of-words',
|
'num-of-wor' : 'number-of-words',
|
||||||
'num-of-chr' : 'number-of-characters',
|
'num-of-chr' : 'number-of-characters',
|
||||||
|
'numofchrws' : 'number-of-characters-without-space',
|
||||||
'num-of-pag' : 'number-of-pages',
|
'num-of-pag' : 'number-of-pages',
|
||||||
|
'version___' : 'version',
|
||||||
|
'intern-ver' : 'internal-version-number',
|
||||||
|
'internalID' : 'internal-id-number',
|
||||||
}
|
}
|
||||||
|
|
||||||
def __before_info_table_func(self, line):
|
def __before_info_table_func(self, line):
|
||||||
"""
|
"""
|
||||||
Required:
|
Required:
|
||||||
@ -92,6 +118,7 @@ class Info:
|
|||||||
if self.__token_info == 'mi<mk<doc-in-beg':
|
if self.__token_info == 'mi<mk<doc-in-beg':
|
||||||
self.__state = 'in_info_table'
|
self.__state = 'in_info_table'
|
||||||
self.__write_obj.write(line)
|
self.__write_obj.write(line)
|
||||||
|
|
||||||
def __in_info_table_func(self, line):
|
def __in_info_table_func(self, line):
|
||||||
"""
|
"""
|
||||||
Requires:
|
Requires:
|
||||||
@ -112,6 +139,7 @@ class Info:
|
|||||||
action(line, tag)
|
action(line, tag)
|
||||||
else:
|
else:
|
||||||
self.__write_obj.write(line)
|
self.__write_obj.write(line)
|
||||||
|
|
||||||
def __found_tag_with_text_func(self, line, tag):
|
def __found_tag_with_text_func(self, line, tag):
|
||||||
"""
|
"""
|
||||||
Requires:
|
Requires:
|
||||||
@ -126,6 +154,7 @@ class Info:
|
|||||||
"""
|
"""
|
||||||
self.__tag = tag
|
self.__tag = tag
|
||||||
self.__state = 'collect_text'
|
self.__state = 'collect_text'
|
||||||
|
|
||||||
def __collect_text_func(self, line):
|
def __collect_text_func(self, line):
|
||||||
"""
|
"""
|
||||||
Requires:
|
Requires:
|
||||||
@ -139,14 +168,17 @@ class Info:
|
|||||||
"""
|
"""
|
||||||
if self.__token_info == 'mi<mk<docinf-end':
|
if self.__token_info == 'mi<mk<docinf-end':
|
||||||
self.__state = 'in_info_table'
|
self.__state = 'in_info_table'
|
||||||
self.__write_obj.write(
|
#Don't print empty tags
|
||||||
'mi<tg<open______<%s\n'
|
if len(self.rmspace.sub('',self.__text_string)):
|
||||||
'tx<nu<__________<%s\n'
|
self.__write_obj.write(
|
||||||
'mi<tg<close_____<%s\n' % (self.__tag, self.__text_string, self.__tag)
|
'mi<tg<open______<%s\n'
|
||||||
)
|
'tx<nu<__________<%s\n'
|
||||||
|
'mi<tg<close_____<%s\n' % (self.__tag, self.__text_string, self.__tag)
|
||||||
|
)
|
||||||
self.__text_string = ''
|
self.__text_string = ''
|
||||||
elif line[0:2] == 'tx':
|
elif line[0:2] == 'tx':
|
||||||
self.__text_string += line[17:-1]
|
self.__text_string += line[17:-1]
|
||||||
|
|
||||||
def __found_tag_with_tokens_func(self, line, tag):
|
def __found_tag_with_tokens_func(self, line, tag):
|
||||||
"""
|
"""
|
||||||
Requires:
|
Requires:
|
||||||
@ -163,6 +195,7 @@ class Info:
|
|||||||
self.__state = 'collect_tokens'
|
self.__state = 'collect_tokens'
|
||||||
self.__text_string = 'mi<tg<empty-att_<%s' % tag
|
self.__text_string = 'mi<tg<empty-att_<%s' % tag
|
||||||
#mi<tg<empty-att_<page-definition<margin>33\n
|
#mi<tg<empty-att_<page-definition<margin>33\n
|
||||||
|
|
||||||
def __collect_tokens_func(self, line):
|
def __collect_tokens_func(self, line):
|
||||||
"""
|
"""
|
||||||
Requires:
|
Requires:
|
||||||
@ -194,18 +227,19 @@ class Info:
|
|||||||
att = line[6:16]
|
att = line[6:16]
|
||||||
value = line[20:-1]
|
value = line[20:-1]
|
||||||
att_changed = self.__token_dict.get(att)
|
att_changed = self.__token_dict.get(att)
|
||||||
if att_changed == None:
|
if att_changed is None:
|
||||||
if self.__run_level > 3:
|
if self.__run_level > 3:
|
||||||
msg = 'no dictionary match for %s\n' % att
|
msg = 'No dictionary match for %s\n' % att
|
||||||
raise self.__bug_handler, msg
|
raise self.__bug_handler, msg
|
||||||
else:
|
else:
|
||||||
self.__text_string += '<%s>%s' % (att_changed, value)
|
self.__text_string += '<%s>%s' % (att_changed, value)
|
||||||
|
|
||||||
def __single_field_func(self, line, tag):
|
def __single_field_func(self, line, tag):
|
||||||
value = line[20:-1]
|
value = line[20:-1]
|
||||||
self.__write_obj.write(
|
self.__write_obj.write(
|
||||||
'mi<tg<empty-att_<%s'
|
'mi<tg<empty-att_<%s<%s>%s\n' % (tag, tag, value)
|
||||||
'<%s>%s\n' % (tag, tag, value)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def __after_info_table_func(self, line):
|
def __after_info_table_func(self, line):
|
||||||
"""
|
"""
|
||||||
Requires:
|
Requires:
|
||||||
@ -217,6 +251,7 @@ class Info:
|
|||||||
the file.
|
the file.
|
||||||
"""
|
"""
|
||||||
self.__write_obj.write(line)
|
self.__write_obj.write(line)
|
||||||
|
|
||||||
def fix_info(self):
|
def fix_info(self):
|
||||||
"""
|
"""
|
||||||
Requires:
|
Requires:
|
||||||
@ -234,20 +269,15 @@ class Info:
|
|||||||
information table, simply write the line to the output file.
|
information table, simply write the line to the output file.
|
||||||
"""
|
"""
|
||||||
self.__initiate_values()
|
self.__initiate_values()
|
||||||
read_obj = open(self.__file, 'r')
|
with open(self.__file, 'r') as read_obj:
|
||||||
self.__write_obj = open(self.__write_to, 'w')
|
with open(self.__write_to, 'wb') as self.__write_obj:
|
||||||
line_to_read = 1
|
for line in read_obj:
|
||||||
while line_to_read:
|
self.__token_info = line[:16]
|
||||||
line_to_read = read_obj.readline()
|
action = self.__state_dict.get(self.__state)
|
||||||
line = line_to_read
|
if action is None:
|
||||||
self.__token_info = line[:16]
|
sys.stderr.write('No matching state in module styles.py\n')
|
||||||
action = self.__state_dict.get(self.__state)
|
sys.stderr.write(self.__state + '\n')
|
||||||
if action == None:
|
action(line)
|
||||||
sys.stderr.write('no no matching state in module styles.py\n')
|
|
||||||
sys.stderr.write(self.__state + '\n')
|
|
||||||
action(line)
|
|
||||||
read_obj.close()
|
|
||||||
self.__write_obj.close()
|
|
||||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||||
if self.__copy:
|
if self.__copy:
|
||||||
copy_obj.copy_file(self.__write_to, "info.data")
|
copy_obj.copy_file(self.__write_to, "info.data")
|
||||||
|
@ -70,7 +70,7 @@ class ProcessTokens:
|
|||||||
';' : ('mc', ';', self.ms_sub_func),
|
';' : ('mc', ';', self.ms_sub_func),
|
||||||
# this must be wrong
|
# this must be wrong
|
||||||
'-' : ('mc', '-', self.ms_sub_func),
|
'-' : ('mc', '-', self.ms_sub_func),
|
||||||
'line' : ('mi', 'hardline-break', self.hardline_func), #calibre
|
'line' : ('mi', 'hardline-break', self.direct_conv_func), #calibre
|
||||||
# misc => ml
|
# misc => ml
|
||||||
'*' : ('ml', 'asterisk__', self.default_func),
|
'*' : ('ml', 'asterisk__', self.default_func),
|
||||||
':' : ('ml', 'colon_____', self.default_func),
|
':' : ('ml', 'colon_____', self.default_func),
|
||||||
@ -78,7 +78,6 @@ class ProcessTokens:
|
|||||||
'backslash' : ('nu', '\\', self.text_func),
|
'backslash' : ('nu', '\\', self.text_func),
|
||||||
'ob' : ('nu', '{', self.text_func),
|
'ob' : ('nu', '{', self.text_func),
|
||||||
'cb' : ('nu', '}', self.text_func),
|
'cb' : ('nu', '}', self.text_func),
|
||||||
#'line' : ('nu', ' ', self.text_func), calibre
|
|
||||||
# paragraph formatting => pf
|
# paragraph formatting => pf
|
||||||
'page' : ('pf', 'page-break', self.default_func),
|
'page' : ('pf', 'page-break', self.default_func),
|
||||||
'par' : ('pf', 'par-end___', self.default_func),
|
'par' : ('pf', 'par-end___', self.default_func),
|
||||||
@ -231,11 +230,15 @@ class ProcessTokens:
|
|||||||
'trhdr' : ('tb', 'row-header', self.default_func),
|
'trhdr' : ('tb', 'row-header', self.default_func),
|
||||||
# preamble => pr
|
# preamble => pr
|
||||||
# document information => di
|
# document information => di
|
||||||
|
# TODO integrate \userprops
|
||||||
'info' : ('di', 'doc-info__', self.default_func),
|
'info' : ('di', 'doc-info__', self.default_func),
|
||||||
|
'title' : ('di', 'title_____', self.default_func),
|
||||||
'author' : ('di', 'author____', self.default_func),
|
'author' : ('di', 'author____', self.default_func),
|
||||||
'operator' : ('di', 'operator__', self.default_func),
|
'operator' : ('di', 'operator__', self.default_func),
|
||||||
'title' : ('di', 'title_____', self.default_func),
|
'manager' : ('di', 'manager___', self.default_func),
|
||||||
|
'company' : ('di', 'company___', self.default_func),
|
||||||
'keywords' : ('di', 'keywords__', self.default_func),
|
'keywords' : ('di', 'keywords__', self.default_func),
|
||||||
|
'category' : ('di', 'category__', self.default_func),
|
||||||
'doccomm' : ('di', 'doc-notes_', self.default_func),
|
'doccomm' : ('di', 'doc-notes_', self.default_func),
|
||||||
'comment' : ('di', 'doc-notes_', self.default_func),
|
'comment' : ('di', 'doc-notes_', self.default_func),
|
||||||
'subject' : ('di', 'subject___', self.default_func),
|
'subject' : ('di', 'subject___', self.default_func),
|
||||||
@ -244,11 +247,19 @@ class ProcessTokens:
|
|||||||
'mo' : ('di', 'month_____', self.default_func),
|
'mo' : ('di', 'month_____', self.default_func),
|
||||||
'dy' : ('di', 'day_______', self.default_func),
|
'dy' : ('di', 'day_______', self.default_func),
|
||||||
'min' : ('di', 'minute____', self.default_func),
|
'min' : ('di', 'minute____', self.default_func),
|
||||||
|
'sec' : ('di', 'second____', self.default_func),
|
||||||
'revtim' : ('di', 'revis-time', self.default_func),
|
'revtim' : ('di', 'revis-time', self.default_func),
|
||||||
|
'edmins' : ('di', 'edit-time_', self.default_func),
|
||||||
|
'printim' : ('di', 'print-time', self.default_func),
|
||||||
|
'buptim' : ('di', 'backuptime', self.default_func),
|
||||||
'nofwords' : ('di', 'num-of-wor', self.default_func),
|
'nofwords' : ('di', 'num-of-wor', self.default_func),
|
||||||
'nofchars' : ('di', 'num-of-chr', self.default_func),
|
'nofchars' : ('di', 'num-of-chr', self.default_func),
|
||||||
|
'nofcharsws' : ('di', 'numofchrws', self.default_func),
|
||||||
'nofpages' : ('di', 'num-of-pag', self.default_func),
|
'nofpages' : ('di', 'num-of-pag', self.default_func),
|
||||||
'edmins' : ('di', 'edit-time_', self.default_func),
|
'version' : ('di', 'version___', self.default_func),
|
||||||
|
'vern' : ('di', 'intern-ver', self.default_func),
|
||||||
|
'hlinkbase' : ('di', 'linkbase__', self.default_func),
|
||||||
|
'id' : ('di', 'internalID', self.default_func),
|
||||||
# headers and footers => hf
|
# headers and footers => hf
|
||||||
'headerf' : ('hf', 'head-first', self.default_func),
|
'headerf' : ('hf', 'head-first', self.default_func),
|
||||||
'headerl' : ('hf', 'head-left_', self.default_func),
|
'headerl' : ('hf', 'head-left_', self.default_func),
|
||||||
@ -605,7 +616,7 @@ class ProcessTokens:
|
|||||||
def ms_sub_func(self, pre, token, num):
|
def ms_sub_func(self, pre, token, num):
|
||||||
return 'tx<mc<__________<%s\n' % token
|
return 'tx<mc<__________<%s\n' % token
|
||||||
|
|
||||||
def hardline_func(self, pre, token, num):
|
def direct_conv_func(self, pre, token, num):
|
||||||
return 'mi<tg<empty_____<%s\n' % token
|
return 'mi<tg<empty_____<%s\n' % token
|
||||||
|
|
||||||
def default_func(self, pre, token, num):
|
def default_func(self, pre, token, num):
|
||||||
|
@ -27,11 +27,13 @@ class Tokenize:
|
|||||||
bug_handler,
|
bug_handler,
|
||||||
copy = None,
|
copy = None,
|
||||||
run_level = 1,
|
run_level = 1,
|
||||||
):
|
# out_file = None,
|
||||||
|
):
|
||||||
self.__file = in_file
|
self.__file = in_file
|
||||||
self.__bug_handler = bug_handler
|
self.__bug_handler = bug_handler
|
||||||
self.__copy = copy
|
self.__copy = copy
|
||||||
self.__write_to = tempfile.mktemp()
|
self.__write_to = tempfile.mktemp()
|
||||||
|
# self.__out_file = out_file
|
||||||
self.__compile_expressions()
|
self.__compile_expressions()
|
||||||
#variables
|
#variables
|
||||||
self.__uc_char = 0
|
self.__uc_char = 0
|
||||||
@ -113,6 +115,8 @@ class Tokenize:
|
|||||||
|
|
||||||
def __sub_reg_split(self,input_file):
|
def __sub_reg_split(self,input_file):
|
||||||
input_file = self.__replace_spchar.mreplace(input_file)
|
input_file = self.__replace_spchar.mreplace(input_file)
|
||||||
|
# this is for older RTF
|
||||||
|
input_file = self.__par_exp.sub('\n\\par \n', input_file)
|
||||||
input_file = self.__ms_hex_exp.sub("\\mshex0\g<1> ", input_file)
|
input_file = self.__ms_hex_exp.sub("\\mshex0\g<1> ", input_file)
|
||||||
input_file = self.__utf_ud.sub("\\{\\uc0 \g<1>\\}", input_file)
|
input_file = self.__utf_ud.sub("\\{\\uc0 \g<1>\\}", input_file)
|
||||||
#remove \n in bin data
|
#remove \n in bin data
|
||||||
@ -127,7 +131,7 @@ class Tokenize:
|
|||||||
# this is for older RTF
|
# this is for older RTF
|
||||||
#line = re.sub(self.__par_exp, '\\par ', line)
|
#line = re.sub(self.__par_exp, '\\par ', line)
|
||||||
#return filter(lambda x: len(x) > 0, \
|
#return filter(lambda x: len(x) > 0, \
|
||||||
#(self.__remove_line.sub('', x) for x in tokens))
|
#(self.__remove_line.sub('', x) for x in tokens))
|
||||||
|
|
||||||
def __compile_expressions(self):
|
def __compile_expressions(self):
|
||||||
SIMPLE_RPL = {
|
SIMPLE_RPL = {
|
||||||
@ -153,8 +157,6 @@ class Tokenize:
|
|||||||
# put a backslash in front of to eliminate special cases and
|
# put a backslash in front of to eliminate special cases and
|
||||||
# make processing easier
|
# make processing easier
|
||||||
"}": "\\}",
|
"}": "\\}",
|
||||||
# this is for older RTF
|
|
||||||
r'\\$': '\\par ',
|
|
||||||
}
|
}
|
||||||
self.__replace_spchar = MReplace(SIMPLE_RPL)
|
self.__replace_spchar = MReplace(SIMPLE_RPL)
|
||||||
#add ;? in case of char following \u
|
#add ;? in case of char following \u
|
||||||
@ -168,10 +170,12 @@ class Tokenize:
|
|||||||
#why keep backslash whereas \is replaced before?
|
#why keep backslash whereas \is replaced before?
|
||||||
#remove \n from endline char
|
#remove \n from endline char
|
||||||
self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)")
|
self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)")
|
||||||
|
#this is for old RTF
|
||||||
|
self.__par_exp = re.compile(r'\\\n+')
|
||||||
|
# self.__par_exp = re.compile(r'\\$')
|
||||||
#self.__bin_exp = re.compile(r"\\bin(-?\d{1,8}) {0,1}")
|
#self.__bin_exp = re.compile(r"\\bin(-?\d{1,8}) {0,1}")
|
||||||
#self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})")
|
#self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})")
|
||||||
#self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\n|\\[^\s\\{}&]+(?:\s)?)")
|
#self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\n|\\[^\s\\{}&]+(?:\s)?)")
|
||||||
#self.__par_exp = re.compile(r'\\$')
|
|
||||||
#self.__remove_line = re.compile(r'\n+')
|
#self.__remove_line = re.compile(r'\n+')
|
||||||
#self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)")
|
#self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)")
|
||||||
##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)")
|
##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)")
|
||||||
@ -199,7 +203,24 @@ class Tokenize:
|
|||||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||||
if self.__copy:
|
if self.__copy:
|
||||||
copy_obj.copy_file(self.__write_to, "tokenize.data")
|
copy_obj.copy_file(self.__write_to, "tokenize.data")
|
||||||
|
# if self.__out_file:
|
||||||
|
# self.__file = self.__out_file
|
||||||
copy_obj.rename(self.__write_to, self.__file)
|
copy_obj.rename(self.__write_to, self.__file)
|
||||||
os.remove(self.__write_to)
|
os.remove(self.__write_to)
|
||||||
|
|
||||||
#self.__special_tokens = [ '_', '~', "'", '{', '}' ]
|
#self.__special_tokens = [ '_', '~', "'", '{', '}' ]
|
||||||
|
|
||||||
|
# import sys
|
||||||
|
# def main(args=sys.argv):
|
||||||
|
# if len(args) < 1:
|
||||||
|
# print 'No file'
|
||||||
|
# return
|
||||||
|
# file = 'data_tokens.txt'
|
||||||
|
# if len(args) == 3:
|
||||||
|
# file = args[2]
|
||||||
|
# to = Tokenize(args[1], Exception, out_file = file)
|
||||||
|
# to.tokenize()
|
||||||
|
|
||||||
|
|
||||||
|
# if __name__ == '__main__':
|
||||||
|
# sys.exit(main())
|
Loading…
x
Reference in New Issue
Block a user