Global overhaul of rtf2xml: RTFfixes (4) ->minors corrections in line endings and check brackets, move check encoding first to eliminate non ascii RTF

This commit is contained in:
Sengian 2010-08-12 17:16:37 +02:00
parent 7c70914ad3
commit b9ed0c6b3d
4 changed files with 33 additions and 29 deletions

View File

@ -133,7 +133,6 @@ class ParseRtf:
self.__temp_dir = out_dir self.__temp_dir = out_dir
self.__dtd_path = dtd self.__dtd_path = dtd
self.__check_file(in_file,"file_to_parse") self.__check_file(in_file,"file_to_parse")
self.__check_ascii(in_file)
self.__char_data = char_data self.__char_data = char_data
self.__debug_dir = deb_dir self.__debug_dir = deb_dir
self.__check_dir(self.__temp_dir) self.__check_dir(self.__temp_dir)
@ -152,6 +151,7 @@ class ParseRtf:
self.__group_borders = group_borders self.__group_borders = group_borders
self.__empty_paragraphs = empty_paragraphs self.__empty_paragraphs = empty_paragraphs
self.__no_dtd = no_dtd self.__no_dtd = no_dtd
def __check_file(self, the_file, type): def __check_file(self, the_file, type):
"""Check to see if files exist""" """Check to see if files exist"""
if hasattr(the_file, 'read'): return if hasattr(the_file, 'read'): return
@ -164,6 +164,7 @@ class ParseRtf:
else: else:
msg = "\nThe file '%s' cannot be found" % the_file msg = "\nThe file '%s' cannot be found" % the_file
raise RtfInvalidCodeException, msg raise RtfInvalidCodeException, msg
def __check_dir(self, the_dir): def __check_dir(self, the_dir):
"""Check to see if directory exists""" """Check to see if directory exists"""
if not the_dir : if not the_dir :
@ -173,15 +174,7 @@ class ParseRtf:
msg = "\n%s is not a directory" % the_dir msg = "\n%s is not a directory" % the_dir
raise RtfInvalidCodeException, msg raise RtfInvalidCodeException, msg
return 1 return 1
def __check_ascii(self, the_file):
"""Check to see if the file is correct ascii"""
try:
test = codecs.open(the_file, 'r', 'ascii', 'strict')
test.close()
except UnicodeError:
msg = "\n%s is not a correct ascii file" % the_file
raise RtfInvalidCodeException, msg
return 1
def parse_rtf(self): def parse_rtf(self):
""" """
Parse the file by calling on other classes. Parse the file by calling on other classes.
@ -192,6 +185,18 @@ class ParseRtf:
depending on the value of 'output' when the instance was created. depending on the value of 'output' when the instance was created.
""" """
self.__temp_file = self.__make_temp_file(self.__file) self.__temp_file = self.__make_temp_file(self.__file)
#Check to see if the file is correct ascii first
check_encoding_obj = check_encoding.CheckEncoding(
bug_handler = RtfInvalidCodeException,
)
if check_encoding_obj.check_encoding(self.__file):
try:
os.remove(self.__temp_file)
except OSError:
pass
sys.stderr.write('File "%s" does not appear to be ascii.\n' \
% self.__file if isinstance(self.__file, str) else self.__file.encode('utf-8'))
raise InvalidRtfException
# if the self.__deb_dir is true, then create a copy object, # if the self.__deb_dir is true, then create a copy object,
# set the directory to write to, remove files, and copy # set the directory to write to, remove files, and copy
# the new temporary file to this directory # the new temporary file to this directory
@ -214,7 +219,7 @@ class ParseRtf:
in_file = self.__temp_file, in_file = self.__temp_file,
bug_handler = RtfInvalidCodeException, bug_handler = RtfInvalidCodeException,
copy = self.__copy, copy = self.__copy,
#run_level = self.__run_level, run_level = self.__run_level,
replace_illegals = self.__replace_illegals, replace_illegals = self.__replace_illegals,
) )
line_obj.fix_endings() line_obj.fix_endings()
@ -223,8 +228,8 @@ class ParseRtf:
tokenize_obj = tokenize.Tokenize( tokenize_obj = tokenize.Tokenize(
bug_handler = RtfInvalidCodeException, bug_handler = RtfInvalidCodeException,
in_file = self.__temp_file, in_file = self.__temp_file,
copy = self.__copy,) copy = self.__copy,
#run_level = self.__run_level,) run_level = self.__run_level)
tokenize_obj.tokenize() tokenize_obj.tokenize()
process_tokens_obj = process_tokens.ProcessTokens( process_tokens_obj = process_tokens.ProcessTokens(
in_file = self.__temp_file, in_file = self.__temp_file,
@ -240,10 +245,6 @@ class ParseRtf:
os.remove(self.__temp_file) os.remove(self.__temp_file)
except OSError: except OSError:
pass pass
check_encoding_obj = check_encoding.CheckEncoding(
bug_handler = RtfInvalidCodeException,
)
check_encoding_obj.check_encoding(self.__file)
sys.stderr.write('File "%s" does not appear to be RTF.\n' % self.__file if isinstance(self.__file, str) else self.__file.encode('utf-8')) sys.stderr.write('File "%s" does not appear to be RTF.\n' % self.__file if isinstance(self.__file, str) else self.__file.encode('utf-8'))
raise InvalidRtfException, msg raise InvalidRtfException, msg
delete_info_obj = delete_info.DeleteInfo( delete_info_obj = delete_info.DeleteInfo(
@ -548,8 +549,7 @@ class ParseRtf:
"""Make a temporary file to parse""" """Make a temporary file to parse"""
write_file="rtf_write_file" write_file="rtf_write_file"
read_obj = file if hasattr(file, 'read') else open(file,'r') read_obj = file if hasattr(file, 'read') else open(file,'r')
write_obj = open(write_file, 'w') write_obj = open(write_file, 'wb')
for line in read_obj: write_obj.write(read_obj.read())
write_obj.write(line)
write_obj.close() write_obj.close()
return write_file return write_file

View File

@ -30,7 +30,6 @@ class CheckBrackets:
self.__bracket_count += 1 self.__bracket_count += 1
def close_brack(self, line): def close_brack(self, line):
num = line[-5:-1] num = line[-5:-1]
##self.__open_bracket_num.append(num)
try: try:
last_num = self.__open_bracket_num.pop() last_num = self.__open_bracket_num.pop()
except: except:

View File

@ -14,12 +14,11 @@ class CheckEncoding:
sys.stderr.write(str(msg) + '\n') sys.stderr.write(str(msg) + '\n')
def check_encoding(self, path, encoding='us-ascii'): def check_encoding(self, path, encoding='us-ascii'):
read_obj = open(path, 'r') read_obj = open(path, 'r')
line_to_read = 1 input_file = read_obj.read()
read_obj.close()
line_num = 0 line_num = 0
while line_to_read: for line in input_file:
line_num += 1 line_num += 1
line_to_read = read_obj.readline()
line = line_to_read
try: try:
line.decode(encoding) line.decode(encoding)
except UnicodeError: except UnicodeError:
@ -27,6 +26,9 @@ class CheckEncoding:
self.__get_position_error(line, encoding, line_num) self.__get_position_error(line, encoding, line_num)
else: else:
sys.stderr.write('line: %d has bad encoding\n'%line_num) sys.stderr.write('line: %d has bad encoding\n'%line_num)
return True
return False
if __name__ == '__main__': if __name__ == '__main__':
check_encoding_obj = CheckEncoding() check_encoding_obj = CheckEncoding()
check_encoding_obj.check_encoding(sys.argv[1]) check_encoding_obj.check_encoding(sys.argv[1])

View File

@ -23,7 +23,7 @@ class FixLineEndings:
bug_handler, bug_handler,
in_file = None, in_file = None,
copy = None, copy = None,
#run_level = 1, calibre why keep it? run_level = 1,
replace_illegals = 1, replace_illegals = 1,
): ):
self.__file = in_file self.__file = in_file
@ -32,8 +32,11 @@ class FixLineEndings:
self.__write_to = tempfile.mktemp() self.__write_to = tempfile.mktemp()
self.__replace_illegals = replace_illegals self.__replace_illegals = replace_illegals
def fix_endings(self): def fix_endings(self):
illegal_regx = re.compile('\x00|\x01|\x02|\x03|\x04|\x05|\x06|\x07|\x08|\x0B|\x0E|\x0F|\x10|\x11|\x12|\x13') #remove ASCII invalid chars : 0 to 8 and 11-14 to 24
#always check since I have to get rid of illegal characters #always check since I have to get rid of illegal characters
chars = list(range(8)) + [0x0B, 0x0E, 0x0F] + list(range(0x10, 0x19))
illegal_regx = re.compile(u'|'.join(map(unichr, chars)))
#illegal_regx = re.compile('\x00|\x01|\x02|\x03|\x04|\x05|\x06|\x07|\x08|\x0B|\x0E|\x0F|\x10|\x11|\x12|\x13')
#read #read
read_obj = open(self.__file, 'r') read_obj = open(self.__file, 'r')
input_file = read_obj.read() input_file = read_obj.read()
@ -42,7 +45,7 @@ class FixLineEndings:
input_file = input_file.replace ('\r\n', '\n') input_file = input_file.replace ('\r\n', '\n')
input_file = input_file.replace ('\r', '\n') input_file = input_file.replace ('\r', '\n')
if self.__replace_illegals: if self.__replace_illegals:
input_file = re.sub(illegal_regx, '', input_file) input_file = illegal_regx.sub('', input_file)
#write #write
write_obj = open(self.__write_to, 'wb') write_obj = open(self.__write_to, 'wb')
write_obj.write(input_file) write_obj.write(input_file)