mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Global overhaul of rtf2xml: RTFfixes (4) ->minors corrections in line endings and check brackets, move check encoding first to eliminate non ascii RTF
This commit is contained in:
parent
7c70914ad3
commit
b9ed0c6b3d
@ -133,7 +133,6 @@ class ParseRtf:
|
|||||||
self.__temp_dir = out_dir
|
self.__temp_dir = out_dir
|
||||||
self.__dtd_path = dtd
|
self.__dtd_path = dtd
|
||||||
self.__check_file(in_file,"file_to_parse")
|
self.__check_file(in_file,"file_to_parse")
|
||||||
self.__check_ascii(in_file)
|
|
||||||
self.__char_data = char_data
|
self.__char_data = char_data
|
||||||
self.__debug_dir = deb_dir
|
self.__debug_dir = deb_dir
|
||||||
self.__check_dir(self.__temp_dir)
|
self.__check_dir(self.__temp_dir)
|
||||||
@ -152,6 +151,7 @@ class ParseRtf:
|
|||||||
self.__group_borders = group_borders
|
self.__group_borders = group_borders
|
||||||
self.__empty_paragraphs = empty_paragraphs
|
self.__empty_paragraphs = empty_paragraphs
|
||||||
self.__no_dtd = no_dtd
|
self.__no_dtd = no_dtd
|
||||||
|
|
||||||
def __check_file(self, the_file, type):
|
def __check_file(self, the_file, type):
|
||||||
"""Check to see if files exist"""
|
"""Check to see if files exist"""
|
||||||
if hasattr(the_file, 'read'): return
|
if hasattr(the_file, 'read'): return
|
||||||
@ -164,6 +164,7 @@ class ParseRtf:
|
|||||||
else:
|
else:
|
||||||
msg = "\nThe file '%s' cannot be found" % the_file
|
msg = "\nThe file '%s' cannot be found" % the_file
|
||||||
raise RtfInvalidCodeException, msg
|
raise RtfInvalidCodeException, msg
|
||||||
|
|
||||||
def __check_dir(self, the_dir):
|
def __check_dir(self, the_dir):
|
||||||
"""Check to see if directory exists"""
|
"""Check to see if directory exists"""
|
||||||
if not the_dir :
|
if not the_dir :
|
||||||
@ -173,15 +174,7 @@ class ParseRtf:
|
|||||||
msg = "\n%s is not a directory" % the_dir
|
msg = "\n%s is not a directory" % the_dir
|
||||||
raise RtfInvalidCodeException, msg
|
raise RtfInvalidCodeException, msg
|
||||||
return 1
|
return 1
|
||||||
def __check_ascii(self, the_file):
|
|
||||||
"""Check to see if the file is correct ascii"""
|
|
||||||
try:
|
|
||||||
test = codecs.open(the_file, 'r', 'ascii', 'strict')
|
|
||||||
test.close()
|
|
||||||
except UnicodeError:
|
|
||||||
msg = "\n%s is not a correct ascii file" % the_file
|
|
||||||
raise RtfInvalidCodeException, msg
|
|
||||||
return 1
|
|
||||||
def parse_rtf(self):
|
def parse_rtf(self):
|
||||||
"""
|
"""
|
||||||
Parse the file by calling on other classes.
|
Parse the file by calling on other classes.
|
||||||
@ -192,6 +185,18 @@ class ParseRtf:
|
|||||||
depending on the value of 'output' when the instance was created.
|
depending on the value of 'output' when the instance was created.
|
||||||
"""
|
"""
|
||||||
self.__temp_file = self.__make_temp_file(self.__file)
|
self.__temp_file = self.__make_temp_file(self.__file)
|
||||||
|
#Check to see if the file is correct ascii first
|
||||||
|
check_encoding_obj = check_encoding.CheckEncoding(
|
||||||
|
bug_handler = RtfInvalidCodeException,
|
||||||
|
)
|
||||||
|
if check_encoding_obj.check_encoding(self.__file):
|
||||||
|
try:
|
||||||
|
os.remove(self.__temp_file)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
sys.stderr.write('File "%s" does not appear to be ascii.\n' \
|
||||||
|
% self.__file if isinstance(self.__file, str) else self.__file.encode('utf-8'))
|
||||||
|
raise InvalidRtfException
|
||||||
# if the self.__deb_dir is true, then create a copy object,
|
# if the self.__deb_dir is true, then create a copy object,
|
||||||
# set the directory to write to, remove files, and copy
|
# set the directory to write to, remove files, and copy
|
||||||
# the new temporary file to this directory
|
# the new temporary file to this directory
|
||||||
@ -214,7 +219,7 @@ class ParseRtf:
|
|||||||
in_file = self.__temp_file,
|
in_file = self.__temp_file,
|
||||||
bug_handler = RtfInvalidCodeException,
|
bug_handler = RtfInvalidCodeException,
|
||||||
copy = self.__copy,
|
copy = self.__copy,
|
||||||
#run_level = self.__run_level,
|
run_level = self.__run_level,
|
||||||
replace_illegals = self.__replace_illegals,
|
replace_illegals = self.__replace_illegals,
|
||||||
)
|
)
|
||||||
line_obj.fix_endings()
|
line_obj.fix_endings()
|
||||||
@ -223,8 +228,8 @@ class ParseRtf:
|
|||||||
tokenize_obj = tokenize.Tokenize(
|
tokenize_obj = tokenize.Tokenize(
|
||||||
bug_handler = RtfInvalidCodeException,
|
bug_handler = RtfInvalidCodeException,
|
||||||
in_file = self.__temp_file,
|
in_file = self.__temp_file,
|
||||||
copy = self.__copy,)
|
copy = self.__copy,
|
||||||
#run_level = self.__run_level,)
|
run_level = self.__run_level)
|
||||||
tokenize_obj.tokenize()
|
tokenize_obj.tokenize()
|
||||||
process_tokens_obj = process_tokens.ProcessTokens(
|
process_tokens_obj = process_tokens.ProcessTokens(
|
||||||
in_file = self.__temp_file,
|
in_file = self.__temp_file,
|
||||||
@ -240,10 +245,6 @@ class ParseRtf:
|
|||||||
os.remove(self.__temp_file)
|
os.remove(self.__temp_file)
|
||||||
except OSError:
|
except OSError:
|
||||||
pass
|
pass
|
||||||
check_encoding_obj = check_encoding.CheckEncoding(
|
|
||||||
bug_handler = RtfInvalidCodeException,
|
|
||||||
)
|
|
||||||
check_encoding_obj.check_encoding(self.__file)
|
|
||||||
sys.stderr.write('File "%s" does not appear to be RTF.\n' % self.__file if isinstance(self.__file, str) else self.__file.encode('utf-8'))
|
sys.stderr.write('File "%s" does not appear to be RTF.\n' % self.__file if isinstance(self.__file, str) else self.__file.encode('utf-8'))
|
||||||
raise InvalidRtfException, msg
|
raise InvalidRtfException, msg
|
||||||
delete_info_obj = delete_info.DeleteInfo(
|
delete_info_obj = delete_info.DeleteInfo(
|
||||||
@ -548,8 +549,7 @@ class ParseRtf:
|
|||||||
"""Make a temporary file to parse"""
|
"""Make a temporary file to parse"""
|
||||||
write_file="rtf_write_file"
|
write_file="rtf_write_file"
|
||||||
read_obj = file if hasattr(file, 'read') else open(file,'r')
|
read_obj = file if hasattr(file, 'read') else open(file,'r')
|
||||||
write_obj = open(write_file, 'w')
|
write_obj = open(write_file, 'wb')
|
||||||
for line in read_obj:
|
write_obj.write(read_obj.read())
|
||||||
write_obj.write(line)
|
|
||||||
write_obj.close()
|
write_obj.close()
|
||||||
return write_file
|
return write_file
|
@ -30,7 +30,6 @@ class CheckBrackets:
|
|||||||
self.__bracket_count += 1
|
self.__bracket_count += 1
|
||||||
def close_brack(self, line):
|
def close_brack(self, line):
|
||||||
num = line[-5:-1]
|
num = line[-5:-1]
|
||||||
##self.__open_bracket_num.append(num)
|
|
||||||
try:
|
try:
|
||||||
last_num = self.__open_bracket_num.pop()
|
last_num = self.__open_bracket_num.pop()
|
||||||
except:
|
except:
|
||||||
|
@ -14,12 +14,11 @@ class CheckEncoding:
|
|||||||
sys.stderr.write(str(msg) + '\n')
|
sys.stderr.write(str(msg) + '\n')
|
||||||
def check_encoding(self, path, encoding='us-ascii'):
|
def check_encoding(self, path, encoding='us-ascii'):
|
||||||
read_obj = open(path, 'r')
|
read_obj = open(path, 'r')
|
||||||
line_to_read = 1
|
input_file = read_obj.read()
|
||||||
|
read_obj.close()
|
||||||
line_num = 0
|
line_num = 0
|
||||||
while line_to_read:
|
for line in input_file:
|
||||||
line_num += 1
|
line_num += 1
|
||||||
line_to_read = read_obj.readline()
|
|
||||||
line = line_to_read
|
|
||||||
try:
|
try:
|
||||||
line.decode(encoding)
|
line.decode(encoding)
|
||||||
except UnicodeError:
|
except UnicodeError:
|
||||||
@ -27,6 +26,9 @@ class CheckEncoding:
|
|||||||
self.__get_position_error(line, encoding, line_num)
|
self.__get_position_error(line, encoding, line_num)
|
||||||
else:
|
else:
|
||||||
sys.stderr.write('line: %d has bad encoding\n'%line_num)
|
sys.stderr.write('line: %d has bad encoding\n'%line_num)
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
check_encoding_obj = CheckEncoding()
|
check_encoding_obj = CheckEncoding()
|
||||||
check_encoding_obj.check_encoding(sys.argv[1])
|
check_encoding_obj.check_encoding(sys.argv[1])
|
||||||
|
@ -23,7 +23,7 @@ class FixLineEndings:
|
|||||||
bug_handler,
|
bug_handler,
|
||||||
in_file = None,
|
in_file = None,
|
||||||
copy = None,
|
copy = None,
|
||||||
#run_level = 1, calibre why keep it?
|
run_level = 1,
|
||||||
replace_illegals = 1,
|
replace_illegals = 1,
|
||||||
):
|
):
|
||||||
self.__file = in_file
|
self.__file = in_file
|
||||||
@ -32,8 +32,11 @@ class FixLineEndings:
|
|||||||
self.__write_to = tempfile.mktemp()
|
self.__write_to = tempfile.mktemp()
|
||||||
self.__replace_illegals = replace_illegals
|
self.__replace_illegals = replace_illegals
|
||||||
def fix_endings(self):
|
def fix_endings(self):
|
||||||
illegal_regx = re.compile('\x00|\x01|\x02|\x03|\x04|\x05|\x06|\x07|\x08|\x0B|\x0E|\x0F|\x10|\x11|\x12|\x13')
|
#remove ASCII invalid chars : 0 to 8 and 11-14 to 24
|
||||||
# always check since I have to get rid of illegal characters
|
#always check since I have to get rid of illegal characters
|
||||||
|
chars = list(range(8)) + [0x0B, 0x0E, 0x0F] + list(range(0x10, 0x19))
|
||||||
|
illegal_regx = re.compile(u'|'.join(map(unichr, chars)))
|
||||||
|
#illegal_regx = re.compile('\x00|\x01|\x02|\x03|\x04|\x05|\x06|\x07|\x08|\x0B|\x0E|\x0F|\x10|\x11|\x12|\x13')
|
||||||
#read
|
#read
|
||||||
read_obj = open(self.__file, 'r')
|
read_obj = open(self.__file, 'r')
|
||||||
input_file = read_obj.read()
|
input_file = read_obj.read()
|
||||||
@ -42,7 +45,7 @@ class FixLineEndings:
|
|||||||
input_file = input_file.replace ('\r\n', '\n')
|
input_file = input_file.replace ('\r\n', '\n')
|
||||||
input_file = input_file.replace ('\r', '\n')
|
input_file = input_file.replace ('\r', '\n')
|
||||||
if self.__replace_illegals:
|
if self.__replace_illegals:
|
||||||
input_file = re.sub(illegal_regx, '', input_file)
|
input_file = illegal_regx.sub('', input_file)
|
||||||
#write
|
#write
|
||||||
write_obj = open(self.__write_to, 'wb')
|
write_obj = open(self.__write_to, 'wb')
|
||||||
write_obj.write(input_file)
|
write_obj.write(input_file)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user