mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Allow check encoding to look directly in rtf & improve code checking for invalid chars
This commit is contained in:
parent
36845e1c9d
commit
bbaecb4007
@ -230,14 +230,21 @@ class ParseRtf:
|
||||
os.remove(self.__temp_file)
|
||||
except OSError:
|
||||
pass
|
||||
#Check to see if the file is correctly encoded
|
||||
#Check to see if the file is correctly encoded
|
||||
encode_obj = default_encoding.DefaultEncoding(
|
||||
in_file = self.__temp_file,
|
||||
run_level = self.__run_level,
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
check_raw = True,
|
||||
)
|
||||
platform, code_page, default_font_num = encode_obj.find_default_encoding()
|
||||
check_encoding_obj = check_encoding.CheckEncoding(
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
)
|
||||
if check_encoding_obj.check_encoding(self.__file, 'cp1252') and \
|
||||
check_encoding_obj.check_encoding(self.__file, 'cp437') and \
|
||||
check_encoding_obj.check_encoding(self.__file, 'cp850') and \
|
||||
check_encoding_obj.check_encoding(self.__file, 'mac_roman'):
|
||||
enc = encode_obj.get_codepage()
|
||||
if enc != 'mac_roman':
|
||||
enc = 'cp' + enc
|
||||
if check_encoding_obj.check_encoding(self.__file, enc):
|
||||
file_name = self.__file if isinstance(self.__file, str) \
|
||||
else self.__file.encode('utf-8')
|
||||
msg = _('File %s does not appear to be correctly encoded.\n') % file_name
|
||||
|
@ -55,18 +55,20 @@ Codepages as to RTF 1.9.1:
|
||||
57010 Gujarati
|
||||
57011 Punjabi
|
||||
'''
|
||||
import re
|
||||
|
||||
class DefaultEncoding:
|
||||
"""
|
||||
Find the default encoding for the doc
|
||||
"""
|
||||
def __init__(self, in_file, bug_handler, run_level = 1,):
|
||||
def __init__(self, in_file, bug_handler, run_level = 1, check_raw = False):
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__platform = 'Windows'
|
||||
self.__default_num = 'not-defined'
|
||||
self.__code_page = '1252'
|
||||
self.__datafetched = False
|
||||
self.__fetchraw = check_raw
|
||||
|
||||
def find_default_encoding(self):
|
||||
if not self.__datafetched:
|
||||
@ -92,27 +94,48 @@ class DefaultEncoding:
|
||||
|
||||
def _encoding(self):
|
||||
with open(self.__file, 'r') as read_obj:
|
||||
for line in read_obj:
|
||||
self.__token_info = line[:16]
|
||||
if self.__token_info == 'mi<mk<rtfhed-end':
|
||||
break
|
||||
if self.__token_info == 'cw<ri<ansi-codpg':
|
||||
#cw<ri<ansi-codpg<nu<10000
|
||||
self.__code_page = line[20:-1] if line[20:-1] \
|
||||
else '1252'
|
||||
if self.__token_info == 'cw<ri<macintosh_':
|
||||
self.__platform = 'Macintosh'
|
||||
elif self.__token_info == 'cw<ri<pc________':
|
||||
self.__platform = 'IBMPC'
|
||||
elif self.__token_info == 'cw<ri<pca_______':
|
||||
self.__platform = 'OS/2'
|
||||
if self.__token_info == 'cw<ri<deflt-font':
|
||||
self.__default_num = line[20:-1]
|
||||
#cw<ri<deflt-font<nu<0
|
||||
if self.__platform == 'Macintosh':
|
||||
self.__code_page = 'mac_roman'
|
||||
elif self.__platform == 'IBMPC':
|
||||
self.__code_page = '437'
|
||||
elif self.__platform == 'OS/2':
|
||||
self.__code_page = '850'
|
||||
if not self.__fetchraw:
|
||||
for line in read_obj:
|
||||
self.__token_info = line[:16]
|
||||
if self.__token_info == 'mi<mk<rtfhed-end':
|
||||
break
|
||||
if self.__token_info == 'cw<ri<ansi-codpg':
|
||||
#cw<ri<ansi-codpg<nu<10000
|
||||
self.__code_page = line[20:-1] if line[20:-1] \
|
||||
else '1252'
|
||||
if self.__token_info == 'cw<ri<macintosh_':
|
||||
self.__platform = 'Macintosh'
|
||||
self.__code_page = 'mac_roman'
|
||||
elif self.__token_info == 'cw<ri<pc________':
|
||||
self.__platform = 'IBMPC'
|
||||
self.__code_page = '437'
|
||||
elif self.__token_info == 'cw<ri<pca_______':
|
||||
self.__platform = 'OS/2'
|
||||
self.__code_page = '850'
|
||||
if self.__token_info == 'cw<ri<deflt-font':
|
||||
self.__default_num = line[20:-1]
|
||||
#cw<ri<deflt-font<nu<0
|
||||
else:
|
||||
fenc = re.compile(r'\\(mac|pc|ansi|pca)[\\ \{\}\t\n]+')
|
||||
fenccp = re.compile(r'\\ansicpg(\d+)[\\ \{\}\t\n]+')
|
||||
for line in read_obj:
|
||||
if fenccp.search(line):
|
||||
self.__code_page = fenccp.search(line).group(1)
|
||||
break
|
||||
if fenc.search(line):
|
||||
enc = fenc.search(line).group(1)
|
||||
if enc == 'mac':
|
||||
self.__code_page = 'mac_roman'
|
||||
elif enc == 'pc':
|
||||
self.__code_page = '437'
|
||||
elif enc == 'pca':
|
||||
self.__code_page = '850'
|
||||
|
||||
# if __name__ == '__main__':
|
||||
# from calibre.ebooks.rtf2xml import default_encoding
|
||||
# encode_obj = default_encoding.DefaultEncoding(
|
||||
# in_file = sys.argv[1],
|
||||
# bug_handler = Exception,
|
||||
# check_raw = True,
|
||||
# )
|
||||
# print encode_obj.get_codepage()
|
||||
|
Loading…
x
Reference in New Issue
Block a user