mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Allow check encoding to look directly in rtf & improve code checking for invalid chars
This commit is contained in:
parent
36845e1c9d
commit
bbaecb4007
@ -230,14 +230,21 @@ class ParseRtf:
|
|||||||
os.remove(self.__temp_file)
|
os.remove(self.__temp_file)
|
||||||
except OSError:
|
except OSError:
|
||||||
pass
|
pass
|
||||||
#Check to see if the file is correctly encoded
|
#Check to see if the file is correctly encoded
|
||||||
|
encode_obj = default_encoding.DefaultEncoding(
|
||||||
|
in_file = self.__temp_file,
|
||||||
|
run_level = self.__run_level,
|
||||||
|
bug_handler = RtfInvalidCodeException,
|
||||||
|
check_raw = True,
|
||||||
|
)
|
||||||
|
platform, code_page, default_font_num = encode_obj.find_default_encoding()
|
||||||
check_encoding_obj = check_encoding.CheckEncoding(
|
check_encoding_obj = check_encoding.CheckEncoding(
|
||||||
bug_handler = RtfInvalidCodeException,
|
bug_handler = RtfInvalidCodeException,
|
||||||
)
|
)
|
||||||
if check_encoding_obj.check_encoding(self.__file, 'cp1252') and \
|
enc = encode_obj.get_codepage()
|
||||||
check_encoding_obj.check_encoding(self.__file, 'cp437') and \
|
if enc != 'mac_roman':
|
||||||
check_encoding_obj.check_encoding(self.__file, 'cp850') and \
|
enc = 'cp' + enc
|
||||||
check_encoding_obj.check_encoding(self.__file, 'mac_roman'):
|
if check_encoding_obj.check_encoding(self.__file, enc):
|
||||||
file_name = self.__file if isinstance(self.__file, str) \
|
file_name = self.__file if isinstance(self.__file, str) \
|
||||||
else self.__file.encode('utf-8')
|
else self.__file.encode('utf-8')
|
||||||
msg = _('File %s does not appear to be correctly encoded.\n') % file_name
|
msg = _('File %s does not appear to be correctly encoded.\n') % file_name
|
||||||
|
@ -55,18 +55,20 @@ Codepages as to RTF 1.9.1:
|
|||||||
57010 Gujarati
|
57010 Gujarati
|
||||||
57011 Punjabi
|
57011 Punjabi
|
||||||
'''
|
'''
|
||||||
|
import re
|
||||||
|
|
||||||
class DefaultEncoding:
|
class DefaultEncoding:
|
||||||
"""
|
"""
|
||||||
Find the default encoding for the doc
|
Find the default encoding for the doc
|
||||||
"""
|
"""
|
||||||
def __init__(self, in_file, bug_handler, run_level = 1,):
|
def __init__(self, in_file, bug_handler, run_level = 1, check_raw = False):
|
||||||
self.__file = in_file
|
self.__file = in_file
|
||||||
self.__bug_handler = bug_handler
|
self.__bug_handler = bug_handler
|
||||||
self.__platform = 'Windows'
|
self.__platform = 'Windows'
|
||||||
self.__default_num = 'not-defined'
|
self.__default_num = 'not-defined'
|
||||||
self.__code_page = '1252'
|
self.__code_page = '1252'
|
||||||
self.__datafetched = False
|
self.__datafetched = False
|
||||||
|
self.__fetchraw = check_raw
|
||||||
|
|
||||||
def find_default_encoding(self):
|
def find_default_encoding(self):
|
||||||
if not self.__datafetched:
|
if not self.__datafetched:
|
||||||
@ -92,27 +94,48 @@ class DefaultEncoding:
|
|||||||
|
|
||||||
def _encoding(self):
|
def _encoding(self):
|
||||||
with open(self.__file, 'r') as read_obj:
|
with open(self.__file, 'r') as read_obj:
|
||||||
for line in read_obj:
|
if not self.__fetchraw:
|
||||||
self.__token_info = line[:16]
|
for line in read_obj:
|
||||||
if self.__token_info == 'mi<mk<rtfhed-end':
|
self.__token_info = line[:16]
|
||||||
break
|
if self.__token_info == 'mi<mk<rtfhed-end':
|
||||||
if self.__token_info == 'cw<ri<ansi-codpg':
|
break
|
||||||
#cw<ri<ansi-codpg<nu<10000
|
if self.__token_info == 'cw<ri<ansi-codpg':
|
||||||
self.__code_page = line[20:-1] if line[20:-1] \
|
#cw<ri<ansi-codpg<nu<10000
|
||||||
else '1252'
|
self.__code_page = line[20:-1] if line[20:-1] \
|
||||||
if self.__token_info == 'cw<ri<macintosh_':
|
else '1252'
|
||||||
self.__platform = 'Macintosh'
|
if self.__token_info == 'cw<ri<macintosh_':
|
||||||
elif self.__token_info == 'cw<ri<pc________':
|
self.__platform = 'Macintosh'
|
||||||
self.__platform = 'IBMPC'
|
self.__code_page = 'mac_roman'
|
||||||
elif self.__token_info == 'cw<ri<pca_______':
|
elif self.__token_info == 'cw<ri<pc________':
|
||||||
self.__platform = 'OS/2'
|
self.__platform = 'IBMPC'
|
||||||
if self.__token_info == 'cw<ri<deflt-font':
|
self.__code_page = '437'
|
||||||
self.__default_num = line[20:-1]
|
elif self.__token_info == 'cw<ri<pca_______':
|
||||||
#cw<ri<deflt-font<nu<0
|
self.__platform = 'OS/2'
|
||||||
if self.__platform == 'Macintosh':
|
self.__code_page = '850'
|
||||||
self.__code_page = 'mac_roman'
|
if self.__token_info == 'cw<ri<deflt-font':
|
||||||
elif self.__platform == 'IBMPC':
|
self.__default_num = line[20:-1]
|
||||||
self.__code_page = '437'
|
#cw<ri<deflt-font<nu<0
|
||||||
elif self.__platform == 'OS/2':
|
else:
|
||||||
self.__code_page = '850'
|
fenc = re.compile(r'\\(mac|pc|ansi|pca)[\\ \{\}\t\n]+')
|
||||||
|
fenccp = re.compile(r'\\ansicpg(\d+)[\\ \{\}\t\n]+')
|
||||||
|
for line in read_obj:
|
||||||
|
if fenccp.search(line):
|
||||||
|
self.__code_page = fenccp.search(line).group(1)
|
||||||
|
break
|
||||||
|
if fenc.search(line):
|
||||||
|
enc = fenc.search(line).group(1)
|
||||||
|
if enc == 'mac':
|
||||||
|
self.__code_page = 'mac_roman'
|
||||||
|
elif enc == 'pc':
|
||||||
|
self.__code_page = '437'
|
||||||
|
elif enc == 'pca':
|
||||||
|
self.__code_page = '850'
|
||||||
|
|
||||||
|
# if __name__ == '__main__':
|
||||||
|
# from calibre.ebooks.rtf2xml import default_encoding
|
||||||
|
# encode_obj = default_encoding.DefaultEncoding(
|
||||||
|
# in_file = sys.argv[1],
|
||||||
|
# bug_handler = Exception,
|
||||||
|
# check_raw = True,
|
||||||
|
# )
|
||||||
|
# print encode_obj.get_codepage()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user