mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Handle non ascii charset in RTF if declared as codepage
This commit is contained in:
parent
b2187360ec
commit
ac07ff853e
@ -326,7 +326,6 @@ class ParseRtf:
|
|||||||
invalid_rtf_handler = InvalidRtfException,
|
invalid_rtf_handler = InvalidRtfException,
|
||||||
)
|
)
|
||||||
hex2utf_obj.convert_hex_2_utf8()
|
hex2utf_obj.convert_hex_2_utf8()
|
||||||
# raise RtfInvalidCodeException, 'stop'
|
|
||||||
self.__bracket_match('hex_2_utf_preamble')
|
self.__bracket_match('hex_2_utf_preamble')
|
||||||
fonts_obj = fonts.Fonts(
|
fonts_obj = fonts.Fonts(
|
||||||
in_file = self.__temp_file,
|
in_file = self.__temp_file,
|
||||||
@ -523,6 +522,7 @@ class ParseRtf:
|
|||||||
indent = self.__indent,
|
indent = self.__indent,
|
||||||
run_level = self.__run_level,
|
run_level = self.__run_level,
|
||||||
no_dtd = self.__no_dtd,
|
no_dtd = self.__no_dtd,
|
||||||
|
encoding = encode_obj.get_codepage(),
|
||||||
bug_handler = RtfInvalidCodeException,
|
bug_handler = RtfInvalidCodeException,
|
||||||
)
|
)
|
||||||
tags_obj.convert_to_tags()
|
tags_obj.convert_to_tags()
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
class CheckEncoding:
|
class CheckEncoding:
|
||||||
|
|
||||||
def __init__(self, bug_handler):
|
def __init__(self, bug_handler):
|
||||||
|
@ -1,6 +1,9 @@
|
|||||||
import os, tempfile
|
import os, tempfile
|
||||||
from calibre.ebooks.rtf2xml import copy
|
|
||||||
|
from calibre.ebooks.rtf2xml import copy, check_encoding
|
||||||
|
|
||||||
public_dtd = 'rtf2xml1.0.dtd'
|
public_dtd = 'rtf2xml1.0.dtd'
|
||||||
|
|
||||||
class ConvertToTags:
|
class ConvertToTags:
|
||||||
"""
|
"""
|
||||||
Convert file to XML
|
Convert file to XML
|
||||||
@ -10,6 +13,7 @@ class ConvertToTags:
|
|||||||
bug_handler,
|
bug_handler,
|
||||||
dtd_path,
|
dtd_path,
|
||||||
no_dtd,
|
no_dtd,
|
||||||
|
encoding,
|
||||||
indent = None,
|
indent = None,
|
||||||
copy = None,
|
copy = None,
|
||||||
run_level = 1,
|
run_level = 1,
|
||||||
@ -29,9 +33,14 @@ class ConvertToTags:
|
|||||||
self.__copy = copy
|
self.__copy = copy
|
||||||
self.__dtd_path = dtd_path
|
self.__dtd_path = dtd_path
|
||||||
self.__no_dtd = no_dtd
|
self.__no_dtd = no_dtd
|
||||||
|
if encoding != 'mac_roman':
|
||||||
|
self.__encoding = 'cp' + encoding
|
||||||
|
else:
|
||||||
|
self.__encoding = 'mac_roman'
|
||||||
self.__indent = indent
|
self.__indent = indent
|
||||||
self.__run_level = run_level
|
self.__run_level = run_level
|
||||||
self.__write_to = tempfile.mktemp()
|
self.__write_to = tempfile.mktemp()
|
||||||
|
|
||||||
def __initiate_values(self):
|
def __initiate_values(self):
|
||||||
"""
|
"""
|
||||||
Set values, including those for the dictionary.
|
Set values, including those for the dictionary.
|
||||||
@ -61,6 +70,7 @@ class ConvertToTags:
|
|||||||
'tx<ut<__________' : self.__text_func,
|
'tx<ut<__________' : self.__text_func,
|
||||||
'mi<tg<empty_____' : self.__empty_func,
|
'mi<tg<empty_____' : self.__empty_func,
|
||||||
}
|
}
|
||||||
|
|
||||||
def __open_func(self, line):
|
def __open_func(self, line):
|
||||||
"""
|
"""
|
||||||
Print the opening tag and newlines when needed.
|
Print the opening tag and newlines when needed.
|
||||||
@ -73,6 +83,7 @@ class ConvertToTags:
|
|||||||
if info in self.__two_new_line:
|
if info in self.__two_new_line:
|
||||||
self.__write_extra_new_line()
|
self.__write_extra_new_line()
|
||||||
self.__write_obj.write('<%s>' % info)
|
self.__write_obj.write('<%s>' % info)
|
||||||
|
|
||||||
def __empty_func(self, line):
|
def __empty_func(self, line):
|
||||||
"""
|
"""
|
||||||
Print out empty tag and newlines when needed.
|
Print out empty tag and newlines when needed.
|
||||||
@ -85,6 +96,7 @@ class ConvertToTags:
|
|||||||
self.__write_new_line()
|
self.__write_new_line()
|
||||||
if info in self.__two_new_line:
|
if info in self.__two_new_line:
|
||||||
self.__write_extra_new_line()
|
self.__write_extra_new_line()
|
||||||
|
|
||||||
def __open_att_func(self, line):
|
def __open_att_func(self, line):
|
||||||
"""
|
"""
|
||||||
Process lines for open tags that have attributes.
|
Process lines for open tags that have attributes.
|
||||||
@ -119,6 +131,7 @@ class ConvertToTags:
|
|||||||
self.__write_new_line()
|
self.__write_new_line()
|
||||||
if element_name in self.__two_new_line:
|
if element_name in self.__two_new_line:
|
||||||
self.__write_extra_new_line()
|
self.__write_extra_new_line()
|
||||||
|
|
||||||
def __empty_att_func(self, line):
|
def __empty_att_func(self, line):
|
||||||
"""
|
"""
|
||||||
Same as the __open_att_func, except a '/' is placed at the end of the tag.
|
Same as the __open_att_func, except a '/' is placed at the end of the tag.
|
||||||
@ -143,6 +156,7 @@ class ConvertToTags:
|
|||||||
self.__write_new_line()
|
self.__write_new_line()
|
||||||
if element_name in self.__two_new_line:
|
if element_name in self.__two_new_line:
|
||||||
self.__write_extra_new_line()
|
self.__write_extra_new_line()
|
||||||
|
|
||||||
def __close_func(self, line):
|
def __close_func(self, line):
|
||||||
"""
|
"""
|
||||||
Print out the closed tag and new lines, if appropriate.
|
Print out the closed tag and new lines, if appropriate.
|
||||||
@ -156,6 +170,7 @@ class ConvertToTags:
|
|||||||
self.__write_new_line()
|
self.__write_new_line()
|
||||||
if info in self.__two_new_line:
|
if info in self.__two_new_line:
|
||||||
self.__write_extra_new_line()
|
self.__write_extra_new_line()
|
||||||
|
|
||||||
def __text_func(self, line):
|
def __text_func(self, line):
|
||||||
"""
|
"""
|
||||||
Simply print out the information between [17:-1]
|
Simply print out the information between [17:-1]
|
||||||
@ -163,6 +178,7 @@ class ConvertToTags:
|
|||||||
#tx<nu<__________<Normal;
|
#tx<nu<__________<Normal;
|
||||||
# change this!
|
# change this!
|
||||||
self.__write_obj.write(line[17:-1])
|
self.__write_obj.write(line[17:-1])
|
||||||
|
|
||||||
def __write_extra_new_line(self):
|
def __write_extra_new_line(self):
|
||||||
"""
|
"""
|
||||||
Print out extra new lines if the new lines have not exceeded two. If
|
Print out extra new lines if the new lines have not exceeded two. If
|
||||||
@ -172,8 +188,10 @@ class ConvertToTags:
|
|||||||
return
|
return
|
||||||
if self.__new_line < 2:
|
if self.__new_line < 2:
|
||||||
self.__write_obj.write('\n')
|
self.__write_obj.write('\n')
|
||||||
|
|
||||||
def __default_func(self, line):
|
def __default_func(self, line):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def __write_new_line(self):
|
def __write_new_line(self):
|
||||||
"""
|
"""
|
||||||
Print out a new line if a new line has not already been printed out.
|
Print out a new line if a new line has not already been printed out.
|
||||||
@ -183,11 +201,22 @@ class ConvertToTags:
|
|||||||
if not self.__new_line:
|
if not self.__new_line:
|
||||||
self.__write_obj.write('\n')
|
self.__write_obj.write('\n')
|
||||||
self.__new_line += 1
|
self.__new_line += 1
|
||||||
|
|
||||||
def __write_dec(self):
|
def __write_dec(self):
|
||||||
"""
|
"""
|
||||||
Write the XML declaration at the top of the document.
|
Write the XML declaration at the top of the document.
|
||||||
"""
|
"""
|
||||||
self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>')
|
#keep maximum compatibility with previous version
|
||||||
|
check_encoding_obj = check_encoding.CheckEncoding(
|
||||||
|
bug_handler = self.__bug_handler,
|
||||||
|
)
|
||||||
|
if not check_encoding_obj.check_encoding(self.__file):
|
||||||
|
self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>')
|
||||||
|
elif not check_encoding_obj.check_encoding(self.__file, self.__encoding):
|
||||||
|
self.__write_obj.write('<?xml version="1.0" encoding="%s" ?>' % self.__encoding)
|
||||||
|
else:
|
||||||
|
self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>')
|
||||||
|
sys.stderr.write(_('Bad RTF encoding, revert to US-ASCII chars and hope for the best'))
|
||||||
self.__new_line = 0
|
self.__new_line = 0
|
||||||
self.__write_new_line()
|
self.__write_new_line()
|
||||||
if self.__no_dtd:
|
if self.__no_dtd:
|
||||||
@ -207,6 +236,7 @@ class ConvertToTags:
|
|||||||
)
|
)
|
||||||
self.__new_line = 0
|
self.__new_line = 0
|
||||||
self.__write_new_line()
|
self.__write_new_line()
|
||||||
|
|
||||||
def convert_to_tags(self):
|
def convert_to_tags(self):
|
||||||
"""
|
"""
|
||||||
Read in the file one line at a time. Get the important info, between
|
Read in the file one line at a time. Get the important info, between
|
||||||
@ -222,18 +252,14 @@ class ConvertToTags:
|
|||||||
an empty tag function.
|
an empty tag function.
|
||||||
"""
|
"""
|
||||||
self.__initiate_values()
|
self.__initiate_values()
|
||||||
read_obj = open(self.__file, 'r')
|
|
||||||
self.__write_obj = open(self.__write_to, 'w')
|
self.__write_obj = open(self.__write_to, 'w')
|
||||||
self.__write_dec()
|
self.__write_dec()
|
||||||
line_to_read = 1
|
with open(self.__file, 'r') as read_obj:
|
||||||
while line_to_read:
|
for line in read_obj:
|
||||||
line_to_read = read_obj.readline()
|
self.__token_info = line[:16]
|
||||||
line = line_to_read
|
action = self.__state_dict.get(self.__token_info)
|
||||||
self.__token_info = line[:16]
|
if action is not None:
|
||||||
action = self.__state_dict.get(self.__token_info)
|
action(line)
|
||||||
if action != None:
|
|
||||||
action(line)
|
|
||||||
read_obj.close()
|
|
||||||
self.__write_obj.close()
|
self.__write_obj.close()
|
||||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||||
if self.__copy:
|
if self.__copy:
|
||||||
|
@ -132,8 +132,7 @@ class DefaultEncoding:
|
|||||||
self.__code_page = '850'
|
self.__code_page = '850'
|
||||||
|
|
||||||
# if __name__ == '__main__':
|
# if __name__ == '__main__':
|
||||||
# from calibre.ebooks.rtf2xml import default_encoding
|
# encode_obj = DefaultEncoding(
|
||||||
# encode_obj = default_encoding.DefaultEncoding(
|
|
||||||
# in_file = sys.argv[1],
|
# in_file = sys.argv[1],
|
||||||
# bug_handler = Exception,
|
# bug_handler = Exception,
|
||||||
# check_raw = True,
|
# check_raw = True,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user