RTF Input: Use input encoding setting for files with no codepage

RTF Input: When converting RTF files with no codepage, use the input
encoding setting as the codepage. Fixes #1163572 [RTF charset ansicpg0 handling](https://bugs.launchpad.net/calibre/+bug/1163572)

Merge branch 'RTF-changes' of https://github.com/sengian/calibre
This commit is contained in:
Kovid Goyal 2013-08-12 09:08:40 +05:30
commit 437746f139
9 changed files with 65 additions and 23 deletions

View File

@ -96,8 +96,13 @@ class RTFInput(InputFormatPlugin):
# Write or do not write paragraphs. Default is 0. # Write or do not write paragraphs. Default is 0.
empty_paragraphs = 1, empty_paragraphs = 1,
#debug # Debug
deb_dir = debug_dir, deb_dir = debug_dir,
# Default encoding
default_encoding = getattr(self.opts, 'input_encoding', 'cp1252') or 'cp1252',
# Run level
run_level = run_lev, run_level = run_lev,
) )
parser.parse_rtf() parser.parse_rtf()

View File

@ -1,5 +1,6 @@
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
""" """
Edit metadata in RTF files. Edit metadata in RTF files.
""" """
@ -61,7 +62,7 @@ def detect_codepage(stream):
if match is not None: if match is not None:
num = match.group(1) num = match.group(1)
if num == '0': if num == '0':
num = '1250' num = '1252'
codec = 'cp'+num codec = 'cp'+num
try: try:
codecs.lookup(codec) codecs.lookup(codec)
@ -82,7 +83,9 @@ def decode(raw, codec):
return raw return raw
def get_metadata(stream): def get_metadata(stream):
""" Return metadata as a L{MetaInfo} object """ """
Return metadata as a L{MetaInfo} object
"""
stream.seek(0) stream.seek(0)
if stream.read(5) != r'{\rtf': if stream.read(5) != r'{\rtf':
return MetaInformation(_('Unknown')) return MetaInformation(_('Unknown'))

View File

@ -58,6 +58,8 @@ def Handle_Main():
group_borders = 1, group_borders = 1,
# Write or do not write paragraphs. Default is 0. # Write or do not write paragraphs. Default is 0.
empty_paragraphs = 0, empty_paragraphs = 0,
# Allow to use a custom default encoding as fallback
default_encoding = 'cp1252',
) )
try: try:
parse_obj.parse_rtf() parse_obj.parse_rtf()
@ -101,6 +103,7 @@ class ParseRtf:
empty_paragraphs = 1, empty_paragraphs = 1,
no_dtd = 0, no_dtd = 0,
char_data = '', char_data = '',
default_encoding = 'cp1252',
): ):
""" """
@ -144,6 +147,7 @@ class ParseRtf:
self.__group_borders = group_borders self.__group_borders = group_borders
self.__empty_paragraphs = empty_paragraphs self.__empty_paragraphs = empty_paragraphs
self.__no_dtd = no_dtd self.__no_dtd = no_dtd
self.__default_encoding = default_encoding
def __check_file(self, the_file, type): def __check_file(self, the_file, type):
"""Check to see if files exist""" """Check to see if files exist"""
@ -227,14 +231,15 @@ class ParseRtf:
run_level = self.__run_level, run_level = self.__run_level,
bug_handler = RtfInvalidCodeException, bug_handler = RtfInvalidCodeException,
check_raw = True, check_raw = True,
default_encoding = self.__default_encoding,
) )
platform, code_page, default_font_num = encode_obj.find_default_encoding() platform, code_page, default_font_num = encode_obj.find_default_encoding()
check_encoding_obj = check_encoding.CheckEncoding( check_encoding_obj = check_encoding.CheckEncoding(
bug_handler = RtfInvalidCodeException, bug_handler = RtfInvalidCodeException,
) )
enc = encode_obj.get_codepage() enc = encode_obj.get_codepage()
if enc != 'mac_roman': #TODO: to check if cp is a good idea or if I should use a dict to convert
enc = 'cp' + enc enc = 'cp' + enc
msg = '%s\nException in token processing' % str(msg) msg = '%s\nException in token processing' % str(msg)
if check_encoding_obj.check_encoding(self.__file, enc): if check_encoding_obj.check_encoding(self.__file, enc):
file_name = self.__file if isinstance(self.__file, str) \ file_name = self.__file if isinstance(self.__file, str) \
@ -308,6 +313,7 @@ class ParseRtf:
in_file = self.__temp_file, in_file = self.__temp_file,
run_level = self.__run_level, run_level = self.__run_level,
bug_handler = RtfInvalidCodeException, bug_handler = RtfInvalidCodeException,
default_encoding = self.__default_encoding,
) )
platform, code_page, default_font_num = encode_obj.find_default_encoding() platform, code_page, default_font_num = encode_obj.find_default_encoding()
hex2utf_obj = hex_2_utf8.Hex2Utf8( hex2utf_obj = hex_2_utf8.Hex2Utf8(

View File

@ -14872,7 +14872,8 @@ LATIN SMALL LETTER U WITH DIAERESIS:'FC:252:&#x00FC;
LATIN SMALL LETTER Z WITH DOT ABOVE:'FD:380:&#x017C; LATIN SMALL LETTER Z WITH DOT ABOVE:'FD:380:&#x017C;
LATIN SMALL LETTER Z WITH CARON:'FE:382:&#x017E; LATIN SMALL LETTER Z WITH CARON:'FE:382:&#x017E;
</ansicpg1257> </ansicpg1257>
<mac_roman> #mac_roman
<ansicpg10000>
LATIN CAPITAL LETTER A WITH DIAERESIS:'80:196:&#x00C4; LATIN CAPITAL LETTER A WITH DIAERESIS:'80:196:&#x00C4;
LATIN CAPITAL LETTER A WITH RING ABOVE:'81:197:&#x00C5; LATIN CAPITAL LETTER A WITH RING ABOVE:'81:197:&#x00C5;
LATIN CAPITAL LETTER C WITH CEDILLA:'82:199:&#x00C7; LATIN CAPITAL LETTER C WITH CEDILLA:'82:199:&#x00C7;
@ -15001,7 +15002,7 @@ CEDILLA:'FC:184:&#x00B8;
DOUBLE ACUTE ACCENT:'FD:733:&#x02DD; DOUBLE ACUTE ACCENT:'FD:733:&#x02DD;
OGONEK:'FE:731:&#x02DB; OGONEK:'FE:731:&#x02DB;
CARON:'FF:711:&#x02C7; CARON:'FF:711:&#x02C7;
</mac_roman> </ansicpg10000>
<caps_hex> <caps_hex>
LATIN SMALL LETTER A:'61:97:'41 LATIN SMALL LETTER A:'61:97:'41
LATIN SMALL LETTER B:'62:98:'42 LATIN SMALL LETTER B:'62:98:'42

View File

@ -13,8 +13,7 @@ class CheckEncoding:
try: try:
char.decode(encoding) char.decode(encoding)
except UnicodeError, msg: except UnicodeError, msg:
sys.stderr.write('line: %s char: %s\n' % (line_num, char_position)) sys.stderr.write('line: %s char: %s\n%s\n' % (line_num, char_position, str(msg)))
sys.stderr.write(str(msg) + '\n')
def check_encoding(self, path, encoding='us-ascii', verbose=True): def check_encoding(self, path, encoding='us-ascii', verbose=True):
line_num = 0 line_num = 0

View File

@ -36,8 +36,8 @@ class ConvertToTags:
self.__dtd_path = dtd_path self.__dtd_path = dtd_path
self.__no_dtd = no_dtd self.__no_dtd = no_dtd
self.__encoding = 'cp' + encoding self.__encoding = 'cp' + encoding
if encoding == 'mac_roman': # if encoding == 'mac_roman':
self.__encoding = 'mac_roman' # self.__encoding = 'mac_roman'
self.__indent = indent self.__indent = indent
self.__run_level = run_level self.__run_level = run_level
self.__write_to = better_mktemp() self.__write_to = better_mktemp()

View File

@ -61,12 +61,41 @@ class DefaultEncoding:
""" """
Find the default encoding for the doc Find the default encoding for the doc
""" """
def __init__(self, in_file, bug_handler, run_level = 1, check_raw = False):
#Note: not all those encoding are really supported by rtf2xml
# See http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756%28v=vs.85%29.aspx
# and src\calibre\gui2\widgets.py for the input list in calibre
ENCODINGS = {
# Special cases
'cp1252':'1252',
'utf-8':'1252',
'ascii':'1252',
# Normal cases
'big5':'950',
'cp1250':'1250',
'cp1251':'1251',
'cp1253':'1253',
'cp1254':'1254',
'cp1255':'1255',
'cp1256':'1256',
'shift_jis':'932',
'gb2312':'936',
#Not in RTF 1.9.1 codepage specification
'hz':'52936',
'iso8859_5':'28595',
'iso2022_jp':'50222',
'iso2022_kr':'50225',
'euc_jp':'51932',
'euc_kr':'51949',
'gb18030':'54936',
}
def __init__(self, in_file, bug_handler, default_encoding, run_level = 1, check_raw = False):
self.__file = in_file self.__file = in_file
self.__bug_handler = bug_handler self.__bug_handler = bug_handler
self.__platform = 'Windows' self.__platform = 'Windows'
self.__default_num = 'not-defined' self.__default_num = 'not-defined'
self.__code_page = '1252' self.__code_page = self.ENCODINGS.get(default_encoding, '1252')
self.__datafetched = False self.__datafetched = False
self.__fetchraw = check_raw self.__fetchraw = check_raw
@ -75,16 +104,16 @@ class DefaultEncoding:
self._encoding() self._encoding()
self.__datafetched = True self.__datafetched = True
code_page = 'ansicpg' + self.__code_page code_page = 'ansicpg' + self.__code_page
if self.__code_page == '10000': # if self.__code_page == '10000':
self.__code_page = 'mac_roman' # self.__code_page = 'mac_roman'
return self.__platform, code_page, self.__default_num return self.__platform, code_page, self.__default_num
def get_codepage(self): def get_codepage(self):
if not self.__datafetched: if not self.__datafetched:
self._encoding() self._encoding()
self.__datafetched = True self.__datafetched = True
if self.__code_page == '10000': # if self.__code_page == '10000':
self.__code_page = 'mac_roman' # self.__code_page = 'mac_roman'
return self.__code_page return self.__code_page
def get_platform(self): def get_platform(self):
@ -148,6 +177,7 @@ if __name__ == '__main__':
import sys import sys
encode_obj = DefaultEncoding( encode_obj = DefaultEncoding(
in_file = sys.argv[1], in_file = sys.argv[1],
default_encoding = sys.argv[2],
bug_handler = Exception, bug_handler = Exception,
check_raw = True, check_raw = True,
) )

View File

@ -34,10 +34,8 @@ class GetCharMap:
self.__bug_handler = bug_handler self.__bug_handler = bug_handler
def get_char_map(self, map): def get_char_map(self, map):
if map == 'ansicpg0': # if map == 'ansicpg10000':
map = 'ansicpg1250' # map = 'mac_roman'
if map == 'ansicpg10000':
map = 'mac_roman'
found_map = False found_map = False
map_dict = {} map_dict = {}
self.__char_file.seek(0) self.__char_file.seek(0)

View File

@ -27,8 +27,8 @@ class Hex2Utf8:
default_char_map, default_char_map,
bug_handler, bug_handler,
invalid_rtf_handler, invalid_rtf_handler,
copy=None, copy= None,
temp_dir=None, temp_dir= None,
symbol = None, symbol = None,
wingdings = None, wingdings = None,
caps = None, caps = None,