RTF Input: Use input encoding setting for files with no codepage

RTF Input: When converting RTF files with no codepage, use the input
encoding setting as the codepage. Fixes #1163572 [RTF charset ansicpg0 handling](https://bugs.launchpad.net/calibre/+bug/1163572)

Merge branch 'RTF-changes' of https://github.com/sengian/calibre
This commit is contained in:
Kovid Goyal 2013-08-12 09:08:40 +05:30
commit 437746f139
9 changed files with 65 additions and 23 deletions

View File

@ -96,8 +96,13 @@ class RTFInput(InputFormatPlugin):
# Write or do not write paragraphs. Default is 0.
empty_paragraphs = 1,
#debug
# Debug
deb_dir = debug_dir,
# Default encoding
default_encoding = getattr(self.opts, 'input_encoding', 'cp1252') or 'cp1252',
# Run level
run_level = run_lev,
)
parser.parse_rtf()

View File

@ -1,5 +1,6 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
"""
Edit metadata in RTF files.
"""
@ -61,7 +62,7 @@ def detect_codepage(stream):
if match is not None:
num = match.group(1)
if num == '0':
num = '1250'
num = '1252'
codec = 'cp'+num
try:
codecs.lookup(codec)
@ -82,7 +83,9 @@ def decode(raw, codec):
return raw
def get_metadata(stream):
""" Return metadata as a L{MetaInfo} object """
"""
Return metadata as a L{MetaInfo} object
"""
stream.seek(0)
if stream.read(5) != r'{\rtf':
return MetaInformation(_('Unknown'))

View File

@ -58,6 +58,8 @@ def Handle_Main():
group_borders = 1,
# Write or do not write paragraphs. Default is 0.
empty_paragraphs = 0,
# Allow to use a custom default encoding as fallback
default_encoding = 'cp1252',
)
try:
parse_obj.parse_rtf()
@ -101,6 +103,7 @@ class ParseRtf:
empty_paragraphs = 1,
no_dtd = 0,
char_data = '',
default_encoding = 'cp1252',
):
"""
@ -144,6 +147,7 @@ class ParseRtf:
self.__group_borders = group_borders
self.__empty_paragraphs = empty_paragraphs
self.__no_dtd = no_dtd
self.__default_encoding = default_encoding
def __check_file(self, the_file, type):
"""Check to see if files exist"""
@ -227,14 +231,15 @@ class ParseRtf:
run_level = self.__run_level,
bug_handler = RtfInvalidCodeException,
check_raw = True,
default_encoding = self.__default_encoding,
)
platform, code_page, default_font_num = encode_obj.find_default_encoding()
check_encoding_obj = check_encoding.CheckEncoding(
bug_handler = RtfInvalidCodeException,
)
enc = encode_obj.get_codepage()
if enc != 'mac_roman':
enc = 'cp' + enc
#TODO: to check if cp is a good idea or if I should use a dict to convert
enc = 'cp' + enc
msg = '%s\nException in token processing' % str(msg)
if check_encoding_obj.check_encoding(self.__file, enc):
file_name = self.__file if isinstance(self.__file, str) \
@ -308,6 +313,7 @@ class ParseRtf:
in_file = self.__temp_file,
run_level = self.__run_level,
bug_handler = RtfInvalidCodeException,
default_encoding = self.__default_encoding,
)
platform, code_page, default_font_num = encode_obj.find_default_encoding()
hex2utf_obj = hex_2_utf8.Hex2Utf8(

View File

@ -14872,7 +14872,8 @@ LATIN SMALL LETTER U WITH DIAERESIS:'FC:252:&#x00FC;
LATIN SMALL LETTER Z WITH DOT ABOVE:'FD:380:&#x017C;
LATIN SMALL LETTER Z WITH CARON:'FE:382:&#x017E;
</ansicpg1257>
<mac_roman>
#mac_roman
<ansicpg10000>
LATIN CAPITAL LETTER A WITH DIAERESIS:'80:196:&#x00C4;
LATIN CAPITAL LETTER A WITH RING ABOVE:'81:197:&#x00C5;
LATIN CAPITAL LETTER C WITH CEDILLA:'82:199:&#x00C7;
@ -15001,7 +15002,7 @@ CEDILLA:'FC:184:&#x00B8;
DOUBLE ACUTE ACCENT:'FD:733:&#x02DD;
OGONEK:'FE:731:&#x02DB;
CARON:'FF:711:&#x02C7;
</mac_roman>
</ansicpg10000>
<caps_hex>
LATIN SMALL LETTER A:'61:97:'41
LATIN SMALL LETTER B:'62:98:'42

View File

@ -13,8 +13,7 @@ class CheckEncoding:
try:
char.decode(encoding)
except UnicodeError, msg:
sys.stderr.write('line: %s char: %s\n' % (line_num, char_position))
sys.stderr.write(str(msg) + '\n')
sys.stderr.write('line: %s char: %s\n%s\n' % (line_num, char_position, str(msg)))
def check_encoding(self, path, encoding='us-ascii', verbose=True):
line_num = 0

View File

@ -36,8 +36,8 @@ class ConvertToTags:
self.__dtd_path = dtd_path
self.__no_dtd = no_dtd
self.__encoding = 'cp' + encoding
if encoding == 'mac_roman':
self.__encoding = 'mac_roman'
# if encoding == 'mac_roman':
# self.__encoding = 'mac_roman'
self.__indent = indent
self.__run_level = run_level
self.__write_to = better_mktemp()

View File

@ -61,12 +61,41 @@ class DefaultEncoding:
"""
Find the default encoding for the doc
"""
def __init__(self, in_file, bug_handler, run_level = 1, check_raw = False):
#Note: not all those encoding are really supported by rtf2xml
# See http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756%28v=vs.85%29.aspx
# and src\calibre\gui2\widgets.py for the input list in calibre
ENCODINGS = {
# Special cases
'cp1252':'1252',
'utf-8':'1252',
'ascii':'1252',
# Normal cases
'big5':'950',
'cp1250':'1250',
'cp1251':'1251',
'cp1253':'1253',
'cp1254':'1254',
'cp1255':'1255',
'cp1256':'1256',
'shift_jis':'932',
'gb2312':'936',
#Not in RTF 1.9.1 codepage specification
'hz':'52936',
'iso8859_5':'28595',
'iso2022_jp':'50222',
'iso2022_kr':'50225',
'euc_jp':'51932',
'euc_kr':'51949',
'gb18030':'54936',
}
def __init__(self, in_file, bug_handler, default_encoding, run_level = 1, check_raw = False):
self.__file = in_file
self.__bug_handler = bug_handler
self.__platform = 'Windows'
self.__default_num = 'not-defined'
self.__code_page = '1252'
self.__code_page = self.ENCODINGS.get(default_encoding, '1252')
self.__datafetched = False
self.__fetchraw = check_raw
@ -75,16 +104,16 @@ class DefaultEncoding:
self._encoding()
self.__datafetched = True
code_page = 'ansicpg' + self.__code_page
if self.__code_page == '10000':
self.__code_page = 'mac_roman'
# if self.__code_page == '10000':
# self.__code_page = 'mac_roman'
return self.__platform, code_page, self.__default_num
def get_codepage(self):
if not self.__datafetched:
self._encoding()
self.__datafetched = True
if self.__code_page == '10000':
self.__code_page = 'mac_roman'
# if self.__code_page == '10000':
# self.__code_page = 'mac_roman'
return self.__code_page
def get_platform(self):
@ -148,6 +177,7 @@ if __name__ == '__main__':
import sys
encode_obj = DefaultEncoding(
in_file = sys.argv[1],
default_encoding = sys.argv[2],
bug_handler = Exception,
check_raw = True,
)

View File

@ -34,10 +34,8 @@ class GetCharMap:
self.__bug_handler = bug_handler
def get_char_map(self, map):
if map == 'ansicpg0':
map = 'ansicpg1250'
if map == 'ansicpg10000':
map = 'mac_roman'
# if map == 'ansicpg10000':
# map = 'mac_roman'
found_map = False
map_dict = {}
self.__char_file.seek(0)

View File

@ -27,8 +27,8 @@ class Hex2Utf8:
default_char_map,
bug_handler,
invalid_rtf_handler,
copy=None,
temp_dir=None,
copy= None,
temp_dir= None,
symbol = None,
wingdings = None,
caps = None,