mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
RTF Input: Use input encoding setting for files with no codepage
RTF Input: When converting RTF files with no codepage, use the input encoding setting as the codepage. Fixes #1163572 [RTF charset ansicpg0 handling](https://bugs.launchpad.net/calibre/+bug/1163572) Merge branch 'RTF-changes' of https://github.com/sengian/calibre
This commit is contained in:
commit
437746f139
@ -96,8 +96,13 @@ class RTFInput(InputFormatPlugin):
|
||||
# Write or do not write paragraphs. Default is 0.
|
||||
empty_paragraphs = 1,
|
||||
|
||||
#debug
|
||||
# Debug
|
||||
deb_dir = debug_dir,
|
||||
|
||||
# Default encoding
|
||||
default_encoding = getattr(self.opts, 'input_encoding', 'cp1252') or 'cp1252',
|
||||
|
||||
# Run level
|
||||
run_level = run_lev,
|
||||
)
|
||||
parser.parse_rtf()
|
||||
|
@ -1,5 +1,6 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
"""
|
||||
Edit metadata in RTF files.
|
||||
"""
|
||||
@ -61,7 +62,7 @@ def detect_codepage(stream):
|
||||
if match is not None:
|
||||
num = match.group(1)
|
||||
if num == '0':
|
||||
num = '1250'
|
||||
num = '1252'
|
||||
codec = 'cp'+num
|
||||
try:
|
||||
codecs.lookup(codec)
|
||||
@ -82,7 +83,9 @@ def decode(raw, codec):
|
||||
return raw
|
||||
|
||||
def get_metadata(stream):
|
||||
""" Return metadata as a L{MetaInfo} object """
|
||||
"""
|
||||
Return metadata as a L{MetaInfo} object
|
||||
"""
|
||||
stream.seek(0)
|
||||
if stream.read(5) != r'{\rtf':
|
||||
return MetaInformation(_('Unknown'))
|
||||
|
@ -58,6 +58,8 @@ def Handle_Main():
|
||||
group_borders = 1,
|
||||
# Write or do not write paragraphs. Default is 0.
|
||||
empty_paragraphs = 0,
|
||||
# Allow to use a custom default encoding as fallback
|
||||
default_encoding = 'cp1252',
|
||||
)
|
||||
try:
|
||||
parse_obj.parse_rtf()
|
||||
@ -101,6 +103,7 @@ class ParseRtf:
|
||||
empty_paragraphs = 1,
|
||||
no_dtd = 0,
|
||||
char_data = '',
|
||||
default_encoding = 'cp1252',
|
||||
):
|
||||
|
||||
"""
|
||||
@ -144,6 +147,7 @@ class ParseRtf:
|
||||
self.__group_borders = group_borders
|
||||
self.__empty_paragraphs = empty_paragraphs
|
||||
self.__no_dtd = no_dtd
|
||||
self.__default_encoding = default_encoding
|
||||
|
||||
def __check_file(self, the_file, type):
|
||||
"""Check to see if files exist"""
|
||||
@ -227,14 +231,15 @@ class ParseRtf:
|
||||
run_level = self.__run_level,
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
check_raw = True,
|
||||
default_encoding = self.__default_encoding,
|
||||
)
|
||||
platform, code_page, default_font_num = encode_obj.find_default_encoding()
|
||||
check_encoding_obj = check_encoding.CheckEncoding(
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
)
|
||||
enc = encode_obj.get_codepage()
|
||||
if enc != 'mac_roman':
|
||||
enc = 'cp' + enc
|
||||
#TODO: to check if cp is a good idea or if I should use a dict to convert
|
||||
enc = 'cp' + enc
|
||||
msg = '%s\nException in token processing' % str(msg)
|
||||
if check_encoding_obj.check_encoding(self.__file, enc):
|
||||
file_name = self.__file if isinstance(self.__file, str) \
|
||||
@ -308,6 +313,7 @@ class ParseRtf:
|
||||
in_file = self.__temp_file,
|
||||
run_level = self.__run_level,
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
default_encoding = self.__default_encoding,
|
||||
)
|
||||
platform, code_page, default_font_num = encode_obj.find_default_encoding()
|
||||
hex2utf_obj = hex_2_utf8.Hex2Utf8(
|
||||
|
@ -14872,7 +14872,8 @@ LATIN SMALL LETTER U WITH DIAERESIS:'FC:252:ü
|
||||
LATIN SMALL LETTER Z WITH DOT ABOVE:'FD:380:ż
|
||||
LATIN SMALL LETTER Z WITH CARON:'FE:382:ž
|
||||
</ansicpg1257>
|
||||
<mac_roman>
|
||||
#mac_roman
|
||||
<ansicpg10000>
|
||||
LATIN CAPITAL LETTER A WITH DIAERESIS:'80:196:Ä
|
||||
LATIN CAPITAL LETTER A WITH RING ABOVE:'81:197:Å
|
||||
LATIN CAPITAL LETTER C WITH CEDILLA:'82:199:Ç
|
||||
@ -15001,7 +15002,7 @@ CEDILLA:'FC:184:¸
|
||||
DOUBLE ACUTE ACCENT:'FD:733:˝
|
||||
OGONEK:'FE:731:˛
|
||||
CARON:'FF:711:ˇ
|
||||
</mac_roman>
|
||||
</ansicpg10000>
|
||||
<caps_hex>
|
||||
LATIN SMALL LETTER A:'61:97:'41
|
||||
LATIN SMALL LETTER B:'62:98:'42
|
||||
|
@ -13,8 +13,7 @@ class CheckEncoding:
|
||||
try:
|
||||
char.decode(encoding)
|
||||
except UnicodeError, msg:
|
||||
sys.stderr.write('line: %s char: %s\n' % (line_num, char_position))
|
||||
sys.stderr.write(str(msg) + '\n')
|
||||
sys.stderr.write('line: %s char: %s\n%s\n' % (line_num, char_position, str(msg)))
|
||||
|
||||
def check_encoding(self, path, encoding='us-ascii', verbose=True):
|
||||
line_num = 0
|
||||
|
@ -36,8 +36,8 @@ class ConvertToTags:
|
||||
self.__dtd_path = dtd_path
|
||||
self.__no_dtd = no_dtd
|
||||
self.__encoding = 'cp' + encoding
|
||||
if encoding == 'mac_roman':
|
||||
self.__encoding = 'mac_roman'
|
||||
# if encoding == 'mac_roman':
|
||||
# self.__encoding = 'mac_roman'
|
||||
self.__indent = indent
|
||||
self.__run_level = run_level
|
||||
self.__write_to = better_mktemp()
|
||||
|
@ -61,12 +61,41 @@ class DefaultEncoding:
|
||||
"""
|
||||
Find the default encoding for the doc
|
||||
"""
|
||||
def __init__(self, in_file, bug_handler, run_level = 1, check_raw = False):
|
||||
|
||||
#Note: not all those encoding are really supported by rtf2xml
|
||||
# See http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756%28v=vs.85%29.aspx
|
||||
# and src\calibre\gui2\widgets.py for the input list in calibre
|
||||
ENCODINGS = {
|
||||
# Special cases
|
||||
'cp1252':'1252',
|
||||
'utf-8':'1252',
|
||||
'ascii':'1252',
|
||||
# Normal cases
|
||||
'big5':'950',
|
||||
'cp1250':'1250',
|
||||
'cp1251':'1251',
|
||||
'cp1253':'1253',
|
||||
'cp1254':'1254',
|
||||
'cp1255':'1255',
|
||||
'cp1256':'1256',
|
||||
'shift_jis':'932',
|
||||
'gb2312':'936',
|
||||
#Not in RTF 1.9.1 codepage specification
|
||||
'hz':'52936',
|
||||
'iso8859_5':'28595',
|
||||
'iso2022_jp':'50222',
|
||||
'iso2022_kr':'50225',
|
||||
'euc_jp':'51932',
|
||||
'euc_kr':'51949',
|
||||
'gb18030':'54936',
|
||||
}
|
||||
|
||||
def __init__(self, in_file, bug_handler, default_encoding, run_level = 1, check_raw = False):
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__platform = 'Windows'
|
||||
self.__default_num = 'not-defined'
|
||||
self.__code_page = '1252'
|
||||
self.__code_page = self.ENCODINGS.get(default_encoding, '1252')
|
||||
self.__datafetched = False
|
||||
self.__fetchraw = check_raw
|
||||
|
||||
@ -75,16 +104,16 @@ class DefaultEncoding:
|
||||
self._encoding()
|
||||
self.__datafetched = True
|
||||
code_page = 'ansicpg' + self.__code_page
|
||||
if self.__code_page == '10000':
|
||||
self.__code_page = 'mac_roman'
|
||||
# if self.__code_page == '10000':
|
||||
# self.__code_page = 'mac_roman'
|
||||
return self.__platform, code_page, self.__default_num
|
||||
|
||||
def get_codepage(self):
|
||||
if not self.__datafetched:
|
||||
self._encoding()
|
||||
self.__datafetched = True
|
||||
if self.__code_page == '10000':
|
||||
self.__code_page = 'mac_roman'
|
||||
# if self.__code_page == '10000':
|
||||
# self.__code_page = 'mac_roman'
|
||||
return self.__code_page
|
||||
|
||||
def get_platform(self):
|
||||
@ -148,6 +177,7 @@ if __name__ == '__main__':
|
||||
import sys
|
||||
encode_obj = DefaultEncoding(
|
||||
in_file = sys.argv[1],
|
||||
default_encoding = sys.argv[2],
|
||||
bug_handler = Exception,
|
||||
check_raw = True,
|
||||
)
|
||||
|
@ -34,10 +34,8 @@ class GetCharMap:
|
||||
self.__bug_handler = bug_handler
|
||||
|
||||
def get_char_map(self, map):
|
||||
if map == 'ansicpg0':
|
||||
map = 'ansicpg1250'
|
||||
if map == 'ansicpg10000':
|
||||
map = 'mac_roman'
|
||||
# if map == 'ansicpg10000':
|
||||
# map = 'mac_roman'
|
||||
found_map = False
|
||||
map_dict = {}
|
||||
self.__char_file.seek(0)
|
||||
|
@ -27,8 +27,8 @@ class Hex2Utf8:
|
||||
default_char_map,
|
||||
bug_handler,
|
||||
invalid_rtf_handler,
|
||||
copy=None,
|
||||
temp_dir=None,
|
||||
copy= None,
|
||||
temp_dir= None,
|
||||
symbol = None,
|
||||
wingdings = None,
|
||||
caps = None,
|
||||
|
Loading…
x
Reference in New Issue
Block a user