diff --git a/src/calibre/ebooks/conversion/plugins/rtf_input.py b/src/calibre/ebooks/conversion/plugins/rtf_input.py index 45d7f16608..325f07cfe8 100644 --- a/src/calibre/ebooks/conversion/plugins/rtf_input.py +++ b/src/calibre/ebooks/conversion/plugins/rtf_input.py @@ -96,8 +96,13 @@ class RTFInput(InputFormatPlugin): # Write or do not write paragraphs. Default is 0. empty_paragraphs = 1, - #debug + # Debug deb_dir = debug_dir, + + # Default encoding + default_encoding = getattr(self.opts, 'input_encoding', 'cp1252') or 'cp1252', + + # Run level run_level = run_lev, ) parser.parse_rtf() diff --git a/src/calibre/ebooks/metadata/rtf.py b/src/calibre/ebooks/metadata/rtf.py index 5abbfb73f8..0eae93a8d3 100644 --- a/src/calibre/ebooks/metadata/rtf.py +++ b/src/calibre/ebooks/metadata/rtf.py @@ -1,5 +1,6 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' + """ Edit metadata in RTF files. """ @@ -61,7 +62,7 @@ def detect_codepage(stream): if match is not None: num = match.group(1) if num == '0': - num = '1250' + num = '1252' codec = 'cp'+num try: codecs.lookup(codec) @@ -82,7 +83,9 @@ def decode(raw, codec): return raw def get_metadata(stream): - """ Return metadata as a L{MetaInfo} object """ + """ + Return metadata as a L{MetaInfo} object + """ stream.seek(0) if stream.read(5) != r'{\rtf': return MetaInformation(_('Unknown')) diff --git a/src/calibre/ebooks/rtf2xml/ParseRtf.py b/src/calibre/ebooks/rtf2xml/ParseRtf.py index 30305b27a7..9c1ae0d00e 100755 --- a/src/calibre/ebooks/rtf2xml/ParseRtf.py +++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py @@ -58,6 +58,8 @@ def Handle_Main(): group_borders = 1, # Write or do not write paragraphs. Default is 0. empty_paragraphs = 0, + # Allow to use a custom default encoding as fallback + default_encoding = 'cp1252', ) try: parse_obj.parse_rtf() @@ -101,6 +103,7 @@ class ParseRtf: empty_paragraphs = 1, no_dtd = 0, char_data = '', + default_encoding = 'cp1252', ): """ @@ -144,6 +147,7 @@ class ParseRtf: self.__group_borders = group_borders self.__empty_paragraphs = empty_paragraphs self.__no_dtd = no_dtd + self.__default_encoding = default_encoding def __check_file(self, the_file, type): """Check to see if files exist""" @@ -227,14 +231,15 @@ class ParseRtf: run_level = self.__run_level, bug_handler = RtfInvalidCodeException, check_raw = True, + default_encoding = self.__default_encoding, ) platform, code_page, default_font_num = encode_obj.find_default_encoding() check_encoding_obj = check_encoding.CheckEncoding( bug_handler = RtfInvalidCodeException, ) enc = encode_obj.get_codepage() - if enc != 'mac_roman': - enc = 'cp' + enc + #TODO: to check if cp is a good idea or if I should use a dict to convert + enc = 'cp' + enc msg = '%s\nException in token processing' % str(msg) if check_encoding_obj.check_encoding(self.__file, enc): file_name = self.__file if isinstance(self.__file, str) \ @@ -308,6 +313,7 @@ class ParseRtf: in_file = self.__temp_file, run_level = self.__run_level, bug_handler = RtfInvalidCodeException, + default_encoding = self.__default_encoding, ) platform, code_page, default_font_num = encode_obj.find_default_encoding() hex2utf_obj = hex_2_utf8.Hex2Utf8( diff --git a/src/calibre/ebooks/rtf2xml/char_set.py b/src/calibre/ebooks/rtf2xml/char_set.py index bfa18d9971..7f0d1d0d13 100755 --- a/src/calibre/ebooks/rtf2xml/char_set.py +++ b/src/calibre/ebooks/rtf2xml/char_set.py @@ -14872,7 +14872,8 @@ LATIN SMALL LETTER U WITH DIAERESIS:'FC:252:ü LATIN SMALL LETTER Z WITH DOT ABOVE:'FD:380:ż LATIN SMALL LETTER Z WITH CARON:'FE:382:ž - +#mac_roman + LATIN CAPITAL LETTER A WITH DIAERESIS:'80:196:Ä LATIN CAPITAL LETTER A WITH RING ABOVE:'81:197:Å LATIN CAPITAL LETTER C WITH CEDILLA:'82:199:Ç @@ -15001,7 +15002,7 @@ CEDILLA:'FC:184:¸ DOUBLE ACUTE ACCENT:'FD:733:˝ OGONEK:'FE:731:˛ CARON:'FF:711:ˇ - + LATIN SMALL LETTER A:'61:97:'41 LATIN SMALL LETTER B:'62:98:'42 diff --git a/src/calibre/ebooks/rtf2xml/check_encoding.py b/src/calibre/ebooks/rtf2xml/check_encoding.py index 0f52320aea..f424e9ff12 100755 --- a/src/calibre/ebooks/rtf2xml/check_encoding.py +++ b/src/calibre/ebooks/rtf2xml/check_encoding.py @@ -13,8 +13,7 @@ class CheckEncoding: try: char.decode(encoding) except UnicodeError, msg: - sys.stderr.write('line: %s char: %s\n' % (line_num, char_position)) - sys.stderr.write(str(msg) + '\n') + sys.stderr.write('line: %s char: %s\n%s\n' % (line_num, char_position, str(msg))) def check_encoding(self, path, encoding='us-ascii', verbose=True): line_num = 0 diff --git a/src/calibre/ebooks/rtf2xml/convert_to_tags.py b/src/calibre/ebooks/rtf2xml/convert_to_tags.py index 32e2bc69d7..d6af7688c7 100755 --- a/src/calibre/ebooks/rtf2xml/convert_to_tags.py +++ b/src/calibre/ebooks/rtf2xml/convert_to_tags.py @@ -36,8 +36,8 @@ class ConvertToTags: self.__dtd_path = dtd_path self.__no_dtd = no_dtd self.__encoding = 'cp' + encoding - if encoding == 'mac_roman': - self.__encoding = 'mac_roman' + # if encoding == 'mac_roman': + # self.__encoding = 'mac_roman' self.__indent = indent self.__run_level = run_level self.__write_to = better_mktemp() diff --git a/src/calibre/ebooks/rtf2xml/default_encoding.py b/src/calibre/ebooks/rtf2xml/default_encoding.py index c0a43db800..0da26d43da 100755 --- a/src/calibre/ebooks/rtf2xml/default_encoding.py +++ b/src/calibre/ebooks/rtf2xml/default_encoding.py @@ -61,12 +61,41 @@ class DefaultEncoding: """ Find the default encoding for the doc """ - def __init__(self, in_file, bug_handler, run_level = 1, check_raw = False): + + #Note: not all those encoding are really supported by rtf2xml + # See http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756%28v=vs.85%29.aspx + # and src\calibre\gui2\widgets.py for the input list in calibre + ENCODINGS = { + # Special cases + 'cp1252':'1252', + 'utf-8':'1252', + 'ascii':'1252', + # Normal cases + 'big5':'950', + 'cp1250':'1250', + 'cp1251':'1251', + 'cp1253':'1253', + 'cp1254':'1254', + 'cp1255':'1255', + 'cp1256':'1256', + 'shift_jis':'932', + 'gb2312':'936', + #Not in RTF 1.9.1 codepage specification + 'hz':'52936', + 'iso8859_5':'28595', + 'iso2022_jp':'50222', + 'iso2022_kr':'50225', + 'euc_jp':'51932', + 'euc_kr':'51949', + 'gb18030':'54936', + } + + def __init__(self, in_file, bug_handler, default_encoding, run_level = 1, check_raw = False): self.__file = in_file self.__bug_handler = bug_handler self.__platform = 'Windows' self.__default_num = 'not-defined' - self.__code_page = '1252' + self.__code_page = self.ENCODINGS.get(default_encoding, '1252') self.__datafetched = False self.__fetchraw = check_raw @@ -75,16 +104,16 @@ class DefaultEncoding: self._encoding() self.__datafetched = True code_page = 'ansicpg' + self.__code_page - if self.__code_page == '10000': - self.__code_page = 'mac_roman' + # if self.__code_page == '10000': + # self.__code_page = 'mac_roman' return self.__platform, code_page, self.__default_num def get_codepage(self): if not self.__datafetched: self._encoding() self.__datafetched = True - if self.__code_page == '10000': - self.__code_page = 'mac_roman' + # if self.__code_page == '10000': + # self.__code_page = 'mac_roman' return self.__code_page def get_platform(self): @@ -148,6 +177,7 @@ if __name__ == '__main__': import sys encode_obj = DefaultEncoding( in_file = sys.argv[1], + default_encoding = sys.argv[2], bug_handler = Exception, check_raw = True, ) diff --git a/src/calibre/ebooks/rtf2xml/get_char_map.py b/src/calibre/ebooks/rtf2xml/get_char_map.py index 00a612ffb3..96e6aa4787 100755 --- a/src/calibre/ebooks/rtf2xml/get_char_map.py +++ b/src/calibre/ebooks/rtf2xml/get_char_map.py @@ -34,10 +34,8 @@ class GetCharMap: self.__bug_handler = bug_handler def get_char_map(self, map): - if map == 'ansicpg0': - map = 'ansicpg1250' - if map == 'ansicpg10000': - map = 'mac_roman' + # if map == 'ansicpg10000': + # map = 'mac_roman' found_map = False map_dict = {} self.__char_file.seek(0) diff --git a/src/calibre/ebooks/rtf2xml/hex_2_utf8.py b/src/calibre/ebooks/rtf2xml/hex_2_utf8.py index 2c0f4ffdff..3a8773b0d6 100755 --- a/src/calibre/ebooks/rtf2xml/hex_2_utf8.py +++ b/src/calibre/ebooks/rtf2xml/hex_2_utf8.py @@ -27,8 +27,8 @@ class Hex2Utf8: default_char_map, bug_handler, invalid_rtf_handler, - copy=None, - temp_dir=None, + copy= None, + temp_dir= None, symbol = None, wingdings = None, caps = None,