RTF Input: Use input encoding setting for files with no codepage

RTF Input: When converting RTF files with no codepage, use the input encoding setting as the codepage. Fixes #1163572 [RTF charset ansicpg0 handling](https://bugs.launchpad.net/calibre/+bug/1163572) Merge branch 'RTF-changes' of https://github.com/sengian/calibre
2025-07-09 03:04:10 -04:00 · 2013-08-12 09:08:40 +05:30 · 2013-08-12 09:08:40 +05:30 · 437746f139
commit 437746f139
parent f22c2f4c7f 5170076301
9 changed files with 65 additions and 23 deletions
--- a/src/calibre/ebooks/conversion/plugins/rtf_input.py
+++ b/src/calibre/ebooks/conversion/plugins/rtf_input.py
@ -96,8 +96,13 @@ class RTFInput(InputFormatPlugin):
            # Write or do not write paragraphs. Default is 0.
            empty_paragraphs = 1,

-            #debug
+            # Debug
            deb_dir = debug_dir,
+
+            # Default encoding
+            default_encoding = getattr(self.opts, 'input_encoding', 'cp1252') or 'cp1252',
+
+            # Run level
            run_level = run_lev,
        )
        parser.parse_rtf()
--- a/src/calibre/ebooks/metadata/rtf.py
+++ b/src/calibre/ebooks/metadata/rtf.py
@ -1,5 +1,6 @@
 __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
+
 """
 Edit metadata in RTF files.
 """
@ -61,7 +62,7 @@ def detect_codepage(stream):
    if match is not None:
        num = match.group(1)
        if num == '0':
-            num = '1250'
+            num = '1252'
        codec = 'cp'+num
        try:
            codecs.lookup(codec)
@ -82,7 +83,9 @@ def decode(raw, codec):
    return raw

 def get_metadata(stream):
-    """ Return metadata as a L{MetaInfo} object """
+    """
+    Return metadata as a L{MetaInfo} object
+    """
    stream.seek(0)
    if stream.read(5) != r'{\rtf':
        return MetaInformation(_('Unknown'))
--- a/src/calibre/ebooks/rtf2xml/ParseRtf.py
+++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py
@ -58,6 +58,8 @@ def Handle_Main():
            group_borders = 1,
            # Write or do not write paragraphs. Default is 0.
            empty_paragraphs = 0,
+            # Allow to use a custom default encoding as fallback
+            default_encoding = 'cp1252',
    )
    try:
        parse_obj.parse_rtf()
@ -101,6 +103,7 @@ class ParseRtf:
                empty_paragraphs = 1,
                no_dtd = 0,
                char_data = '',
+                default_encoding = 'cp1252',
                ):

        """
@ -144,6 +147,7 @@ class ParseRtf:
        self.__group_borders = group_borders
        self.__empty_paragraphs = empty_paragraphs
        self.__no_dtd = no_dtd
+        self.__default_encoding = default_encoding

    def __check_file(self, the_file, type):
        """Check to see if files exist"""
@ -227,14 +231,15 @@ class ParseRtf:
            run_level = self.__run_level,
            bug_handler = RtfInvalidCodeException,
            check_raw = True,
+            default_encoding = self.__default_encoding,
            )
            platform, code_page, default_font_num = encode_obj.find_default_encoding()
            check_encoding_obj = check_encoding.CheckEncoding(
                    bug_handler = RtfInvalidCodeException,
                        )
            enc = encode_obj.get_codepage()
-            if enc != 'mac_roman':
-                enc = 'cp' + enc
+            #TODO: to check if cp is a good idea or if I should use a dict to convert
+            enc = 'cp' + enc
            msg = '%s\nException in token processing' % str(msg)
            if check_encoding_obj.check_encoding(self.__file, enc):
                file_name = self.__file if isinstance(self.__file, str) \
@ -308,6 +313,7 @@ class ParseRtf:
            in_file = self.__temp_file,
            run_level = self.__run_level,
            bug_handler = RtfInvalidCodeException,
+            default_encoding = self.__default_encoding,
            )
        platform, code_page, default_font_num = encode_obj.find_default_encoding()
        hex2utf_obj = hex_2_utf8.Hex2Utf8(
--- a/src/calibre/ebooks/rtf2xml/char_set.py
+++ b/src/calibre/ebooks/rtf2xml/char_set.py
@ -14872,7 +14872,8 @@ LATIN SMALL LETTER U WITH DIAERESIS:'FC:252:&#x00FC;
 LATIN SMALL LETTER Z WITH DOT ABOVE:'FD:380:&#x017C;
 LATIN SMALL LETTER Z WITH CARON:'FE:382:&#x017E;
 </ansicpg1257>
-<mac_roman>
+#mac_roman
+<ansicpg10000>
 LATIN CAPITAL LETTER A WITH DIAERESIS:'80:196:&#x00C4;
 LATIN CAPITAL LETTER A WITH RING ABOVE:'81:197:&#x00C5;
 LATIN CAPITAL LETTER C WITH CEDILLA:'82:199:&#x00C7;
@ -15001,7 +15002,7 @@ CEDILLA:'FC:184:&#x00B8;
 DOUBLE ACUTE ACCENT:'FD:733:&#x02DD;
 OGONEK:'FE:731:&#x02DB;
 CARON:'FF:711:&#x02C7;
-</mac_roman>
+</ansicpg10000>
 <caps_hex>
 LATIN SMALL LETTER A:'61:97:'41
 LATIN SMALL LETTER B:'62:98:'42
--- a/src/calibre/ebooks/rtf2xml/check_encoding.py
+++ b/src/calibre/ebooks/rtf2xml/check_encoding.py
@ -13,8 +13,7 @@ class CheckEncoding:
            try:
                char.decode(encoding)
            except UnicodeError, msg:
-                sys.stderr.write('line: %s char: %s\n' %  (line_num, char_position))
-                sys.stderr.write(str(msg) + '\n')
+                sys.stderr.write('line: %s char: %s\n%s\n' %  (line_num, char_position, str(msg)))

    def check_encoding(self, path, encoding='us-ascii', verbose=True):
        line_num = 0
--- a/src/calibre/ebooks/rtf2xml/convert_to_tags.py
+++ b/src/calibre/ebooks/rtf2xml/convert_to_tags.py
@ -36,8 +36,8 @@ class ConvertToTags:
        self.__dtd_path = dtd_path
        self.__no_dtd = no_dtd
        self.__encoding = 'cp' + encoding
-        if encoding == 'mac_roman':
-            self.__encoding = 'mac_roman'
+        # if encoding == 'mac_roman':
+            # self.__encoding = 'mac_roman'
        self.__indent = indent
        self.__run_level = run_level
        self.__write_to = better_mktemp()
--- a/src/calibre/ebooks/rtf2xml/default_encoding.py
+++ b/src/calibre/ebooks/rtf2xml/default_encoding.py
@ -61,12 +61,41 @@ class DefaultEncoding:
    """
    Find the default encoding for the doc
    """
-    def __init__(self, in_file, bug_handler, run_level = 1, check_raw = False):
+
+    #Note: not all those encoding are really supported by rtf2xml
+    # See http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756%28v=vs.85%29.aspx
+    # and src\calibre\gui2\widgets.py for the input list in calibre
+    ENCODINGS = {
+                # Special cases
+                'cp1252':'1252',
+                'utf-8':'1252',
+                'ascii':'1252',
+                # Normal cases
+                'big5':'950',
+                'cp1250':'1250',
+                'cp1251':'1251',
+                'cp1253':'1253',
+                'cp1254':'1254',
+                'cp1255':'1255',
+                'cp1256':'1256',
+                'shift_jis':'932',
+                'gb2312':'936',
+                #Not in RTF 1.9.1 codepage specification
+                'hz':'52936',
+                'iso8859_5':'28595',
+                'iso2022_jp':'50222',
+                'iso2022_kr':'50225',
+                'euc_jp':'51932',
+                'euc_kr':'51949',
+                'gb18030':'54936',
+                }
+
+    def __init__(self, in_file, bug_handler, default_encoding, run_level = 1, check_raw = False):
        self.__file = in_file
        self.__bug_handler = bug_handler
        self.__platform = 'Windows'
        self.__default_num = 'not-defined'
-        self.__code_page = '1252'
+        self.__code_page = self.ENCODINGS.get(default_encoding, '1252')
        self.__datafetched = False
        self.__fetchraw = check_raw

@ -75,16 +104,16 @@ class DefaultEncoding:
            self._encoding()
            self.__datafetched = True
            code_page = 'ansicpg' + self.__code_page
-            if self.__code_page == '10000':
-                self.__code_page = 'mac_roman'
+            # if self.__code_page == '10000':
+                # self.__code_page = 'mac_roman'
        return self.__platform, code_page, self.__default_num

    def get_codepage(self):
        if not self.__datafetched:
            self._encoding()
            self.__datafetched = True
-            if self.__code_page == '10000':
-                self.__code_page = 'mac_roman'
+            # if self.__code_page == '10000':
+                # self.__code_page = 'mac_roman'
        return self.__code_page

    def get_platform(self):
@ -148,6 +177,7 @@ if __name__ == '__main__':
    import sys
    encode_obj = DefaultEncoding(
            in_file = sys.argv[1],
+            default_encoding = sys.argv[2],
            bug_handler = Exception,
            check_raw = True,
            )
--- a/src/calibre/ebooks/rtf2xml/get_char_map.py
+++ b/src/calibre/ebooks/rtf2xml/get_char_map.py
@ -34,10 +34,8 @@ class GetCharMap:
        self.__bug_handler = bug_handler

    def get_char_map(self, map):
-        if map == 'ansicpg0':
-            map = 'ansicpg1250'
-        if map == 'ansicpg10000':
-            map = 'mac_roman'
+        # if map == 'ansicpg10000':
+            # map = 'mac_roman'
        found_map = False
        map_dict = {}
        self.__char_file.seek(0)
--- a/src/calibre/ebooks/rtf2xml/hex_2_utf8.py
+++ b/src/calibre/ebooks/rtf2xml/hex_2_utf8.py
@ -27,8 +27,8 @@ class Hex2Utf8:
            default_char_map,
            bug_handler,
            invalid_rtf_handler,
-            copy=None,
-            temp_dir=None,
+            copy= None,
+            temp_dir= None,
            symbol = None,
            wingdings = None,
            caps = None,