RTF Input: Fix regression in 0.7.40 that broke conversion of some old style RTF files

2025-07-09 03:04:10 -04:00 · 2011-01-15 15:46:19 -07:00 · 2011-01-15 15:46:19 -07:00 · f6d72fbe0b
commit f6d72fbe0b
parent 4e93c30cd0
7 changed files with 153 additions and 95 deletions
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@ -286,7 +286,6 @@ class RTFInput(InputFormatPlugin):
        try:
            xml = self.generate_xml(stream.name)
        except RtfInvalidCodeException, e:
-            raise
            raise ValueError(_('This RTF file has a feature calibre does not '
            'support. Convert it to HTML first and then try it.\n%s')%e)

--- a/src/calibre/ebooks/rtf2xml/ParseRtf.py
+++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py
@ -226,10 +226,6 @@ class ParseRtf:
        try:
            return_value = process_tokens_obj.process_tokens()
        except InvalidRtfException, msg:
-            try:
-                os.remove(self.__temp_file)
-            except OSError:
-                pass
            #Check to see if the file is correctly encoded
            encode_obj = default_encoding.DefaultEncoding(
            in_file = self.__temp_file,
@ -241,14 +237,17 @@ class ParseRtf:
            check_encoding_obj = check_encoding.CheckEncoding(
                    bug_handler = RtfInvalidCodeException,
                        )
-            enc = encode_obj.get_codepage()
-            if enc != 'mac_roman':
-                enc = 'cp' + enc
+            enc = 'cp' + encode_obj.get_codepage()
+            msg = 'Exception in token processing'
            if check_encoding_obj.check_encoding(self.__file, enc):
                file_name = self.__file if isinstance(self.__file, str) \
                                    else self.__file.encode('utf-8')
                msg = 'File %s does not appear to be correctly encoded.\n' % file_name
-                raise InvalidRtfException, msg
+            try:
+                os.remove(self.__temp_file)
+            except OSError:
+                pass
+            raise InvalidRtfException, msg
        delete_info_obj = delete_info.DeleteInfo(
            in_file = self.__temp_file,
            copy = self.__copy,
--- a/src/calibre/ebooks/rtf2xml/default_encoding.py
+++ b/src/calibre/ebooks/rtf2xml/default_encoding.py
@ -74,9 +74,6 @@ class DefaultEncoding:
        if not self.__datafetched:
            self._encoding()
            self.__datafetched = True
-        if self.__platform == 'Macintosh':
-            code_page = self.__code_page
-        else:
            code_page = 'ansicpg' + self.__code_page
        return self.__platform, code_page, self.__default_num

@ -94,49 +91,60 @@ class DefaultEncoding:

    def _encoding(self):
        with open(self.__file, 'r') as read_obj:
+            cpfound = False
            if not self.__fetchraw:
                for line in read_obj:
                    self.__token_info = line[:16]
                    if self.__token_info == 'mi<mk<rtfhed-end':
                        break
-                    if self.__token_info == 'cw<ri<ansi-codpg':
-                        #cw<ri<ansi-codpg<nu<10000
-                        self.__code_page = line[20:-1] if int(line[20:-1]) \
-                                            else '1252'
                    if self.__token_info == 'cw<ri<macintosh_':
                        self.__platform = 'Macintosh'
-                        self.__code_page = 'mac_roman'
                    elif self.__token_info == 'cw<ri<pc________':
                        self.__platform = 'IBMPC'
-                        self.__code_page = '437'
                    elif self.__token_info == 'cw<ri<pca_______':
                        self.__platform = 'OS/2'
-                        self.__code_page = '850'
+                    if self.__token_info == 'cw<ri<ansi-codpg' \
+                        and int(line[20:-1]):
+                            self.__code_page = line[20:-1]
                    if self.__token_info == 'cw<ri<deflt-font':
                        self.__default_num = line[20:-1]
+                        cpfound = True
                        #cw<ri<deflt-font<nu<0
+                if self.__platform != 'Windows' and \
+                        not cpfound:
+                    if self.__platform == 'Macintosh':
+                       self.__code_page = '10000'
+                    elif self.__platform == 'IBMPC':
+                        self.__code_page = '437'
+                    elif self.__platform == 'OS/2':
+                        self.__code_page = '850'
            else:
                fenc = re.compile(r'\\(mac|pc|ansi|pca)[\\ \{\}\t\n]+')
                fenccp = re.compile(r'\\ansicpg(\d+)[\\ \{\}\t\n]+')
+
                for line in read_obj:
+                    if fenc.search(line):
+                        enc = fenc.search(line).group(1)
                    if fenccp.search(line):
                        cp = fenccp.search(line).group(1)
                        if not int(cp):
                            self.__code_page = cp
+                        cpfound = True
                        break
-                    if fenc.search(line):
-                        enc = fenc.search(line).group(1)
-                        if enc == 'mac':
-                            self.__code_page = 'mac_roman'
-                        elif enc == 'pc':
-                            self.__code_page = '437'
-                        elif enc == 'pca':
-                            self.__code_page = '850'
+                if self.__platform != 'Windows' and \
+                        not cpfound:
+                    if enc == 'mac':
+                        self.__code_page = '10000'
+                    elif enc == 'pc':
+                        self.__code_page = '437'
+                    elif enc == 'pca':
+                        self.__code_page = '850'

-# if __name__ == '__main__':
-    # encode_obj = DefaultEncoding(
-            # in_file = sys.argv[1],
-            # bug_handler = Exception,
-            # check_raw = True,
-            # )
-    # print encode_obj.get_codepage()
+if __name__ == '__main__':
+    import sys
+    encode_obj = DefaultEncoding(
+            in_file = sys.argv[1],
+            bug_handler = Exception,
+            check_raw = True,
+            )
+    print encode_obj.get_codepage()
--- a/src/calibre/ebooks/rtf2xml/delete_info.py
+++ b/src/calibre/ebooks/rtf2xml/delete_info.py
@ -20,7 +20,7 @@ import sys, os, tempfile
 from calibre.ebooks.rtf2xml import copy

 class DeleteInfo:
-    """Delelet unecessary destination groups"""
+    """Delete unecessary destination groups"""
    def __init__(self,
            in_file ,
            bug_handler,
@ -31,17 +31,14 @@ class DeleteInfo:
        self.__bug_handler = bug_handler
        self.__copy = copy
        self.__write_to = tempfile.mktemp()
+        self.__run_level = run_level
+        self.__initiate_allow()
        self.__bracket_count= 0
        self.__ob_count = 0
        self.__cb_count = 0
-        # self.__after_asterisk = False
-        # self.__delete = 0
-        self.__initiate_allow()
        self.__ob = 0
        self.__write_cb = False
-        self.__run_level = run_level
        self.__found_delete = False
-        # self.__list = False

    def __initiate_allow(self):
        """
@ -57,6 +54,8 @@ class DeleteInfo:
                            'cw<an<annotation',
                            'cw<cm<comment___',
                            'cw<it<lovr-table',
+                            # info table
+                            'cw<di<company___',
                            # 'cw<ls<list______',
                        )
        self.__not_allowable = (
@ -116,7 +115,6 @@ class DeleteInfo:
        """
        # Test for {\*}, in which case don't enter
        # delete state
-        # self.__after_asterisk = False # only enter this function once
        self.__found_delete = True
        if self.__token_info == 'cb<nu<clos-brack':
            if self.__delete_count == self.__cb_count:
@ -128,7 +126,7 @@ class DeleteInfo:
                # not sure what happens here!
                # believe I have a '{\*}
                if self.__run_level > 3:
-                    msg = 'flag problem\n'
+                    msg = 'Flag problem\n'
                    raise self.__bug_handler, msg
                return True
        elif self.__token_info in self.__allowable :
@ -173,8 +171,8 @@ class DeleteInfo:
        Return True for all control words.
        Return False otherwise.
        """
-        if self.__delete_count == self.__cb_count and self.__token_info ==\
-            'cb<nu<clos-brack':
+        if self.__delete_count == self.__cb_count and \
+                self.__token_info == 'cb<nu<clos-brack':
            self.__state = 'default'
            if self.__write_cb:
                self.__write_cb = False
@ -186,32 +184,24 @@ class DeleteInfo:
            return False

    def delete_info(self):
-        """Main method for handling other methods. Read one line in at
+        """Main method for handling other methods. Read one line at
        a time, and determine whether to print the line based on the state."""
        with open(self.__file, 'r') as read_obj:
            with open(self.__write_to, 'w') as self.__write_obj:
                for line in read_obj:
                    #ob<nu<open-brack<0001
-                    to_print = True
                    self.__token_info = line[:16]
                    if self.__token_info == 'ob<nu<open-brack':
                        self.__ob_count = line[-5:-1]
                    if self.__token_info == 'cb<nu<clos-brack':
                        self.__cb_count = line[-5:-1]
+                    # Get action to perform
                    action = self.__state_dict.get(self.__state)
                    if not action:
-                        sys.stderr.write(_('No action in dictionary state is "%s" \n')
+                        sys.stderr.write('No action in dictionary state is "%s" \n'
                                % self.__state)
-                    to_print = action(line)
-                    # if self.__after_asterisk:
-                        # to_print = self.__asterisk_func(line)
-                    # elif self.__list:
-                        # self.__in_list_func(line)
-                    # elif self.__delete:
-                        # to_print = self.__delete_func(line)
-                    # else:
-                        # to_print = self.__default_func(line)
-                    if to_print:
+                    # Print if allowed by action
+                    if action(line):
                        self.__write_obj.write(line)
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
        if self.__copy:
--- a/src/calibre/ebooks/rtf2xml/info.py
+++ b/src/calibre/ebooks/rtf2xml/info.py
@ -15,8 +15,10 @@
 #                                                                       #
 #                                                                       #
 #########################################################################
-import sys, os, tempfile
+import sys, os, tempfile, re
+
 from calibre.ebooks.rtf2xml import copy
+
 class Info:
    """
    Make tags for document-information
@ -42,12 +44,14 @@ class Info:
        self.__copy = copy
        self.__run_level = run_level
        self.__write_to = tempfile.mktemp()
+
    def __initiate_values(self):
        """
        Initiate all values.
        """
        self.__text_string = ''
        self.__state = 'before_info_table'
+        self.rmspace = re.compile(r'\s+')
        self.__state_dict = {
        'before_info_table': self.__before_info_table_func,
        'after_info_table': self.__after_info_table_func,
@ -58,27 +62,49 @@ class Info:
        self.__info_table_dict = {
        'cw<di<title_____'  : (self.__found_tag_with_text_func, 'title'),
        'cw<di<author____'  : (self.__found_tag_with_text_func, 'author'),
+        'cw<di<operator__'  : (self.__found_tag_with_text_func, 'operator'),
+        'cw<di<manager___'  : (self.__found_tag_with_text_func, 'manager'),
+        'cw<di<company___'  : (self.__found_tag_with_text_func, 'company'),
        'cw<di<keywords__'  : (self.__found_tag_with_text_func, 'keywords'),
+        'cw<di<category__'  : (self.__found_tag_with_text_func, 'category'),
        'cw<di<doc-notes_'  : (self.__found_tag_with_text_func, 'doc-notes'),
        'cw<di<subject___'  : (self.__found_tag_with_text_func, 'subject'),
-        'cw<di<operator__'  : (self.__found_tag_with_text_func, 'operator'),
+        'cw<di<linkbase__'  : (self.__found_tag_with_text_func, 'hyperlink-base'),
+        
        'cw<di<create-tim'  : (self.__found_tag_with_tokens_func, 'creation-time'),
-        'cw<di<revis-time'  :  (self.__found_tag_with_tokens_func, 'revision-time'),
-        'cw<di<edit-time_'  : (self.__single_field_func, 'editing-time'),
+        'cw<di<revis-time'  : (self.__found_tag_with_tokens_func, 'revision-time'),
+        'cw<di<edit-time_'  : (self.__found_tag_with_tokens_func, 'editing-time'),
+        'cw<di<print-time'  : (self.__found_tag_with_tokens_func, 'printing-time'),
+        'cw<di<backuptime'  : (self.__found_tag_with_tokens_func, 'backup-time'),
+        
        'cw<di<num-of-wor'  : (self.__single_field_func, 'number-of-words'),
        'cw<di<num-of-chr'  : (self.__single_field_func, 'number-of-characters'),
+        'cw<di<numofchrws'  : (self.__single_field_func, 'number-of-characters-without-space'),
        'cw<di<num-of-pag'  : (self.__single_field_func, 'number-of-pages'),
+        'cw<di<version___'  : (self.__single_field_func, 'version'),
+        'cw<di<intern-ver'  : (self.__single_field_func, 'internal-version-number'),
+        'cw<di<internalID'  : (self.__single_field_func, 'internal-id-number'),
        }
        self.__token_dict = {
        'year______'        : 'year',
        'month_____'        : 'month',
        'day_______'        : 'day',
        'minute____'        : 'minute',
+        'second____'        : 'second',
        'revis-time'        : 'revision-time',
+        'create-tim'        : 'creation-time',
+        'edit-time_'        : 'editing-time',
+        'print-time'        : 'printing-time',
+        'backuptime'        : 'backup-time',
        'num-of-wor'        : 'number-of-words',
        'num-of-chr'        : 'number-of-characters',
+        'numofchrws'        : 'number-of-characters-without-space',
        'num-of-pag'        : 'number-of-pages',
+        'version___'        : 'version',
+        'intern-ver'        : 'internal-version-number',
+        'internalID'        : 'internal-id-number',
        }
+
    def __before_info_table_func(self, line):
        """
        Required:
@ -92,6 +118,7 @@ class Info:
        if self.__token_info == 'mi<mk<doc-in-beg':
            self.__state = 'in_info_table'
        self.__write_obj.write(line)
+
    def __in_info_table_func(self, line):
        """
        Requires:
@ -112,6 +139,7 @@ class Info:
                action(line, tag)
            else:
                self.__write_obj.write(line)
+
    def __found_tag_with_text_func(self, line, tag):
        """
        Requires:
@ -126,6 +154,7 @@ class Info:
        """
        self.__tag = tag
        self.__state = 'collect_text'
+
    def __collect_text_func(self, line):
        """
        Requires:
@ -139,14 +168,17 @@ class Info:
        """
        if self.__token_info == 'mi<mk<docinf-end':
            self.__state = 'in_info_table'
-            self.__write_obj.write(
-                'mi<tg<open______<%s\n'
-                'tx<nu<__________<%s\n'
-                'mi<tg<close_____<%s\n' % (self.__tag, self.__text_string, self.__tag)
-            )
+            #Don't print empty tags
+            if len(self.rmspace.sub('',self.__text_string)):
+                self.__write_obj.write(
+                    'mi<tg<open______<%s\n'
+                    'tx<nu<__________<%s\n'
+                    'mi<tg<close_____<%s\n' % (self.__tag, self.__text_string, self.__tag)
+                )
            self.__text_string = ''
        elif line[0:2] == 'tx':
            self.__text_string += line[17:-1]
+
    def __found_tag_with_tokens_func(self, line, tag):
        """
        Requires:
@ -163,6 +195,7 @@ class Info:
        self.__state = 'collect_tokens'
        self.__text_string = 'mi<tg<empty-att_<%s' % tag
        #mi<tg<empty-att_<page-definition<margin>33\n
+
    def __collect_tokens_func(self, line):
        """
        Requires:
@ -194,18 +227,19 @@ class Info:
            att = line[6:16]
            value = line[20:-1]
            att_changed = self.__token_dict.get(att)
-            if att_changed == None:
+            if att_changed is None:
                if self.__run_level > 3:
-                    msg = 'no dictionary match for %s\n' % att
+                    msg = 'No dictionary match for %s\n' % att
                    raise self.__bug_handler, msg
            else:
                self.__text_string += '<%s>%s' % (att_changed, value)
+
    def __single_field_func(self, line, tag):
        value = line[20:-1]
        self.__write_obj.write(
-        'mi<tg<empty-att_<%s'
-        '<%s>%s\n' % (tag, tag, value)
+        'mi<tg<empty-att_<%s<%s>%s\n' % (tag, tag, value)
        )
+
    def __after_info_table_func(self, line):
        """
        Requires:
@ -217,6 +251,7 @@ class Info:
            the file.
        """
        self.__write_obj.write(line)
+
    def fix_info(self):
        """
        Requires:
@ -234,20 +269,15 @@ class Info:
            information table, simply write the line to the output file.
        """
        self.__initiate_values()
-        read_obj = open(self.__file, 'r')
-        self.__write_obj = open(self.__write_to, 'w')
-        line_to_read = 1
-        while line_to_read:
-            line_to_read = read_obj.readline()
-            line = line_to_read
-            self.__token_info = line[:16]
-            action = self.__state_dict.get(self.__state)
-            if action == None:
-                sys.stderr.write('no no matching state in module styles.py\n')
-                sys.stderr.write(self.__state + '\n')
-            action(line)
-        read_obj.close()
-        self.__write_obj.close()
+        with open(self.__file, 'r') as read_obj:
+            with open(self.__write_to, 'wb') as self.__write_obj:
+                for line in read_obj:
+                    self.__token_info = line[:16]
+                    action = self.__state_dict.get(self.__state)
+                    if action is None:
+                        sys.stderr.write('No matching state in module styles.py\n')
+                        sys.stderr.write(self.__state + '\n')
+                    action(line)
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
        if self.__copy:
            copy_obj.copy_file(self.__write_to, "info.data")
--- a/src/calibre/ebooks/rtf2xml/process_tokens.py
+++ b/src/calibre/ebooks/rtf2xml/process_tokens.py
@ -70,7 +70,7 @@ class ProcessTokens:
        ';'                  :	('mc', ';', self.ms_sub_func),
        # this must be wrong
        '-'                  :	('mc', '-', self.ms_sub_func),
-        'line'               :  ('mi', 'hardline-break', self.hardline_func), #calibre
+        'line'               :  ('mi', 'hardline-break', self.direct_conv_func), #calibre
        # misc => ml
        '*'                  :	('ml', 'asterisk__', self.default_func),
        ':'                  :	('ml', 'colon_____', self.default_func),
@ -78,7 +78,6 @@ class ProcessTokens:
        'backslash'          :	('nu', '\\', self.text_func),
        'ob'                 :	('nu', '{', self.text_func),
        'cb'                 :	('nu', '}', self.text_func),
-        #'line'               :  ('nu', ' ', self.text_func), calibre
        # paragraph formatting => pf
        'page'               :  ('pf', 'page-break', self.default_func),
        'par'                :	('pf', 'par-end___', self.default_func),
@ -231,11 +230,15 @@ class ProcessTokens:
        'trhdr'              :  ('tb', 'row-header', self.default_func),
        # preamble => pr
        # document information => di
+        # TODO integrate \userprops
        'info'               :	('di', 'doc-info__', self.default_func),
+        'title'              :	('di', 'title_____', self.default_func),
        'author'             :	('di', 'author____', self.default_func),
        'operator'           :	('di', 'operator__', self.default_func),
-        'title'              :	('di', 'title_____', self.default_func),
+        'manager'            :	('di', 'manager___', self.default_func),
+        'company'            :	('di', 'company___', self.default_func),
        'keywords'           :  ('di', 'keywords__', self.default_func),
+        'category'           :  ('di', 'category__', self.default_func),
        'doccomm'            :  ('di', 'doc-notes_', self.default_func),
        'comment'            :  ('di', 'doc-notes_', self.default_func),
        'subject'            :  ('di', 'subject___', self.default_func),
@ -244,11 +247,19 @@ class ProcessTokens:
        'mo'                 :	('di', 'month_____', self.default_func),
        'dy'                 :	('di', 'day_______', self.default_func),
        'min'                :	('di', 'minute____', self.default_func),
+        'sec'                :	('di', 'second____', self.default_func),
        'revtim'             :	('di', 'revis-time', self.default_func),
+        'edmins'             :	('di', 'edit-time_', self.default_func),
+        'printim'            :	('di', 'print-time', self.default_func),
+        'buptim'             :	('di', 'backuptime', self.default_func),
        'nofwords'           :	('di', 'num-of-wor', self.default_func),
        'nofchars'           :	('di', 'num-of-chr', self.default_func),
+        'nofcharsws'         :	('di', 'numofchrws', self.default_func),
        'nofpages'           :	('di', 'num-of-pag', self.default_func),
-        'edmins'             :	('di', 'edit-time_', self.default_func),
+        'version'            :	('di', 'version___', self.default_func),
+        'vern'               :	('di', 'intern-ver', self.default_func),
+        'hlinkbase'          :	('di', 'linkbase__', self.default_func),
+        'id'                 :	('di', 'internalID', self.default_func),
        # headers and footers => hf
        'headerf'            :	('hf', 'head-first', self.default_func),
        'headerl'            :	('hf', 'head-left_', self.default_func),
@ -605,7 +616,7 @@ class ProcessTokens:
    def ms_sub_func(self, pre, token, num):
        return 'tx<mc<__________<%s\n' % token

-    def hardline_func(self, pre, token, num):
+    def direct_conv_func(self, pre, token, num):
        return 'mi<tg<empty_____<%s\n' % token

    def default_func(self, pre, token, num):
--- a/src/calibre/ebooks/rtf2xml/tokenize.py
+++ b/src/calibre/ebooks/rtf2xml/tokenize.py
@ -27,11 +27,13 @@ class Tokenize:
            bug_handler,
            copy = None,
            run_level = 1,
-    ):
+            # out_file = None,
+        ):
        self.__file = in_file
        self.__bug_handler = bug_handler
        self.__copy = copy
        self.__write_to = tempfile.mktemp()
+        # self.__out_file = out_file
        self.__compile_expressions()
        #variables
        self.__uc_char = 0
@ -113,6 +115,8 @@ class Tokenize:

    def __sub_reg_split(self,input_file):
        input_file = self.__replace_spchar.mreplace(input_file)
+        # this is for older RTF
+        input_file = self.__par_exp.sub('\n\\par \n', input_file)
        input_file = self.__ms_hex_exp.sub("\\mshex0\g<1> ", input_file)
        input_file = self.__utf_ud.sub("\\{\\uc0 \g<1>\\}", input_file)
        #remove \n in bin data
@ -153,8 +157,6 @@ class Tokenize:
            # put a backslash in front of to eliminate special cases and
            # make processing easier
            "}": "\\}",
-            # this is for older RTF
-            r'\\$': '\\par ',
            }
        self.__replace_spchar = MReplace(SIMPLE_RPL)
        #add ;? in case of char following \u
@ -168,10 +170,12 @@ class Tokenize:
        #why keep backslash whereas \is replaced before?
        #remove \n from endline char
        self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)")
+        #this is for old RTF
+        self.__par_exp = re.compile(r'\\\n+')
+        # self.__par_exp = re.compile(r'\\$')
        #self.__bin_exp = re.compile(r"\\bin(-?\d{1,8}) {0,1}")
        #self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})")
        #self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\n|\\[^\s\\{}&]+(?:\s)?)")
-        #self.__par_exp = re.compile(r'\\$')
        #self.__remove_line = re.compile(r'\n+')
        #self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)")
        ##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)")
@ -199,7 +203,24 @@ class Tokenize:
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
        if self.__copy:
            copy_obj.copy_file(self.__write_to, "tokenize.data")
+        # if self.__out_file:
+            # self.__file = self.__out_file
        copy_obj.rename(self.__write_to, self.__file)
        os.remove(self.__write_to)
        
        #self.__special_tokens = [ '_', '~', "'", '{', '}' ]
+
+# import sys
+# def main(args=sys.argv):
+    # if len(args) < 1:
+        # print 'No file'
+        # return
+    # file = 'data_tokens.txt'
+    # if len(args) == 3:
+        # file = args[2]
+    # to = Tokenize(args[1], Exception, out_file = file)
+    # to.tokenize()
+
+
+# if __name__ == '__main__':
+    # sys.exit(main())