Handle inproper \*\csN in body without braces

2025-07-09 03:04:10 -04:00 · 2011-01-16 00:47:01 +01:00 · 2011-01-16 00:47:01 +01:00 · fc42efda42
commit fc42efda42
parent 2e033022b7
4 changed files with 39 additions and 24 deletions
--- a/src/calibre/ebooks/rtf2xml/ParseRtf.py
+++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py
@ -375,7 +375,7 @@ class ParseRtf:
        old_rtf = old_rtf_obj.check_if_old_rtf()
        if old_rtf:
            if self.__run_level > 5:
-                msg = 'older RTF\n'
+                msg = 'Older RTF\n'
                msg += 'self.__run_level is "%s"\n' % self.__run_level
                raise RtfInvalidCodeException, msg
            if self.__run_level > 1:
--- a/src/calibre/ebooks/rtf2xml/delete_info.py
+++ b/src/calibre/ebooks/rtf2xml/delete_info.py
@ -48,6 +48,7 @@ class DeleteInfo:
                            'cw<it<listtable_',
                            'cw<it<revi-table',
                            'cw<ls<list-lev-d',
+                            # Field allowed
                            'cw<fd<field-inst',
                            'cw<an<book-mk-st',
                            'cw<an<book-mk-en',
@ -86,7 +87,7 @@ class DeleteInfo:
            self.__ob = line
            return False
        else:
-            # write previous bracket, since didn't fine asterisk
+            # write previous bracket, since didn't find asterisk
            if self.__ob:
                self.__write_obj.write(self.__ob)
                self.__ob = 0
@ -109,7 +110,7 @@ class DeleteInfo:
        If you find that you are in a delete group, and the previous
        token in not an open bracket (self.__ob = 0), that means
        that the delete group is nested inside another acceptable
-        detination group. In this case, you have alrady written
+        detination group. In this case, you have already written
        the open bracket, so you will need to write the closed one
        as well.
        """
--- a/src/calibre/ebooks/rtf2xml/fields_small.py
+++ b/src/calibre/ebooks/rtf2xml/fields_small.py
@ -15,8 +15,10 @@
 #                                                                       #
 #                                                                       #
 #########################################################################
-import sys, os, tempfile,   re
+import sys, os, tempfile, re
+
 from calibre.ebooks.rtf2xml import field_strings, copy
+
 class FieldsSmall:
    """
 =================
@ -24,7 +26,7 @@ Purpose
 =================
 Write tags for bookmarks, index and toc entry fields in a tokenized file.
 This module does not handle toc or index tables.  (This module won't be any
-use to use to you unless you use it as part of the other modules.)
+use to you unless you use it as part of the other modules.)
 -----------
 Method
 -----------
@ -55,6 +57,7 @@ file.
        self.__copy = copy
        self.__write_to = tempfile.mktemp()
        self.__run_level = run_level
+
    def __initiate_values(self):
        """
        Initiate all values.
@ -81,6 +84,7 @@ file.
        tx = 'tx<nu<__________<(.*?)'
        reg_st = ob + bk_st + tx + cb
        self.__book_start = re.compile(r'%s' % reg_st)
+
    def __before_body_func(self, line):
        """
        Requires:
@ -94,6 +98,7 @@ file.
        if self.__token_info == 'mi<mk<body-open_':
            self.__state = 'body'
        self.__write_obj.write(line)
+
    def __body_func(self, line):
        """
        Requires:
@ -110,6 +115,7 @@ file.
            action(line, tag)
        else:
            self.__write_obj.write(line)
+
    def __found_bookmark_func(self, line, tag):
        """
        Requires:
@ -125,6 +131,7 @@ file.
        self.__cb_count = 0
        self.__state = 'bookmark'
        self.__type_of_bookmark = tag
+
    def __bookmark_func(self, line):
        """
        Requires:
@ -153,6 +160,7 @@ file.
            self.__write_obj.write(line)
        elif line[0:2] == 'tx':
            self.__text_string += line[17:-1]
+
    def __parse_index_func(self, my_string):
        """
        Requires:
@ -201,6 +209,7 @@ file.
            my_changed_string += '<sub-entry>%s' % sub_entry
        my_changed_string += '\n'
        return my_changed_string
+
    def __index_see_func(self, my_string):
        in_see = 0
        bracket_count = 0
@ -226,6 +235,7 @@ file.
                    in_see = 1
                changed_string += '%s\n' % line
        return changed_string, see_string
+
    def __index_bookmark_func(self, my_string):
        """
        Requries:
@ -262,6 +272,7 @@ file.
                    in_bookmark = 1
                index_string += '%s\n' % line
        return index_string, bookmark_string
+
    def __index__format_func(self, my_string):
        italics = 0
        bold =0
@ -273,6 +284,7 @@ file.
            if token_info == 'cw<in<index-ital':
                italics = 1
        return italics, bold
+
    def __parse_toc_func(self, my_string):
        """
        Requires:
@ -308,6 +320,7 @@ file.
        my_changed_string += '<main-entry>%s' % main_entry
        my_changed_string += '\n'
        return my_changed_string
+
    def __parse_bookmark_for_toc(self, my_string):
        """
        Requires:
@ -353,6 +366,7 @@ file.
                    in_bookmark = 1
                toc_string += '%s\n' % line
        return toc_string, book_start_string, book_end_string
+
    def __parse_bookmark_func(self, my_string, type):
        """
        Requires:
@ -367,6 +381,7 @@ file.
        my_changed_string = ('mi<tg<empty-att_<field<type>%s'
        '<number>%s<update>none\n' % (type, my_string))
        return my_changed_string
+
    def __found_toc_index_func(self, line, tag):
        """
        Requires:
@ -382,6 +397,7 @@ file.
        self.__cb_count = 0
        self.__state = 'toc_index'
        self.__tag = tag
+
    def __toc_index_func(self, line):
        """
        Requires:
@ -409,6 +425,7 @@ file.
            self.__write_obj.write(line)
        else:
            self.__text_string += line
+
    def fix_fields(self):
        """
        Requires:
@ -423,24 +440,19 @@ file.
           bookmark.
        """
        self.__initiate_values()
-        read_obj = open(self.__file)
-        self.__write_obj = open(self.__write_to, 'w')
-        line_to_read = '1'
-        while line_to_read:
-            line_to_read = read_obj.readline()
-            line = line_to_read
-            self.__token_info = line[:16]
-            if self.__token_info == 'ob<nu<open-brack':
-                self.__ob_count = line[-5:-1]
-            if self.__token_info == 'cb<nu<clos-brack':
-                self.__cb_count = line[-5:-1]
-            action = self.__state_dict.get(self.__state)
-            if action == None:
-                sys.stderr.write('no no matching state in module fields_small.py\n')
-                sys.stderr.write(self.__state + '\n')
-            action(line)
-        read_obj.close()
-        self.__write_obj.close()
+        with open(self.__file, 'r') as read_obj:
+            with open(self.__write_to, 'w') as self.__write_obj:
+                for line in read_obj:
+                    self.__token_info = line[:16]
+                    if self.__token_info == 'ob<nu<open-brack':
+                        self.__ob_count = line[-5:-1]
+                    if self.__token_info == 'cb<nu<clos-brack':
+                        self.__cb_count = line[-5:-1]
+                    action = self.__state_dict.get(self.__state)
+                    if action is None:
+                        sys.stderr.write('No matching state in module fields_small.py\n')
+                        sys.stderr.write(self.__state + '\n')
+                    action(line)
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
        if self.__copy:
            copy_obj.copy_file(self.__write_to, "fields_small.data")
--- a/src/calibre/ebooks/rtf2xml/tokenize.py
+++ b/src/calibre/ebooks/rtf2xml/tokenize.py
@ -115,8 +115,8 @@ class Tokenize:

    def __sub_reg_split(self,input_file):
        input_file = self.__replace_spchar.mreplace(input_file)
-        # this is for older RTF
        input_file = self.__par_exp.sub('\n\\par \n', input_file)
+        input_file = self.__cs_ast.sub("\g<1>", input_file)
        input_file = self.__ms_hex_exp.sub("\\mshex0\g<1> ", input_file)
        input_file = self.__utf_ud.sub("\\{\\uc0 \g<1>\\}", input_file)
        #remove \n in bin data
@ -172,6 +172,8 @@ class Tokenize:
        self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)")
        #this is for old RTF
        self.__par_exp = re.compile(r'\\\n+')
+        #handle improper cs char-style with \* before without {
+        self.__cs_ast = re.compile(r'\\\*([\n ]*\\cs\d+[\n \\]+)')
        # self.__par_exp = re.compile(r'\\$')
        #self.__bin_exp = re.compile(r"\\bin(-?\d{1,8}) {0,1}")
        #self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})")