calibre/src/libprs500/ebooks/rtf2xml/fields_small.py

#########################################################################
#                                                                       #
#                                                                       #
#   copyright 2002 Paul Henry Tremblay                                  #
#                                                                       #
#   This program is distributed in the hope that it will be useful,     #
#   but WITHOUT ANY WARRANTY; without even the implied warranty of      #
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    #
#   General Public License for more details.                            #
#                                                                       #
#   You should have received a copy of the GNU General Public License   #
#   along with this program; if not, write to the Free Software         #
#   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA            #
#   02111-1307 USA                                                      #
#                                                                       #
#                                                                       #
#########################################################################
import sys, os, tempfile,   re
from libprs500.ebooks.rtf2xml import field_strings, copy
class FieldsSmall:
    """
=================
Purpose
=================
Write tags for bookmarks, index and toc entry fields in a tokenized file.
This module does not handle toc or index tables.  (This module won't be any
use to use to you unless you use it as part of the other modules.)
-----------
Method
-----------
Look for the beginning of a bookmark, index, or toc entry. When such a token
is found, store the opeing bracket count in a variable. Collect all the text
until the closing bracket entry is found. Send the string to the module
field_strings to process it. Write the processed string to the output
file.
    """
    def __init__(self,
            in_file,
            bug_handler,
            copy = None,
            run_level = 1,
            ):
        """
        Required:
            'file'--file to parse
        Optional:
            'copy'-- whether to make a copy of result for debugging
            'temp_dir' --where to output temporary results (default is
            directory from which the script is run.)
        Returns:
            nothing
            """
        self.__file = in_file
        self.__bug_handler = bug_handler
        self.__copy = copy
        self.__write_to = tempfile.mktemp()
        self.__run_level = run_level
    def __initiate_values(self):
        """
        Initiate all values.
        """
        self.__string_obj = field_strings.FieldStrings(bug_handler = self.__bug_handler)
        self.__state = 'before_body'
        self.__text_string = ''
        self.__marker = 'mi<mk<inline-fld\n'
        self.__state_dict = {
        'before_body'   : self.__before_body_func,
        'body'  : self.__body_func,
        'bookmark'  : self.__bookmark_func,
        'toc_index'       : self.__toc_index_func,
        }
        self.__body_dict = {
        'cw<an<book-mk-st'      : (self.__found_bookmark_func, 'start'),
        'cw<an<book-mk-en'      : (self.__found_bookmark_func, 'end'),
        'cw<an<toc_______'      : (self.__found_toc_index_func, 'toc'),
        'cw<an<index-mark'      : (self.__found_toc_index_func, 'index'),
        }
        ob = 'ob<nu<open-brack.....'
        cb = 'cb<nu<clos-brack'
        bk_st = 'cw<an<book-mk-st<nu<true'
        tx = 'tx<nu<__________<(.*?)'
        reg_st = ob + bk_st + tx + cb
        self.__book_start = re.compile(r'%s' % reg_st)
    def __before_body_func(self, line):
        """
        Requires:
            line --the line to parse
        Returns:
            nothing
        Logic:
            Look for the beginning of the body. When found, change the state
            to body. Always print out the line.
        """
        if self.__token_info == 'mi<mk<body-open_':
            self.__state = 'body'
        self.__write_obj.write(line)
    def __body_func(self, line):
        """
        Requires:
            line --the line to parse
        Returns:
            nothing
        Logic:
            This function handles all the lines in the body of the documents.
            Look for a bookmark, index or toc entry and take the appropriate action.
        """
        action, tag = \
           self.__body_dict.get(self.__token_info, (None, None))
        if action:
            action(line, tag)
        else:
            self.__write_obj.write(line)
    def __found_bookmark_func(self, line, tag):
        """
        Requires:
            line --the line to parse
        Returns:
            nothing
        Logic:
            This function is called when a bookmark is found. The opening
            bracket count is stored int eh beginning bracket count. The state
            is changed to 'bookmark.'
        """
        self.__beg_bracket_count = self.__ob_count
        self.__cb_count = 0
        self.__state = 'bookmark'
        self.__type_of_bookmark = tag
    def __bookmark_func(self, line):
        """
        Requires:
            line --the line to parse
        Returns:
            nothing
        Logic:
            This function handles all lines within a bookmark. It adds each
            line to a string until the end of the bookmark is found. It
            processes the string with the fields_string module, and
            prints out the result.
        """
        if self.__beg_bracket_count == self.__cb_count:
            self.__state = 'body'
            type = 'bookmark-%s'  % self.__type_of_bookmark
            # change here
            """
            my_string = self.__string_obj.process_string(
                self.__text_string, type)
            """
            my_string = self.__parse_bookmark_func(
                self.__text_string, type)
            self.__write_obj.write(self.__marker)
            self.__write_obj.write(my_string)
            self.__text_string = ''
            self.__write_obj.write(line)
        elif line[0:2] == 'tx':
            self.__text_string += line[17:-1]
    def __parse_index_func(self, my_string):
        """
        Requires:
            my_string --string to parse
            type --type of string
        Returns:
            A string for a toc instruction field.
        Logic:
            This method is meant for *both* index and toc entries.
            I want to eleminate paragraph endings, and I want to divide the
            entry into a main entry and (if it exists) a sub entry.
            Split the string by newlines. Read on token at a time. If the
            token is a special colon, end the main entry element and start the
            sub entry element.
            If the token is a pargrah ending, ignore it, since I don't won't
            paragraphs within toc or index entries.
        """
        my_string, see_string = self.__index_see_func(my_string)
        my_string, bookmark_string = self.__index_bookmark_func( my_string)
        italics, bold = self.__index__format_func(my_string)
        found_sub = 0
        my_changed_string = 'mi<tg<empty-att_<field<type>index-entry'
        my_changed_string += '<update>static'
        if see_string:
            my_changed_string += '<additional-text>%s' % see_string
        if bookmark_string:
            my_changed_string += '<bookmark>%s' % bookmark_string
        if italics:
            my_changed_string += '<italics>true'
        if bold:
            my_changed_string += '<bold>true'
        main_entry = ''
        sub_entry = ''
        lines = my_string.split('\n')
        for line in lines:
            token_info = line[:16]
            if token_info == 'cw<ml<colon_____':
                found_sub = 1
            elif token_info[0:2] == 'tx':
                if found_sub:
                    sub_entry += line[17:]
                else:
                    main_entry += line[17:]
        my_changed_string += '<main-entry>%s' % main_entry
        if found_sub:
            my_changed_string += '<sub-entry>%s' % sub_entry
        my_changed_string += '\n'
        return my_changed_string
    def __index_see_func(self, my_string):
        in_see = 0
        bracket_count = 0
        see_string = ''
        changed_string = ''
        lines = my_string.split('\n')
        end_bracket_count = sys.maxint
        for line in lines:
            token_info = line[:16]
            if token_info == 'ob<nu<open-brack':
                bracket_count += 1
            if token_info == 'cb<nu<clos-brack':
                bracket_count -= 1
            if in_see:
                if bracket_count == end_bracket_count and token_info == 'cb<nu<clos-brack':
                    in_see = 0
                else:
                    if token_info == 'tx<nu<__________':
                        see_string += line[17:]
            else:
                if token_info == 'cw<in<index-see_':
                    end_bracket_count = bracket_count - 1
                    in_see = 1
                changed_string += '%s\n' % line
        return changed_string, see_string
    def __index_bookmark_func(self, my_string):
        """
        Requries:
            my_string -- string in all the index
        Returns:
            bookmark_string -- the text string of the book mark
            index_string -- string minus the bookmark_string
        """
        # cw<an<place_____<nu<true
        in_bookmark = 0
        bracket_count = 0
        bookmark_string = ''
        index_string = ''
        lines = my_string.split('\n')
        end_bracket_count = sys.maxint
        for line in lines:
            token_info = line[:16]
            if token_info == 'ob<nu<open-brack':
                bracket_count += 1
            if token_info == 'cb<nu<clos-brack':
                bracket_count -= 1
            if in_bookmark:
                if bracket_count == end_bracket_count and token_info == 'cb<nu<clos-brack':
                    in_bookmark = 0
                    index_string += '%s\n' % line
                else:
                    if token_info == 'tx<nu<__________':
                        bookmark_string += line[17:]
                    else:
                        index_string += '%s\n' % line
            else:
                if token_info == 'cw<an<place_____':
                    end_bracket_count = bracket_count - 1
                    in_bookmark = 1
                index_string += '%s\n' % line
        return index_string, bookmark_string
    def __index__format_func(self, my_string):
        italics = 0
        bold =0
        lines = my_string.split('\n')
        for line in lines:
            token_info = line[:16]
            if token_info == 'cw<in<index-bold':
                bold = 1
            if token_info == 'cw<in<index-ital':
                italics = 1
        return italics, bold
    def __parse_toc_func(self, my_string):
        """
        Requires:
            my_string -- all the string in the toc
        Returns:
            modidified string
        Logic:
        """
        toc_level = 0
        toc_suppress = 0
        my_string, book_start_string, book_end_string =\
        self.__parse_bookmark_for_toc(my_string)
        main_entry = ''
        my_changed_string = 'mi<tg<empty-att_<field<type>toc-entry'
        my_changed_string += '<update>static'
        if book_start_string:
            my_changed_string += '<bookmark-start>%s' % book_start_string
        if book_end_string:
            my_changed_string += '<bookmark-end>%s' % book_end_string
        lines = my_string.split('\n')
        for line in lines:
            token_info = line[:16]
            if token_info[0:2] == 'tx':
                main_entry += line[17:]
            if token_info == 'cw<tc<toc-level_':
                toc_level = line[20:]
            if token_info == 'cw<tc<toc-sup-nu':
                toc_suppress = 1
        if toc_level:
            my_changed_string += '<toc-level>%s' % toc_level
        if toc_suppress:
            my_changed_string += '<toc-suppress-number>true'
        my_changed_string += '<main-entry>%s' % main_entry
        my_changed_string += '\n'
        return my_changed_string
    def __parse_bookmark_for_toc(self, my_string):
        """
        Requires:
            the_string --string of toc, with new lines
        Returns:
            the_string -- string minus bookmarks
            bookmark_string -- bookmarks
        Logic:
        """
        in_bookmark = 0
        bracket_count = 0
        book_start_string = ''
        book_end_string = ''
        book_type = 0
        toc_string = ''
        lines = my_string.split('\n')
        end_bracket_count = sys.maxint
        for line in lines:
            token_info = line[:16]
            if token_info == 'ob<nu<open-brack':
                bracket_count += 1
            if token_info == 'cb<nu<clos-brack':
                bracket_count -= 1
            if in_bookmark:
                if bracket_count == end_bracket_count and token_info == 'cb<nu<clos-brack':
                    in_bookmark = 0
                    toc_string += '%s\n' % line
                else:
                    if token_info == 'tx<nu<__________':
                        if book_type == 'start':
                            book_start_string += line[17:]
                        elif book_type == 'end':
                            book_end_string += line[17:]
                    else:
                        toc_string += '%s\n' % line
            else:
                if token_info == 'cw<an<book-mk-st' or token_info =='cw<an<book-mk-en':
                    if token_info == 'cw<an<book-mk-st':
                        book_type = 'start'
                    if token_info == 'cw<an<book-mk-en':
                        book_type = 'end'
                    end_bracket_count = bracket_count - 1
                    in_bookmark = 1
                toc_string += '%s\n' % line
        return toc_string, book_start_string, book_end_string
    def __parse_bookmark_func(self, my_string, type):
        """
        Requires:
            my_string --string to parse
            type --type of string
        Returns:
            A string formated for a field instruction.
        Logic:
            The type is the name (either bookmark-end or bookmark-start). The
            id is the complete text string.
        """
        my_changed_string = ('mi<tg<empty-att_<field<type>%s'
        '<number>%s<update>none\n' % (type, my_string))
        return my_changed_string
    def __found_toc_index_func(self, line, tag):
        """
        Requires:
            line --the line to parse
        Returns:
            nothing
        Logic:
            This function is called when a toc or index entry is found. The opening
            bracket count is stored in the beginning bracket count. The state
            is changed to 'toc_index.'
        """
        self.__beg_bracket_count = self.__ob_count
        self.__cb_count = 0
        self.__state = 'toc_index'
        self.__tag = tag
    def __toc_index_func(self, line):
        """
        Requires:
            line --the line to parse
        Returns:
            nothing
        Logic:
            This function handles all lines within a toc or index entry. It
            adds each line to a string until the end of the entry is found. It
            processes the string with the fields_string module, and
            prints out the result.
        """
        if self.__beg_bracket_count == self.__cb_count:
            self.__state = 'body'
            type = self.__tag
            if type == 'index':
                my_string = self.__parse_index_func(
                self.__text_string)
            elif type == 'toc':
                my_string = self.__parse_toc_func(
                self.__text_string)
            self.__write_obj.write(self.__marker)
            self.__write_obj.write(my_string)
            self.__text_string = ''
            self.__write_obj.write(line)
        else:
            self.__text_string += line
    def fix_fields(self):
        """
        Requires:
            nothing
        Returns:
            nothing (changes the original file)
        Logic:
            Read one line in at a time. Determine what action to take based on
            the state. If the state is before the body, look for the
            beginning of the body.
           The other two states are toc_index (for toc and index entries) and
           bookmark.
        """
        self.__initiate_values()
        read_obj = open(self.__file)
        self.__write_obj = open(self.__write_to, 'w')
        line_to_read = '1'
        while line_to_read:
            line_to_read = read_obj.readline()
            line = line_to_read
            self.__token_info = line[:16]
            if self.__token_info == 'ob<nu<open-brack':
                self.__ob_count = line[-5:-1]
            if self.__token_info == 'cb<nu<clos-brack':
                self.__cb_count = line[-5:-1]
            action = self.__state_dict.get(self.__state)
            if action == None:
                sys.stderr.write('no no matching state in module fields_small.py\n')
                sys.stderr.write(self.__state + '\n')
            action(line)
        read_obj.close()
        self.__write_obj.close()
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
        if self.__copy:
            copy_obj.copy_file(self.__write_to, "fields_small.data")
        copy_obj.rename(self.__write_to, self.__file)
        os.remove(self.__write_to)