calibre/src/libprs500/ebooks/rtf2xml/paragraphs.py

#########################################################################
#                                                                       #
#                                                                       #
#   copyright 2002 Paul Henry Tremblay                                  #
#                                                                       #
#   This program is distributed in the hope that it will be useful,     #
#   but WITHOUT ANY WARRANTY; without even the implied warranty of      #
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    #
#   General Public License for more details.                            #
#                                                                       #
#   You should have received a copy of the GNU General Public License   #
#   along with this program; if not, write to the Free Software         #
#   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA            #
#   02111-1307 USA                                                      #
#                                                                       #
#                                                                       #
#########################################################################
import sys, os, tempfile
from libprs500.ebooks.rtf2xml import copy
class Paragraphs:
    """
=================
Purpose
=================
Write paragraph tags for a tokenized file. (This module won't be any use to use
to you unless you use it as part of the other modules.)
-------------
Method
-------------
RTF does not tell you when a paragraph begins. It only tells you when the
paragraph ends.
In order to make paragraphs out of this limited info, the parser starts in the
body of the documents and assumes it is not in a paragraph. It looks for clues
to begin a paragraph. Text starts a paragraph; so does an inline field or
list-text. If an end of paragraph marker (\par) is found, then this indicates
a blank paragraph.
Once a paragraph is found, the state changes to 'paragraph.' In this state,
clues are looked to for the end of a paragraph. The end of a paragraph marker
(\par) marks the end of a paragraph. So does the end of a footnote or heading;
a paragraph definintion; the end of a field-block; and the beginning of a
section. (How about the end of a section or the end of a field-block?)
    """
    def __init__(self,
            in_file,
            bug_handler,
            copy = None,
            write_empty_para = 1,
            run_level = 1,
            ):
        """
        Required:
            'file'--file to parse
        Optional:
            'copy'-- whether to make a copy of result for debugging
            'temp_dir' --where to output temporary results (default is
            directory from which the script is run.)
        Returns:
            nothing
            """
        self.__file = in_file
        self.__bug_handler = bug_handler
        self.__copy = copy
        self.__write_empty_para = write_empty_para
        self.__run_level = run_level
        self.__write_to = tempfile.mktemp()
    def __initiate_values(self):
        """
        Initiate all values.
        """
        self.__state = 'before_body'
        self.__start_marker =  'mi<mk<para-start\n' # outside para tags
        self.__start2_marker = 'mi<mk<par-start_\n' # inside para tags
        self.__end2_marker =   'mi<mk<par-end___\n' # inside para tags
        self.__end_marker =    'mi<mk<para-end__\n' # outside para tags
        self.__state_dict = {
        'before_body'       : self.__before_body_func,
        'not_paragraph'     : self.__not_paragraph_func,
        'paragraph'         : self.__paragraph_func,
        }
        self.__paragraph_dict = {
        'cw<pf<par-end___'      : self.__close_para_func,   # end of paragraph
        'mi<mk<headi_-end'      : self.__close_para_func,   # end of header or footer
        ##'cw<pf<par-def___'      : self.__close_para_func,   # paragraph definition
        # 'mi<mk<fld-bk-end'      : self.__close_para_func,   # end of field-block
        'mi<mk<fldbk-end_'      : self.__close_para_func,   # end of field-block
        'mi<mk<body-close'      : self.__close_para_func,   # end of body
        'mi<mk<sect-close'      : self.__close_para_func,   # end of body
        'mi<mk<sect-start'      : self.__close_para_func,   # start of section
        'mi<mk<foot___clo'      : self.__close_para_func,   # end of footnote
        'cw<tb<cell______'      : self.__close_para_func,   # end of cell
        'mi<mk<par-in-fld'      : self.__close_para_func,   # start of block field
        'cw<pf<par-def___'      : self.__bogus_para__def_func,   # paragraph definition
        }
        self.__not_paragraph_dict = {
        'tx<nu<__________'      : self.__start_para_func,
        'tx<hx<__________'      : self.__start_para_func,
        'tx<ut<__________'      : self.__start_para_func,
        'tx<mc<__________'      : self.__start_para_func,
        'mi<mk<inline-fld'      : self.__start_para_func,
        'mi<mk<para-beg__'      : self.__start_para_func,
        'cw<pf<par-end___'      : self.__empty_para_func,
        'mi<mk<pict-start'      : self.__start_para_func,
        'cw<pf<page-break'      : self.__empty_pgbk_func,    # page break
        }
    def __before_body_func(self, line):
        """
        Required:
            line -- line to parse
        Returns:
            nothing
        Logic:
            This function handles all the lines before the start of the body.
            Once the body starts, the state is switched to 'not_paragraph'
        """
        if self.__token_info == 'mi<mk<body-open_':
            self.__state = 'not_paragraph'
        self.__write_obj.write(line)
    def __not_paragraph_func(self, line):
        """
        Required:
            line --line to parse
        Returns:
            nothing
        Logic:
            This function handles all lines that are outside of the paragraph.
            It looks for clues that start a paragraph, and when found,
            switches states and writes the start tags.
        """
        action = self.__not_paragraph_dict.get(self.__token_info)
        if action:
            action(line)
        self.__write_obj.write(line)
    def __paragraph_func(self, line):
        """
        Required:
            line --line to parse
        Returns:
            nothing
        Logic:
            This function handles all the lines that are in the paragraph. It
            looks for clues to the end of the paragraph. When a clue is found,
            it calls on another method to write the end of the tag and change
            the state.
        """
        action = self.__paragraph_dict.get(self.__token_info)
        if action:
            action(line)
        else:
            self.__write_obj.write(line)
    def __start_para_func(self, line):
        """
        Requires:
            line --line to parse
        Returns:
            nothing
        Logic:
            This function writes the beginning tags for a paragraph and
            changes the state to paragraph.
        """
        self.__write_obj.write(self.__start_marker) # marker for later parsing
        self.__write_obj.write(
        'mi<tg<open______<para\n'
        )
        self.__write_obj.write(self.__start2_marker)
        self.__state = 'paragraph'
    def __empty_para_func(self, line):
        """
        Requires:
            line --line to parse
        Returns:
            nothing
        Logic:
            This function writes the empty tags for a paragraph.
            It does not do anything if self.__write_empty_para is 0.
        """
        if self.__write_empty_para:
            self.__write_obj.write(self.__start_marker) # marker for later parsing
            self.__write_obj.write(
            'mi<tg<empty_____<para\n'
            )
            self.__write_obj.write(self.__end_marker)   # marker for later parsing
    def __empty_pgbk_func(self, line):
        """
        Requires:
            line --line to parse
        Returns:
            nothing
        Logic:
            This function writes the empty tags for a page break.
        """
        self.__write_obj.write(
        'mi<tg<empty_____<page-break\n'
        )
    def __close_para_func(self, line):
        """
        Requires:
            line --line to parse
        Returns:
            nothing
        Logic:
            This function writes the end tags for a paragraph and
            changes the state to not_paragraph.
        """
        self.__write_obj.write(self.__end2_marker) # marker for later parser
        self.__write_obj.write(
        'mi<tg<close_____<para\n'
        )
        self.__write_obj.write(self.__end_marker) # marker for later parser
        self.__write_obj.write(line)
        self.__state = 'not_paragraph'
    def __bogus_para__def_func(self, line):
        """
        Requires:
            line --line to parse
        Returns:
            nothing
        Logic:
            if a \pard occurs in a paragraph, I want to ignore it. (I believe)
        """
        self.__write_obj.write('mi<mk<bogus-pard\n')
    def make_paragraphs(self):
        """
        Requires:
            nothing
        Returns:
            nothing (changes the original file)
        Logic:
            Read one line in at a time. Determine what action to take based on
            the state. If the state is before the body, look for the
            beginning of the body.
            When the body is found, change the state to 'not_paragraph'. The
            only other state is 'paragraph'.
        """
        self.__initiate_values()
        read_obj = open(self.__file, 'r')
        self.__write_obj = open(self.__write_to, 'w')
        line_to_read = 1
        while line_to_read:
            line_to_read = read_obj.readline()
            line = line_to_read
            self.__token_info = line[:16]
            action = self.__state_dict.get(self.__state)
            if action == None:
                sys.stderr.write('no no matching state in module sections.py\n')
                sys.stderr.write(self.__state + '\n')
            action(line)
        read_obj.close()
        self.__write_obj.close()
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
        if self.__copy:
            copy_obj.copy_file(self.__write_to, "paragraphs.data")
        copy_obj.rename(self.__write_to, self.__file)
        os.remove(self.__write_to)