calibre/src/libprs500/ebooks/rtf2xml/fields_small.py
2007-12-27 22:11:26 +00:00

449 lines
17 KiB
Python
Executable File

#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program; if not, write to the Free Software #
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
# 02111-1307 USA #
# #
# #
#########################################################################
import sys, os, tempfile, re
from libprs500.ebooks.rtf2xml import field_strings, copy
class FieldsSmall:
"""
=================
Purpose
=================
Write tags for bookmarks, index and toc entry fields in a tokenized file.
This module does not handle toc or index tables. (This module won't be any
use to use to you unless you use it as part of the other modules.)
-----------
Method
-----------
Look for the beginning of a bookmark, index, or toc entry. When such a token
is found, store the opeing bracket count in a variable. Collect all the text
until the closing bracket entry is found. Send the string to the module
field_strings to process it. Write the processed string to the output
file.
"""
def __init__(self,
in_file,
bug_handler,
copy = None,
run_level = 1,
):
"""
Required:
'file'--file to parse
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
Returns:
nothing
"""
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__write_to = tempfile.mktemp()
self.__run_level = run_level
def __initiate_values(self):
"""
Initiate all values.
"""
self.__string_obj = field_strings.FieldStrings(bug_handler = self.__bug_handler)
self.__state = 'before_body'
self.__text_string = ''
self.__marker = 'mi<mk<inline-fld\n'
self.__state_dict = {
'before_body' : self.__before_body_func,
'body' : self.__body_func,
'bookmark' : self.__bookmark_func,
'toc_index' : self.__toc_index_func,
}
self.__body_dict = {
'cw<an<book-mk-st' : (self.__found_bookmark_func, 'start'),
'cw<an<book-mk-en' : (self.__found_bookmark_func, 'end'),
'cw<an<toc_______' : (self.__found_toc_index_func, 'toc'),
'cw<an<index-mark' : (self.__found_toc_index_func, 'index'),
}
ob = 'ob<nu<open-brack.....'
cb = 'cb<nu<clos-brack'
bk_st = 'cw<an<book-mk-st<nu<true'
tx = 'tx<nu<__________<(.*?)'
reg_st = ob + bk_st + tx + cb
self.__book_start = re.compile(r'%s' % reg_st)
def __before_body_func(self, line):
"""
Requires:
line --the line to parse
Returns:
nothing
Logic:
Look for the beginning of the body. When found, change the state
to body. Always print out the line.
"""
if self.__token_info == 'mi<mk<body-open_':
self.__state = 'body'
self.__write_obj.write(line)
def __body_func(self, line):
"""
Requires:
line --the line to parse
Returns:
nothing
Logic:
This function handles all the lines in the body of the documents.
Look for a bookmark, index or toc entry and take the appropriate action.
"""
action, tag = \
self.__body_dict.get(self.__token_info, (None, None))
if action:
action(line, tag)
else:
self.__write_obj.write(line)
def __found_bookmark_func(self, line, tag):
"""
Requires:
line --the line to parse
Returns:
nothing
Logic:
This function is called when a bookmark is found. The opening
bracket count is stored int eh beginning bracket count. The state
is changed to 'bookmark.'
"""
self.__beg_bracket_count = self.__ob_count
self.__cb_count = 0
self.__state = 'bookmark'
self.__type_of_bookmark = tag
def __bookmark_func(self, line):
"""
Requires:
line --the line to parse
Returns:
nothing
Logic:
This function handles all lines within a bookmark. It adds each
line to a string until the end of the bookmark is found. It
processes the string with the fields_string module, and
prints out the result.
"""
if self.__beg_bracket_count == self.__cb_count:
self.__state = 'body'
type = 'bookmark-%s' % self.__type_of_bookmark
# change here
"""
my_string = self.__string_obj.process_string(
self.__text_string, type)
"""
my_string = self.__parse_bookmark_func(
self.__text_string, type)
self.__write_obj.write(self.__marker)
self.__write_obj.write(my_string)
self.__text_string = ''
self.__write_obj.write(line)
elif line[0:2] == 'tx':
self.__text_string += line[17:-1]
def __parse_index_func(self, my_string):
"""
Requires:
my_string --string to parse
type --type of string
Returns:
A string for a toc instruction field.
Logic:
This method is meant for *both* index and toc entries.
I want to eleminate paragraph endings, and I want to divide the
entry into a main entry and (if it exists) a sub entry.
Split the string by newlines. Read on token at a time. If the
token is a special colon, end the main entry element and start the
sub entry element.
If the token is a pargrah ending, ignore it, since I don't won't
paragraphs within toc or index entries.
"""
my_string, see_string = self.__index_see_func(my_string)
my_string, bookmark_string = self.__index_bookmark_func( my_string)
italics, bold = self.__index__format_func(my_string)
found_sub = 0
my_changed_string = 'mi<tg<empty-att_<field<type>index-entry'
my_changed_string += '<update>static'
if see_string:
my_changed_string += '<additional-text>%s' % see_string
if bookmark_string:
my_changed_string += '<bookmark>%s' % bookmark_string
if italics:
my_changed_string += '<italics>true'
if bold:
my_changed_string += '<bold>true'
main_entry = ''
sub_entry = ''
lines = my_string.split('\n')
for line in lines:
token_info = line[:16]
if token_info == 'cw<ml<colon_____':
found_sub = 1
elif token_info[0:2] == 'tx':
if found_sub:
sub_entry += line[17:]
else:
main_entry += line[17:]
my_changed_string += '<main-entry>%s' % main_entry
if found_sub:
my_changed_string += '<sub-entry>%s' % sub_entry
my_changed_string += '\n'
return my_changed_string
def __index_see_func(self, my_string):
in_see = 0
bracket_count = 0
see_string = ''
changed_string = ''
lines = my_string.split('\n')
end_bracket_count = sys.maxint
for line in lines:
token_info = line[:16]
if token_info == 'ob<nu<open-brack':
bracket_count += 1
if token_info == 'cb<nu<clos-brack':
bracket_count -= 1
if in_see:
if bracket_count == end_bracket_count and token_info == 'cb<nu<clos-brack':
in_see = 0
else:
if token_info == 'tx<nu<__________':
see_string += line[17:]
else:
if token_info == 'cw<in<index-see_':
end_bracket_count = bracket_count - 1
in_see = 1
changed_string += '%s\n' % line
return changed_string, see_string
def __index_bookmark_func(self, my_string):
"""
Requries:
my_string -- string in all the index
Returns:
bookmark_string -- the text string of the book mark
index_string -- string minus the bookmark_string
"""
# cw<an<place_____<nu<true
in_bookmark = 0
bracket_count = 0
bookmark_string = ''
index_string = ''
lines = my_string.split('\n')
end_bracket_count = sys.maxint
for line in lines:
token_info = line[:16]
if token_info == 'ob<nu<open-brack':
bracket_count += 1
if token_info == 'cb<nu<clos-brack':
bracket_count -= 1
if in_bookmark:
if bracket_count == end_bracket_count and token_info == 'cb<nu<clos-brack':
in_bookmark = 0
index_string += '%s\n' % line
else:
if token_info == 'tx<nu<__________':
bookmark_string += line[17:]
else:
index_string += '%s\n' % line
else:
if token_info == 'cw<an<place_____':
end_bracket_count = bracket_count - 1
in_bookmark = 1
index_string += '%s\n' % line
return index_string, bookmark_string
def __index__format_func(self, my_string):
italics = 0
bold =0
lines = my_string.split('\n')
for line in lines:
token_info = line[:16]
if token_info == 'cw<in<index-bold':
bold = 1
if token_info == 'cw<in<index-ital':
italics = 1
return italics, bold
def __parse_toc_func(self, my_string):
"""
Requires:
my_string -- all the string in the toc
Returns:
modidified string
Logic:
"""
toc_level = 0
toc_suppress = 0
my_string, book_start_string, book_end_string =\
self.__parse_bookmark_for_toc(my_string)
main_entry = ''
my_changed_string = 'mi<tg<empty-att_<field<type>toc-entry'
my_changed_string += '<update>static'
if book_start_string:
my_changed_string += '<bookmark-start>%s' % book_start_string
if book_end_string:
my_changed_string += '<bookmark-end>%s' % book_end_string
lines = my_string.split('\n')
for line in lines:
token_info = line[:16]
if token_info[0:2] == 'tx':
main_entry += line[17:]
if token_info == 'cw<tc<toc-level_':
toc_level = line[20:]
if token_info == 'cw<tc<toc-sup-nu':
toc_suppress = 1
if toc_level:
my_changed_string += '<toc-level>%s' % toc_level
if toc_suppress:
my_changed_string += '<toc-suppress-number>true'
my_changed_string += '<main-entry>%s' % main_entry
my_changed_string += '\n'
return my_changed_string
def __parse_bookmark_for_toc(self, my_string):
"""
Requires:
the_string --string of toc, with new lines
Returns:
the_string -- string minus bookmarks
bookmark_string -- bookmarks
Logic:
"""
in_bookmark = 0
bracket_count = 0
book_start_string = ''
book_end_string = ''
book_type = 0
toc_string = ''
lines = my_string.split('\n')
end_bracket_count = sys.maxint
for line in lines:
token_info = line[:16]
if token_info == 'ob<nu<open-brack':
bracket_count += 1
if token_info == 'cb<nu<clos-brack':
bracket_count -= 1
if in_bookmark:
if bracket_count == end_bracket_count and token_info == 'cb<nu<clos-brack':
in_bookmark = 0
toc_string += '%s\n' % line
else:
if token_info == 'tx<nu<__________':
if book_type == 'start':
book_start_string += line[17:]
elif book_type == 'end':
book_end_string += line[17:]
else:
toc_string += '%s\n' % line
else:
if token_info == 'cw<an<book-mk-st' or token_info =='cw<an<book-mk-en':
if token_info == 'cw<an<book-mk-st':
book_type = 'start'
if token_info == 'cw<an<book-mk-en':
book_type = 'end'
end_bracket_count = bracket_count - 1
in_bookmark = 1
toc_string += '%s\n' % line
return toc_string, book_start_string, book_end_string
def __parse_bookmark_func(self, my_string, type):
"""
Requires:
my_string --string to parse
type --type of string
Returns:
A string formated for a field instruction.
Logic:
The type is the name (either bookmark-end or bookmark-start). The
id is the complete text string.
"""
my_changed_string = ('mi<tg<empty-att_<field<type>%s'
'<number>%s<update>none\n' % (type, my_string))
return my_changed_string
def __found_toc_index_func(self, line, tag):
"""
Requires:
line --the line to parse
Returns:
nothing
Logic:
This function is called when a toc or index entry is found. The opening
bracket count is stored in the beginning bracket count. The state
is changed to 'toc_index.'
"""
self.__beg_bracket_count = self.__ob_count
self.__cb_count = 0
self.__state = 'toc_index'
self.__tag = tag
def __toc_index_func(self, line):
"""
Requires:
line --the line to parse
Returns:
nothing
Logic:
This function handles all lines within a toc or index entry. It
adds each line to a string until the end of the entry is found. It
processes the string with the fields_string module, and
prints out the result.
"""
if self.__beg_bracket_count == self.__cb_count:
self.__state = 'body'
type = self.__tag
if type == 'index':
my_string = self.__parse_index_func(
self.__text_string)
elif type == 'toc':
my_string = self.__parse_toc_func(
self.__text_string)
self.__write_obj.write(self.__marker)
self.__write_obj.write(my_string)
self.__text_string = ''
self.__write_obj.write(line)
else:
self.__text_string += line
def fix_fields(self):
"""
Requires:
nothing
Returns:
nothing (changes the original file)
Logic:
Read one line in at a time. Determine what action to take based on
the state. If the state is before the body, look for the
beginning of the body.
The other two states are toc_index (for toc and index entries) and
bookmark.
"""
self.__initiate_values()
read_obj = open(self.__file)
self.__write_obj = open(self.__write_to, 'w')
line_to_read = '1'
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
if self.__token_info == 'ob<nu<open-brack':
self.__ob_count = line[-5:-1]
if self.__token_info == 'cb<nu<clos-brack':
self.__cb_count = line[-5:-1]
action = self.__state_dict.get(self.__state)
if action == None:
sys.stderr.write('no no matching state in module fields_small.py\n')
sys.stderr.write(self.__state + '\n')
action(line)
read_obj.close()
self.__write_obj.close()
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "fields_small.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)