mirror of
https://github.com/kovidgoyal/calibre.git
synced 2026-01-01 09:40:21 -05:00
449 lines
17 KiB
Python
Executable File
449 lines
17 KiB
Python
Executable File
#########################################################################
|
|
# #
|
|
# #
|
|
# copyright 2002 Paul Henry Tremblay #
|
|
# #
|
|
# This program is distributed in the hope that it will be useful, #
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
|
# General Public License for more details. #
|
|
# #
|
|
# You should have received a copy of the GNU General Public License #
|
|
# along with this program; if not, write to the Free Software #
|
|
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
|
|
# 02111-1307 USA #
|
|
# #
|
|
# #
|
|
#########################################################################
|
|
import sys, os, tempfile, re
|
|
from libprs500.ebooks.rtf2xml import field_strings, copy
|
|
class FieldsSmall:
|
|
"""
|
|
=================
|
|
Purpose
|
|
=================
|
|
Write tags for bookmarks, index and toc entry fields in a tokenized file.
|
|
This module does not handle toc or index tables. (This module won't be any
|
|
use to use to you unless you use it as part of the other modules.)
|
|
-----------
|
|
Method
|
|
-----------
|
|
Look for the beginning of a bookmark, index, or toc entry. When such a token
|
|
is found, store the opeing bracket count in a variable. Collect all the text
|
|
until the closing bracket entry is found. Send the string to the module
|
|
field_strings to process it. Write the processed string to the output
|
|
file.
|
|
"""
|
|
def __init__(self,
|
|
in_file,
|
|
bug_handler,
|
|
copy = None,
|
|
run_level = 1,
|
|
):
|
|
"""
|
|
Required:
|
|
'file'--file to parse
|
|
Optional:
|
|
'copy'-- whether to make a copy of result for debugging
|
|
'temp_dir' --where to output temporary results (default is
|
|
directory from which the script is run.)
|
|
Returns:
|
|
nothing
|
|
"""
|
|
self.__file = in_file
|
|
self.__bug_handler = bug_handler
|
|
self.__copy = copy
|
|
self.__write_to = tempfile.mktemp()
|
|
self.__run_level = run_level
|
|
def __initiate_values(self):
|
|
"""
|
|
Initiate all values.
|
|
"""
|
|
self.__string_obj = field_strings.FieldStrings(bug_handler = self.__bug_handler)
|
|
self.__state = 'before_body'
|
|
self.__text_string = ''
|
|
self.__marker = 'mi<mk<inline-fld\n'
|
|
self.__state_dict = {
|
|
'before_body' : self.__before_body_func,
|
|
'body' : self.__body_func,
|
|
'bookmark' : self.__bookmark_func,
|
|
'toc_index' : self.__toc_index_func,
|
|
}
|
|
self.__body_dict = {
|
|
'cw<an<book-mk-st' : (self.__found_bookmark_func, 'start'),
|
|
'cw<an<book-mk-en' : (self.__found_bookmark_func, 'end'),
|
|
'cw<an<toc_______' : (self.__found_toc_index_func, 'toc'),
|
|
'cw<an<index-mark' : (self.__found_toc_index_func, 'index'),
|
|
}
|
|
ob = 'ob<nu<open-brack.....'
|
|
cb = 'cb<nu<clos-brack'
|
|
bk_st = 'cw<an<book-mk-st<nu<true'
|
|
tx = 'tx<nu<__________<(.*?)'
|
|
reg_st = ob + bk_st + tx + cb
|
|
self.__book_start = re.compile(r'%s' % reg_st)
|
|
def __before_body_func(self, line):
|
|
"""
|
|
Requires:
|
|
line --the line to parse
|
|
Returns:
|
|
nothing
|
|
Logic:
|
|
Look for the beginning of the body. When found, change the state
|
|
to body. Always print out the line.
|
|
"""
|
|
if self.__token_info == 'mi<mk<body-open_':
|
|
self.__state = 'body'
|
|
self.__write_obj.write(line)
|
|
def __body_func(self, line):
|
|
"""
|
|
Requires:
|
|
line --the line to parse
|
|
Returns:
|
|
nothing
|
|
Logic:
|
|
This function handles all the lines in the body of the documents.
|
|
Look for a bookmark, index or toc entry and take the appropriate action.
|
|
"""
|
|
action, tag = \
|
|
self.__body_dict.get(self.__token_info, (None, None))
|
|
if action:
|
|
action(line, tag)
|
|
else:
|
|
self.__write_obj.write(line)
|
|
def __found_bookmark_func(self, line, tag):
|
|
"""
|
|
Requires:
|
|
line --the line to parse
|
|
Returns:
|
|
nothing
|
|
Logic:
|
|
This function is called when a bookmark is found. The opening
|
|
bracket count is stored int eh beginning bracket count. The state
|
|
is changed to 'bookmark.'
|
|
"""
|
|
self.__beg_bracket_count = self.__ob_count
|
|
self.__cb_count = 0
|
|
self.__state = 'bookmark'
|
|
self.__type_of_bookmark = tag
|
|
def __bookmark_func(self, line):
|
|
"""
|
|
Requires:
|
|
line --the line to parse
|
|
Returns:
|
|
nothing
|
|
Logic:
|
|
This function handles all lines within a bookmark. It adds each
|
|
line to a string until the end of the bookmark is found. It
|
|
processes the string with the fields_string module, and
|
|
prints out the result.
|
|
"""
|
|
if self.__beg_bracket_count == self.__cb_count:
|
|
self.__state = 'body'
|
|
type = 'bookmark-%s' % self.__type_of_bookmark
|
|
# change here
|
|
"""
|
|
my_string = self.__string_obj.process_string(
|
|
self.__text_string, type)
|
|
"""
|
|
my_string = self.__parse_bookmark_func(
|
|
self.__text_string, type)
|
|
self.__write_obj.write(self.__marker)
|
|
self.__write_obj.write(my_string)
|
|
self.__text_string = ''
|
|
self.__write_obj.write(line)
|
|
elif line[0:2] == 'tx':
|
|
self.__text_string += line[17:-1]
|
|
def __parse_index_func(self, my_string):
|
|
"""
|
|
Requires:
|
|
my_string --string to parse
|
|
type --type of string
|
|
Returns:
|
|
A string for a toc instruction field.
|
|
Logic:
|
|
This method is meant for *both* index and toc entries.
|
|
I want to eleminate paragraph endings, and I want to divide the
|
|
entry into a main entry and (if it exists) a sub entry.
|
|
Split the string by newlines. Read on token at a time. If the
|
|
token is a special colon, end the main entry element and start the
|
|
sub entry element.
|
|
If the token is a pargrah ending, ignore it, since I don't won't
|
|
paragraphs within toc or index entries.
|
|
"""
|
|
my_string, see_string = self.__index_see_func(my_string)
|
|
my_string, bookmark_string = self.__index_bookmark_func( my_string)
|
|
italics, bold = self.__index__format_func(my_string)
|
|
found_sub = 0
|
|
my_changed_string = 'mi<tg<empty-att_<field<type>index-entry'
|
|
my_changed_string += '<update>static'
|
|
if see_string:
|
|
my_changed_string += '<additional-text>%s' % see_string
|
|
if bookmark_string:
|
|
my_changed_string += '<bookmark>%s' % bookmark_string
|
|
if italics:
|
|
my_changed_string += '<italics>true'
|
|
if bold:
|
|
my_changed_string += '<bold>true'
|
|
main_entry = ''
|
|
sub_entry = ''
|
|
lines = my_string.split('\n')
|
|
for line in lines:
|
|
token_info = line[:16]
|
|
if token_info == 'cw<ml<colon_____':
|
|
found_sub = 1
|
|
elif token_info[0:2] == 'tx':
|
|
if found_sub:
|
|
sub_entry += line[17:]
|
|
else:
|
|
main_entry += line[17:]
|
|
my_changed_string += '<main-entry>%s' % main_entry
|
|
if found_sub:
|
|
my_changed_string += '<sub-entry>%s' % sub_entry
|
|
my_changed_string += '\n'
|
|
return my_changed_string
|
|
def __index_see_func(self, my_string):
|
|
in_see = 0
|
|
bracket_count = 0
|
|
see_string = ''
|
|
changed_string = ''
|
|
lines = my_string.split('\n')
|
|
end_bracket_count = sys.maxint
|
|
for line in lines:
|
|
token_info = line[:16]
|
|
if token_info == 'ob<nu<open-brack':
|
|
bracket_count += 1
|
|
if token_info == 'cb<nu<clos-brack':
|
|
bracket_count -= 1
|
|
if in_see:
|
|
if bracket_count == end_bracket_count and token_info == 'cb<nu<clos-brack':
|
|
in_see = 0
|
|
else:
|
|
if token_info == 'tx<nu<__________':
|
|
see_string += line[17:]
|
|
else:
|
|
if token_info == 'cw<in<index-see_':
|
|
end_bracket_count = bracket_count - 1
|
|
in_see = 1
|
|
changed_string += '%s\n' % line
|
|
return changed_string, see_string
|
|
def __index_bookmark_func(self, my_string):
|
|
"""
|
|
Requries:
|
|
my_string -- string in all the index
|
|
Returns:
|
|
bookmark_string -- the text string of the book mark
|
|
index_string -- string minus the bookmark_string
|
|
"""
|
|
# cw<an<place_____<nu<true
|
|
in_bookmark = 0
|
|
bracket_count = 0
|
|
bookmark_string = ''
|
|
index_string = ''
|
|
lines = my_string.split('\n')
|
|
end_bracket_count = sys.maxint
|
|
for line in lines:
|
|
token_info = line[:16]
|
|
if token_info == 'ob<nu<open-brack':
|
|
bracket_count += 1
|
|
if token_info == 'cb<nu<clos-brack':
|
|
bracket_count -= 1
|
|
if in_bookmark:
|
|
if bracket_count == end_bracket_count and token_info == 'cb<nu<clos-brack':
|
|
in_bookmark = 0
|
|
index_string += '%s\n' % line
|
|
else:
|
|
if token_info == 'tx<nu<__________':
|
|
bookmark_string += line[17:]
|
|
else:
|
|
index_string += '%s\n' % line
|
|
else:
|
|
if token_info == 'cw<an<place_____':
|
|
end_bracket_count = bracket_count - 1
|
|
in_bookmark = 1
|
|
index_string += '%s\n' % line
|
|
return index_string, bookmark_string
|
|
def __index__format_func(self, my_string):
|
|
italics = 0
|
|
bold =0
|
|
lines = my_string.split('\n')
|
|
for line in lines:
|
|
token_info = line[:16]
|
|
if token_info == 'cw<in<index-bold':
|
|
bold = 1
|
|
if token_info == 'cw<in<index-ital':
|
|
italics = 1
|
|
return italics, bold
|
|
def __parse_toc_func(self, my_string):
|
|
"""
|
|
Requires:
|
|
my_string -- all the string in the toc
|
|
Returns:
|
|
modidified string
|
|
Logic:
|
|
"""
|
|
toc_level = 0
|
|
toc_suppress = 0
|
|
my_string, book_start_string, book_end_string =\
|
|
self.__parse_bookmark_for_toc(my_string)
|
|
main_entry = ''
|
|
my_changed_string = 'mi<tg<empty-att_<field<type>toc-entry'
|
|
my_changed_string += '<update>static'
|
|
if book_start_string:
|
|
my_changed_string += '<bookmark-start>%s' % book_start_string
|
|
if book_end_string:
|
|
my_changed_string += '<bookmark-end>%s' % book_end_string
|
|
lines = my_string.split('\n')
|
|
for line in lines:
|
|
token_info = line[:16]
|
|
if token_info[0:2] == 'tx':
|
|
main_entry += line[17:]
|
|
if token_info == 'cw<tc<toc-level_':
|
|
toc_level = line[20:]
|
|
if token_info == 'cw<tc<toc-sup-nu':
|
|
toc_suppress = 1
|
|
if toc_level:
|
|
my_changed_string += '<toc-level>%s' % toc_level
|
|
if toc_suppress:
|
|
my_changed_string += '<toc-suppress-number>true'
|
|
my_changed_string += '<main-entry>%s' % main_entry
|
|
my_changed_string += '\n'
|
|
return my_changed_string
|
|
def __parse_bookmark_for_toc(self, my_string):
|
|
"""
|
|
Requires:
|
|
the_string --string of toc, with new lines
|
|
Returns:
|
|
the_string -- string minus bookmarks
|
|
bookmark_string -- bookmarks
|
|
Logic:
|
|
"""
|
|
in_bookmark = 0
|
|
bracket_count = 0
|
|
book_start_string = ''
|
|
book_end_string = ''
|
|
book_type = 0
|
|
toc_string = ''
|
|
lines = my_string.split('\n')
|
|
end_bracket_count = sys.maxint
|
|
for line in lines:
|
|
token_info = line[:16]
|
|
if token_info == 'ob<nu<open-brack':
|
|
bracket_count += 1
|
|
if token_info == 'cb<nu<clos-brack':
|
|
bracket_count -= 1
|
|
if in_bookmark:
|
|
if bracket_count == end_bracket_count and token_info == 'cb<nu<clos-brack':
|
|
in_bookmark = 0
|
|
toc_string += '%s\n' % line
|
|
else:
|
|
if token_info == 'tx<nu<__________':
|
|
if book_type == 'start':
|
|
book_start_string += line[17:]
|
|
elif book_type == 'end':
|
|
book_end_string += line[17:]
|
|
else:
|
|
toc_string += '%s\n' % line
|
|
else:
|
|
if token_info == 'cw<an<book-mk-st' or token_info =='cw<an<book-mk-en':
|
|
if token_info == 'cw<an<book-mk-st':
|
|
book_type = 'start'
|
|
if token_info == 'cw<an<book-mk-en':
|
|
book_type = 'end'
|
|
end_bracket_count = bracket_count - 1
|
|
in_bookmark = 1
|
|
toc_string += '%s\n' % line
|
|
return toc_string, book_start_string, book_end_string
|
|
def __parse_bookmark_func(self, my_string, type):
|
|
"""
|
|
Requires:
|
|
my_string --string to parse
|
|
type --type of string
|
|
Returns:
|
|
A string formated for a field instruction.
|
|
Logic:
|
|
The type is the name (either bookmark-end or bookmark-start). The
|
|
id is the complete text string.
|
|
"""
|
|
my_changed_string = ('mi<tg<empty-att_<field<type>%s'
|
|
'<number>%s<update>none\n' % (type, my_string))
|
|
return my_changed_string
|
|
def __found_toc_index_func(self, line, tag):
|
|
"""
|
|
Requires:
|
|
line --the line to parse
|
|
Returns:
|
|
nothing
|
|
Logic:
|
|
This function is called when a toc or index entry is found. The opening
|
|
bracket count is stored in the beginning bracket count. The state
|
|
is changed to 'toc_index.'
|
|
"""
|
|
self.__beg_bracket_count = self.__ob_count
|
|
self.__cb_count = 0
|
|
self.__state = 'toc_index'
|
|
self.__tag = tag
|
|
def __toc_index_func(self, line):
|
|
"""
|
|
Requires:
|
|
line --the line to parse
|
|
Returns:
|
|
nothing
|
|
Logic:
|
|
This function handles all lines within a toc or index entry. It
|
|
adds each line to a string until the end of the entry is found. It
|
|
processes the string with the fields_string module, and
|
|
prints out the result.
|
|
"""
|
|
if self.__beg_bracket_count == self.__cb_count:
|
|
self.__state = 'body'
|
|
type = self.__tag
|
|
if type == 'index':
|
|
my_string = self.__parse_index_func(
|
|
self.__text_string)
|
|
elif type == 'toc':
|
|
my_string = self.__parse_toc_func(
|
|
self.__text_string)
|
|
self.__write_obj.write(self.__marker)
|
|
self.__write_obj.write(my_string)
|
|
self.__text_string = ''
|
|
self.__write_obj.write(line)
|
|
else:
|
|
self.__text_string += line
|
|
def fix_fields(self):
|
|
"""
|
|
Requires:
|
|
nothing
|
|
Returns:
|
|
nothing (changes the original file)
|
|
Logic:
|
|
Read one line in at a time. Determine what action to take based on
|
|
the state. If the state is before the body, look for the
|
|
beginning of the body.
|
|
The other two states are toc_index (for toc and index entries) and
|
|
bookmark.
|
|
"""
|
|
self.__initiate_values()
|
|
read_obj = open(self.__file)
|
|
self.__write_obj = open(self.__write_to, 'w')
|
|
line_to_read = '1'
|
|
while line_to_read:
|
|
line_to_read = read_obj.readline()
|
|
line = line_to_read
|
|
self.__token_info = line[:16]
|
|
if self.__token_info == 'ob<nu<open-brack':
|
|
self.__ob_count = line[-5:-1]
|
|
if self.__token_info == 'cb<nu<clos-brack':
|
|
self.__cb_count = line[-5:-1]
|
|
action = self.__state_dict.get(self.__state)
|
|
if action == None:
|
|
sys.stderr.write('no no matching state in module fields_small.py\n')
|
|
sys.stderr.write(self.__state + '\n')
|
|
action(line)
|
|
read_obj.close()
|
|
self.__write_obj.close()
|
|
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
|
if self.__copy:
|
|
copy_obj.copy_file(self.__write_to, "fields_small.data")
|
|
copy_obj.rename(self.__write_to, self.__file)
|
|
os.remove(self.__write_to)
|