Move rtf2xml into libprs500 tree and clean it up

This commit is contained in:
Kovid Goyal 2007-12-27 22:11:26 +00:00
parent 7227fe1c4f
commit e1899e9f1c
52 changed files with 16757 additions and 2 deletions

View File

@ -122,7 +122,7 @@ def main(args=sys.argv, logger=None):
def generate_xml(rtfpath):
from rtf2xml.ParseRtf import ParseRtf
from libprs500.ebooks.rtf2xml.ParseRtf import ParseRtf
tdir = tempfile.mkdtemp(prefix=__appname__+'_')
ofile = os.path.join(tdir, 'index.xml')
cwd = os.getcwdu()

View File

@ -0,0 +1,563 @@
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program; if not, write to the Free Software #
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
# 02111-1307 USA #
# #
# #
#########################################################################
# $Revision: 1.41 $
# $Date: 2006/03/24 23:50:07 $
import sys,os
from libprs500.ebooks.rtf2xml import headings_to_sections, \
line_endings, footnote, fields_small, default_encoding, \
make_lists, preamble_div, header, colors, group_borders, \
check_encoding, add_brackets, table, combine_borders, \
fields_large, process_tokens, hex_2_utf8, tokenize, \
delete_info, sections, check_brackets, styles, \
paragraph_def, convert_to_tags, output, copy, \
list_numbers, info, pict, table_info, fonts, paragraphs, \
body_styles, preamble_rest, group_styles, \
inline, correct_unicode
from libprs500.ebooks.rtf2xml.old_rtf import OldRtf
"""
Here is an example script using the ParseRTF module directly
#!/usr/bin/env python
def Handle_Main():
# Handles options and creates a parse object
parse_obj =ParseRtf.ParseRtf(
in_file = 'in.rtf',
# All values from here on are optional
# determine the output file
out_file = 'out.xml',
# determine the run level. The default is 1.
run_level = 3,
# The name of a debug directory, if you are running at
# run level 3 or higer.
debug = 'debug_dir',
# Convert RTF caps to real caps.
# Default is 1.
convert_caps = 1,
# Indent resulting XML.
# Default is 0 (no indent).
indent = 1,
# Form lists from RTF. Default is 1.
form_lists = 1,
# Convert headings to sections. Default is 0.
headings_to_sections = 1,
# Group paragraphs with the same style name. Default is 1.
group_styles = 1,
# Group borders. Default is 1.
group_borders = 1,
# Write or do not write paragraphs. Default is 0.
empty_paragraphs = 0,
)
try:
parse_obj.parse_rtf()
except ParseRtf.InvalidRtfException, msg:
sys.stderr.write(msg)
except ParseRtf.RtfInvalidCodeException, msg:
sys.stderr.write(msg)
"""
class InvalidRtfException(Exception):
"""
handle invalid RTF
"""
pass
class RtfInvalidCodeException(Exception):
"""
handle bugs in program
"""
pass
class ParseRtf:
"""
Main class for controlling the rest of the parsing.
"""
def __init__(self,
in_file,
out_file = '',
out_dir = None,
dtd = '',
debug = 0,
deb_dir=None,
convert_symbol = None,
convert_wingdings = None,
convert_zapf = None,
convert_caps = None,
run_level = 1,
indent = None,
replace_illegals = 1,
form_lists = 1,
headings_to_sections = 1,
group_styles = 1,
group_borders = 1,
empty_paragraphs = 1,
no_dtd = 0,
char_data = '',
):
"""
Requires:
'file' --file to parse
'char_data' --file containing character maps
'dtd' --path to dtd
Possible parameters, but not necessary:
'output' --a file to output the parsed file. (Default is standard
output.)
'temp_dir' --directory for temporary output (If not provided, the
script tries to output to directory where is script is exectued.)
'deb_dir' --debug directory. If a debug_dir is provided, the script
will copy each run through as a file to examine in the debug_dir
'perl_script'--use perl to make tokens. This runs just a bit faster.
(I will probably phase this out.)
'check_brackets' -- make sure the brackets match up after each run
through a file. Only for debugging.
Returns: Nothing
"""
self.__file = in_file
self.__out_file = out_file
self.__out_dir = out_dir
self.__temp_dir = out_dir
self.__dtd_path = dtd
self.__check_file(in_file,"file_to_parse")
self.__char_data = char_data
self.__debug_dir = debug
self.__check_dir(self.__temp_dir)
self.__copy = self.__check_dir(self.__debug_dir)
self.__convert_caps = convert_caps
self.__convert_symbol = convert_symbol
self.__convert_wingdings = convert_wingdings
self.__convert_zapf = convert_zapf
self.__run_level = run_level
self.__exit_level = 0
self.__indent = indent
self.__replace_illegals = replace_illegals
self.__form_lists = form_lists
self.__headings_to_sections = headings_to_sections
self.__group_styles = group_styles
self.__group_borders = group_borders
self.__empty_paragraphs = empty_paragraphs
self.__no_dtd = no_dtd
def __check_file(self, the_file, type):
"""Check to see if files exist"""
if the_file == None:
if type == "file_to_parse":
message = "You must provide a file for the script to work"
msg = message
raise RtfInvalidCodeException, msg
elif os.path.exists(the_file):
pass # do nothing
else:
message = "The file '%s' cannot be found" % the_file
msg = message
raise RtfInvalidCodeException, msg
def __check_dir(self, the_dir):
"""Check to see if directory exists"""
if not the_dir :
return
dir_exists = os.path.isdir(the_dir)
if not dir_exists:
message = "%s is not a directory" % the_dir
msg = message
raise RtfInvalidCodeException, msg
return 1
def parse_rtf(self):
"""
Parse the file by calling on other classes.
Requires:
Nothing
Returns:
A parsed file in XML, either to standard output or to a file,
depending on the value of 'output' when the instance was created.
"""
self.__temp_file = self.__make_temp_file(self.__file)
# if the self.__deb_dir is true, then create a copy object,
# set the directory to write to, remove files, and copy
# the new temporary file to this directory
if self.__debug_dir:
copy_obj = copy.Copy(
bug_handler = RtfInvalidCodeException,
)
copy_obj.set_dir(self.__debug_dir)
copy_obj.remove_files()
copy_obj.copy_file(self.__temp_file, "original_file")
# new as of 2005-08-02. Do I want this?
if self.__debug_dir or self.__run_level > 2:
self.__check_brack_obj = check_brackets.CheckBrackets\
(file = self.__temp_file,
bug_handler = RtfInvalidCodeException,
)
# convert Macintosh line endings to Unix line endings
line_obj = line_endings.FixLineEndings(
in_file = self.__temp_file,
bug_handler = RtfInvalidCodeException,
copy = self.__copy,
run_level = self.__run_level,
replace_illegals = self.__replace_illegals,
)
return_value = line_obj.fix_endings()
self.__return_code(return_value)
tokenize_obj = tokenize.Tokenize(
bug_handler = RtfInvalidCodeException,
in_file = self.__temp_file,
copy = self.__copy,
run_level = self.__run_level,)
tokenize_obj.tokenize()
process_tokens_obj = process_tokens.ProcessTokens(
in_file = self.__temp_file,
bug_handler = RtfInvalidCodeException,
copy = self.__copy,
run_level = self.__run_level,
exception_handler = InvalidRtfException,
)
try:
return_value = process_tokens_obj.process_tokens()
except InvalidRtfException, msg:
try:
os.remove(self.__temp_file)
except OSError:
pass
check_encoding_obj = check_encoding.CheckEncoding(
bug_handler = RtfInvalidCodeException,
)
check_encoding_obj.check_encoding(self.__file)
sys.stderr.write('File "%s" does not appear to be RTF.\n' % self.__file)
raise InvalidRtfException, msg
delete_info_obj = delete_info.DeleteInfo(
in_file = self.__temp_file,
copy = self.__copy,
bug_handler = RtfInvalidCodeException,
run_level = self.__run_level,)
# found destination means {\*\destination
# if found, the RTF should be newer RTF
found_destination = delete_info_obj.delete_info()
self.__bracket_match('delete_data_info')
# put picts in a separate file
pict_obj = pict.Pict(
in_file = self.__temp_file,
bug_handler = RtfInvalidCodeException,
copy = self.__copy,
orig_file = self.__file,
out_file = self.__out_file,
run_level = self.__run_level,
)
pict_obj.process_pict()
self.__bracket_match('pict_data_info')
correct_uni_obj = correct_unicode.CorrectUnicode(
in_file = self.__temp_file,
bug_handler = RtfInvalidCodeException,
copy = self.__copy,
run_level = self.__run_level,
exception_handler = InvalidRtfException,
)
correct_uni_obj.correct_unicode()
self.__bracket_match('correct_unicode_info')
combine_obj = combine_borders.CombineBorders(
in_file = self.__temp_file,
bug_handler = RtfInvalidCodeException,
copy = self.__copy,
run_level = self.__run_level,)
combine_obj.combine_borders()
self.__bracket_match('combine_borders_info')
footnote_obj = footnote.Footnote(
in_file = self.__temp_file,
bug_handler = RtfInvalidCodeException,
copy = self.__copy,
run_level = self.__run_level,
)
footnote_obj.separate_footnotes()
self.__bracket_match('separate_footnotes_info')
header_obj = header.Header(
in_file = self.__temp_file,
bug_handler = RtfInvalidCodeException,
copy = self.__copy,
run_level = self.__run_level,
)
header_obj.separate_headers()
self.__bracket_match('separate_headers_info')
list_numbers_obj = list_numbers.ListNumbers(
in_file = self.__temp_file,
bug_handler = RtfInvalidCodeException,
copy = self.__copy,
run_level = self.__run_level,
)
list_numbers_obj.fix_list_numbers()
self.__bracket_match('list_number_info')
preamble_div_obj = preamble_div.PreambleDiv(
in_file = self.__temp_file,
bug_handler = RtfInvalidCodeException,
copy = self.__copy,
run_level = self.__run_level,
)
list_of_lists = preamble_div_obj.make_preamble_divisions()
self.__bracket_match('make_preamble_divisions')
encode_obj = default_encoding.DefaultEncoding(
in_file = self.__temp_file,
run_level = self.__run_level,
bug_handler = RtfInvalidCodeException,
)
platform, code_page, default_font_num = encode_obj.find_default_encoding()
hex2utf_obj = hex_2_utf8.Hex2Utf8(
in_file = self.__temp_file,
copy = self.__copy,
area_to_convert = 'preamble',
char_file = self.__char_data,
default_char_map = code_page,
run_level = self.__run_level,
bug_handler = RtfInvalidCodeException,
invalid_rtf_handler = InvalidRtfException,
)
hex2utf_obj.convert_hex_2_utf8()
self.__bracket_match('hex_2_utf_preamble')
fonts_obj = fonts.Fonts(
in_file = self.__temp_file,
bug_handler = RtfInvalidCodeException,
copy = self.__copy,
default_font_num = default_font_num,
run_level = self.__run_level,
)
special_font_dict = fonts_obj.convert_fonts()
self.__bracket_match('fonts_info')
color_obj = colors.Colors(
in_file = self.__temp_file,
copy = self.__copy,
bug_handler = RtfInvalidCodeException,
run_level = self.__run_level,
)
color_obj.convert_colors()
self.__bracket_match('colors_info')
style_obj = styles.Styles(
in_file = self.__temp_file,
bug_handler = RtfInvalidCodeException,
copy = self.__copy,
run_level = self.__run_level,
)
style_obj.convert_styles()
self.__bracket_match('styles_info')
info_obj = info.Info(
in_file = self.__temp_file,
bug_handler = RtfInvalidCodeException,
copy = self.__copy,
run_level = self.__run_level,
)
info_obj.fix_info()
default_font = special_font_dict.get('default-font')
preamble_rest_obj = preamble_rest.Preamble(
file = self.__temp_file, copy = self.__copy,
bug_handler = RtfInvalidCodeException,
platform = platform, default_font = default_font,
code_page = code_page)
preamble_rest_obj.fix_preamble()
self.__bracket_match('preamble_rest_info')
old_rtf_obj = OldRtf(
in_file = self.__temp_file,
bug_handler = RtfInvalidCodeException,
run_level = self.__run_level,
)
# RTF can actually have destination groups and old RTF.
# BAH!
old_rtf = old_rtf_obj.check_if_old_rtf()
if old_rtf:
if self.__run_level > 5:
msg = 'older RTF\n'
msg += 'self.__run_level is "%s"\n' % self.__run_level
raise RtfInvalidCodeException, msg
if self.__run_level > 1:
sys.stderr.write('File could be older RTF...\n')
if found_destination:
if self.__run_level > 1:
sys.stderr.write(
'File also has newer RTF.\n'
'Will do the best to convert.\n'
)
add_brackets_obj = add_brackets.AddBrackets(
in_file = self.__temp_file,
bug_handler = RtfInvalidCodeException,
copy = self.__copy,
run_level = self.__run_level,
)
add_brackets_obj.add_brackets()
fields_small_obj = fields_small.FieldsSmall(
in_file = self.__temp_file,
copy = self.__copy,
bug_handler = RtfInvalidCodeException,
run_level = self.__run_level,)
fields_small_obj.fix_fields()
self.__bracket_match('fix_small_fields_info')
fields_large_obj = fields_large.FieldsLarge(
in_file = self.__temp_file,
copy = self.__copy,
bug_handler = RtfInvalidCodeException,
run_level = self.__run_level)
fields_large_obj.fix_fields()
self.__bracket_match('fix_large_fields_info')
sections_obj = sections.Sections(
in_file = self.__temp_file,
bug_handler = RtfInvalidCodeException,
copy = self.__copy,
run_level = self.__run_level,)
sections_obj.make_sections()
self.__bracket_match('sections_info')
paragraphs_obj = paragraphs.Paragraphs(
in_file = self.__temp_file,
bug_handler = RtfInvalidCodeException,
copy = self.__copy,
write_empty_para = self.__empty_paragraphs,
run_level = self.__run_level,)
paragraphs_obj.make_paragraphs()
self.__bracket_match('paragraphs_info')
default_font = special_font_dict['default-font']
paragraph_def_obj = paragraph_def.ParagraphDef(
in_file = self.__temp_file,
bug_handler = RtfInvalidCodeException,
copy = self.__copy,
default_font = default_font,
run_level = self.__run_level,)
list_of_styles = paragraph_def_obj.make_paragraph_def()
body_styles_obj = body_styles.BodyStyles(
in_file = self.__temp_file,
bug_handler = RtfInvalidCodeException,
copy = self.__copy,
list_of_styles = list_of_styles,
run_level = self.__run_level,)
body_styles_obj.insert_info()
self.__bracket_match('body_styles_info')
self.__bracket_match('paragraph_def_info')
table_obj = table.Table(
in_file = self.__temp_file,
bug_handler = RtfInvalidCodeException,
copy = self.__copy,
run_level = self.__run_level,)
table_data = table_obj.make_table()
self.__bracket_match('table_info')
table_info_obj = table_info.TableInfo(
in_file = self.__temp_file,
bug_handler = RtfInvalidCodeException,
copy = self.__copy,
table_data = table_data,
run_level = self.__run_level,)
table_info_obj.insert_info()
self.__bracket_match('table__data_info')
if self.__form_lists:
make_list_obj = make_lists.MakeLists(
in_file = self.__temp_file,
bug_handler = RtfInvalidCodeException,
copy = self.__copy,
headings_to_sections = self.__headings_to_sections,
run_level = self.__run_level,
list_of_lists = list_of_lists,
)
make_list_obj.make_lists()
self.__bracket_match('form_lists_info')
if self.__headings_to_sections:
headings_to_sections_obj = headings_to_sections.HeadingsToSections(
in_file = self.__temp_file,
bug_handler = RtfInvalidCodeException,
copy = self.__copy,
run_level = self.__run_level,)
headings_to_sections_obj.make_sections()
self.__bracket_match('headings_to_sections_info')
if self.__group_styles:
group_styles_obj = group_styles.GroupStyles(
in_file = self.__temp_file,
bug_handler = RtfInvalidCodeException,
copy = self.__copy,
wrap = 1,
run_level = self.__run_level,)
group_styles_obj.group_styles()
self.__bracket_match('group_styles_info')
if self.__group_borders:
group_borders_obj = group_borders.GroupBorders(
in_file = self.__temp_file,
bug_handler = RtfInvalidCodeException,
copy = self.__copy,
wrap = 1,
run_level = self.__run_level,)
group_borders_obj.group_borders()
self.__bracket_match('group_borders_info')
inline_obj = inline.Inline(
in_file = self.__temp_file,
bug_handler = RtfInvalidCodeException,
copy = self.__copy,
run_level = self.__run_level,)
inline_obj.form_tags()
self.__bracket_match('inline_info')
hex2utf_obj.update_values(file = self.__temp_file,
area_to_convert = 'body',
copy = self.__copy,
char_file = self.__char_data,
convert_caps = self.__convert_caps,
convert_symbol = self.__convert_symbol,
convert_wingdings = self.__convert_wingdings,
convert_zapf = self.__convert_zapf,
symbol = 1,
wingdings = 1,
dingbats = 1,
)
hex2utf_obj.convert_hex_2_utf8()
header_obj.join_headers()
footnote_obj.join_footnotes()
tags_obj = convert_to_tags.ConvertToTags(
in_file = self.__temp_file,
copy = self.__copy,
dtd_path = self.__dtd_path,
indent = self.__indent,
run_level = self.__run_level,
no_dtd = self.__no_dtd,
bug_handler = RtfInvalidCodeException,
)
tags_obj.convert_to_tags()
output_obj = output.Output(
file = self.__temp_file,
orig_file = self.__file,
output_dir = self.__out_dir,
out_file = self.__out_file,
)
output_obj.output()
os.remove(self.__temp_file)
return self.__exit_level
def __bracket_match(self, file_name):
if self.__run_level > 2:
good_br, msg = self.__check_brack_obj.check_brackets()
if good_br:
pass
# sys.stderr.write( msg + ' in ' + file_name + "\n")
else:
msg += msg + " in file '" + file_name + "'\n"
raise RtfInvalidCodeException, msg
def __return_code(self, num):
if num == None:
return
if int(num) > self.__exit_level:
self.__exit_level = num
def __make_temp_file(self,file):
"""Make a temporary file to parse"""
write_file="rtf_write_file"
read_obj = open(file,'r')
write_obj = open(write_file, 'w')
line = "dummy"
while line:
line = read_obj.read(1000)
write_obj.write(line )
read_obj.close()
write_obj.close()
return write_file
"""
mi<tg<open______<style-sheet\n
mi<tg<close_____<style-sheet\n
mi<tg<open-att__<footnote<num>1\n
mi<tg<empty-att_<page-definition<margin>33\n
mi<tg<empty_____<para\n
"""

View File

@ -0,0 +1,3 @@
'''
modules for rtf2xml
'''

View File

@ -0,0 +1,205 @@
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program; if not, write to the Free Software #
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
# 02111-1307 USA #
# #
# #
#########################################################################
import sys, os, tempfile
from libprs500.ebooks.rtf2xml import copy, check_brackets
# note to self. This is the first module in which I use tempfile. A good idea?
"""
"""
class AddBrackets:
"""
Add brackets for old RTF.
Logic:
"""
def __init__(self, in_file,
bug_handler,
copy = None,
run_level = 1,
):
"""
Required:
'file'--file to parse
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
Returns:
nothing
"""
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__write_to = tempfile.mktemp()
self.__run_level = run_level
def __initiate_values(self):
"""
"""
self.__state_dict = {
'before_body' : self.__before_body_func,
'in_body' : self.__in_body_func,
'after_control_word' : self.__after_control_word_func,
'in_ignore' : self.__ignore_func,
}
self.__state = 'before_body'
self.__inline = {}
self.__temp_group = []
self.__open_bracket = 0
self.__found_brackets = 0
self.__accept = [
'cw<ci<bold______',
'cw<ci<annotation' ,
'cw<ci<blue______' ,
'cw<ci<bold______' ,
'cw<ci<caps______' ,
'cw<ci<char-style' ,
'cw<ci<dbl-strike' ,
'cw<ci<emboss____' ,
'cw<ci<engrave___' ,
'cw<ci<font-color' ,
'cw<ci<font-down_' ,
'cw<ci<font-size_' ,
'cw<ci<font-style' ,
'cw<ci<font-up___',
'cw<ci<footnot-mk',
'cw<ci<green_____' ,
'cw<ci<hidden____',
'cw<ci<italics___' ,
'cw<ci<outline___',
'cw<ci<red_______' ,
'cw<ci<shadow____',
'cw<ci<small-caps' ,
'cw<ci<strike-thr',
'cw<ci<subscript_' ,
'cw<ci<superscrip',
'cw<ci<underlined' ,
'cw<ul<underlined' ,
]
def __before_body_func(self, line):
"""
"""
if self.__token_info == 'mi<mk<body-open_':
self.__state = 'in_body'
self.__write_obj.write(line)
def __in_body_func(self, line):
"""
"""
if line == 'cb<nu<clos-brack<0001\n' and self.__open_bracket:
self.__write_obj.write(
'cb<nu<clos-brack<0003\n'
)
self.__write_obj.write(line)
elif self.__token_info == 'ob<nu<open-brack':
self.__found_brackets = 1
self.__state = 'in_ignore'
self.__ignore_count = self.__ob_count
self.__write_obj.write(line)
elif self.__token_info in self.__accept:
self.__temp_group.append(line)
self.__state = 'after_control_word'
else:
self.__write_obj.write(line)
def __after_control_word_func(self, line):
"""
"""
if self.__token_info in self.__accept:
self.__temp_group.append(line)
else:
self.__change_permanent_group()
self.__write_group()
self.__write_obj.write(line)
if self.__token_info == 'ob<nu<open-brack':
self.__state = 'in_ignore'
self.__ignore_count = self.__ob_count
else:
self.__state = 'in_body'
def __write_group(self):
"""
"""
if self.__open_bracket:
self.__write_obj.write(
'cb<nu<clos-brack<0003\n'
)
self.__open_bracket = 0
inline_string = ''
the_keys = self.__inline.keys()
for the_key in the_keys:
value = self.__inline[the_key]
if value != 'false':
inline_string += '%s<nu<%s\n' % (the_key, value)
if inline_string:
self.__write_obj.write('ob<nu<open-brack<0003\n')
self.__write_obj.write(inline_string)
self.__open_bracket = 1
self.__temp_group = []
def __change_permanent_group(self):
"""
use temp group to change permanent group
"""
for line in self.__temp_group:
token_info = line[:16]
if token_info in self.__accept:
att = line[20:-1]
self.__inline[token_info] = att
def __ignore_func(self, line):
"""
Don't add any brackets while inside of brackets RTF has already
added.
"""
self.__write_obj.write(line)
if self.__token_info == 'cb<nu<clos-brack'and\
self.__cb_count == self.__ignore_count:
self.__state = 'in_body'
def __check_brackets(self, in_file):
self.__check_brack_obj = check_brackets.CheckBrackets\
(file = in_file)
good_br = self.__check_brack_obj.check_brackets()[0]
if not good_br:
return 1
def add_brackets(self):
"""
"""
self.__initiate_values()
read_obj = open(self.__file, 'r')
self.__write_obj = open(self.__write_to, 'w')
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
if self.__token_info == 'ob<nu<open-brack':
self.__ob_count = line[-5:-1]
if self.__token_info == 'cb<nu<clos-brack':
self.__cb_count = line[-5:-1]
action = self.__state_dict.get(self.__state)
if action == None:
sys.stderr.write('No matching state in module add_brackets.py\n')
sys.stderr.write(self.__state + '\n')
action(line)
read_obj.close()
self.__write_obj.close()
bad_brackets = self.__check_brackets(self.__write_to)
if not bad_brackets:
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "add_brackets.data")
copy_obj.rename(self.__write_to, self.__file)
else:
if self.__run_level > 0:
sys.stderr.write(
'Sorry, but this files has a mix of old and new RTF.\n'
'Some characteristics cannot be converted.\n')
os.remove(self.__write_to)

View File

@ -0,0 +1,81 @@
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program; if not, write to the Free Software #
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
# 02111-1307 USA #
# #
# #
#########################################################################
import os, tempfile
from libprs500.ebooks.rtf2xml import copy
"""
Simply write the list of strings after style table
"""
class BodyStyles:
"""
Insert table data for tables.
Logic:
"""
def __init__(self,
in_file,
list_of_styles,
bug_handler,
copy=None,
run_level = 1,):
"""
Required:
'file'--file to parse
'table_data' -- a dictionary for each table.
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
Returns:
nothing
"""
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__list_of_styles = list_of_styles
self.__run_level = run_level
self.__write_to = tempfile.mktemp()
# self.__write_to = 'table_info.data'
def insert_info(self):
"""
"""
read_obj = open(self.__file, 'r')
self.__write_obj = open(self.__write_to, 'w')
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
if line == 'mi<tg<close_____<style-table\n':
if len(self.__list_of_styles) > 0:
self.__write_obj.write('mi<tg<open______<styles-in-body\n')
the_string = ''.join(self.__list_of_styles)
self.__write_obj.write(the_string)
self.__write_obj.write('mi<tg<close_____<styles-in-body\n')
else:
# this shouldn't happen!
if self.__run_level > 3:
msg = 'Not enough data for each table\n'
raise self.__bug_handler, msg
# why was this line even here?
# self.__write_obj.write('mi<tg<open______<table\n')
self.__write_obj.write(line)
read_obj.close()
self.__write_obj.close()
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "body_styles.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)

View File

@ -0,0 +1,189 @@
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program; if not, write to the Free Software #
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
# 02111-1307 USA #
# #
# #
#########################################################################
import sys
class BorderParse:
"""
Parse a border line and return a dictionary of attributes and values
"""
def __init__(self):
# cw<bd<bor-t-r-hi<nu<true
self.__border_dict = {
'bor-t-r-hi' : 'border-table-row-horizontal-inside',
'bor-t-r-vi' : 'border-table-row-vertical-inside',
'bor-t-r-to' : 'border-table-row-top',
'bor-t-r-le' : 'border-table-row-left',
'bor-t-r-bo' : 'border-table-row-bottom',
'bor-t-r-ri' : 'border-table-row-right',
'bor-cel-bo' : 'border-cell-bottom',
'bor-cel-to' : 'border-cell-top',
'bor-cel-le' : 'border-cell-left',
'bor-cel-ri' : 'border-cell-right',
'bor-par-bo' : 'border-paragraph-bottom',
'bor-par-to' : 'border-paragraph-top',
'bor-par-le' : 'border-paragraph-left',
'bor-par-ri' : 'border-paragraph-right',
'bor-par-bx' : 'border-paragraph-box',
'bor-for-ev' : 'border-for-every-paragraph',
'bor-outsid' : 'border-outside',
'bor-none__' : 'border',
# border type => bt
'bdr-li-wid' : 'line-width',
'bdr-sp-wid' : 'padding',
'bdr-color_' : 'color',
}
self.__border_style_dict = {
'bdr-single' : 'single',
'bdr-doubtb' : 'double-thickness-border',
'bdr-shadow' : 'shadowed-border',
'bdr-double' : 'double-border',
'bdr-dotted' : 'dotted-border',
'bdr-dashed' : 'dashed',
'bdr-hair__' : 'hairline',
'bdr-inset_' : 'inset',
'bdr-das-sm' : 'dash-small',
'bdr-dot-sm' : 'dot-dash',
'bdr-dot-do' : 'dot-dot-dash',
'bdr-outset' : 'outset',
'bdr-trippl' : 'tripple',
'bdr-thsm__' : 'thick-thin-small',
'bdr-htsm__' : 'thin-thick-small',
'bdr-hthsm_' : 'thin-thick-thin-small',
'bdr-thm___' : 'thick-thin-medium',
'bdr-htm___' : 'thin-thick-medium',
'bdr-hthm__' : 'thin-thick-thin-medium',
'bdr-thl___' : 'thick-thin-large',
'bdr-hthl__' : 'thin-thick-thin-large',
'bdr-wavy__' : 'wavy',
'bdr-d-wav_' : 'double-wavy',
'bdr-strip_' : 'striped',
'bdr-embos_' : 'emboss',
'bdr-engra_' : 'engrave',
'bdr-frame_' : 'frame',
}
def parse_border(self, line):
"""
Requires:
line -- line with border definition in it
Returns:
?
Logic:
"""
border_dict = {}
border_style_dict = {}
border_style_list = []
border_type = self.__border_dict.get(line[6:16])
if not border_type:
sys.stderr.write(
'module is border_parse.py\n'
'function is parse_border\n'
'token does not have a dictionary value\n'
'token is "%s"' % line
)
return border_dict
att_line = line[20:-1]
atts = att_line.split('|')
# cw<bd<bor-cel-ri<nu<
# border has no value--should be no lines
if len(atts) == 1 and atts[0] == '':
border_dict[border_type] = 'none'
return border_dict
# border-paragraph-right
for att in atts:
values = att.split(':')
if len(values) ==2:
att = values[0]
value = values[1]
else:
value = 'true'
style_att = self.__border_style_dict.get(att)
if style_att:
att = '%s-%s' % (border_type, att)
border_style_dict[att] = value
border_style_list.append(style_att)
else:
att = self.__border_dict.get(att)
if not att:
sys.stderr.write(
'module is border_parse_def.py\n'
'function is parse_border\n'
'token does not have an att value\n'
'line is "%s"' % line
)
att = '%s-%s' % (border_type, att)
border_dict[att] = value
new_border_dict = self.__determine_styles(border_type, border_style_list)
border_dict.update(new_border_dict)
return border_dict
def __determine_styles(self, border_type, border_style_list):
new_border_dict = {}
att = '%s-style' % border_type
if 'shadowed-border' in border_style_list:
new_border_dict[att] = 'shadowed'
elif 'engraved' in border_style_list:
new_border_dict[att] = 'engraved'
elif 'emboss' in border_style_list:
new_border_dict[att] = 'emboss'
elif 'striped' in border_style_list:
new_border_dict[att] = 'striped'
elif 'thin-thick-thin-small' in border_style_list:
new_border_dict[att] = 'thin-thick-thin-small'
elif 'thick-thin-large' in border_style_list:
new_border_dict[att] = 'thick-thin-large'
elif 'thin-thick-thin-medium' in border_style_list:
new_border_dict[att] = 'thin-thick-thin-medium'
elif 'thin-thick-medium' in border_style_list:
new_border_dict[att] = 'thin-thick-medium'
elif 'thick-thin-medium' in border_style_list:
new_border_dict[att] = 'thick-thin-medium'
elif 'thick-thin-small' in border_style_list:
new_border_dict[att] = 'thick-thin-small'
elif 'thick-thin-small' in border_style_list:
new_border_dict[att] = 'thick-thin-small'
elif 'double-wavy' in border_style_list:
new_border_dict[att] = 'double-wavy'
elif 'dot-dot-dash' in border_style_list:
new_border_dict[att] = 'dot-dot-dash'
elif 'dot-dash' in border_style_list:
new_border_dict[att] = 'dot-dash'
elif 'dotted-border' in border_style_list:
new_border_dict[att] = 'dotted'
elif 'wavy' in border_style_list:
new_border_dict[att] = 'wavy'
elif 'dash-small' in border_style_list:
new_border_dict[att] = 'dash-small'
elif 'dashed' in border_style_list:
new_border_dict[att] = 'dashed'
elif 'frame' in border_style_list:
new_border_dict[att] = 'frame'
elif 'inset' in border_style_list:
new_border_dict[att] = 'inset'
elif 'outset' in border_style_list:
new_border_dict[att] = 'outset'
elif 'tripple-border' in border_style_list:
new_border_dict[att] = 'tripple'
elif 'double-border' in border_style_list:
new_border_dict[att] = 'double'
elif 'double-thickness-border' in border_style_list:
new_border_dict[att] = 'double-thickness'
elif 'hairline' in border_style_list:
new_border_dict[att] = 'hairline'
elif 'single' in border_style_list:
new_border_dict[att] = 'single'
else:
new_border_dict[att] = border_style_list[0]
return new_border_dict

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,61 @@
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program; if not, write to the Free Software #
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
# 02111-1307 USA #
# #
# #
#########################################################################
class CheckBrackets:
"""Check that brackets match up"""
def __init__(self, bug_handler = None, file=None):
self.__file=file
self.__bug_handler = bug_handler
self.__bracket_count=0
self.__ob_count = 0
self.__cb_count = 0
self.__open_bracket_num = []
def open_brack(self, line):
num = line[-5:-1]
self.__open_bracket_num.append(num)
self.__bracket_count += 1
def close_brack(self, line):
num = line[-5:-1]
##self.__open_bracket_num.append(num)
try:
last_num = self.__open_bracket_num.pop()
except:
return 0
if num != last_num:
return 0
self.__bracket_count -= 1
return 1
def check_brackets(self):
read_obj = open(self.__file, 'r')
line = 'dummy'
line_count = 0
while line:
line_count += 1
line = read_obj.readline()
self.__token_info = line[:16]
if self.__token_info == 'ob<nu<open-brack':
self.open_brack(line)
if self.__token_info == 'cb<nu<clos-brack':
right_count = self.close_brack(line)
if not right_count:
return (0, "closed bracket doesn't match, line %s" % line_count)
read_obj.close()
if self.__bracket_count != 0:
msg = 'At end of file open and closed brackets don\'t match\n'
msg = msg + 'total number of brackets is %s' % self.__bracket_count
return (0, msg)
return (1, "brackets match!")

View File

@ -0,0 +1,29 @@
#!/usr/bin/env python
import sys
class CheckEncoding:
def __init__(self, bug_handler):
self.__bug_handler = bug_handler
def __get_position_error(self, line, encoding, line_num):
char_position = 0
for char in line:
char_position +=1
try:
char.decode(encoding)
except UnicodeError, msg:
sys.stderr.write('line: %s char: %s\n' % (line_num, char_position))
sys.stderr.write(str(msg) + '\n')
def check_encoding(self, path, encoding='us-ascii'):
read_obj = open(path, 'r')
line_to_read = 1
line_num = 0
while line_to_read:
line_num += 1
line_to_read = read_obj.readline()
line = line_to_read
try:
line.decode(encoding)
except UnicodeError:
self.__get_position_error(line, encoding, line_num)
if __name__ == '__main__':
check_encoding_obj = CheckEncoding()
check_encoding_obj.check_encoding(sys.argv[1])

View File

@ -0,0 +1,247 @@
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program; if not, write to the Free Software #
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
# 02111-1307 USA #
# #
# #
#########################################################################
import sys, os, tempfile, re
from libprs500.ebooks.rtf2xml import copy
class Colors:
"""
Change lines with color info from color numbers to the actual color names.
"""
def __init__(self,
in_file,
bug_handler,
copy = None,
run_level = 1
):
"""
Required:
'file'--file to parse
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
Returns:
nothing
"""
self.__file = in_file
self.__copy = copy
self.__bug_handler = bug_handler
self.__write_to = tempfile.mktemp()
self.__run_level = run_level
def __initiate_values(self):
"""
Initiate all values.
"""
self.__color_dict = {}
self.__state = 'before_color_table'
self.__state_dict = {
'before_color_table': self.__before_color_func,
'in_color_table' : self.__in_color_func,
'after_color_table' : self.__after_color_func,
'cw<ci<red_______' : self.__default_color_func,
'cw<ci<green_____' : self.__default_color_func,
'cw<ci<blue______' : self.__blue_func,
'tx<nu<__________' : self.__do_nothing_func,
}
self.__color_string = '#'
self.__color_num = 1
self.__line_color_exp = re.compile(r'bdr-color_:(\d+)')
# cw<bd<bor-par-to<nu<bdr-hair__|bdr-li-wid:0.50|bdr-sp-wid:1.00|bdr-color_:2
def __before_color_func(self, line):
"""
Requires:
line
Returns:
nothing
Logic:
Check to see if the line marks the beginning of the color table.
If so, change states.
Always print out the line.
"""
# mi<mk<clrtbl-beg
if self.__token_info == 'mi<mk<clrtbl-beg':
self.__state = 'in_color_table'
self.__write_obj.write(line)
def __default_color_func(self, line):
"""
Requires:
line
Returns:
nothing
Logic:
get the hex number from the line and add it to the color string.
"""
hex_num = line[-3:-1]
self.__color_string += hex_num
def __blue_func(self, line):
"""
Requires:
line
Returns:
nothing
Logic:
Get the hex number from the line and add it to the color string.
Add a key -> value pair to the color dictionary, with the number
as the key, and the hex number as the value. Write an empty tag
with the hex number and number as attributes. Add one to the color
number. Reset the color string to '#'
"""
hex_num = line[-3:-1]
self.__color_string += hex_num
self.__color_dict[self.__color_num] = self.__color_string
self.__write_obj.write(
'mi<tg<empty-att_'
'<color-in-table<num>%s<value>%s\n' % (self.__color_num, self.__color_string)
)
self.__color_num += 1
self.__color_string = '#'
def __in_color_func(self, line):
"""
Requires:
line
Returns:
nothing
Logic:
Check if the end of the color table has been reached. If so,
change the state to after the color table.
Othewise, get a function by passing the self.__token_info to the
state dictionary.
"""
#mi<mk<clrtbl-beg
#cw<ci<red_______<nu<00
if self.__token_info == 'mi<mk<clrtbl-end':
self.__state = 'after_color_table'
else:
action = self.__state_dict.get(self.__token_info)
if action == None:
sys.stderr.write('in module colors.py\n'
'function is self.__in_color_func\n'
'no action for %s' % self.__token_info
)
action(line)
def __after_color_func(self, line):
"""
Check the to see if it contains color info. If it does, extract the
number and look up the hex value in the color dictionary. If the color
dictionary has no key for the number, print out an error message.
Otherwise, print out the line.
Added Oct 10, 2003
If the number is 0, that indicates no color
"""
#cw<ci<font-color<nu<2
if self.__token_info == 'cw<ci<font-color':
hex_num = int(line[20:-1])
hex_num = self.__figure_num(hex_num)
if hex_num:
self.__write_obj.write(
'cw<ci<font-color<nu<%s\n' % hex_num
)
elif line[0:5] == 'cw<bd':
the_index = line.find('bdr-color_')
if the_index > -1:
line = re.sub(self.__line_color_exp, self.__sub_from_line_color, line)
self.__write_obj.write(line)
"""
if num == 0:
hex_num = 'false'
else:
hex_num = self.__color_dict.get(num)
if hex_num == None:
if self.__run_level > 0:
sys.stderr.write(
'module is colors.py\n'
'function is self.__after_color_func\n'
'no value in self.__color_dict for key %s\n' % num
)
if self.__run_level > 3:
sys.stderr.write(
'run level is %s\n'
'Script will now quit\n'
% self.__run_level)
else:
self.__write_obj.write(
'cw<ci<font-color<nu<%s\n' % hex_num
)
"""
else:
self.__write_obj.write(line)
# cw<bd<bor-par-to<nu<bdr-hair__|bdr-li-wid:0.50|bdr-sp-wid:1.00|bdr-color_:2
def __sub_from_line_color(self, match_obj):
num = match_obj.group(1)
try:
num = int(num)
except ValueError:
if self.__run_level > 3:
msg = 'can\'t make integer from string\n'
raise self.__bug_handler, msg
else:
return 'bdr-color_:no-value'
hex_num = self.__figure_num(num)
return_value = 'bdr-color_:%s' % hex_num
return return_value
def __figure_num(self, num):
if num == 0:
hex_num = 'false'
else:
hex_num = self.__color_dict.get(num)
if hex_num == None:
if self.__run_level > 3:
msg = 'no value in self.__color_dict for key %s\n' % num
raise self.__bug_hanlder, msg
if hex_num == None:
hex_num = '0'
return hex_num
def __do_nothing_func(self, line):
"""
Bad RTF will have text in the color table
"""
pass
def convert_colors(self):
"""
Requires:
nothing
Returns:
nothing (changes the original file)
Logic:
Read one line in at a time. Determine what action to take based on
the state. If the state is before the color table, look for the
beginning of the color table.
If the state is in the color table, create the color dictionary
and print out the tags.
If the state if afer the color table, look for lines with color
info, and substitute the number with the hex number.
"""
self.__initiate_values()
read_obj = open(self.__file, 'r')
self.__write_obj = open(self.__write_to, 'w')
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
action = self.__state_dict.get(self.__state)
if action == None:
sys.stderr.write('no no matching state in module fonts.py\n')
sys.stderr.write(self.__state + '\n')
action(line)
read_obj.close()
self.__write_obj.close()
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "color.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)

View File

@ -0,0 +1,92 @@
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program; if not, write to the Free Software #
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
# 02111-1307 USA #
# #
# #
#########################################################################
import os, tempfile
from libprs500.ebooks.rtf2xml import copy
class CombineBorders:
"""Combine borders in RTF tokens to make later processing easier"""
def __init__(self,
in_file ,
bug_handler,
copy = None,
run_level = 1,
):
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__write_to = tempfile.mktemp()
self.__state = 'default'
self.__bord_pos = 'default'
self.__bord_att = []
def found_bd(self, line):
#cw<bd<bor-t-r-vi
self.__state = 'border'
self.__bord_pos = line[6:16]
def __default_func(self, line):
#cw<bd<bor-t-r-vi
if self.__first_five == 'cw<bd':
self.found_bd(line)
return ''
return line
def end_border(self, line, write_obj):
joiner = "|"
border_string = joiner.join(self.__bord_att)
self.__bord_att = []
write_obj.write('cw<bd<%s<nu<%s\n' % (self.__bord_pos,
border_string))
self.__state = 'default'
self.__bord_string = ''
if self.__first_five == 'cw<bd':
self. found_bd(line)
else:
write_obj.write(line)
def add_to_border_desc(self, line):
#cw<bt<bdr-hair__<nu<true
#cw<bt<bdr-linew<nu<0.50
#tx<__________<some text
border_desc = line[6:16]
num = line[20:-1]
if num == 'true':
num = ''
else:
num = ':' + num
self.__bord_att.append(border_desc + num)
def __border_func(self, line, write_obj):
if self.__first_five != 'cw<bt':
self.end_border(line, write_obj)
else:
self.add_to_border_desc(line)
def combine_borders(self):
read_obj = open(self.__file, 'r')
write_obj = open(self.__write_to, 'w')
line_to_read = 'dummy'
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__first_five = line[0:5]
if self.__state == 'border':
self.__border_func(line, write_obj)
else:
to_print = self.__default_func(line)
write_obj.write(to_print)
read_obj.close()
write_obj.close()
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "combine_borders.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)

View File

@ -0,0 +1,151 @@
import os, sys
class Configure:
def __init__( self,
configuration_file,
bug_handler,
debug_dir = None,
show_config_file = None,
):
"""
Requires:
file --file to be read
output --file to output to
Returns:
Nothing. Outputs a file
Logic:
"""
self.__configuration_file = configuration_file
self.__debug_dir = debug_dir
self.__bug_handler = bug_handler
self.__show_config_file = show_config_file
def get_configuration(self, type):
self.__configuration_file = self.__get_file_name()
return_dict = {}
return_dict['config-location'] = self.__configuration_file
if self.__show_config_file and self.__configuration_file:
sys.stderr.write('configuration file is "%s"\n' % self.__configuration_file)
if self.__show_config_file and not self.__configuration_file:
sys.stderr.write('No configuraiton file found; using default vaules\n')
if self.__configuration_file:
read_obj = open(self.__configuration_file, 'r')
line_to_read = 1
line_num = 0
while line_to_read:
line_num += 1
line_to_read = read_obj.readline()
line = line_to_read
line = line.strip()
if line[0:1] == '#':
continue
if not line:
continue
fields = line.split('=')
if len(fields) != 2:
msg = line
msg += ('Error in configuration.txt, line %s\n' % line_num)
msg += ('Options take the form of option = value.\n')
msg += ('Please correct the configuration file "%s" before continuing\n'
% self.__configuration_file)
raise self.__bug_handler, msg
att = fields[0]
value = fields[1]
att = att.strip()
value = value.strip()
return_dict[att] = value
return_dict = self.__parse_dict(return_dict)
if return_dict == 1:
msg = ('Please correct the configuration file "%s" before continuing\n'
% self.__configuration_file)
raise self.__bug_handler, msg
return return_dict
def __get_file_name(self):
home_var = os.environ.get('HOME')
if home_var:
home_config = os.path.join(home_var, '.rtf2xml')
if os.path.isfile(home_config):
return home_config
home_var = os.environ.get('USERPROFILE')
if home_var:
home_config = os.path.join(home_var, '.rtf2xml')
if os.path.isfile(home_config):
return home_config
script_file = os.path.join(sys.path[0], '.rtf2xml')
if os.path.isfile(script_file):
return script_file
return self.__configuration_file
def __parse_dict(self, return_dict):
allowable = [
'configuration-directory',
'smart-output', # = false
'level', # = 1
'convert-symbol',# = true
'convert-wingdings',# = true
'convert-zapf-dingbats', # = true
'convert-caps',# true
'indent', # = 1
'group-styles',
'group-borders',
'headings-to-sections',
'lists',
'raw-dtd-path',
'write-empty-paragraphs',
'config-location',
'script-name',
]
the_keys = return_dict.keys()
for the_key in the_keys:
if the_key not in allowable:
sys.stderr.write('options "%s" not a legal option.\n'
% the_key)
return 1
configuration_dir = return_dict.get('configuration-directory')
if configuration_dir == None:
return_dict['configure-directory'] = None
else:
if not os.path.isdir(configuration_dir):
sys.stderr.write('The dirctory "%s" does not appear to be a directory.\n'
% configuration_dir)
return 1
else:
return_dict['configure-directory'] = configuration_dir
smart_output = return_dict.get('smart-output')
if not smart_output:
return_dict['smart-output'] = 0
elif smart_output != 'true' and smart_output != 'false':
sys.stderr.write('"smart-output" must be true or false.\n')
return 1
elif smart_output == 'false':
return_dict['smart-output'] = 0
int_options = ['level', 'indent']
for int_option in int_options:
value = return_dict.get(int_option)
if not value:
if int_option == 'level':
return_dict['level'] = 1
else:
return_dict['indent'] = 0
else:
try:
int_num = int(return_dict[int_option])
return_dict[int_option] = int_num
except:
sys.stderr.write('"%s" must be a number\n' % int_option)
sys.stderr.write('You choose "%s" ' % return_dict[int_option])
return 1
fonts = ['convert-symbol', 'convert-wingdings', 'convert-zapf-dingbats',
'convert-caps'
]
for font in fonts:
value = return_dict.get(font)
if not value:
return_dict[font] = 0
elif value != 'true' and value != 'false':
sys.stderr.write(
'"%s" must be true or false.\n' % font)
elif value == 'false':
return_dict[font] = 0
return_dict['xslt-processor'] = None
return_dict['no-namespace'] = None
return_dict['format'] = 'raw'
return_dict['no-pyxml'] = 'true'
return return_dict

View File

@ -0,0 +1,242 @@
import os, tempfile
from libprs500.ebooks.rtf2xml import copy
public_dtd = 'rtf2xml1.0.dtd'
class ConvertToTags:
"""
Convert file to XML
"""
def __init__(self,
in_file,
bug_handler,
dtd_path,
no_dtd,
indent = None,
copy = None,
run_level = 1,
):
"""
Required:
'file'
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
Returns:
nothing
"""
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__dtd_path = dtd_path
self.__no_dtd = no_dtd
self.__indent = indent
self.__run_level = run_level
self.__write_to = tempfile.mktemp()
def __initiate_values(self):
"""
Set values, including those for the dictionary.
"""
self.__state = 'default'
self.__new_line = 0
self.__block = ('doc', 'preamble', 'rtf-definition', 'font-table',
'font-in-table', 'color-table', 'color-in-table', 'style-sheet',
'paragraph-styles', 'paragraph-style-in-table', 'character-styles',
'character-style-in-table', 'list-table', 'doc-information', 'title',
'author', 'operator', 'creation-time', 'revision-time',
'editing-time', 'time', 'number-of-pages', 'number-of-words',
'number-of-characters', 'page-definition', 'section-definition',
'headers-and-footers', 'section', 'para', 'body',
'paragraph-definition', 'cell', 'row', 'table', 'revision-table',
'style-group', 'border-group','styles-in-body', 'paragraph-style-in-body',
'list-in-table', 'level-in-table', 'override-table','override-list',
)
self.__two_new_line = ('section', 'body', 'table', 'row' 'list-table')
self.__state_dict = {
'default' : self.__default_func,
'mi<tg<open______' : self.__open_func,
'mi<tg<close_____' : self.__close_func,
'mi<tg<open-att__' : self.__open_att_func,
'mi<tg<empty-att_' : self.__empty_att_func,
'tx<nu<__________' : self.__text_func,
'tx<ut<__________' : self.__text_func,
'mi<tg<empty_____' : self.__empty_func,
}
def __open_func(self, line):
"""
Print the opening tag and newlines when needed.
"""
#mi<tg<open______<style-sheet
info = line[17:-1]
self.__new_line = 0
if info in self.__block:
self.__write_new_line()
if info in self.__two_new_line:
self.__write_extra_new_line()
self.__write_obj.write('<%s>' % info)
def __empty_func(self, line):
"""
Print out empty tag and newlines when needed.
"""
info = line[17:-1]
self.__write_obj.write(
'<%s/>' % info)
self.__new_line = 0
if info in self.__block:
self.__write_new_line()
if info in self.__two_new_line:
self.__write_extra_new_line()
def __open_att_func(self, line):
"""
Process lines for open tags that have attributes.
The important infor is between [17:-1]. Take this info and split it
with the delimeter '<'. The first token in this group is the element
name. The rest are attributes, separated fromt their values by '>'. So
read each token one at a time, and split them by '>'.
"""
#mi<tg<open-att__<footnote<num>
info = line[17:-1]
tokens = info.split("<")
element_name = tokens[0]
tokens = tokens[1:]
self.__write_obj.write('<%s' % element_name)
for token in tokens:
groups = token.split('>')
try:
val = groups[0]
att = groups[1]
att = att.replace('"', '&quot;')
att = att.replace("'", '&quot;')
self.__write_obj.write(
' %s="%s"' % (val, att)
)
except:
if self.__run_level > 3:
msg = 'index out of range\n'
raise self.__bug_handler, msg
self.__write_obj.write('>')
self.__new_line = 0
if element_name in self.__block:
self.__write_new_line()
if element_name in self.__two_new_line:
self.__write_extra_new_line()
def __empty_att_func(self, line):
"""
Same as the __open_att_func, except a '/' is placed at the end of the tag.
"""
#mi<tg<open-att__<footnote<num>
info = line[17:-1]
tokens = info.split("<")
element_name = tokens[0]
tokens = tokens[1:]
self.__write_obj.write('<%s' % element_name)
for token in tokens:
groups = token.split('>')
val = groups[0]
att = groups[1]
att = att.replace('"', '&quot;')
att = att.replace("'", '&quot;')
self.__write_obj.write(
' %s="%s"' % (val, att))
self.__write_obj.write('/>')
self.__new_line = 0
if element_name in self.__block:
self.__write_new_line()
if element_name in self.__two_new_line:
self.__write_extra_new_line()
def __close_func(self, line):
"""
Print out the closed tag and new lines, if appropriate.
"""
#mi<tg<close_____<style-sheet\n
info = line[17:-1]
self.__write_obj.write(
'</%s>' % info)
self.__new_line = 0
if info in self.__block:
self.__write_new_line()
if info in self.__two_new_line:
self.__write_extra_new_line()
def __text_func(self, line):
"""
Simply print out the information between [17:-1]
"""
#tx<nu<__________<Normal;
# change this!
self.__write_obj.write(line[17:-1])
def __write_extra_new_line(self):
"""
Print out extra new lines if the new lines have not exceeded two. If
the new lines are greater than two, do nothing.
"""
if not self.__indent:
return
if self.__new_line < 2:
self.__write_obj.write('\n')
def __default_func(self, line):
pass
def __write_new_line(self):
"""
Print out a new line if a new line has not already been printed out.
"""
if not self.__indent:
return
if not self.__new_line:
self.__write_obj.write('\n')
self.__new_line += 1
def __write_dec(self):
"""
Write the XML declaration at the top of the document.
"""
self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>')
self.__new_line = 0
self.__write_new_line()
if self.__no_dtd:
pass
elif self.__dtd_path:
self.__write_obj.write(
'<!DOCTYPE doc SYSTEM "%s">' % self.__dtd_path
)
elif self.__dtd_path == '':
# don't print dtd if further transformations are going to take
# place
pass
else:
self.__write_obj.write(
'<!DOCTYPE doc PUBLIC "publicID" '
'"http://rtf2xml.sourceforge.net/dtd/%s">' % public_dtd
)
self.__new_line = 0
self.__write_new_line()
def convert_to_tags(self):
"""
Read in the file one line at a time. Get the important info, between
[:16]. Check if this info matches a dictionary entry. If it does, call
the appropriate function.
The functions that are called:
a text function for text
an open funciton for open tags
an open with attribute function for tags with attributes
an empty with attribute function for tags that are empty but have
attribtes.
a closed function for closed tags.
an empty tag function.
"""
self.__initiate_values()
read_obj = open(self.__file, 'r')
self.__write_obj = open(self.__write_to, 'w')
self.__write_dec()
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
action = self.__state_dict.get(self.__token_info)
if action != None:
action(line)
read_obj.close()
self.__write_obj.close()
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "convert_to_tags.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)

View File

@ -0,0 +1,88 @@
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program; if not, write to the Free Software #
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
# 02111-1307 USA #
# #
# #
#########################################################################
import sys, os
class Copy:
"""Copy each changed file to a directory for debugging purposes"""
__dir = ""
def __init__(self, bug_handler, file = None, deb_dir = None, ):
self.__file = file
self.__bug_handler = bug_handler
def set_dir(self, deb_dir):
"""Set the temporary directory to write files to"""
if deb_dir is None:
message = "No directory has been provided to write to in the copy.py"
raise self.__bug_handler, message
check = os.path.isdir(deb_dir)
if not check:
message = "%(deb_dir)s is not a directory" % vars()
raise self.__bug_handler , message
Copy.__dir = deb_dir
def remove_files(self ):
"""Remove files from directory"""
self.__remove_the_files(Copy.__dir)
"""
list_of_files = os.listdir(Copy.__dir)
list_of_files = os.listdir(the_dir)
for file in list_of_files:
rem_file = os.path.join(Copy.__dir,file)
if os.path.isdir(rem_file):
self.remove_files(rem_file)
else:
os.remove(rem_file)
"""
def __remove_the_files(self, the_dir):
"""Remove files from directory"""
list_of_files = os.listdir(the_dir)
for file in list_of_files:
rem_file = os.path.join(Copy.__dir,file)
if os.path.isdir(rem_file):
self.__remove_the_files(rem_file)
else:
try:
os.remove(rem_file)
except OSError:
pass
def copy_file(self, file, new_file):
"""
Copy the file to a new name
If the platform is linux, use the faster linux command
of cp. Otherwise, use a safe python method.
"""
write_file = os.path.join(Copy.__dir,new_file)
platform = sys.platform
if platform[:5] == 'linux':
command = 'cp %(file)s %(write_file)s' % vars()
os.system(command)
else:
read_obj = open(file,'r')
write_obj = open(write_file, 'w')
line = "dummy"
while line:
line = read_obj.read(1000)
write_obj.write(line )
read_obj.close()
write_obj.close()
def rename(self, source, dest):
read_obj = open(source, 'r')
write_obj = open(dest, 'w')
line = 1
while line:
line = read_obj.readline()
write_obj.write(line)
read_obj.close()
write_obj.close()

View File

@ -0,0 +1,94 @@
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program; if not, write to the Free Software #
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
# 02111-1307 USA #
# #
# #
#########################################################################
import os, re, tempfile
from libprs500.ebooks.rtf2xml import copy
class CorrectUnicode:
"""
corrects sequences such as \u201c\'F0\'BE
Where \'F0\'BE has to be eliminated.
"""
def __init__(self,
in_file,
exception_handler,
bug_handler,
copy = None,
run_level = 1,
):
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__run_level = run_level
self.__write_to = tempfile.mktemp()
self.__exception_handler = exception_handler
self.__bug_handler = bug_handler
self.__state = 'outside'
self.__utf_exp = re.compile(r'&#x(.*?);')
def __process_token(self, line):
if self.__state == 'outside':
if line[:5] == 'tx<ut':
self.__handle_unicode(line)
else:
self.__write_obj.write(line)
elif self.__state == 'after':
if line[:5] == 'tx<hx':
pass
elif line[:5] == 'tx<ut':
self.__handle_unicode(line)
else:
self.__state = 'outside'
self.__write_obj.write(line)
else:
raise 'should\'t happen'
def __handle_unicode(self, line):
token = line[16:]
match_obj = re.search(self.__utf_exp, token)
if match_obj:
uni_char = match_obj.group(1)
dec_num = int(uni_char, 16)
if dec_num > 57343 and dec_num < 63743:
self.__state = 'outside'
else:
self.__write_obj.write(line)
self.__state = 'after'
else:
self.__write_obj.write(line)
self.__state = 'outside'
def correct_unicode(self):
"""
Requires:
nothing
Returns:
nothing (changes the original file)
Logic:
Read one line in at a time.
"""
read_obj = open(self.__file, 'r')
self.__write_obj = open(self.__write_to, 'w')
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
self.__process_token(line)
read_obj.close()
self.__write_obj.close()
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "correct_unicode.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)

View File

@ -0,0 +1,61 @@
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program; if not, write to the Free Software #
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
# 02111-1307 USA #
# #
# #
#########################################################################
class DefaultEncoding:
"""
Find the default encoding for the doc
"""
def __init__(self, in_file, bug_handler, run_level = 1,):
"""
Required:
'file'
Returns:
nothing
"""
self.__file = in_file
self.__bug_handler = bug_handler
def find_default_encoding(self):
platform = 'Windows'
default_num = 'not-defined'
code_page = 'ansicpg1252'
read_obj = open(self.__file, 'r')
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
if self.__token_info == 'mi<mk<rtfhed-end':
break
if self.__token_info == 'cw<ri<ansi-codpg':
#cw<ri<ansi-codpg<nu<10000
num = line[20:-1]
if not num:
num = '1252'
code_page = 'ansicpg' + num
if self.__token_info == 'cw<ri<macintosh_':
platform = 'Macintosh'
if self.__token_info == 'cw<ri<deflt-font':
default_num = line[20:-1]
#cw<ri<deflt-font<nu<0
#action = self.__state_dict.get(self.__state)
#if action == None:
#print self.__state
#action(line)
read_obj.close()
if platform == 'Macintosh':
code_page = 'mac_roman'
return platform, code_page, default_num

View File

@ -0,0 +1,219 @@
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program; if not, write to the Free Software #
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
# 02111-1307 USA #
# #
# #
#########################################################################
import sys, os, tempfile
from libprs500.ebooks.rtf2xml import copy
class DeleteInfo:
"""Delelet unecessary destination groups"""
def __init__(self,
in_file ,
bug_handler,
copy = None,
run_level = 1,
):
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__write_to = tempfile.mktemp()
self.__bracket_count=0
self.__ob_count = 0
self.__cb_count = 0
self.__after_asterisk = 0
self.__delete = 0
self.__initiate_allow()
self.__ob = 0
self.__write_cb = 0
self.__run_level = run_level
self.__found_delete = 0
self.__list = 0
def __initiate_allow(self):
"""
Initiate a list of destination groups which should be printed out.
"""
self.__allowable = ('cw<ss<char-style',
'cw<it<listtable_',
'cw<it<revi-table',
'cw<ls<list-lev-d',
'cw<fd<field-inst',
'cw<an<book-mk-st',
'cw<an<book-mk-en',
'cw<an<annotation',
'cw<cm<comment___',
'cw<it<lovr-table',
# 'cw<ls<list______',
)
self.__not_allowable = (
'cw<un<unknown___',
'cw<un<company___',
'cw<ls<list-level',
'cw<fd<datafield_',
)
self.__state = 'default'
self.__state_dict = {
'default' : self.__default_func,
'after_asterisk' : self.__asterisk_func,
'delete' : self.__delete_func,
'list' : self.__list_func,
}
def __default_func(self,line):
"""Handle lines when in no special state. Look for an asterisk to
begin a special state. Otherwise, print out line."""
##cw<ml<asterisk__<nu<true
if self.__token_info == 'cw<ml<asterisk__':
self.__state = 'after_asterisk'
self.__delete_count = self.__ob_count
elif self.__token_info == 'ob<nu<open-brack':
# write previous bracket, if exists
if self.__ob:
self.__write_obj.write(self.__ob)
self.__ob = line
return 0
else:
# write previous bracket, since didn't fine asterisk
if self.__ob:
self.__write_obj.write(self.__ob)
self.__ob = 0
return 1
def __delete_func(self,line):
"""Handle lines when in delete state. Don't print out lines
unless the state has ended."""
if self.__delete_count == self.__cb_count:
self.__state = 'default'
if self.__write_cb:
self.__write_cb = 0
return 1
return 0
def __asterisk_func(self,line):
"""
Determine whether to delete info in group
Note on self.__cb flag.
If you find that you are in a delete group, and the preivous
token in not an open bracket (self.__ob = 0), that means
that the delete group is nested inside another acceptable
detination group. In this case, you have alrady written
the open bracket, so you will need to write the closed one
as well.
"""
# Test for {\*}, in which case don't enter
# delete state
self.__after_asterisk = 0 # only enter this function once
self.__found_delete = 1
if self.__token_info == 'cb<nu<clos-brack':
if self.__delete_count == self.__cb_count:
self.__state = 'default'
self.__ob = 0
# changed this because haven't printed out start
return 0
else:
# not sure what happens here!
# believe I have a '{\*}
if self.__run_level > 3:
msg = 'flag problem\n'
raise self.__bug_handler, msg
return 1
elif self.__token_info in self.__allowable :
if self.__ob:
self.__write_obj.write(self.__ob)
self.__ob = 0
self.__state = 'default'
else:
pass
return 1
elif self.__token_info == 'cw<ls<list______':
self.__ob = 0
self.__found_list_func(line)
elif self.__token_info in self.__not_allowable:
if not self.__ob:
self.__write_cb = 1
self.__ob = 0
self.__state = 'delete'
self.__cb_count = 0
return 0
else:
if self.__run_level > 5:
msg = 'After an asterisk, and found neither an allowable or non-allowble token\n'
msg += 'token is "%s"\n' % self.__token_info
raise self.__bug_handler
if not self.__ob:
self.__write_cb = 1
self.__ob = 0
self.__state = 'delete'
self.__cb_count = 0
return 0
def __found_list_func(self, line):
"""
print out control words in this group
"""
self.__state = 'list'
def __list_func(self, line):
"""
Check to see if the group has ended.
Return 1 for all control words.
Return 0 otherwise.
"""
if self.__delete_count == self.__cb_count and self.__token_info ==\
'cb<nu<clos-brack':
self.__state = 'default'
if self.__write_cb:
self.__write_cb = 0
return 1
return 0
elif line[0:2] == 'cw':
return 1
else:
return 0
def delete_info(self):
"""Main method for handling other methods. Read one line in at
a time, and determine wheter to print the line based on the state."""
line_to_read = 'dummy'
read_obj = open(self.__file, 'r')
self.__write_obj = open(self.__write_to, 'w')
while line_to_read:
#ob<nu<open-brack<0001
to_print =1
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
if self.__token_info == 'ob<nu<open-brack':
self.__ob_count = line[-5:-1]
if self.__token_info == 'cb<nu<clos-brack':
self.__cb_count = line[-5:-1]
action = self.__state_dict.get(self.__state)
if not action:
sys.stderr.write('No action in dictionary state is "%s" \n'
% self.__state)
to_print = action(line)
"""
if self.__after_asterisk:
to_print = self.__asterisk_func(line)
elif self.__list:
self.__in_list_func(line)
elif self.__delete:
to_print = self.__delete_func(line)
else:
to_print = self.__default_func(line)
"""
if to_print:
self.__write_obj.write(line)
self.__write_obj.close()
read_obj.close()
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "delete_info.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)
return self.__found_delete

View File

@ -0,0 +1,795 @@
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program; if not, write to the Free Software #
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
# 02111-1307 USA #
# #
# #
#########################################################################
import sys, re
class FieldStrings:
"""
This module is given a string. It processes the field instruction string and
returns a list of three values.
"""
def __init__(self, bug_handler, run_level = 1):
"""
Requires:
nothing
Returns:
nothing
"""
self.__run_level = run_level
self.__bug_handler = bug_handler
self.__initiate_values()
def __initiate_values(self):
"""
Requires:
nothing.
Returns:
nothing.
Logic:
initiate values for rest of class.
self.__field_instruction_dict:
The dictionary for all field names.
"""
self.__field_instruction_dict = {
# number type (arabic, etc.) and number format (\# " ")
'EDITTIME' : (self.__num_type_and_format_func, 'editing-time'),
'NUMCHARS' : (self.__num_type_and_format_func, 'number-of-characters-in-doc'),
'NUMPAGES' : (self.__num_type_and_format_func, 'number-of-pages-in-doc'),
'NUMWORDS' : (self.__num_type_and_format_func, 'number-of-words-in-doc'),
'REVNUM' : (self.__num_type_and_format_func, 'revision-number'),
'SECTIONPAGES' : (self.__num_type_and_format_func, 'num-of-pages-in-section'),
'SECTION' : (self.__num_type_and_format_func, 'insert-section-number'),
'QUOTE' : (self.__num_type_and_format_func, 'quote'),
# number formatting (\# "")
'PAGE' : (self.__default_inst_func, 'insert-page-number'),
'page' : (self.__default_inst_func, 'insert-page-number'),
# date format (\@ "")
'CREATEDATE' : (self.__date_func, 'insert-date'),
'PRINTDATE' : (self.__date_func, 'insert-date'),
# PRINTDATE?
'SAVEDATE' : (self.__date_func, 'last-saved'),
'TIME' : (self.__date_func, 'insert-time'),
# numbers?
# these fields take four switches
'AUTHOR' : (self.__simple_info_func, 'user-name'),
'COMMENTS' : (self.__simple_info_func, 'comments'),
'FILENAME' : (self.__simple_info_func, 'file-name'),
'filename' : (self.__simple_info_func, 'file-name'),
'KEYWORDS' : (self.__simple_info_func, 'keywords'),
'LASTSAVEDBY' : (self.__simple_info_func, 'last-saved-by'),
'SUBJECT' : (self.__simple_info_func, 'subject'),
'TEMPLATE' : (self.__simple_info_func, 'based-on-template'),
'TITLE' : (self.__simple_info_func, 'document-title'),
'USERADDRESS' : (self.__simple_info_func, 'user-address'),
'USERINITIALS' : (self.__simple_info_func, 'user-initials'),
'USERNAME' : (self.__simple_info_func, 'user-name'),
'EQ' : (self.__equation_func, 'equation'),
'HYPERLINK' : (self.__hyperlink_func, 'hyperlink'),
'INCLUDEPICTURE': (self.__include_pict_func, 'include-picture'),
'INCLUDETEXT' : (self.__include_text_func, 'include-text-from-file'),
'INDEX' : (self.__index_func, 'index'),
'NOTEREF' : (self.__note_ref_func, 'reference-to-note'),
'PAGEREF' : (self.__page_ref_func, 'reference-to-page'),
'REF' : (self.__ref_func, 'reference'),
'ref' : (self.__ref_func, 'reference'),
'SEQ' : (self.__sequence_func, 'numbering-sequence'),
'SYMBOL' : (self.__symbol_func, 'symbol'),
'TA' : (self.__ta_func, 'anchor-for-table-of-authorities'),
'TOA' : (self.__toc_table_func, 'table-of-authorities'),
'TOC' : (self.__toc_table_func, 'table-of-contents'),
# no switches
'AUTONUMOUT' : (self.__no_switch_func, 'auto-num-out?'),
'COMPARE' : (self.__no_switch_func, 'compare'),
'DOCVARIABLE' : (self.__no_switch_func, 'document-variable'),
'GOTOBUTTON' : (self.__no_switch_func, 'go-button'),
'NEXT' : (self.__no_switch_func, 'next'),
'NEXTIF' : (self.__no_switch_func, 'next-if'),
'SKIPIF' : (self.__no_switch_func, 'skip-if'),
'IF' : (self.__no_switch_func, 'if'),
'MERGEFIELD' : (self.__no_switch_func, 'merge-field'),
'MERGEREC' : (self.__no_switch_func, 'merge-record'),
'MERGESEQ' : (self.__no_switch_func, 'merge-sequence'),
'PLACEHOLDER' : (self.__no_switch_func, 'place-holder'),
'PRIVATE' : (self.__no_switch_func, 'private'),
'RD' : (self.__no_switch_func, 'referenced-document'),
'SET' : (self.__no_switch_func, 'set'),
# default instructions (haven't written a method for them
'ADVANCE' : (self.__default_inst_func, 'advance'),
'ASK' : (self.__default_inst_func, 'prompt-user'),
'AUTONUMLGL' : (self.__default_inst_func, 'automatic-number'),
'AUTONUM' : (self.__default_inst_func, 'automatic-number'),
'AUTOTEXTLIST' : (self.__default_inst_func, 'auto-list-text'),
'AUTOTEXT' : (self.__default_inst_func, 'auto-text'),
'BARCODE' : (self.__default_inst_func, 'barcode'),
'CONTACT' : (self.__default_inst_func, 'contact'),
'DATABASE' : (self.__default_inst_func, 'database'),
'DATE' : (self.__default_inst_func, 'date'),
'date' : (self.__default_inst_func, 'date'),
'DOCPROPERTY' : (self.__default_inst_func, 'document-property'),
'FILESIZE' : (self.__default_inst_func, 'file-size'),
'FILLIN' : (self.__default_inst_func, 'fill-in'),
'INFO' : (self.__default_inst_func, 'document-info'),
'LINK' : (self.__default_inst_func, 'link'),
'PA' : (self.__default_inst_func, 'page'),
'PRINT' : (self.__default_inst_func, 'print'),
'STYLEREF' : (self.__default_inst_func, 'style-reference'),
'USERPROPERTY' : (self.__default_inst_func, 'user-property'),
'FORMCHECKBOX' : (self.__default_inst_func, 'form-checkbox'),
'FORMTEXT' : (self.__default_inst_func, 'form-text'),
# buttons
'MACROBUTTON' : (self.__default_inst_func, 'macro-button'),
}
self.__number_dict = {
'Arabic' : 'arabic',
'alphabetic' : 'alphabetic',
'ALPHABETIC' : 'capital-alphabetic',
'roman' : 'roman',
'ROMAN' : 'capital-roman',
'Ordinal' : 'ordinal',
'CardText' : 'cardinal-text',
'OrdText' : 'ordinal-text',
'Hex' : 'hexidecimal',
'DollarText' : 'dollar-text',
'Upper' : 'upper-case',
'Lower' : 'lower-case',
'FirstCap' : 'first-cap',
'Caps' : 'caps',
}
self.__text_format_dict = {
'Upper' : 'upper',
'Lower' : 'lower',
'FirstCap' : 'first-cap',
'Caps' : 'caps',
}
self.__symbol_num_exp = re.compile(r'SYMBOL (.*?) ')
self.__symbol_font_exp = re.compile(r'\\f "(.*?)"')
self.__symbol_size_exp = re.compile(r'\\s (\d+)')
##self.__toc_figure_exp = re.compile(r'\\c "Figure"')
# \\@ "dddd, MMMM d, yyyy"
self.__date_exp = re.compile(r'\\@\s{1,}"(.*?)"')
self.__num_type_exp = re.compile(r'\\\*\s{1,}(Arabic|alphabetic|ALPHABETIC|roman|ROMAN|Ordinal|CardText|OrdText|Hex|DollarText|Upper|Lower|FirstCap|Caps)')
self.__format_text_exp = re.compile(r'\\\*\s{1,}(Upper|Lower|FirstCap|Caps)')
self.__merge_format_exp = re.compile(r'\\\*\s{1,}MERGEFORMAT')
self.__ta_short_field_exp = re.compile(r'\\s\s{1,}"(.*?)"')
self.__ta_long_field_exp = re.compile(r'\\l\s{1,}"(.*?)"')
self.__ta_category_exp = re.compile(r'\\c\s{1,}(\d+)')
# indices
self.__index_insert_blank_line_exp = re.compile(r'\\h\s{1,}""')
self.__index_insert_letter_exp = re.compile(r'\\h\s{1,}"()"')
self.__index_columns_exp = re.compile(r'\\c\s{1,}"(.*?)"')
self.__bookmark_exp = re.compile(r'\\b\s{1,}(.*?)\s')
self.__d_separator = re.compile(r'\\d\s{1,}(.*?)\s')
self.__e_separator = re.compile(r'\\e\s{1,}(.*?)\s')
self.__l_separator = re.compile(r'\\l\s{1,}(.*?)\s')
self.__p_separator = re.compile(r'\\p\s{1,}(.*?)\s')
self.__index_sequence = re.compile(r'\\s\s{1,}(.*?)\s')
self.__index_entry_typ_exp = re.compile(r'\\f\s{1,}"(.*?)"')
self.__quote_exp = re.compile(r'"(.*?)"')
self.__filter_switch = re.compile(r'\\c\s{1,}(.*?)\s')
self.__link_switch = re.compile(r'\\l\s{1,}(.*?)\s')
def process_string(self, my_string, type):
"""
Requires:
my_string --the string to parse.
type -- the type of string.
Returns:
Returns a string for a field instrution attribute.
Logic:
This handles all "large" fields, which means everything except
toc entries, index entries, and bookmarks
Split the string by spaces, and get the first item in the
resulting list. This item is the field's type. Check for the
action in the field instructions dictionary for further parsing.
If no action is found, print out an error message.
"""
changed_string = ''
lines = my_string.split('\n')
for line in lines:
if line[0:2] == 'tx':
changed_string += line[17:]
fields = changed_string.split()
field_name = fields[0]
action, name = self.__field_instruction_dict.get(field_name, (None, None))
match_obj = re.search(self.__merge_format_exp, changed_string)
if match_obj and name:
name += '<update>dynamic'
elif name:
name += '<update>static'
else:
pass
# no name--not in list above
if action:
the_list = action(field_name, name, changed_string)
else:
# change -1 to 0--for now, I want users to report bugs
msg = 'no key for "%s" "%s"\n' % (field_name, changed_string)
sys.stderr.write(msg)
if self.__run_level > 3:
msg = 'no key for "%s" "%s"\n' % (field_name, changed_string)
raise self.__bug_handler, msg
the_list = self.__fall_back_func(field_name, line)
return the_list
return the_list
def __default_inst_func(self, field_name, name, line):
"""
Requires:
field_name -- the first word in the string
name -- the changed name according to the dictionary
line -- the string to be parsed
Returns:
The name of the field.
Logic:
I only need the changed name for the field.
"""
return [None, None, name]
def __fall_back_func(self, field_name, line):
"""
Requires:
field_name -- the first word in the string
name -- the changed name according to the dictionary
line -- the string to be parsed
Returns:
The name of the field.
Logic:
Used for fields not found in dict
"""
the_string = field_name
the_string += '<update>none'
return [None, None, the_string]
def __equation_func(self, field_name, name, line):
"""
Requried:
field_name -- the first word in the string
name --the changed name according to the dictionary
line -- the string to be parse
Retuns:
The name of the field
Logic:
"""
return [None, None, name]
def __no_switch_func(self, field_name, name, line):
"""
Required:
field_name --the first
field_name -- the first word in the string
name --the changed name according to the dictionary
line -- the string to be parse
Retuns:
The name of the field
Logic:
"""
return [None, None, name]
def __num_type_and_format_func(self, field_name, name, line):
"""
Required:
field_name -- the first word in the string
name --the changed name according to the dictionary
line -- the string to be parse
Returns:
list of None, None, and part of a tag
Logic:
parse num_type
parse num_format
"""
the_string = name
num_format = self.__parse_num_format(line)
if num_format:
the_string += '<number-format>%s' % num_format
num_type = self.__parse_num_type(line)
if num_type:
the_string += '<number-type>%s' % num_type
# Only QUOTE takes a (mandatory?) argument
if field_name == 'QUOTE':
match_group = re.search(r'QUOTE\s{1,}"(.*?)"', line)
if match_group:
arg = match_group.group(1)
the_string += '<argument>%s' % arg
return [None, None, the_string]
def __num_format_func(self, field_name, name, line):
"""
Required:
field_name -- the first word in the string
name --the changed name according to the dictionary
line -- the string to be parse
Returns:
list of None, None, and part of a tag
Logic:
"""
the_string = name
num_format = self.__parse_num_format(line)
if num_format:
the_string += '<number-format>%s' % num_format
return [None, None, the_string]
def __parse_num_format(self, the_string):
"""
Required:
the_string -- the string to parse
Returns:
a string if the_string contains number formatting information
None, otherwise
Logic:
"""
match_group = re.search(self.__date_exp, the_string)
if match_group:
return match_group(1)
def __parse_num_type(self, the_string):
"""
Required:
the_string -- the string to parse
Returns:
a string if the_string contains number type information
None, otherwise
Logic:
the_string might look like:
USERNAME \\* Arabic \\* MERGEFORMAT
Get the \\* Upper part. Use a dictionary to convert the "Arabic" to
a more-readable word for the value of the key "number-type".
(<field number-type = "Arabic">
"""
match_group = re.search(self.__num_type_exp, the_string)
if match_group:
name = match_group.group(1)
changed_name = self.__number_dict.get(name)
if changed_name:
return changed_name
else:
sys.stderr.write('module is fields_string\n')
sys.stderr.write('method is __parse_num_type\n')
sys.stderr.write('no dictionary entry for %s\n' % name)
def __date_func(self, field_name, name, line):
"""
Required:
field_name --the fist
field_name -- the first word in the string
name --the changed name according to the dictionary
line -- the string to be parse
Returns:
list of None, None, and part of a tag
Logic:
"""
the_string = name
match_group = re.search(self.__date_exp, line)
if match_group:
the_string += '<date-format>%s' % match_group.group(1)
return [None, None, the_string]
def __simple_info_func(self, field_name, name, line):
"""
Requried:
field_name -- the first word in the string
name --the changed name according to the dictionary
line -- the string to be parse
Retuns:
The name of the field
Logic:
These fields can only have the following switches:
1. Upper
2. Lower
3. FirstCap
4. Caps
"""
the_string = name
match_group = re.search(self.__format_text_exp, line)
if match_group:
name = match_group.group(1)
changed_name = self.__text_format_dict.get(name)
if changed_name:
the_string += '<format>%s' % changed_name
else:
sys.stderr.write('module is fields_string\n')
sys.stderr.write('method is __parse_num_type\n')
sys.stderr.write('no dictionary entry for %s\n' % name)
return [None, None, the_string]
def __hyperlink_func(self, field_name, name, line):
"""
Requried:
field_name -- the first word in the string
name --the changed name according to the dictionary
line -- the string to be parse
Retuns:
The name of the field
Logic:
self.__link_switch = re.compile(r'\\l\s{1,}(.*?)\s')
"""
self.__link_switch = re.compile(r'\\l\s{1,}(.*?)\s')
the_string = name
match_group = re.search(self.__link_switch, line)
if match_group:
link = match_group.group(1)
link = link.replace('"', "&quot;")
the_string += '<link>%s' % link
# \l "txt" "link"
# want "file name" so must get rid of \c "txt"
line = re.sub(self.__link_switch, '', line)
match_group = re.search(self.__quote_exp, line)
if match_group:
arg = match_group.group(1)
the_string += '<argument>%s' % arg
else:
pass
index = line.find('\\m')
if index > -1:
the_string += '<html2-image-map>true'
index = line.find('\\n')
if index > -1:
the_string += '<new-window>true'
index = line.find('\\h')
if index > -1:
the_string += '<no-history>true'
return [None, None, the_string]
def __include_text_func(self, field_name, name, line):
"""
Requried:
field_name -- the first word in the string
name --the changed name according to the dictionary
line -- the string to be parse
Retuns:
The name of the field
Logic:
"""
the_string = name
match_group = re.search(self.__format_text_exp, line)
if match_group:
name = match_group.group(1)
changed_name = self.__text_format_dict.get(name)
if changed_name:
the_string += '<format>%s' % changed_name
else:
sys.stderr.write('module is fields_string\n')
sys.stderr.write('method is __parse_num_type\n')
sys.stderr.write('no dictionary entry for %s\n' % name)
match_group = re.search(self.__filter_switch, line)
if match_group:
arg = match_group.group(1)
the_string += '<filter>%s' % arg
# \c "txt" "file name"
# want "file name" so must get rid of \c "txt"
line = re.sub(self.__filter_switch, '', line)
match_group = re.search(self.__quote_exp, line)
if match_group:
arg = match_group.group(1)
arg = arg.replace('"', "&quot;")
the_string += '<argument>%s' % arg
else:
sys.stderr.write('Module is field_strings\n')
sys.stderr.write('method is include_text_func\n')
sys.stderr.write('no argument for include text\n')
index = line.find('\\!')
if index > -1:
the_string += '<no-field-update>true'
return [None, None, the_string]
def __include_pict_func(self, field_name, name, line):
"""
Requried:
field_name -- the first word in the string
name --the changed name according to the dictionary
line -- the string to be parse
Retuns:
The name of the field
Logic:
"""
the_string = name
match_group = re.search(self.__filter_switch, line)
if match_group:
arg = match_group.group(1)
arg = arg.replace('"', "&quot;")
the_string += '<filter>%s' % arg
# \c "txt" "file name"
# want "file name" so must get rid of \c "txt"
line = re.sub(self.__filter_switch, '', line)
match_group = re.search(self.__quote_exp, line)
if match_group:
arg = match_group.group(1)
the_string += '<argument>%s' % arg
else:
sys.stderr.write('Module is field_strings\n')
sys.stderr.write('method is include_pict_func\n')
sys.stderr.write('no argument for include pict\n')
index = line.find('\\d')
if index > -1:
the_string += '<external>true'
return [None, None, the_string]
def __ref_func(self, field_name, name, line):
"""
Requires:
field_name -- the first word in the string
name -- the changed name according to the dictionary
line -- the string to be parsed
Returns:
The name of the field.
Logic:
A page reference field looks like this:
PAGEREF _Toc440880424 \\h
I want to extract the second line of info, which is used as an
achor in the resulting XML file.
"""
the_string = name
match_group = re.search(self.__format_text_exp, line)
if match_group:
name = match_group.group(1)
changed_name = self.__text_format_dict.get(name)
if changed_name:
the_string += '<format>%s' % changed_name
else:
sys.stderr.write('module is fields_string\n')
sys.stderr.write('method is __parse_num_type\n')
sys.stderr.write('no dictionary entry for %s\n' % name)
line = re.sub(self.__merge_format_exp, '', line)
words = line.split()
words = words[1:] # get rid of field name
for word in words:
if word[0:1] != '\\':
the_string += '<bookmark>%s' % word
index = line.find('\\f')
if index > -1:
the_string += '<include-note-number>true'
index = line.find('\\h')
if index > -1:
the_string += '<hyperlink>true'
index = line.find('\\n')
if index > -1:
the_string += '<insert-number>true'
index = line.find('\\r')
if index > -1:
the_string += '<insert-number-relative>true'
index = line.find('\\p')
if index > -1:
the_string += '<paragraph-relative-position>true'
index = line.find('\\t')
if index > -1:
the_string += '<suppress-non-delimeter>true'
index = line.find('\\w')
if index > -1:
the_string += '<insert-number-full>true'
return [None, None, the_string]
def __toc_table_func(self, field_name, name, line):
"""
Requires:
field_name -- the name of the first word in the string
name --the changed name, according to the dictionary.
line --the string to be parsed.
Returns:
A string for a TOC table field.
Logic:
If the string contains Figure, it is a table of figures.
Otherwise, it is a plain old table of contents.
"""
the_string = name
index = line.find('\\c "Figure"')
if index > -1:
the_string = the_string.replace('table-of-contents', 'table-of-figures')
# don't really need the first value in this list, I don't believe
return [name, None, the_string]
def __sequence_func(self, field_name, name, line):
"""
Requires:
field_name --the name of the first word in the string.
name --the changed name according to the dictionary.
line -- the string to parse.
Returns:
A string with a a value for the type and label attributes
Logic:
The type of sequence--whether figure, graph, my-name, or
whatever--is represented by the second word in the string. Extract
and return.
SEQ Figure \\* ARABIC
"""
fields = line.split()
label = fields[1]
my_string = '%s<label>%s' % (name, label)
return [None, None, my_string]
def __ta_func(self, field_name, name, line):
"""
Requires:
field_name --the name of the first word in the string.
name --the changed name according to the dictionary.
line -- the string to parse.
Returns:
A string with a a value for the type and label attributes
Logic:
"""
the_string = name
match_group = re.search(self.__ta_short_field_exp, line)
if match_group:
short_name = match_group.group(1)
the_string += '<short-field>%s' % short_name
match_group = re.search(self.__ta_long_field_exp, line)
if match_group:
long_name = match_group.group(1)
the_string += '<long-field>%s' % long_name
match_group = re.search(self.__ta_category_exp, line)
if match_group:
category = match_group.group(1)
the_string += '<category>%s' % category
index = line.find('\\b')
if index > -1:
the_string += '<bold>true'
index = line.find('\\i')
if index > -1:
the_string += '<italics>true'
return [None, None, the_string]
def __index_func(self, field_name, name, line):
"""
Requires:
field_name --the name of the first word in the string.
name --the changed name according to the dictionary.
line -- the string to parse.
Returns:
A string with a a value for the type and label attributes
Logic:
"""
# self.__index_insert_blank_line_exp = re.compile(r'\\h\s{1,}""')
# self.__index_insert_letter_exp = re.compile(r'\\h\s{1,}(".*?")')
the_string = name
match_group = re.search(self.__index_insert_blank_line_exp, line)
if match_group:
the_string += '<insert-blank-line>true'
else:
match_group = re.search(self.__index_insert_letter_exp, line)
if match_group:
insert_letter = match_group.group(1)
the_string += '<insert-letter>%s' % insert_letter
match_group = re.search(self.__index_columns_exp, line)
if match_group:
columns = match_group.group(1)
the_string += '<number-of-columns>%s' % columns
# self.__bookmark_exp = re.compile(r'\\b\s{1,}(.*?)\s')
match_group = re.search(self.__bookmark_exp, line)
if match_group:
bookmark = match_group.group(1)
the_string += '<use-bookmark>%s' % bookmark
match_group = re.search(self.__d_separator, line)
if match_group:
separator = match_group.group(1)
separator = separator.replace('"', '&quot;')
the_string += '<sequence-separator>%s' % separator
# self.__e_separator = re.compile(r'\\e\s{1,}(.*?)\s')
match_group = re.search(self.__e_separator, line)
if match_group:
separator = match_group.group(1)
separator = separator.replace('"', '&quot;')
the_string += '<page-separator>%s' % separator
# self.__index_sequence = re.compile(r'\\s\s{1,}(.*?)\s')
match_group = re.search(self.__index_sequence, line)
if match_group:
sequence = match_group.group(1)
separator = separator.replace('"', '&quot;')
the_string += '<use-sequence>%s' % sequence
# self.__index_entry_typ_exp = re.compile(r'\\f\s{1,}"(.*?)"')
match_group = re.search(self.__index_entry_typ_exp, line)
if match_group:
entry_type = match_group.group(1)
the_string += '<entry-type>%s' % entry_type
# self.__p_separator = re.compile(r'\\p\s{1,}(.*?)\s')
match_group = re.search(self.__p_separator, line)
if match_group:
limit = match_group.group(1)
the_string += '<limit-to-letters>%s' % limit
match_group = re.search(self.__l_separator, line)
if match_group:
separator = match_group.group(1)
separator = separator.replace('"', '&quot;')
the_string += '<multi-page-separator>%s' % separator
index = line.find('\\a')
if index > -1:
the_string += '<accented>true'
index = line.find('\\r')
if index > -1:
the_string += '<sub-entry-on-same-line>true'
index = line.find('\\t')
if index > -1:
the_string += '<enable-yomi-text>true'
return [None, None, the_string]
def __page_ref_func(self, field_name, name, line):
"""
Requires:
field_name --first name in the string.
name -- the changed name according to the dictionary.
line -- the string to parse.
Returns:
A string .
Logic:
"""
the_string = name
num_format = self.__parse_num_format(line)
if num_format:
the_string += '<number-format>%s' % num_format
num_type = self.__parse_num_type(line)
if num_type:
the_string += '<number-type>%s' % num_type
line = re.sub(self.__merge_format_exp, '', line)
words = line.split()
words = words[1:] # get rid of field name
for word in words:
if word[0:1] != '\\':
the_string += '<bookmark>%s' % word
index = line.find('\\h')
if index > -1:
the_string += '<hyperlink>true'
index = line.find('\\p')
if index > -1:
the_string += '<paragraph-relative-position>true'
return [None, None, the_string]
def __note_ref_func(self, field_name, name, line):
"""
Requires:
field_name --first name in the string.
name -- the changed name according to the dictionary.
line -- the string to parse.
Returns:
A string .
Logic:
"""
the_string = name
line = re.sub(self.__merge_format_exp, '', line)
words = line.split()
words = words[1:] # get rid of field name
for word in words:
if word[0:1] != '\\':
the_string += '<bookmark>%s' % word
index = line.find('\\h')
if index > -1:
the_string += '<hyperlink>true'
index = line.find('\\p')
if index > -1:
the_string += '<paragraph-relative-position>true'
index = line.find('\\f')
if index > -1:
the_string += '<include-note-number>true'
return [None, None, the_string]
def __symbol_func(self, field_name, name, line):
"""
Requires:
field_name --first name in the string.
name -- the changed name according to the dictionary.
line -- the string to parse.
Returns:
A string containing font size, font style, and a hexidecimal value.
Logic:
The SYMBOL field is one of Microsoft's many quirky ways of
entering text. The string that results from this method looks like
this:
SYMBOL 97 \\f "Symbol" \\s 12
The first word merely tells us that we have encountered a SYMBOL
field.
The next value is the Microsoft decimal value. Change this to
hexidecimal.
The pattern '\\f "some font' tells us the font.
The pattern '\\s some size' tells us the font size.
Extract all of this information. Store this information in a
string, and make this string the last item in a list. The first
item in the list is the simple word 'symbol', which tells me that
I don't really have field, but UTF-8 data.
"""
num = ''
font = ''
font_size = ''
changed_line = ''
search_obj = re.search(self.__symbol_num_exp, line)
if search_obj:
num = search_obj.group(1)
num = int(num)
num = '%X' % num
search_obj = re.search(self.__symbol_font_exp, line)
if search_obj:
font = search_obj.group(1)
changed_line += 'cw<ci<font-style<nu<%s\n' % font
search_obj = re.search(self.__symbol_size_exp, line)
if search_obj:
font_size = search_obj.group(1)
font_size = int(font_size)
font_size = '%.2f' % font_size
changed_line += 'cw<ci<font-size_<nu<%s\n' % font_size
changed_line += 'tx<hx<__________<\'%s\n' % num
return ['Symbol', None, changed_line]

View File

@ -0,0 +1,358 @@
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program; if not, write to the Free Software #
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
# 02111-1307 USA #
# #
# #
#########################################################################
import sys, os, tempfile
from libprs500.ebooks.rtf2xml import field_strings, copy
class FieldsLarge:
"""
=========================
Logic
=========================
Make tags for fields.
-Fields reflect text that Microsoft Word automatically generates.
-Each file contains (or should contain) an inner group called field instructions.
-Fields can be nested.
--------------
Logic
--------------
1. As soon as a field is found, make a new text string by appending an empty text string to the field list. Collect all the lines in this string until the field instructions are found.
2. Collect all the tokens and text in the field instructions. When the end of the field instructions is found, process the string of text with the field_strings module. Append the processed string to the field instructins list.
3. Continue collecting tokens. Check for paragraphs or sections. If either is found, add to the paragraph or section list.
4. Continue collecting tokens and text either the beginning of a new field is found, or the end of this field is found.
5. If a new field is found, repeat steps 1-3.
6. If the end of the field is found, process the last text string of the field list.
7. If the field list is empty (after removing the last text string), there are no more fields. Print out the final string. If the list contains other strings, add the processed string to the last string in the field list.
============================
Examples
============================
This line of RTF:
{\field{\*\fldinst { CREATEDATE \\* MERGEFORMAT }}{\fldrslt {
\lang1024 1/11/03 10:34 PM}}}
Becomes:
<field type = "insert-time">
10:34 PM
</field>
The simple field in the above example conatins no paragraph or sections breaks.
This line of RTF:
{{\field{\*\fldinst SYMBOL 97 \\f "Symbol" \\s 12}{\fldrslt\f3\fs24}}}
Becomes:
<para><inline font-size="18"><inline font-style="Symbol">&#x03A7;</inline></inline></para>
The RTF in the example above should be represented as UTF-8 rather than a field.
This RTF:
{\field\fldedit{\*\fldinst { TOC \\o "1-3" }}{\fldrslt {\lang1024
Heading one\tab }{\field{\*\fldinst {\lang1024 PAGEREF _Toc440880424
\\h }{\lang1024 {\*\datafield
{\lang1024 1}}}{\lang1024 \par }\pard\plain
\s18\li240\widctlpar\tqr\tldot\tx8630\aspalpha\aspnum\faauto\adjustright\rin0\lin240\itap0
\f4\lang1033\cgrid {\lang1024 Heading 2\tab }{\field{\*\fldinst
{\lang1024 PAGEREF _Toc440880425 \\h }{\lang1024 {\*\datafield
{\lang1024 1}}}{\lang1024 \par }\pard\plain
\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0
\f4\lang1033\cgrid }}\pard\plain
\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0
\f4\lang1033\cgrid {\fs28 \u214\'85 \par }{\fs36 {\field{\*\fldinst
SYMBOL 67 \\f "Symbol" \\s 18}{\fldrslt\f3\fs36}}}
Becomes:
<field-block type="table-of-contents">
<paragraph-definition language="1033" nest-level="0"
font-style="Times" name="toc 1" adjust-right="true"
widow-control="true">
<para><inline language="1024">Heading one&#x009;</inline><field
type="reference-to-page" ref="_Toc440880424"><inline
language="1024">1</inline></field></para>
</paragraph-definition>
<paragraph-definition language="1033" nest-level="0" left-indent="12"
font-style="Times" name="toc 2" adjust-right="true"
widow-control="true">
<para><inline language="1024">Heading 2&#x009;</inline><field
type="reference-to-page" ref="_Toc440880425"><inline
language="1024">1</inline></field></para>
</paragraph-definition>
</field-block>
"""
def __init__(self,
in_file,
bug_handler,
copy = None,
run_level = 1,
):
"""
Required:
'file'--file to parse
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
Returns:
nothing
"""
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__run_level = run_level
self.__write_to = tempfile.mktemp()
def __initiate_values(self):
"""
Initiate all values.
"""
self.__text_string = ''
self.__field_instruction_string = ''
self.__marker = 'mi<mk<inline-fld\n'
self.__state = 'before_body'
self.__string_obj = field_strings.FieldStrings(run_level = self.__run_level,
bug_handler= self.__bug_handler,)
self.__state_dict = {
'before_body' : self.__before_body_func,
'in_body' : self.__in_body_func,
'field' : self.__in_field_func,
'field_instruction' : self.__field_instruction_func,
}
self.__in_body_dict = {
'cw<fd<field_____' : self.__found_field_func,
}
self.__field_dict = {
'cw<fd<field-inst' : self.__found_field_instruction_func,
'cw<fd<field_____' : self.__found_field_func,
'cw<pf<par-end___' : self.__par_in_field_func,
'cw<sc<section___' : self.__sec_in_field_func,
}
self.__field_count = [] # keep track of the brackets
self.__field_instruction = [] # field instruction strings
self.__symbol = 0 # wheter or not the field is really UTF-8
# (these fields cannot be nested.)
self.__field_instruction_string = '' # string that collects field instruction
self.__par_in_field = [] # paragraphs in field?
self.__sec_in_field = [] # sections in field?
self.__field_string = [] # list of field strings
def __before_body_func(self, line):
"""
Requried:
line --line ro parse
Returns:
nothing (changes an instant and writes a line)
Logic:
Check for the beginninf of the body. If found, changed the state.
Always write out the line.
"""
if self.__token_info == 'mi<mk<body-open_':
self.__state = 'in_body'
self.__write_obj.write(line)
def __in_body_func(self, line):
"""
Required:
line --line to parse
Returns:
nothing. (Writes a line to the output file, or performs other actions.)
Logic:
Check of the beginning of a field. Always output the line.
"""
action = self.__in_body_dict.get(self.__token_info)
if action:
action(line)
self.__write_obj.write(line)
def __found_field_func(self, line):
"""
Requires:
line --line to parse
Returns:
nothing
Logic:
Set the values for parseing the field. Four lists have to have
items appended to them.
"""
self.__state = 'field'
self.__cb_count = 0
ob_count = self.__ob_count
self.__field_string.append('')
self.__field_count.append(ob_count)
self.__sec_in_field.append(0)
self.__par_in_field.append(0)
def __in_field_func(self, line):
"""
Requires:
line --line to parse
Returns:
nothing.
Logic:
Check for the end of the field; a paragaph break; a section break;
the beginning of another field; or the beginning of the field
instruction.
"""
if self.__cb_count == self.__field_count[-1]:
self.__field_string[-1] += line
self.__end_field_func()
else:
action = self.__field_dict.get(self.__token_info)
if action:
action(line)
else:
self.__field_string[-1] += line
def __par_in_field_func(self, line):
"""
Requires:
line --line to parse
Returns:
nothing
Logic:
Write the line to the output file and set the last item in the
paragraph in field list to true.
"""
self.__field_string[-1] += line
self.__par_in_field[-1] = 1
def __sec_in_field_func(self, line):
"""
Requires:
line --line to parse
Returns:
nothing
Logic:
Write the line to the output file and set the last item in the
section in field list to true.
"""
self.__field_string[-1] += line
self.__sec_in_field[-1] = 1
def __found_field_instruction_func(self, line):
"""
Requires:
line -- line to parse
Returns:
nothing
Change the state to field instruction. Set the open bracket count of
the beginning of this field so you know when it ends. Set the closed
bracket count to 0 so you don't prematureley exit this state.
"""
self.__state = 'field_instruction'
self.__field_instruction_count = self.__ob_count
self.__cb_count = 0
def __field_instruction_func(self, line):
"""
Requires:
line --line to parse
Returns:
nothing
Logic:
Collect all the lines until the end of the field is reached.
Process these lines with the module rtr.field_strings.
Check if the field instruction is 'Symbol' (really UTF-8).
"""
if self.__cb_count == self.__field_instruction_count:
# The closing bracket should be written, since the opening bracket
# was written
self.__field_string[-1] += line
my_list = self.__string_obj.process_string(
self.__field_instruction_string, 'field_instruction')
instruction = my_list[2]
self.__field_instruction.append(instruction)
if my_list[0] == 'Symbol':
self.__symbol = 1
self.__state = 'field'
self.__field_instruction_string = ''
else:
self.__field_instruction_string += line
def __end_field_func(self):
"""
Requires:
nothing
Returns:
Nothing
Logic:
Pop the last values in the instructions list, the fields list, the
paragaph list, and the section list.
If the field is a symbol, do not write the tags <field></field>,
since this field is really just UTF-8.
If the field contains paragraph or section breaks, it is a
field-block rather than just a field.
Write the paragraph or section markers for later parsing of the
file.
If the filed list contains more strings, add the latest
(processed) string to the last string in the list. Otherwise,
write the string to the output file.
"""
last_bracket = self.__field_count.pop()
instruction = self.__field_instruction.pop()
inner_field_string = self.__field_string.pop()
sec_in_field = self.__sec_in_field.pop()
par_in_field = self.__par_in_field.pop()
# add a closing bracket, since the closing bracket is not included in
# the field string
if self.__symbol:
inner_field_string = '%scb<nu<clos-brack<%s\n' % \
(instruction, last_bracket)
elif sec_in_field or par_in_field:
inner_field_string = \
'mi<mk<fldbkstart\n'\
'mi<tg<open-att__<field-block<type>%s\n%s'\
'mi<mk<fldbk-end_\n' \
'mi<tg<close_____<field-block\n'\
'mi<mk<fld-bk-end\n' \
% ( instruction, inner_field_string)
# write a marker to show an inline field for later parsing
else:
inner_field_string = \
'%s' \
'mi<tg<open-att__<field<type>%s\n%s'\
'mi<tg<close_____<field\n'\
% (self.__marker, instruction, inner_field_string)
if sec_in_field:
inner_field_string = 'mi<mk<sec-fd-beg\n' + inner_field_string + \
'mi<mk<sec-fd-end\n'
if par_in_field:
inner_field_string = 'mi<mk<par-in-fld\n' + inner_field_string
if len(self.__field_string) == 0:
self.__write_field_string(inner_field_string)
else:
self.__field_string[-1] += inner_field_string
self.__symbol = 0
def __write_field_string(self, the_string):
self.__state = 'in_body'
self.__write_obj.write(the_string)
def fix_fields(self):
"""
Requires:
nothing
Returns:
nothing (changes the original file)
Logic:
Read one line in at a time. Determine what action to take based on
the state. If the state is before the body, look for the
beginning of the body.
If the state is body, send the line to the body method.
"""
self.__initiate_values()
read_obj = open(self.__file, 'r')
self.__write_obj = open(self.__write_to, 'w')
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
if self.__token_info == 'ob<nu<open-brack':
self.__ob_count = line[-5:-1]
if self.__token_info == 'cb<nu<clos-brack':
self.__cb_count = line[-5:-1]
action = self.__state_dict.get(self.__state)
if action == None:
sys.stderr.write('no no matching state in module styles.py\n')
sys.stderr.write(self.__state + '\n')
action(line)
read_obj.close()
self.__write_obj.close()
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "fields_large.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)

View File

@ -0,0 +1,448 @@
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program; if not, write to the Free Software #
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
# 02111-1307 USA #
# #
# #
#########################################################################
import sys, os, tempfile, re
from libprs500.ebooks.rtf2xml import field_strings, copy
class FieldsSmall:
"""
=================
Purpose
=================
Write tags for bookmarks, index and toc entry fields in a tokenized file.
This module does not handle toc or index tables. (This module won't be any
use to use to you unless you use it as part of the other modules.)
-----------
Method
-----------
Look for the beginning of a bookmark, index, or toc entry. When such a token
is found, store the opeing bracket count in a variable. Collect all the text
until the closing bracket entry is found. Send the string to the module
field_strings to process it. Write the processed string to the output
file.
"""
def __init__(self,
in_file,
bug_handler,
copy = None,
run_level = 1,
):
"""
Required:
'file'--file to parse
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
Returns:
nothing
"""
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__write_to = tempfile.mktemp()
self.__run_level = run_level
def __initiate_values(self):
"""
Initiate all values.
"""
self.__string_obj = field_strings.FieldStrings(bug_handler = self.__bug_handler)
self.__state = 'before_body'
self.__text_string = ''
self.__marker = 'mi<mk<inline-fld\n'
self.__state_dict = {
'before_body' : self.__before_body_func,
'body' : self.__body_func,
'bookmark' : self.__bookmark_func,
'toc_index' : self.__toc_index_func,
}
self.__body_dict = {
'cw<an<book-mk-st' : (self.__found_bookmark_func, 'start'),
'cw<an<book-mk-en' : (self.__found_bookmark_func, 'end'),
'cw<an<toc_______' : (self.__found_toc_index_func, 'toc'),
'cw<an<index-mark' : (self.__found_toc_index_func, 'index'),
}
ob = 'ob<nu<open-brack.....'
cb = 'cb<nu<clos-brack'
bk_st = 'cw<an<book-mk-st<nu<true'
tx = 'tx<nu<__________<(.*?)'
reg_st = ob + bk_st + tx + cb
self.__book_start = re.compile(r'%s' % reg_st)
def __before_body_func(self, line):
"""
Requires:
line --the line to parse
Returns:
nothing
Logic:
Look for the beginning of the body. When found, change the state
to body. Always print out the line.
"""
if self.__token_info == 'mi<mk<body-open_':
self.__state = 'body'
self.__write_obj.write(line)
def __body_func(self, line):
"""
Requires:
line --the line to parse
Returns:
nothing
Logic:
This function handles all the lines in the body of the documents.
Look for a bookmark, index or toc entry and take the appropriate action.
"""
action, tag = \
self.__body_dict.get(self.__token_info, (None, None))
if action:
action(line, tag)
else:
self.__write_obj.write(line)
def __found_bookmark_func(self, line, tag):
"""
Requires:
line --the line to parse
Returns:
nothing
Logic:
This function is called when a bookmark is found. The opening
bracket count is stored int eh beginning bracket count. The state
is changed to 'bookmark.'
"""
self.__beg_bracket_count = self.__ob_count
self.__cb_count = 0
self.__state = 'bookmark'
self.__type_of_bookmark = tag
def __bookmark_func(self, line):
"""
Requires:
line --the line to parse
Returns:
nothing
Logic:
This function handles all lines within a bookmark. It adds each
line to a string until the end of the bookmark is found. It
processes the string with the fields_string module, and
prints out the result.
"""
if self.__beg_bracket_count == self.__cb_count:
self.__state = 'body'
type = 'bookmark-%s' % self.__type_of_bookmark
# change here
"""
my_string = self.__string_obj.process_string(
self.__text_string, type)
"""
my_string = self.__parse_bookmark_func(
self.__text_string, type)
self.__write_obj.write(self.__marker)
self.__write_obj.write(my_string)
self.__text_string = ''
self.__write_obj.write(line)
elif line[0:2] == 'tx':
self.__text_string += line[17:-1]
def __parse_index_func(self, my_string):
"""
Requires:
my_string --string to parse
type --type of string
Returns:
A string for a toc instruction field.
Logic:
This method is meant for *both* index and toc entries.
I want to eleminate paragraph endings, and I want to divide the
entry into a main entry and (if it exists) a sub entry.
Split the string by newlines. Read on token at a time. If the
token is a special colon, end the main entry element and start the
sub entry element.
If the token is a pargrah ending, ignore it, since I don't won't
paragraphs within toc or index entries.
"""
my_string, see_string = self.__index_see_func(my_string)
my_string, bookmark_string = self.__index_bookmark_func( my_string)
italics, bold = self.__index__format_func(my_string)
found_sub = 0
my_changed_string = 'mi<tg<empty-att_<field<type>index-entry'
my_changed_string += '<update>static'
if see_string:
my_changed_string += '<additional-text>%s' % see_string
if bookmark_string:
my_changed_string += '<bookmark>%s' % bookmark_string
if italics:
my_changed_string += '<italics>true'
if bold:
my_changed_string += '<bold>true'
main_entry = ''
sub_entry = ''
lines = my_string.split('\n')
for line in lines:
token_info = line[:16]
if token_info == 'cw<ml<colon_____':
found_sub = 1
elif token_info[0:2] == 'tx':
if found_sub:
sub_entry += line[17:]
else:
main_entry += line[17:]
my_changed_string += '<main-entry>%s' % main_entry
if found_sub:
my_changed_string += '<sub-entry>%s' % sub_entry
my_changed_string += '\n'
return my_changed_string
def __index_see_func(self, my_string):
in_see = 0
bracket_count = 0
see_string = ''
changed_string = ''
lines = my_string.split('\n')
end_bracket_count = sys.maxint
for line in lines:
token_info = line[:16]
if token_info == 'ob<nu<open-brack':
bracket_count += 1
if token_info == 'cb<nu<clos-brack':
bracket_count -= 1
if in_see:
if bracket_count == end_bracket_count and token_info == 'cb<nu<clos-brack':
in_see = 0
else:
if token_info == 'tx<nu<__________':
see_string += line[17:]
else:
if token_info == 'cw<in<index-see_':
end_bracket_count = bracket_count - 1
in_see = 1
changed_string += '%s\n' % line
return changed_string, see_string
def __index_bookmark_func(self, my_string):
"""
Requries:
my_string -- string in all the index
Returns:
bookmark_string -- the text string of the book mark
index_string -- string minus the bookmark_string
"""
# cw<an<place_____<nu<true
in_bookmark = 0
bracket_count = 0
bookmark_string = ''
index_string = ''
lines = my_string.split('\n')
end_bracket_count = sys.maxint
for line in lines:
token_info = line[:16]
if token_info == 'ob<nu<open-brack':
bracket_count += 1
if token_info == 'cb<nu<clos-brack':
bracket_count -= 1
if in_bookmark:
if bracket_count == end_bracket_count and token_info == 'cb<nu<clos-brack':
in_bookmark = 0
index_string += '%s\n' % line
else:
if token_info == 'tx<nu<__________':
bookmark_string += line[17:]
else:
index_string += '%s\n' % line
else:
if token_info == 'cw<an<place_____':
end_bracket_count = bracket_count - 1
in_bookmark = 1
index_string += '%s\n' % line
return index_string, bookmark_string
def __index__format_func(self, my_string):
italics = 0
bold =0
lines = my_string.split('\n')
for line in lines:
token_info = line[:16]
if token_info == 'cw<in<index-bold':
bold = 1
if token_info == 'cw<in<index-ital':
italics = 1
return italics, bold
def __parse_toc_func(self, my_string):
"""
Requires:
my_string -- all the string in the toc
Returns:
modidified string
Logic:
"""
toc_level = 0
toc_suppress = 0
my_string, book_start_string, book_end_string =\
self.__parse_bookmark_for_toc(my_string)
main_entry = ''
my_changed_string = 'mi<tg<empty-att_<field<type>toc-entry'
my_changed_string += '<update>static'
if book_start_string:
my_changed_string += '<bookmark-start>%s' % book_start_string
if book_end_string:
my_changed_string += '<bookmark-end>%s' % book_end_string
lines = my_string.split('\n')
for line in lines:
token_info = line[:16]
if token_info[0:2] == 'tx':
main_entry += line[17:]
if token_info == 'cw<tc<toc-level_':
toc_level = line[20:]
if token_info == 'cw<tc<toc-sup-nu':
toc_suppress = 1
if toc_level:
my_changed_string += '<toc-level>%s' % toc_level
if toc_suppress:
my_changed_string += '<toc-suppress-number>true'
my_changed_string += '<main-entry>%s' % main_entry
my_changed_string += '\n'
return my_changed_string
def __parse_bookmark_for_toc(self, my_string):
"""
Requires:
the_string --string of toc, with new lines
Returns:
the_string -- string minus bookmarks
bookmark_string -- bookmarks
Logic:
"""
in_bookmark = 0
bracket_count = 0
book_start_string = ''
book_end_string = ''
book_type = 0
toc_string = ''
lines = my_string.split('\n')
end_bracket_count = sys.maxint
for line in lines:
token_info = line[:16]
if token_info == 'ob<nu<open-brack':
bracket_count += 1
if token_info == 'cb<nu<clos-brack':
bracket_count -= 1
if in_bookmark:
if bracket_count == end_bracket_count and token_info == 'cb<nu<clos-brack':
in_bookmark = 0
toc_string += '%s\n' % line
else:
if token_info == 'tx<nu<__________':
if book_type == 'start':
book_start_string += line[17:]
elif book_type == 'end':
book_end_string += line[17:]
else:
toc_string += '%s\n' % line
else:
if token_info == 'cw<an<book-mk-st' or token_info =='cw<an<book-mk-en':
if token_info == 'cw<an<book-mk-st':
book_type = 'start'
if token_info == 'cw<an<book-mk-en':
book_type = 'end'
end_bracket_count = bracket_count - 1
in_bookmark = 1
toc_string += '%s\n' % line
return toc_string, book_start_string, book_end_string
def __parse_bookmark_func(self, my_string, type):
"""
Requires:
my_string --string to parse
type --type of string
Returns:
A string formated for a field instruction.
Logic:
The type is the name (either bookmark-end or bookmark-start). The
id is the complete text string.
"""
my_changed_string = ('mi<tg<empty-att_<field<type>%s'
'<number>%s<update>none\n' % (type, my_string))
return my_changed_string
def __found_toc_index_func(self, line, tag):
"""
Requires:
line --the line to parse
Returns:
nothing
Logic:
This function is called when a toc or index entry is found. The opening
bracket count is stored in the beginning bracket count. The state
is changed to 'toc_index.'
"""
self.__beg_bracket_count = self.__ob_count
self.__cb_count = 0
self.__state = 'toc_index'
self.__tag = tag
def __toc_index_func(self, line):
"""
Requires:
line --the line to parse
Returns:
nothing
Logic:
This function handles all lines within a toc or index entry. It
adds each line to a string until the end of the entry is found. It
processes the string with the fields_string module, and
prints out the result.
"""
if self.__beg_bracket_count == self.__cb_count:
self.__state = 'body'
type = self.__tag
if type == 'index':
my_string = self.__parse_index_func(
self.__text_string)
elif type == 'toc':
my_string = self.__parse_toc_func(
self.__text_string)
self.__write_obj.write(self.__marker)
self.__write_obj.write(my_string)
self.__text_string = ''
self.__write_obj.write(line)
else:
self.__text_string += line
def fix_fields(self):
"""
Requires:
nothing
Returns:
nothing (changes the original file)
Logic:
Read one line in at a time. Determine what action to take based on
the state. If the state is before the body, look for the
beginning of the body.
The other two states are toc_index (for toc and index entries) and
bookmark.
"""
self.__initiate_values()
read_obj = open(self.__file)
self.__write_obj = open(self.__write_to, 'w')
line_to_read = '1'
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
if self.__token_info == 'ob<nu<open-brack':
self.__ob_count = line[-5:-1]
if self.__token_info == 'cb<nu<clos-brack':
self.__cb_count = line[-5:-1]
action = self.__state_dict.get(self.__state)
if action == None:
sys.stderr.write('no no matching state in module fields_small.py\n')
sys.stderr.write(self.__state + '\n')
action(line)
read_obj.close()
self.__write_obj.close()
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "fields_small.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)

View File

@ -0,0 +1,223 @@
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program; if not, write to the Free Software #
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
# 02111-1307 USA #
# #
# #
#########################################################################
import sys, os, tempfile
from libprs500.ebooks.rtf2xml import copy
class Fonts:
"""
Change lines with font info from font numbers to the actual font names.
"""
def __init__(self,
in_file,
bug_handler,
default_font_num,
copy = None,
run_level = 1,
):
"""
Required:
'file'--file to parse
'default_font_num'--the default font number
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
Returns:
nothing
"""
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__default_font_num = default_font_num
self.__write_to = tempfile.mktemp()
self.__run_level = run_level
def __initiate_values(self):
"""
Initiate all values.
"""
self.__special_font_dict = {
'Symbol' : 0,
'Wingdings' : 0,
'Zapf Dingbats' : 0,
}
self.__special_font_list = [
'Symbol', 'Wingdings', 'Zapf Dingbats'
]
self.__state = 'default'
self.__state_dict = {
'default' : self.__default_func,
'font_table' : self.__font_table_func,
'after_font_table' : self.__after_font_table_func,
'font_in_table' : self.__font_in_table_func,
}
self.__font_table = {}
# individual font written
self.__wrote_ind_font = 0
def __default_func(self, line):
"""
Requires:
line
Returns:
nothing
Handle all lines before the font table. Check for the beginning of the
font table. If found, change the state. Print out all lines.
"""
if self.__token_info == 'mi<mk<fonttb-beg':
self.__state = 'font_table'
self.__write_obj.write(line)
def __font_table_func(self, line):
"""
Requires:
line
Returns:
nothing
Logic:
If the self.__token_info indicates that you have reached the end of
the font table, then change the state to after the font table.
If the self.__token_info indicates that there is a font in the
table, change the state to font in table. Reset the number of the
font to the default font (in case there is no number provided, in
which case RTF assumes the number will be the default font.) Reset
the test string (for the font name) to ''
"""
if self.__token_info == 'mi<mk<fonttb-end':
self.__state = 'after_font_table'
elif self.__token_info == 'mi<mk<fontit-beg':
self.__state = 'font_in_table'
self.__font_num = self.__default_font_num
self.__text_line = ''
##self.__write_obj.write(line)
def __font_in_table_func(self, line):
"""
Requires:
line
Returns:
nothing
Logic:
Check for four conditions:
The line contains font-info. In this case, store the number in
self.__font_num.
The line contains text. In this case, add to the text string
self.__text_string.
The line marks the end of the individual font in the table. In
this case, add a new key-> value pair to the font-table
dictionary. Also create an empty tag with the name and number
as attributes.
Preamture end of font table
"""
#cw<ci<font-style<nu<4
#tx<nu<__________<Times;
if self.__token_info == 'mi<mk<fontit-end':
self.__wrote_ind_font = 1
self.__state = 'font_table'
self.__text_line = self.__text_line[:-1] # get rid of last ';'
self.__font_table[self.__font_num] = self.__text_line
self.__write_obj.write(
'mi<tg<empty-att_'
'<font-in-table<name>%s<num>%s\n' % (self.__text_line, self.__font_num)
)
elif self.__token_info == 'cw<ci<font-style':
self.__font_num = line[20:-1]
elif self.__token_info == 'tx<nu<__________' or \
self.__token_info == 'tx<ut<__________':
self.__text_line += line[17:-1]
elif self.__token_info == 'mi<mk<fonttb-end':
self.__found_end_font_table_func()
self.__state = 'after_font_table'
def __found_end_font_table_func(self):
"""
Required:
nothing
Returns:
nothing
Logic:
If not individual fonts have been written, write one out
"""
if not self.__wrote_ind_font:
self.__write_obj.write(
'mi<tg<empty-att_'
'<font-in-table<name>Times<num>0\n' )
def __after_font_table_func(self, line):
"""
Required:
line
Returns:
nothing
Logic:
Check the self.__token_info. If this matches a token with font
info, then extract the number from the line, and look up the font
name in the font dictionary. If no name exists for that number,
print out an error. Otherwise print out the same line, except with
the name rather than the number.
If the line does not contain font info, simply print it out to the
file.
"""
if self.__token_info == 'cw<ci<font-style':
font_num = line[20:-1]
font_name = self.__font_table.get(font_num)
if font_name == None:
if self.__run_level > 3:
msg = 'no value for %s in self.__font_table\n' % font_num
raise self.__bug_handler, msg
else:
# self.__special_font_dict
if font_name in self.__special_font_list:
self.__special_font_dict[font_name] = 1
self.__write_obj.write(
'cw<ci<font-style<nu<%s\n' % font_name
)
else:
self.__write_obj.write(line)
def convert_fonts(self):
"""
Required:
nothing
Returns:
a dictionary indicating with values for special fonts
Logic:
Read one line in at a time. Determine what action to take based on
the state. If the state is font_table, looke for individual fonts
and add the number and font name to a dictionary. Also create a
tag for each individual font in the font table.
If the state is after the font table, look for lines with font
info. Substitute a font name for a font number.
"""
self.__initiate_values()
read_obj = open(self.__file, 'r')
self.__write_obj = open(self.__write_to, 'w')
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
action = self.__state_dict.get(self.__state)
if action == None:
sys.stderr.write('no no matching state in module fonts.py\n')
sys.stderr.write(self.__state + '\n')
action(line)
read_obj.close()
self.__write_obj.close()
default_font_name = self.__font_table.get(self.__default_font_num)
if not default_font_name:
default_font_name = 'Not Defined'
self.__special_font_dict['default-font'] = default_font_name
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "fonts.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)
return self.__special_font_dict

View File

@ -0,0 +1,268 @@
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program; if not, write to the Free Software #
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
# 02111-1307 USA #
# #
# #
#########################################################################
import os, tempfile
from libprs500.ebooks.rtf2xml import copy
class Footnote:
"""
Two public methods are available. The first separates all of the
footnotes from the body and puts them at the bottom of the text, where
they are easier to process. The second joins those footnotes to the
proper places in the body.
"""
def __init__(self,
in_file ,
bug_handler,
copy = None,
run_level = 1,
):
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__write_to = tempfile.mktemp()
self.__found_a_footnote = 0
def __first_line_func(self, line):
"""
Print the tag info for footnotes. Check whether footnote is an
endnote and make the tag according to that.
"""
if self.__token_info == 'cw<nt<type______':
self.__write_to_foot_obj.write(
'mi<tg<open-att__<footnote<type>endnote<num>%s\n' % self.__footnote_count)
else:
self.__write_to_foot_obj.write(
'mi<tg<open-att__<footnote<num>%s\n' % self.__footnote_count)
self.__first_line = 0
def __in_footnote_func(self, line):
"""Handle all tokens that are part of footnote"""
if self.__first_line:
self.__first_line_func(line)
if self.__token_info == 'cw<ci<footnot-mk':
num = str(self.__footnote_count)
self.__write_to_foot_obj.write(line)
self.__write_to_foot_obj.write(
'tx<nu<__________<%s\n' % num
)
if self.__cb_count == self.__footnote_bracket_count:
self.__in_footnote = 0
self.__write_obj.write(line)
self.__write_to_foot_obj.write(
'mi<mk<foot___clo\n')
self.__write_to_foot_obj.write(
'mi<tg<close_____<footnote\n')
self.__write_to_foot_obj.write(
'mi<mk<footnt-clo\n')
else:
self.__write_to_foot_obj.write(line)
def __found_footnote(self, line):
""" Found a footnote"""
self.__found_a_footnote = 1
self.__in_footnote = 1
self.__first_line = 1
self.__footnote_count += 1
# temporarily set this to zero so I can enter loop
self.__cb_count = 0
self.__footnote_bracket_count = self.__ob_count
self.__write_obj.write(
'mi<mk<footnt-ind<%04d\n' % self.__footnote_count)
self.__write_to_foot_obj.write(
'mi<mk<footnt-ope<%04d\n' % self.__footnote_count)
def __default_sep(self, line):
"""Handle all tokens that are not footnote tokens"""
if self.__token_info == 'cw<nt<footnote__':
self.__found_footnote(line)
self.__write_obj.write(line)
if self.__token_info == 'cw<ci<footnot-mk':
num = str(self.__footnote_count + 1)
self.__write_obj.write(
'tx<nu<__________<%s\n' % num
)
def __initiate_sep_values(self):
"""
initiate counters for separate_footnotes method.
"""
self.__bracket_count=0
self.__ob_count = 0
self.__cb_count = 0
self.__footnote_bracket_count = 0
self.__in_footnote = 0
self.__first_line = 0 #have not processed the first line of footnote
self.__footnote_count = 0
def separate_footnotes(self):
"""
Separate all the footnotes in an RTF file and put them at the bottom,
where they are easier to process. Each time a footnote is found,
print all of its contents to a temporary file. Close both the main and
temporary file. Print the footnotes from the temporary file to the
bottom of the main file.
"""
self.__initiate_sep_values()
read_obj = open(self.__file)
self.__write_obj = open(self.__write_to, 'w')
self.__footnote_holder = tempfile.mktemp()
self.__write_to_foot_obj = open(self.__footnote_holder, 'w')
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
# keep track of opening and closing brackets
if self.__token_info == 'ob<nu<open-brack':
self.__ob_count = line[-5:-1]
if self.__token_info == 'cb<nu<clos-brack':
self.__cb_count = line[-5:-1]
# In the middle of footnote text
if self.__in_footnote:
self.__in_footnote_func(line)
# not in the middle of footnote text
else:
self.__default_sep(line)
self.__write_obj.close()
read_obj.close()
self.__write_to_foot_obj.close()
read_obj = open(self.__footnote_holder, 'r')
write_obj = open(self.__write_to, 'a')
write_obj.write(
'mi<mk<sect-close\n'
'mi<mk<body-close\n'
'mi<tg<close_____<section\n'
'mi<tg<close_____<body\n'
'mi<tg<close_____<doc\n'
'mi<mk<footnt-beg\n')
line = 1
while line:
line = read_obj.readline()
write_obj.write(line)
write_obj.write(
'mi<mk<footnt-end\n')
read_obj.close()
write_obj.close()
os.remove(self.__footnote_holder)
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "footnote_separate.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)
def update_info(self, file, copy):
"""
Unused method
"""
self.__file = file
self.__copy = copy
def __get_foot_body_func(self, line):
"""
Process lines in main body and look for beginning of footnotes.
"""
# mi<mk<footnt-end
if self.__token_info == 'mi<mk<footnt-beg':
self.__state = 'foot'
else:
self.__write_obj.write(line)
def __get_foot_foot_func(self, line):
"""
Copy footnotes from bottom of file to a separate, temporary file.
"""
if self.__token_info == 'mi<mk<footnt-end':
self.__state = 'body'
else:
self.__write_to_foot_obj.write(line)
def __get_footnotes(self):
"""
Private method to remove footnotes from main file. Read one line from
the main file at a time. If the state is 'body', call on the private
__get_foot_foot_func. Otherwise, call on the __get_foot_body_func.
These two functions do the work of separating the footnotes form the
body.
"""
read_obj = open(self.__file)
self.__write_obj = open(self.__write_to, 'w')
# self.__write_to = "footnote_info.data"
self.__write_to_foot_obj = open(self.__footnote_holder, 'w')
line = 1
while line:
line = read_obj.readline()
self.__token_info = line[:16]
if self.__state == 'body':
self.__get_foot_body_func(line)
elif self.__state == 'foot':
self.__get_foot_foot_func(line)
read_obj.close()
self.__write_obj.close()
self.__write_to_foot_obj.close()
def __get_foot_from_temp(self, num):
"""
Private method for joining footnotes to body. This method reads from
the temporary file until the proper footnote marker is found. It
collects all the tokens until the end of the footnote, and returns
them as a string.
"""
look_for = 'mi<mk<footnt-ope<' + num + '\n'
found_foot = 0
string_to_return = ''
line = 1
while line:
line = self.__read_from_foot_obj.readline()
if found_foot:
if line == 'mi<mk<footnt-clo\n':
return string_to_return
string_to_return = string_to_return + line
else:
if line == look_for:
found_foot = 1
def __join_from_temp(self):
"""
Private method for rejoining footnotes to body. Read from the
newly-created, temporary file that contains the body text but no
footnotes. Each time a footnote marker is found, call the private
method __get_foot_from_temp(). This method will return a string to
print out to the third file.
If no footnote marker is found, simply print out the token (line).
"""
self.__read_from_foot_obj = open(self.__footnote_holder, 'r')
read_obj = open(self.__write_to, 'r')
self.__write_obj = open(self.__write_to2, 'w')
line = 1
while line:
line = read_obj.readline()
if line[:16] == 'mi<mk<footnt-ind':
line = self.__get_foot_from_temp(line[17:-1])
self.__write_obj.write(line)
read_obj.close()
def join_footnotes(self):
"""
Join the footnotes from the bottom of the file and put them in their
former places. First, remove the footnotes from the bottom of the
input file, outputting them to a temporary file. This creates two new
files, one without footnotes, and one of just footnotes. Open both
these files to read. When a marker is found in the main file, find the
corresponding marker in the footnote file. Output the mix of body and
footnotes to a third file.
"""
if not self.__found_a_footnote:
return
self.__write_to2 = tempfile.mktemp()
self.__state = 'body'
self.__get_footnotes()
self.__join_from_temp()
self.__write_obj.close()
self.__read_from_foot_obj.close()
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to2, "footnote_joined.data")
copy_obj.rename(self.__write_to2, self.__file)
os.remove(self.__write_to2)
os.remove(self.__footnote_holder)

View File

@ -0,0 +1,67 @@
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program; if not, write to the Free Software #
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
# 02111-1307 USA #
# #
# #
#########################################################################
class GetCharMap:
"""
Return the character map for the given value
"""
def __init__(self, bug_handler, char_file):
"""
Required:
'char_file'--the file with the mappings
Returns:
nothing
"""
self.__char_file = char_file
self.__bug_handler = bug_handler
def get_char_map(self, map):
found_map = 0
map_dict = {}
self.__char_file.seek(0)
for line in self.__char_file.readlines():
if not line.strip(): continue
begin_element = '<%s>' % map;
end_element = '</%s>' % map
if not found_map:
if begin_element in line:
found_map = 1
else:
if end_element in line:
break
fields = line.split(':')
fields[1].replace('\\colon', ':')
map_dict[fields[1]] = fields[3]
if not found_map:
msg = 'no map found\n'
msg += 'map is "%s"\n'%(map,)
raise self.__bug_handler, msg
return map_dict

View File

@ -0,0 +1,332 @@
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program; if not, write to the Free Software #
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
# 02111-1307 USA #
# #
# #
#########################################################################
"""
Gets options for main part of script
"""
import sys, os
from libprs500.ebooks.rtf2xml import options_trem, configure_txt
class GetOptions:
def __init__(self,
system_arguments,
rtf_dir,
bug_handler,
configuration_file = None,
):
self.__system_arguments = system_arguments
self.__rtf_dir = rtf_dir
self.__configuration_file = configuration_file
self.__bug_handler = bug_handler
def get_options(self):
"""
return valid, output, help, show_warnings, debug, file
"""
return_options = self.__get_config_options()
options_dict = {
'dir' : [1],
'help' : [0, 'h'],
'show-warnings' : [0],
'caps' : [0,],
'no-caps' : [0],
'symbol' : [0 ],
'no-symbol' : [0],
'windings' : [0],
'no-wingdings' : [0],
'zapf' : [0],
'no-zapf' : [0],
'font' : [0],
'no-font' : [0],
'dtd' : [1],
'no-dtd' : [0],
'version' : [0],
'output' : [1, 'o'],
'no-namespace' : [0],
'level' : [1],
'indent' : [1],
'no-lists' : [0],
'lists' : [0],
'group-styles' : [0],
'no-group-styles' : [0],
'group-borders' : [0],
'no-group-borders' : [0],
'headings-to-sections' : [0],
'no-headings-to-sections' : [0],
'empty-para' : [0],
'no-empty-para' : [0],
'format' : [1, 'f'],
'config' : [0],
}
options_obj = options_trem.ParseOptions(
system_string = self.__system_arguments,
options_dict = options_dict
)
options, arguments = options_obj.parse_options()
if options == 0:
return_options['valid'] = 0
return return_options
the_keys = options.keys()
return_options['help'] = 0
if 'help' in the_keys:
return_options['help'] = 1
return return_options
return_options['config'] = 0
if 'config' in the_keys:
return_options['config'] = 1
return return_options
return_options['version'] = 0
if 'version' in the_keys:
return_options['version'] = 1
return return_options
# unused
return_options['out-dir'] = 0
if 'dir' in the_keys:
out_dir = options['dir']
if not os.path.isdir(out_dir):
sys.stderr.write('Your output must be an existing directory.\n')
return_options['valid'] = 0
else:
return_options['dir'] = options['dir']
return_options['out-file'] = 0
if 'output' in the_keys:
#out_file = options['output']
return_options['out-file'] = options['output']
else:
pass
"""
sys.stderr.write(
'You must provide an ouput file with the \'o\' option\n')
return_options['valid'] = 0
"""
if 'level' in the_keys:
return_options['level'] = options['level']
the_level = return_options.get('level')
if the_level:
try:
return_options['level'] = int(the_level)
except ValueError:
sys.stderr.write('The options "--level" must be a number.\n')
return_options['valid'] = 0
return return_options
if 'dtd' in the_keys:
#dtd = options['dtd']
return_options['raw-dtd-path'] = options['dtd']
acceptable = ['sdoc', 'raw', 'tei']
if 'format' in the_keys:
format = options['format']
if format not in acceptable:
sys.stderr.write('--format must take either \'sdoc\' or '
'\'tei\'\n')
return_options['valid'] = 0
return return_options
else:
return_options['format'] = options['format']
# a hack! python chokes on external dtd
# Was able to fix this
# format = return_options.get('format')
# if format != 'raw' and format != None:
# return_options['raw-dtd-path'] = ''
return_options['show-warnings'] = 0
if 'show-warnings' in the_keys:
return_options['show-warnings'] = 1
if 'no-font' in the_keys:
return_options['convert-symbol'] = 0
return_options['convert-zapf'] = 0
return_options['convert-wingdings'] = 0
if 'font' in the_keys:
return_options['convert-symbol'] = 1
return_options['convert-zapf'] = 1
return_options['convert-wingdings'] = 1
if 'symbol' in the_keys:
return_options['convert-symbol'] = 1
if 'no-symbol' in the_keys:
return_options['convert-symbol'] = 0
if 'wingdings' in the_keys:
return_options['convert-wingdings'] = 1
if 'no-wingdings' in the_keys:
return_options['convert-wingdings'] = 0
if 'zapf' in the_keys:
return_options['convert-zapf'] = 1
if 'no-zapf' in the_keys:
return_options['convert-zapf'] = 0
if 'caps' in the_keys:
return_options['convert-caps'] = 1
if 'no-caps' in the_keys:
return_options['convert-caps'] = 0
if 'no-dtd' in the_keys:
return_options['no-dtd'] = 1
else:
return_options['no-dtd'] = 0
return_options['no-ask'] = 0
if 'no-ask' in the_keys:
return_options['no-ask'] = 1
sys.stderr.write('You can also permanetly set the no-ask option in the rtf2xml file.\n')
if 'no-namespace' in the_keys:
return_options['no-namespace'] = 1
if 'headings-to-sections' in the_keys:
return_options['headings-to-sections'] = 1
elif 'no-headings-to-sections' in the_keys:
return_options['headings-to-sections'] = 0
if 'no-lists' in the_keys:
return_options['form-lists'] = 0
elif 'lists' in the_keys:
return_options['form-lists'] = 1
if 'group-styles' in the_keys:
return_options['group-styles'] = 1
elif 'no-group-styles' in the_keys:
return_options['group-styles'] = 0
if 'group-borders' in the_keys:
return_options['group-borders'] = 1
elif 'no-group-borders' in the_keys:
return_options['group-borders'] = 0
if 'empty-para' in the_keys:
return_options['empty-paragraphs'] = 1
elif 'no-empty-para' in the_keys:
return_options['empty-paragraphs'] = 0
if len(arguments) == 0:
sys.stderr.write(
'You must provide a file to convert.\n')
return_options['valid'] = 0
return return_options
elif len(arguments) > 1:
sys.stderr.write(
'You can only convert one file at a time.\n')
return_options['valid'] = 0
else:
return_options['in-file'] = arguments[0]
# check for out file
smart_output = return_options.get('smart-output')
if smart_output == 'false':
smart_output = 0
if smart_output and not return_options['out-file']:
in_file = return_options['in-file']
the_file_name, ext = os.path.splitext(in_file)
if ext != '.rtf':
sys.stderr.write(
'Sorry, but this file does not have an "rtf" extension, so \n'
'the script will not attempt to convert it.\n'
'If it is in fact an rtf file, use the "-o" option.\n'
)
return_options['valid'] = 0
else:
return_options['out-file'] = '%s.xml' % the_file_name
if not smart_output and not return_options['out-file']:
"""
sys.stderr.write(
'Please provide and file to outut with the -o option.\n'
'Or set \'<smart-output value = "true"/>\'.\n'
'in the configuration file.\n'
)
return_options['valid'] = 0
"""
pass
if 'indent' in the_keys:
try:
value = int(options['indent'])
return_options['indent'] = value
except ValueError:
sys.stderr.write('--indent must take an integer')
return_options['valid'] = 0
# check for format and pyxml
"""
the_format = return_options.get('format')
if the_format != 'raw':
no_pyxml = return_options.get('no-pyxml')
if no_pyxml:
sys.stderr.write('You want to convert your file to "%s".\n'
'Sorry, but you must have pyxml installed\n'
'in order to convert your document to anything but raw XML.\n'
'Please do not use the --format option.\n\n'
% the_format
)
return_options['valid'] = 0
xslt_proc = return_options.get('xslt-processor')
if xslt_proc == None and not no_pyxml:
sys.stderr.write('You want to convert your file to "%s".\n'
'Sorry, but you must have an xslt processor set up\n'
'in order to conevert your document to anything but raw XML.\n'
'Please use --format raw.\n\n'
% the_format
)
return_options['valid'] = 0
"""
return return_options
def __get_config_options(self):
configure_obj = configure_txt.Configure(
bug_handler = self.__bug_handler,
configuration_file = self.__configuration_file)
options_dict = configure_obj.get_configuration(type = 'normal')
if options_dict == 1:
sys.exit(1)
options_dict['valid'] = 1
convert_caps = options_dict.get('convert-caps')
if convert_caps == 'false':
options_dict['convert-caps'] = 0
convert_symbol = options_dict.get('convert-symbol')
if convert_symbol == 'false':
options_dict['convert-symbol'] = 0
convert_wingdings = options_dict.get('convert-wingdings')
if convert_wingdings == 'false':
options_dict['convert-wingdings'] = 0
convert_zapf = options_dict.get('convert-zapf-dingbats')
if convert_zapf == 'false':
options_dict['convert-zapf'] = 0
elif convert_zapf == 'true':
options_dict['convert-zapf'] = 1
headings_to_sections = options_dict.get('headings-to-sections')
if headings_to_sections == 'true':
options_dict['headings-to-sections'] = 1
elif headings_to_sections == '1':
options_dict['headings-to-sections'] = 1
elif headings_to_sections == 'false':
options_dict['headings-to-sections'] = 0
elif headings_to_sections == '0':
options_dict['headings-to-sections'] = 0
else:
options_dict['headings-to-sections'] = 0
write_empty_paragraphs = options_dict.get('write-empty-paragraphs')
if write_empty_paragraphs == 'true':
options_dict['empty-paragraphs'] = 1
elif write_empty_paragraphs == '1':
options_dict['empty-paragraphs'] = 1
elif write_empty_paragraphs == 'false':
options_dict['empty-paragraphs'] = 0
elif write_empty_paragraphs == '0':
options_dict['empty-paragraphs'] = 0
else:
options_dict['empty-paragraphs'] = 1
form_lists = options_dict.get('lists')
if form_lists == 'true' or form_lists == '1':
options_dict['form-lists'] = 1
elif form_lists == 'false' or form_lists == '0':
options_dict['form-lists'] = 0
else:
options_dict['form-lists'] = 0
group_styles = options_dict.get('group-styles')
if group_styles == 'true' or group_styles == '1':
options_dict['group-styles'] = 1
elif group_styles == 'false' or group_styles == '0':
options_dict['group-styles'] = 0
else:
options_dict['group-styles'] = 0
group_borders = options_dict.get('group-borders')
if group_borders == 'true' or group_borders == '1':
options_dict['group-borders'] = 1
elif group_borders == 'false' or group_borders == '0':
options_dict['group-borders'] = 0
else:
options_dict['group-borders'] = 0
return options_dict

View File

@ -0,0 +1,292 @@
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program; if not, write to the Free Software #
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
# 02111-1307 USA #
# #
# #
#########################################################################
import sys, os, tempfile, re
from libprs500.ebooks.rtf2xml import copy
class GroupBorders:
"""
Form lists.
Use RTF's own formatting to determine if a paragraph definition is part of a
list.
Use indents to determine items and how lists are nested.
"""
def __init__(self,
in_file,
bug_handler,
copy = None,
run_level = 1,
wrap = 0,
):
"""
Required:
'file'
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
Returns:
nothing
"""
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__run_level = run_level
self.__write_to = tempfile.mktemp()
self.__wrap = wrap
def __initiate_values(self):
"""
Required:
Nothing
Return:
Nothing
Logic:
The self.__end_list is a list of tokens that will force a list to end.
Likewise, the self.__end_lines is a list of lines that forces a list to end.
"""
self.__state = "default"
self.__left_indent = 0
self.__border_num = 0
self.__list_type = 'not-defined'
self.__pard_def = ""
self.__all_lists = []
self.__list_chunk = ''
self.__state_dict={
'default' : self.__default_func,
'in_pard' : self.__in_pard_func,
'after_pard' : self.__after_pard_func,
}
# section end
self.__end_list = [
# section end
'mi<mk<sect-close',
'mi<mk<sect-start',
# table begin
'mi<mk<tabl-start',
# field block begin
'mi<mk<fldbk-end_',
'mi<mk<fldbkstart',
# cell end
'mi<mk<close_cell',
# item end
'mi<tg<item_end__',
# footnote end
'mi<mk<foot___clo',
'mi<mk<footnt-ope',
# heading end
'mi<mk<header-beg',
'mi<mk<header-end',
'mi<mk<head___clo',
# lists
'mi<tg<item_end__',
'mi<tg<item_end__',
'mi<mk<list_start'
# body close
#
# style-group
'mi<mk<style-grp_',
'mi<mk<style_grp_',
'mi<mk<style_gend',
'mi<mk<stylegend_',
# don't use
# 'mi<mk<body-close',
# 'mi<mk<par-in-fld',
# 'cw<tb<cell______',
# 'cw<tb<row-def___',
# 'cw<tb<row_______',
# 'mi<mk<sec-fd-beg',
]
# <name>Normal<
self.__name_regex = re.compile(r'(<name>[^<]+)')
self.__border_regex = re.compile(r'border-paragraph')
self.__found_appt = 0
self.__line_num = 0
self.__border_regex = re.compile(r'(<border-paragraph[^<]+|<border-for-every-paragraph[^<]+)')
self.__last_border_string = ''
def __in_pard_func(self, line):
"""
Required:
line -- the line of current text.
Return:
Nothing
Logic:
You are in a list, but in the middle of a paragraph definition.
Don't do anything until you find the end of the paragraph definition.
"""
if self.__token_info == 'mi<tg<close_____' \
and line[17:-1] == 'paragraph-definition':
self.__state = 'after_pard'
else:
self.__write_obj.write(line)
def __after_pard_func(self, line):
"""
Required:
line -- the line of current text.
Return:
Nothing
Logic:
"""
if self.__token_info == 'mi<tg<open-att__' \
and line[17:37] == 'paragraph-definition':
# found paragraph definition
self.__pard_after_par_def_func(line)
elif self.__token_info == 'mi<tg<close_____' \
and line[17:-1] == 'paragraph-definition':
sys.stderr.write('Wrong flag in __after_pard_func\n')
if self.__run_level > 2:
msg = 'wrong flag'
raise self.__bug_handler, msg
elif self.__token_info in self.__end_list:
self.__write_obj.write('mi<tg<close_____<paragraph-definition\n')
self.__write_end_border_tag()
self.__write_obj.write(self.__list_chunk)
self.__list_chunk = ''
self.__state = 'default'
self.__write_obj.write(line)
else:
self.__list_chunk += line
def __close_pard_(self, line):
self.__write_obj.write(self.__list_chunk)
self.__write_obj.write('mi<tg<close_____<paragraph-definition\n')
self.__write_end_wrap()
self.__list_chunk = ''
self.__state = 'default'
def __pard_after_par_def_func(self, line):
"""
Required:
line -- the line of current text.
id -- the id of the current list
Return:
Nothing
Logic:
"""
is_border = self.__is_border_func(line)
if not is_border:
self.__write_obj.write('mi<tg<close_____<paragraph-definition\n')
self.__write_end_border_tag()
self.__write_obj.write(self.__list_chunk)
self.__write_obj.write(line)
self.__state = 'default'
self.__list_chunk = ''
else:
border_string, pard_string = self.__parse_pard_with_border(line)
if self.__last_border_string == border_string:
# just keep going
self.__write_obj.write('mi<tg<close_____<paragraph-definition\n')
self.__write_obj.write(self.__list_chunk)
self.__list_chunk = ''
self.__state = 'in_pard'
self.__write_obj.write(pard_string)
else:
# different name for the paragraph definition
self.__write_obj.write('mi<tg<close_____<paragraph-definition\n')
self.__write_end_border_tag()
self.__write_obj.write(self.__list_chunk)
self.__write_start_border_tag(border_string)
self.__write_obj.write(pard_string)
self.__state = 'in_pard'
self.__last_border_string = border_string
self.__list_chunk = ''
def __default_func(self, line):
"""
Required:
self, line
Returns:
Nothing
Logic
Look for the start of a paragraph defintion. If one is found, check if
it contains a list-id. If it does, start a list. Change the state to
in_pard.
"""
if self.__token_info == 'mi<tg<open-att__' \
and line[17:37] == 'paragraph-definition':
contains_border = self.__is_border_func(line)
if contains_border:
border_string, pard_string = self.__parse_pard_with_border(line)
self.__write_start_border_tag(border_string)
self.__write_obj.write(pard_string)
self.__last_border_string = border_string
self.__state = 'in_pard'
else:
self.__write_obj.write(line)
else:
self.__write_obj.write(line)
def __write_start_border_tag(self, the_string):
self.__write_obj.write('mi<mk<start-brdg\n' )
self.__border_num += 1
num = '%04d' % self.__border_num
num_string = 's%s' % num
the_string += '<num>%s' % num_string
self.__write_obj.write('mi<tg<open-att__<border-group%s\n' % the_string)
def __write_end_border_tag(self):
self.__write_obj.write('mi<mk<end-brdg__\n' )
self.__write_obj.write('mi<tg<close_____<border-group\n')
def __is_border_func(self, line):
line = re.sub(self.__name_regex, '', line)
index = line.find('border-paragraph')
if index > -1:
return 1
return 0
def __parse_pard_with_border(self, line):
border_string = ''
pard_string = ''
tokens = re.split(self.__border_regex, line)
for token in tokens:
if token[0:17] == '<border-paragraph':
border_string += token
else:
pard_string += token
return border_string, pard_string
def __write_pard_with_border(self, line):
border_string = ''
pard_string = ''
tokens = re.split(self.__border_regex, line)
for token in tokens:
if token[0:17] == '<border-paragraph':
border_string += token
else:
pard_string += token
self.__write_start_border_tag(border_string)
self.__write_obj.write(pard_string)
def __get_style_name(self, line):
if self.__token_info == 'mi<mk<style-name':
self.__style_name = line[17:-1]
def group_borders(self):
"""
Required:
nothing
Returns:
original file will be changed
Logic:
"""
self.__initiate_values()
read_obj = open(self.__file, 'r')
self.__write_obj = open(self.__write_to, 'w')
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
self.__get_style_name(line)
action = self.__state_dict.get(self.__state)
action(line)
read_obj.close()
self.__write_obj.close()
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "group_borders.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)

View File

@ -0,0 +1,241 @@
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program; if not, write to the Free Software #
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
# 02111-1307 USA #
# #
# #
#########################################################################
import sys, os, tempfile, re
from libprs500.ebooks.rtf2xml import copy
class GroupStyles:
"""
Form lists.
Use RTF's own formatting to determine if a paragraph definition is part of a
list.
Use indents to determine items and how lists are nested.
"""
def __init__(self,
in_file,
bug_handler,
copy = None,
run_level = 1,
wrap = 0,
):
"""
Required:
'file'
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
Returns:
nothing
"""
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__run_level = run_level
self.__write_to = tempfile.mktemp()
self.__wrap = wrap
def __initiate_values(self):
"""
Required:
Nothing
Return:
Nothing
Logic:
The self.__end_list is a list of tokens that will force a list to end.
Likewise, the self.__end_lines is a list of lines that forces a list to end.
"""
self.__state = "default"
self.__left_indent = 0
self.__list_type = 'not-defined'
self.__pard_def = ""
self.__all_lists = []
self.__list_chunk = ''
self.__state_dict={
'default' : self.__default_func,
'in_pard' : self.__in_pard_func,
'after_pard' : self.__after_pard_func,
}
# section end
self.__end_list = [
# section end
'mi<mk<sect-close',
'mi<mk<sect-start',
# table begin
'mi<mk<tabl-start',
# field block begin
'mi<mk<fldbk-end_',
'mi<mk<fldbkstart',
# cell end
'mi<mk<close_cell',
# item end
'mi<tg<item_end__',
# footnote end
'mi<mk<foot___clo',
'mi<mk<footnt-ope',
# heading end
'mi<mk<header-beg',
'mi<mk<header-end',
'mi<mk<head___clo',
# lists
'mi<tg<item_end__',
'mi<tg<item_end__',
'mi<mk<list_start'
# body close
# don't use
# 'mi<mk<body-close',
# 'mi<mk<par-in-fld',
# 'cw<tb<cell______',
# 'cw<tb<row-def___',
# 'cw<tb<row_______',
# 'mi<mk<sec-fd-beg',
]
self.__name_regex = re.compile(r'<name>')
self.__found_appt = 0
self.__line_num = 0
def __in_pard_func(self, line):
"""
Required:
line -- the line of current text.
Return:
Nothing
Logic:
You are in a list, but in the middle of a paragraph definition.
Don't do anything until you find the end of the paragraph definition.
"""
if self.__token_info == 'mi<tg<close_____' \
and line[17:-1] == 'paragraph-definition':
self.__state = 'after_pard'
else:
self.__write_obj.write(line)
def __after_pard_func(self, line):
"""
Required:
line -- the line of current text.
Return:
Nothing
Logic:
"""
if self.__token_info == 'mi<tg<open-att__' \
and line[17:37] == 'paragraph-definition':
# found paragraph definition
self.__pard_after_par_def_func(line)
elif self.__token_info == 'mi<tg<close_____' \
and line[17:-1] == 'paragraph-definition':
sys.stderr.write('Wrong flag in __after_pard_func\n')
if self.__run_level > 2:
msg = 'wrong flag'
raise self.__bug_handler, msg
elif self.__token_info in self.__end_list:
self.__write_obj.write('mi<tg<close_____<paragraph-definition\n')
self.__write_end_wrap()
self.__write_obj.write(self.__list_chunk)
self.__list_chunk = ''
self.__state = 'default'
self.__write_obj.write(line)
else:
self.__list_chunk += line
def __close_pard_(self, line):
self.__write_obj.write(self.__list_chunk)
self.__write_obj.write('mi<tg<close_____<paragraph-definition\n')
self.__write_end_wrap()
self.__list_chunk = ''
self.__state = 'default'
def __write_start_wrap(self, name):
if self.__wrap:
self.__write_obj.write('mi<mk<style-grp_<%s\n' % name)
self.__write_obj.write('mi<tg<open-att__<style-group<name>%s\n' % name)
self.__write_obj.write('mi<mk<style_grp_<%s\n' % name)
def __write_end_wrap(self):
if self.__wrap:
self.__write_obj.write('mi<mk<style_gend\n' )
self.__write_obj.write('mi<tg<close_____<style-group\n')
self.__write_obj.write('mi<mk<stylegend_\n' )
def __pard_after_par_def_func(self, line):
"""
Required:
line -- the line of current text.
id -- the id of the current list
Return:
Nothing
Logic:
"""
if self.__last_style_name == self.__style_name:
# just keep going
if self.__wrap:
self.__write_obj.write('mi<tg<close_____<paragraph-definition\n')
self.__write_obj.write(self.__list_chunk)
self.__list_chunk = ''
self.__state = 'in_pard'
if self.__wrap:
self.__write_obj.write(line)
else:
# different name for the paragraph definition
self.__write_obj.write('mi<tg<close_____<paragraph-definition\n')
self.__write_end_wrap()
self.__write_obj.write(self.__list_chunk)
self.__write_start_wrap(self.__style_name)
self.__write_obj.write(line)
self.__state = 'in_pard'
self.__last_style_name = self.__style_name
self.__list_chunk = ''
def __default_func(self, line):
"""
Required:
self, line
Returns:
Nothing
Logic
Look for the start of a paragraph defintion. If one is found, check if
it contains a list-id. If it does, start a list. Change the state to
in_pard.
"""
if self.__token_info == 'mi<tg<open-att__' \
and line[17:37] == 'paragraph-definition':
self.__state = 'in_pard'
self.__last_style_name = self.__style_name
self.__write_start_wrap(self.__last_style_name)
self.__write_obj.write(line)
else:
self.__write_obj.write(line)
def __get_style_name(self, line):
if self.__token_info == 'mi<mk<style-name':
self.__style_name = line[17:-1]
def group_styles(self):
"""
Required:
nothing
Returns:
original file will be changed
Logic:
"""
self.__initiate_values()
read_obj = open(self.__file, 'r')
self.__write_obj = open(self.__write_to, 'w')
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
self.__get_style_name(line)
action = self.__state_dict.get(self.__state)
action(line)
read_obj.close()
self.__write_obj.close()
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "group_styles.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)

View File

@ -0,0 +1,265 @@
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program; if not, write to the Free Software #
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
# 02111-1307 USA #
# #
# #
#########################################################################
import sys, os, tempfile
from libprs500.ebooks.rtf2xml import copy
class Header:
"""
Two public methods are available. The first separates all of the headers
and footers from the body and puts them at the bottom of the text, where
they are easier to process. The second joins those headers and footers to
the proper places in the body.
"""
def __init__(self,
in_file ,
bug_handler,
copy = None,
run_level = 1,
):
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__write_to = tempfile.mktemp()
self.__found_a_header = 0
def __in_header_func(self, line):
"""
Handle all tokens that are part of header
"""
if self.__cb_count == self.__header_bracket_count:
self.__in_header = 0
self.__write_obj.write(line)
self.__write_to_head_obj.write(
'mi<mk<head___clo\n')
self.__write_to_head_obj.write(
'mi<tg<close_____<header-or-footer\n')
self.__write_to_head_obj.write(
'mi<mk<header-clo\n')
else:
self.__write_to_head_obj.write(line)
def __found_header(self, line):
"""
Found a header
"""
# but this could be header or footer
self.__found_a_header = 1
self.__in_header = 1
self.__header_count += 1
# temporarily set this to zero so I can enter loop
self.__cb_count = 0
self.__header_bracket_count = self.__ob_count
self.__write_obj.write(
'mi<mk<header-ind<%04d\n' % self.__header_count)
self.__write_to_head_obj.write(
'mi<mk<header-ope<%04d\n' % self.__header_count)
info = line[6:16]
type = self.__head_dict.get(info)
if type:
self.__write_to_head_obj.write(
'mi<tg<open-att__<header-or-footer<type>%s\n' % (type)
)
else:
sys.stderr.write('module is header\n')
sys.stderr.write('method is __found_header\n')
sys.stderr.write('no dict entry\n')
sys.stderr.write('line is %s' % line)
self.__write_to_head_obj.write(
'mi<tg<open-att__<header-or-footer<type>none\n'
)
def __default_sep(self, line):
"""Handle all tokens that are not header tokens"""
if self.__token_info[3:5] == 'hf':
self.__found_header(line)
self.__write_obj.write(line)
def __initiate_sep_values(self):
"""
initiate counters for separate_footnotes method.
"""
self.__bracket_count=0
self.__ob_count = 0
self.__cb_count = 0
self.__header_bracket_count = 0
self.__in_header = 0
self.__header_count = 0
self.__head_dict = {
'head-left_' : ('header-left'),
'head-right' : ('header-right'),
'foot-left_' : ('footer-left'),
'foot-right' : ('footer-right'),
'head-first' : ('header-first' ),
'foot-first' : ('footer-first' ),
'header____' : ('header' ),
'footer____' : ('footer' ),
}
def separate_headers(self):
"""
Separate all the footnotes in an RTF file and put them at the bottom,
where they are easier to process. Each time a footnote is found,
print all of its contents to a temporary file. Close both the main and
temporary file. Print the footnotes from the temporary file to the
bottom of the main file.
"""
self.__initiate_sep_values()
read_obj = open(self.__file)
self.__write_obj = open(self.__write_to, 'w')
self.__header_holder = tempfile.mktemp()
self.__write_to_head_obj = open(self.__header_holder, 'w')
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
# keep track of opening and closing brackets
if self.__token_info == 'ob<nu<open-brack':
self.__ob_count = line[-5:-1]
if self.__token_info == 'cb<nu<clos-brack':
self.__cb_count = line[-5:-1]
# In the middle of footnote text
if self.__in_header:
self.__in_header_func(line)
# not in the middle of footnote text
else:
self.__default_sep(line)
self.__write_obj.close()
read_obj.close()
self.__write_to_head_obj.close()
read_obj = open(self.__header_holder, 'r')
write_obj = open(self.__write_to, 'a')
write_obj.write(
'mi<mk<header-beg\n')
line = 1
while line:
line = read_obj.readline()
write_obj.write(line)
write_obj.write(
'mi<mk<header-end\n')
read_obj.close()
write_obj.close()
os.remove(self.__header_holder)
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "header_separate.info")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)
def update_info(self, file, copy):
"""
Unused method
"""
self.__file = file
self.__copy = copy
def __get_head_body_func(self, line):
"""
Process lines in main body and look for beginning of headers.
"""
# mi<mk<footnt-end
if self.__token_info == 'mi<mk<header-beg':
self.__state = 'head'
else:
self.__write_obj.write(line)
def __get_head_head_func(self, line):
"""
Copy headers and footers from bottom of file to a separate, temporary file.
"""
if self.__token_info == 'mi<mk<header-end':
self.__state = 'body'
else:
self.__write_to_head_obj.write(line)
def __get_headers(self):
"""
Private method to remove footnotes from main file. Read one line from
the main file at a time. If the state is 'body', call on the private
__get_foot_foot_func. Otherwise, call on the __get_foot_body_func.
These two functions do the work of separating the footnotes form the
body.
"""
read_obj = open(self.__file)
self.__write_obj = open(self.__write_to, 'w')
# self.__write_to = "footnote_info.data"
self.__write_to_head_obj = open(self.__header_holder, 'w')
line = 1
while line:
line = read_obj.readline()
self.__token_info = line[:16]
if self.__state == 'body':
self.__get_head_body_func(line)
elif self.__state == 'head':
self.__get_head_head_func(line)
read_obj.close()
self.__write_obj.close()
self.__write_to_head_obj.close()
def __get_head_from_temp(self, num):
"""
Private method for joining headers and footers to body. This method
reads from the temporary file until the proper footnote marker is
found. It collects all the tokens until the end of the footnote, and
returns them as a string.
"""
look_for = 'mi<mk<header-ope<' + num + '\n'
found_head = 0
string_to_return = ''
line = 1
while line:
line = self.__read_from_head_obj.readline()
if found_head:
if line == 'mi<mk<header-clo\n':
return string_to_return
string_to_return = string_to_return + line
else:
if line == look_for:
found_head = 1
def __join_from_temp(self):
"""
Private method for rejoining footnotes to body. Read from the
newly-created, temporary file that contains the body text but no
footnotes. Each time a footnote marker is found, call the private
method __get_foot_from_temp(). This method will return a string to
print out to the third file.
If no footnote marker is found, simply print out the token (line).
"""
self.__read_from_head_obj = open(self.__header_holder, 'r')
read_obj = open(self.__write_to, 'r')
self.__write_obj = open(self.__write_to2, 'w')
line = 1
while line:
line = read_obj.readline()
if line[:16] == 'mi<mk<header-ind':
line = self.__get_head_from_temp(line[17:-1])
self.__write_obj.write(line)
read_obj.close()
def join_headers(self):
"""
Join the footnotes from the bottom of the file and put them in their
former places. First, remove the footnotes from the bottom of the
input file, outputting them to a temporary file. This creates two new
files, one without footnotes, and one of just footnotes. Open both
these files to read. When a marker is found in the main file, find the
corresponding marker in the footnote file. Output the mix of body and
footnotes to a third file.
"""
if not self.__found_a_header:
return
self.__write_to2 = tempfile.mktemp()
self.__state = 'body'
self.__get_headers()
self.__join_from_temp()
self.__write_obj.close()
self.__read_from_head_obj.close()
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "header_join.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)
os.remove(self.__header_holder)

View File

@ -0,0 +1,215 @@
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program; if not, write to the Free Software #
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
# 02111-1307 USA #
# #
# #
#########################################################################
import os, tempfile, re
from libprs500.ebooks.rtf2xml import copy
class HeadingsToSections:
"""
"""
def __init__(self,
in_file,
bug_handler,
copy = None,
run_level = 1,
):
"""
Required:
'file'
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
Returns:
nothing
"""
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__write_to = tempfile.mktemp()
def __initiate_values(self):
"""
Required:
Nothing
Return:
Nothing
Logic:
The self.__end_list is a list of tokens that will force a list to end.
Likewise, the self.__end_lines is a list of lines that forces a list to end.
"""
self.__state = "default"
self.__all_sections = []
self.__chunk = ''
self.__state_dict={
'default' : self.__default_func,
'in_table' : self.__in_table_func,
'in_list' : self.__in_list_func,
'after_body' : self.__after_body_func,
}
self.__list_depth = 0
self.__end_list = [
'mi<mk<body-close',
# changed 2004-04-26
# 'mi<mk<par-in-fld',
'mi<mk<sect-close', # right before close of section
'mi<mk<sect-start', # right before section start
# this should be sect-close!
# 'mi<mk<header-beg',
# 'mi<mk<header-end',
# 'mi<mk<head___clo',
#
# changed 2004-04-26
# 'mi<mk<fldbk-end_',
# 'mi<mk<sec-fd-beg',
]
self.__headings = [
'heading 1', 'heading 2', 'heading 3', 'heading 4',
'heading 5', 'heading 6', 'heading 7', 'heading 8',
'heading 9'
]
self.__section_num = [0]
self.__id_regex = re.compile(r'\<list-id\>(\d+)')
def __close_lists(self):
"""
Required:
Nothing
Return:
Nothing
Logic:
Reverse the list of dictionaries. Iterate through the list and
get the indent for each list. If the current indent is less than
or equal to the indent in the dictionary, close that level.
Keep track of how many levels you close. Reduce the list by that
many levels.
Reverse the list again.
"""
current_indent = self.__left_indent
self.__all_lists.reverse()
num_levels_closed = 0
for the_dict in self.__all_lists:
list_indent = the_dict.get('left-indent')
if current_indent <= list_indent:
self.__write_end_item()
self.__write_end_list()
num_levels_closed += 1
self.__all_lists = self.__all_lists[num_levels_closed:]
self.__all_lists.reverse()
def __close_sections(self, current_level):
self.__all_sections.reverse()
num_levels_closed = 0
for level in self.__all_sections:
if current_level <= level:
self.__write_end_section()
num_levels_closed += 1
self.__all_sections = self.__all_sections[num_levels_closed:]
self.__all_sections.reverse()
def __write_start_section(self, current_level, name):
section_num = ''
for the_num in self.__section_num:
section_num += '%s.' % the_num
section_num = section_num[:-1]
num_in_level = len(self.__all_sections)
num_in_level = self.__section_num[num_in_level]
level = len(self.__all_sections)
self.__write_obj.write(
'mi<mk<sect-start\n'
)
self.__write_obj.write (
'mi<tg<open-att__<section<num>%s<num-in-level>%s<level>%s'
'<type>%s\n'
% (section_num, num_in_level, level, name)
)
def __write_end_section(self):
self.__write_obj.write('mi<mk<sect-close\n')
self.__write_obj.write('mi<tg<close_____<section\n')
def __default_func(self, line):
"""
Required:
self, line
Returns:
Nothing
Logic
Look for the start of a paragraph defintion. If one is found, check if
it contains a list-id. If it does, start a list. Change the state to
in_pard.
"""
if self.__token_info == 'mi<mk<sect-start':
self.__section_num[0] += 1
self.__section_num = self.__section_num[0:1]
if self.__token_info == 'mi<mk<tabl-start':
self.__state = 'in_table'
elif self.__token_info == 'mi<mk<list_start':
self.__state = 'in_list'
self.__list_depth += 1
elif self.__token_info in self.__end_list:
self.__close_sections(0)
elif self.__token_info == 'mi<mk<style-name':
name = line[17:-1]
if name in self.__headings:
self.__handle_heading(name)
if self.__token_info == 'mi<mk<body-close':
self.__state = 'after_body'
self.__write_obj.write(line)
def __handle_heading(self, name):
num = self.__headings.index(name) + 1
self.__close_sections(num)
self.__all_sections.append(num)
level_depth = len(self.__all_sections) + 1
self.__section_num = self.__section_num[:level_depth]
if len(self.__section_num) < level_depth:
self.__section_num.append(1)
else:
self.__section_num[-1] += 1
self.__write_start_section(num, name)
def __in_table_func(self, line):
if self.__token_info == 'mi<mk<table-end_':
self.__state = 'default'
self.__write_obj.write(line)
def __in_list_func(self, line):
if self.__token_info == 'mi<mk<list_close':
self.__list_depth -= 1
elif self.__token_info == 'mi<mk<list_start':
self.__list_depth += 1
if self.__list_depth == 0:
self.__state = 'default'
self.__write_obj.write(line)
def __after_body_func(self, line):
self.__write_obj.write(line)
def make_sections(self):
"""
Required:
nothing
Returns:
original file will be changed
Logic:
"""
self.__initiate_values()
read_obj = open(self.__file, 'r')
self.__write_obj = open(self.__write_to, 'w')
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
action = self.__state_dict.get(self.__state)
action(line)
read_obj.close()
self.__write_obj.close()
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "sections_to_headings.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)

View File

@ -0,0 +1,579 @@
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program; if not, write to the Free Software #
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
# 02111-1307 USA #
# #
# #
#########################################################################
import sys, os, tempfile, cStringIO
from libprs500.ebooks.rtf2xml import get_char_map, copy
from libprs500.ebooks.rtf2xml.char_set import char_set
class Hex2Utf8:
"""
Convert Microsoft hexidecimal numbers to utf-8
"""
def __init__(self,
in_file,
area_to_convert,
char_file,
default_char_map,
bug_handler,
invalid_rtf_handler,
copy=None,
temp_dir=None,
symbol = None,
wingdings = None,
caps = None,
convert_caps = None,
dingbats = None,
run_level = 1,
):
"""
Required:
'file'
'area_to_convert'--the area of file to convert
'char_file'--the file containing the character mappings
'default_char_map'--name of default character map
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
'symbol'--whether to load the symbol character map
'winddings'--whether to load the wingdings character map
'caps'--whether to load the caps characer map
'convert_to_caps'--wether to convert caps to utf-8
Returns:
nothing
"""
self.__file = in_file
self.__copy = copy
if area_to_convert != 'preamble' and area_to_convert != 'body':
msg = (
'Developer error! Wrong flag.\n'
'in module "hex_2_utf8.py\n'
'"area_to_convert" must be "body" or "preamble"\n'
)
raise self.__bug_handler, msg
self.__char_file = char_file
self.__area_to_convert = area_to_convert
self.__default_char_map = default_char_map
self.__symbol = symbol
self.__wingdings = wingdings
self.__dingbats = dingbats
self.__caps = caps
self.__convert_caps = 0
self.__convert_symbol = 0
self.__convert_wingdings = 0
self.__convert_zapf = 0
self.__run_level = run_level
self.__write_to = tempfile.mktemp()
self.__bug_handler = bug_handler
self.__invalid_rtf_handler = invalid_rtf_handler
def update_values( self,
file,
area_to_convert,
char_file,
convert_caps,
convert_symbol,
convert_wingdings,
convert_zapf,
copy=None,
temp_dir=None,
symbol = None,
wingdings = None,
caps = None,
dingbats = None,
):
"""
Required:
'file'
'area_to_convert'--the area of file to convert
'char_file'--the file containing the character mappings
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
'symbol'--whether to load the symbol character map
'winddings'--whether to load the wingdings character map
'caps'--whether to load the caps characer map
'convert_to_caps'--wether to convert caps to utf-8
Returns:
nothing
"""
self.__file=file
self.__copy = copy
if area_to_convert != 'preamble' and area_to_convert != 'body':
msg = (
'in module "hex_2_utf8.py\n'
'"area_to_convert" must be "body" or "preamble"\n'
)
raise self.__bug_handler, msg
self.__area_to_convert = area_to_convert
self.__symbol = symbol
self.__wingdings = wingdings
self.__dingbats = dingbats
self.__caps = caps
self.__convert_caps = convert_caps
self.__convert_symbol = convert_symbol
self.__convert_wingdings = convert_wingdings
self.__convert_zapf = convert_zapf
# new!
# no longer try to convert these
# self.__convert_symbol = 0
# self.__convert_wingdings = 0
# self.__convert_zapf = 0
def __initiate_values(self):
"""
Required:
Nothing
Set values, including those for the dictionaries.
The file that contains the maps is broken down into many different
sets. For example, for the Symbol font, there is the standard part for
hexidecimal numbers, and the the part for Microsoft charcters. Read
each part in, and then combine them.
"""
# the default encoding system, the lower map for characters 0 through
# 128, and the encoding system for Microsoft characters.
# New on 2004-05-8: the self.__char_map is not in diretory with other
# modules
self.__char_file = cStringIO.StringIO(char_set)
char_map_obj = get_char_map.GetCharMap(
char_file = self.__char_file,
bug_handler = self.__bug_handler,
)
up_128_dict = char_map_obj.get_char_map(map=self.__default_char_map)
bt_128_dict = char_map_obj.get_char_map(map = 'bottom_128')
ms_standard_dict = char_map_obj.get_char_map(map = 'ms_standard')
self.__def_dict = {}
self.__def_dict.update(up_128_dict)
self.__def_dict.update(bt_128_dict)
self.__def_dict.update(ms_standard_dict)
self.__current_dict = self.__def_dict
self.__current_dict_name = 'default'
self.__in_caps = 0
self.__special_fonts_found = 0
if self.__symbol:
symbol_base_dict = char_map_obj.get_char_map(map = 'SYMBOL')
ms_symbol_dict = char_map_obj.get_char_map(map = 'ms_symbol')
self.__symbol_dict = {}
self.__symbol_dict.update(symbol_base_dict)
self.__symbol_dict.update(ms_symbol_dict)
if self.__wingdings:
wingdings_base_dict = char_map_obj.get_char_map(map = 'wingdings')
ms_wingdings_dict = char_map_obj.get_char_map(map = 'ms_wingdings')
self.__wingdings_dict = {}
self.__wingdings_dict.update(wingdings_base_dict)
self.__wingdings_dict.update(ms_wingdings_dict)
if self.__dingbats:
dingbats_base_dict = char_map_obj.get_char_map(map = 'dingbats')
ms_dingbats_dict = char_map_obj.get_char_map(map = 'ms_dingbats')
self.__dingbats_dict = {}
self.__dingbats_dict.update(dingbats_base_dict)
self.__dingbats_dict.update(ms_dingbats_dict)
# load dictionary for caps, and make a string for the replacement
self.__caps_uni_dict = char_map_obj.get_char_map(map='caps_uni')
# # print self.__caps_uni_dict
# don't think I'll need this
##keys = self.__caps_uni_dict.keys()
##self.__caps_uni_replace = '|'.join(keys)
self.__preamble_state_dict = {
'preamble' : self.__preamble_func,
'body' : self.__body_func,
'mi<mk<body-open_' : self.__found_body_func,
'tx<hx<__________' : self.__hex_text_func,
}
self.__body_state_dict = {
'preamble' : self.__preamble_for_body_func,
'body' : self.__body_for_body_func,
}
self.__in_body_dict = {
'mi<mk<body-open_' : self.__found_body_func,
'tx<ut<__________' : self.__utf_to_caps_func,
'tx<hx<__________' : self.__hex_text_func,
'tx<mc<__________' : self.__hex_text_func,
'tx<nu<__________' : self.__text_func,
'mi<mk<font______' : self.__start_font_func,
'mi<mk<caps______' : self.__start_caps_func,
'mi<mk<font-end__' : self.__end_font_func,
'mi<mk<caps-end__' : self.__end_caps_func,
}
self.__caps_list = ['false']
self.__font_list = ['not-defined']
def __hex_text_func(self, line):
"""
Required:
'line' -- the line
Logic:
get the hex_num and look it up in the default dictionary. If the
token is in the dictionary, then check if the value starts with a
"&". If it does, then tag the result as utf text. Otherwise, tag it
as normal text.
If the nex_num is not in the dictionary, then a mistake has been
made.
"""
hex_num = line[17:-1]
converted = self.__current_dict.get(hex_num)
if converted != None:
# tag as utf-8
if converted[0:1] == "&":
font = self.__current_dict_name
if self.__convert_caps\
and self.__caps_list[-1] == 'true'\
and font != 'Symbol'\
and font != 'Wingdings'\
and font != 'Zapf Dingbats':
converted = self.__utf_token_to_caps_func(converted)
self.__write_obj.write(
'tx<ut<__________<%s\n' % converted
)
# tag as normal text
else:
font = self.__current_dict_name
if self.__convert_caps\
and self.__caps_list[-1] == 'true'\
and font != 'Symbol'\
and font != 'Wingdings'\
and font != 'Zapf Dingbats':
converted = converted.upper()
self.__write_obj.write(
'tx<nu<__________<%s\n' % converted
)
# error
else:
token = hex_num.replace("'", '')
the_num = 0
if token:
the_num = int(token, 16)
if the_num > 10:
self.__write_obj.write('mi<tg<empty-att_<udef_symbol<num>%s<description>not-in-table\n' %
hex_num)
if self.__run_level > 4:
# msg = 'no dictionary entry for %s\n'
# msg += 'the hexidecimal num is "%s"\n' % (hex_num)
# msg += 'dictionary is %s\n' % self.__current_dict_name
msg = 'Character "&#x%s;" does not appear to be valid (or is a control character)\n' % token
raise self.__bug_handler, msg
def __found_body_func(self, line):
self.__state = 'body'
self.__write_obj.write(line)
def __body_func(self, line):
"""
When parsing preamble
"""
self.__write_obj.write(line)
def __preamble_func(self, line):
action = self.__preamble_state_dict.get(self.__token_info)
if action != None:
action(line)
else:
self.__write_obj.write(line)
def __convert_preamble(self):
self.__state = 'preamble'
read_obj = open(self.__file, 'r')
self.__write_obj = open(self.__write_to, 'w')
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
action = self.__preamble_state_dict.get(self.__state)
if action == None:
sys.stderr.write('error no state found in hex_2_utf8',
self.__state
)
action(line)
read_obj.close()
self.__write_obj.close()
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "preamble_utf_convert.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)
def __preamble_for_body_func(self, line):
"""
Required:
line -- line to parse
Returns:
nothing
Logic:
Used when parsing the body.
"""
if self.__token_info == 'mi<mk<body-open_':
self.__found_body_func(line)
self.__write_obj.write(line)
def __body_for_body_func(self, line):
"""
Required:
line -- line to parse
Returns:
nothing
Logic:
Used when parsing the body.
"""
action = self.__in_body_dict.get(self.__token_info)
if action != None:
action(line)
else:
self.__write_obj.write(line)
def __start_font_func(self, line):
"""
Required:
line -- line to parse
Returns:
nothing
Logic:
add font face to font_list
"""
face = line[17:-1]
self.__font_list.append(face)
if face == 'Symbol' and self.__convert_symbol:
self.__current_dict_name = 'Symbol'
self.__current_dict = self.__symbol_dict
elif face == 'Wingdings' and self.__convert_wingdings:
self.__current_dict_name = 'Wingdings'
self.__current_dict = self.__wingdings_dict
elif face == 'Zapf Dingbats' and self.__convert_zapf:
self.__current_dict_name = 'Zapf Dingbats'
self.__current_dict = self.__dingbats_dict
else:
self.__current_dict_name = 'default'
self.__current_dict = self.__def_dict
def __end_font_func(self, line):
"""
Required:
line -- line to parse
Returns:
nothing
Logic:
pop font_list
"""
if len(self.__font_list) > 1:
self.__font_list.pop()
else:
sys.stderr.write('module is hex_2_utf8\n')
sys.stderr.write('method is end_font_func\n')
sys.stderr.write('self.__font_list should be greater than one?\n')
face = self.__font_list[-1]
if face == 'Symbol' and self.__convert_symbol:
self.__current_dict_name = 'Symbol'
self.__current_dict = self.__symbol_dict
elif face == 'Wingdings' and self.__convert_wingdings:
self.__current_dict_name = 'Wingdings'
self.__current_dict = self.__wingdings_dict
elif face == 'Zapf Dingbats' and self.__convert_zapf:
self.__current_dict_name = 'Zapf Dingbats'
self.__current_dict = self.__dingbats_dict
else:
self.__current_dict_name = 'default'
self.__current_dict = self.__def_dict
def __start_special_font_func_old(self, line):
"""
Required:
line -- line
Returns;
nothing
Logic:
change the dictionary to use in conversion
"""
# for error checking
if self.__token_info == 'mi<mk<font-symbo':
self.__current_dict.append(self.__symbol_dict)
self.__special_fonts_found += 1
self.__current_dict_name = 'Symbol'
elif self.__token_info == 'mi<mk<font-wingd':
self.__special_fonts_found += 1
self.__current_dict.append(self.__wingdings_dict)
self.__current_dict_name = 'Wingdings'
elif self.__token_info == 'mi<mk<font-dingb':
self.__current_dict.append(self.__dingbats_dict)
self.__special_fonts_found += 1
self.__current_dict_name = 'Zapf Dingbats'
def __end_special_font_func(self, line):
"""
Required:
line --line to parse
Returns:
nothing
Logic:
pop the last dictionary, which should be a special font
"""
if len(self.__current_dict) < 2:
sys.stderr.write('module is hex_2_utf 8\n')
sys.stderr.write('method is __end_special_font_func\n')
sys.stderr.write('less than two dictionaries --can\'t pop\n')
self.__special_fonts_found -= 1
else:
self.__current_dict.pop()
self.__special_fonts_found -= 1
self.__dict_name = 'default'
def __start_caps_func_old(self, line):
"""
Required:
line -- line to parse
Returns:
nothing
Logic:
A marker that marks the start of caps has been found. Set
self.__in_caps to 1
"""
self.__in_caps = 1
def __start_caps_func(self, line):
"""
Required:
line -- line to parse
Returns:
nothing
Logic:
A marker that marks the start of caps has been found. Set
self.__in_caps to 1
"""
self.__in_caps = 1
value = line[17:-1]
self.__caps_list.append(value)
def __end_caps_func(self, line):
"""
Required:
line -- line to parse
Returns:
nothing
Logic:
A marker that marks the end of caps has been found.
set self.__in_caps to 0
"""
if len(self.__caps_list) > 1:
self.__caps_list.pop()
else:
sys.stderr.write('Module is hex_2_utf8\n')
sys.stderr.write('method is __end_caps_func\n')
sys.stderr.write('caps list should be more than one?\n')
def __text_func(self, line):
"""
Required:
line -- line to parse
Returns:
nothing
Logic:
if in caps, convert. Otherwise, print out.
"""
text = line[17:-1]
if self.__current_dict_name == 'Symbol'\
or self.__current_dict_name == 'Wingdings'\
or self.__current_dict_name == 'Zapf Dingbats':
the_string = ''
for letter in text:
hex_num = hex(ord(letter))
hex_num = str(hex_num)
hex_num = hex_num.upper()
hex_num = hex_num[2:]
hex_num = '\'%s' % hex_num
converted = self.__current_dict.get(hex_num)
if converted == None:
sys.stderr.write('module is hex_2_ut8\n')
sys.stderr.write('method is __text_func\n')
sys.stderr.write('no hex value for "%s"\n' % hex_num)
else:
the_string += converted
self.__write_obj.write('tx<nu<__________<%s\n' % the_string)
else:
if self.__caps_list[-1] == 'true' \
and self.__convert_caps\
and self.__current_dict_name != 'Symbol'\
and self.__current_dict_name != 'Wingdings'\
and self.__current_dict_name != 'Zapf Dingbats':
text = text.upper()
self.__write_obj.write('tx<nu<__________<%s\n' % text)
def __utf_to_caps_func(self, line):
"""
Required:
line -- line to parse
returns
nothing
Logic
Get the text, and use another method to convert
"""
utf_text = line[17:-1]
if self.__caps_list[-1] == 'true' and self.__convert_caps:
# utf_text = utf_text.upper()
utf_text = self.__utf_token_to_caps_func(utf_text)
self.__write_obj.write('tx<ut<__________<%s\n' % utf_text)
def __utf_token_to_caps_func(self, char_entity):
"""
Required:
utf_text -- such as &xxx;
Returns:
token converted to the capital equivalent
Logic:
RTF often stores text in the improper values. For example, a
capital umlaut o (?), is stores as ?. This function swaps the
case by looking up the value in a dictionary.
"""
hex_num = char_entity[3:]
length = len(hex_num)
if length == 3:
hex_num = '00%s' % hex_num
elif length == 4:
hex_num = '0%s' % hex_num
new_char_entity = '&#x%s' % hex_num
converted = self.__caps_uni_dict.get(new_char_entity)
if not converted:
# bullets and other entities dont' have capital equivelents
return char_entity
else:
return converted
def __convert_body(self):
self.__state = 'body'
read_obj = open(self.__file, 'r')
self.__write_obj = open(self.__write_to, 'w')
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
action = self.__body_state_dict.get(self.__state)
if action == None:
sys.stderr.write('error no state found in hex_2_utf8',
self.__state
)
action(line)
read_obj.close()
self.__write_obj.close()
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "body_utf_convert.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)
def convert_hex_2_utf8(self):
self.__initiate_values()
if self.__area_to_convert == 'preamble':
self.__convert_preamble()
else:
self.__convert_body()
"""
how to swap case for non-capitals
my_string.swapcase()
An example of how to use a hash for the caps function
(but I shouldn't need this, since utf text is separate
from regular text?)
sub_dict = {
"&#x0430;" : "some other value"
}
def my_sub_func(matchobj):
info = matchobj.group(0)
value = sub_dict.get(info)
return value
return "f"
line = "&#x0430; more text"
reg_exp = re.compile(r'(?P<name>&#x0430;|&#x0431;)')
line2 = re.sub(reg_exp, my_sub_func, line)
print line2
"""

View File

@ -0,0 +1,255 @@
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program; if not, write to the Free Software #
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
# 02111-1307 USA #
# #
# #
#########################################################################
import sys, os, tempfile
from libprs500.ebooks.rtf2xml import copy
class Info:
"""
Make tags for document-information
"""
def __init__(self,
in_file,
bug_handler,
copy = None,
run_level = 1,
):
"""
Required:
'file'--file to parse
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
Returns:
nothing
"""
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__run_level = run_level
self.__write_to = tempfile.mktemp()
def __initiate_values(self):
"""
Initiate all values.
"""
self.__text_string = ''
self.__state = 'before_info_table'
self.__state_dict = {
'before_info_table': self.__before_info_table_func,
'after_info_table': self.__after_info_table_func,
'in_info_table' : self.__in_info_table_func,
'collect_text' : self.__collect_text_func,
'collect_tokens' : self.__collect_tokens_func,
}
self.__info_table_dict = {
'cw<di<title_____' : (self.__found_tag_with_text_func, 'title'),
'cw<di<author____' : (self.__found_tag_with_text_func, 'author'),
'cw<di<keywords__' : (self.__found_tag_with_text_func, 'keywords'),
'cw<di<doc-notes_' : (self.__found_tag_with_text_func, 'doc-notes'),
'cw<di<subject___' : (self.__found_tag_with_text_func, 'subject'),
'cw<di<operator__' : (self.__found_tag_with_text_func, 'operator'),
'cw<di<create-tim' : (self.__found_tag_with_tokens_func, 'creation-time'),
'cw<di<revis-time' : (self.__found_tag_with_tokens_func, 'revision-time'),
'cw<di<edit-time_' : (self.__single_field_func, 'editing-time'),
'cw<di<num-of-wor' : (self.__single_field_func, 'number-of-words'),
'cw<di<num-of-chr' : (self.__single_field_func, 'number-of-characters'),
'cw<di<num-of-pag' : (self.__single_field_func, 'number-of-pages'),
}
self.__token_dict = {
'year______' : 'year',
'month_____' : 'month',
'day_______' : 'day',
'minute____' : 'minute',
'revis-time' : 'revision-time',
'num-of-wor' : 'number-of-words',
'num-of-chr' : 'number-of-characters',
'num-of-pag' : 'number-of-pages',
}
def __before_info_table_func(self, line):
"""
Required:
line -- the line to parse
Returns:
nothing
Logic:
Check for the beginning of the informatin table. When found, set
the state to the information table. Always write the line.
"""
if self.__token_info == 'mi<mk<doc-in-beg':
self.__state = 'in_info_table'
self.__write_obj.write(line)
def __in_info_table_func(self, line):
"""
Requires:
line -- line to parse
Returns:
nothing.
Logic:
Check for the end of information. If not found, check if the
token has a special value in the info table dictionay. If it
does, execute that function.
Otherwise, output the line to the file.
"""
if self.__token_info == 'mi<mk<doc-in-end':
self.__state = 'after_info_table'
else:
action, tag = self.__info_table_dict.get(self.__token_info, (None, None))
if action:
action(line, tag)
else:
self.__write_obj.write(line)
def __found_tag_with_text_func(self, line, tag):
"""
Requires:
line -- line to parse
tag --what kind of line
Returns:
nothing
Logic:
This function marks the beginning of informatin fields that have
text that must be collected. Set the type of information field
with the tag option. Set the state to collecting text
"""
self.__tag = tag
self.__state = 'collect_text'
def __collect_text_func(self, line):
"""
Requires:
line -- line to parse
Returns:
nothing
Logic:
If the end of the information field is found, write the text
string to the file.
Otherwise, if the line contains text, add it to the text string.
"""
if self.__token_info == 'mi<mk<docinf-end':
self.__state = 'in_info_table'
self.__write_obj.write(
'mi<tg<open______<%s\n'
'tx<nu<__________<%s\n'
'mi<tg<close_____<%s\n' % (self.__tag, self.__text_string, self.__tag)
)
self.__text_string = ''
elif line[0:2] == 'tx':
self.__text_string += line[17:-1]
def __found_tag_with_tokens_func(self, line, tag):
"""
Requires:
line -- line to parse
tag -- type of field
Returns:
nothing
Logic:
Some fields have a series of tokens (cw<di<year______<nu<2003)
that must be parsed as attributes for the element.
Set the state to collect tokesn, and set the text string to
start an empty element with attributes.
"""
self.__state = 'collect_tokens'
self.__text_string = 'mi<tg<empty-att_<%s' % tag
#mi<tg<empty-att_<page-definition<margin>33\n
def __collect_tokens_func(self, line):
"""
Requires:
line -- line to parse
Returns:
nothing
Logic:
This function collects all the token information and adds it to
the text string until the end of the field is found.
First check of the end of the information field. If found, write
the text string to the file.
If not found, get the relevant information from the text string.
This information cannot be directly added to the text string,
because it exists in abbreviated form. (num-of-wor)
I want to check this information in a dictionary to convert it
to a longer, readable form. If the key does not exist in the
dictionary, print out an error message. Otherise add the value
to the text string.
(num-of-wor => number-of-words)
"""
#cw<di<year______<nu<2003
if self.__token_info == 'mi<mk<docinf-end':
self.__state = 'in_info_table'
self.__write_obj.write(
'%s\n' % self.__text_string
)
self.__text_string = ''
else:
att = line[6:16]
value = line[20:-1]
att_changed = self.__token_dict.get(att)
if att_changed == None:
if self.__run_level > 3:
msg = 'no dictionary match for %s\n' % att
raise self.__bug_handler, msg
else:
self.__text_string += '<%s>%s' % (att_changed, value)
def __single_field_func(self, line, tag):
value = line[20:-1]
self.__write_obj.write(
'mi<tg<empty-att_<%s'
'<%s>%s\n' % (tag, tag, value)
)
def __after_info_table_func(self, line):
"""
Requires:
line --line to write to file
Returns:
nothing
Logic:
After the end of the information table, simple write the line to
the file.
"""
self.__write_obj.write(line)
def fix_info(self):
"""
Requires:
nothing
Returns:
nothing (changes the original file)
Logic:
Read one line in at a time. Determine what action to take based on
the state. If the state is before the information table, look for the
beginning of the style table.
If the state is in the information table, use other methods to
parse the information
style table, look for lines with style info, and substitute the
number with the name of the style. If the state if afer the
information table, simply write the line to the output file.
"""
self.__initiate_values()
read_obj = open(self.__file, 'r')
self.__write_obj = open(self.__write_to, 'w')
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
action = self.__state_dict.get(self.__state)
if action == None:
sys.stderr.write('no no matching state in module styles.py\n')
sys.stderr.write(self.__state + '\n')
action(line)
read_obj.close()
self.__write_obj.close()
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "info.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)

View File

@ -0,0 +1,411 @@
import sys, os, tempfile
from libprs500.ebooks.rtf2xml import copy
"""
States.
1. default
1. an open bracket ends this state.
2. Text print out text. Print out any groups_in_waiting.
3. closed bracket. Close groups
2. after an open bracket
1. The lack of a control word ends this state.
2. paragraph end -- close out all tags
3. footnote beg -- close out all tags
"""
class Inline:
"""
Make inline tags within lists.
Logic:
"""
def __init__(self,
in_file,
bug_handler,
copy=None,
run_level = 1,):
"""
Required:
'file'--file to parse
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
Returns:
nothing
"""
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__run_level = run_level
self.__write_to = tempfile.mktemp()
def __initiate_values(self):
"""
Initiate all values.
"""
self.__state_dict = {
'default': self.__default_func,
'after_open_bracket': self.__after_open_bracket_func,
}
self.__default_dict = {
'ob<nu<open-brack': self.__found_open_bracket_func,
'tx<nu<__________' : self.__found_text_func,
'tx<hx<__________' : self.__found_text_func,
'tx<ut<__________' : self.__found_text_func,
'mi<mk<inline-fld' : self.__found_text_func,
'text' : self.__found_text_func,
'cb<nu<clos-brack' : self.__close_bracket_func,
'mi<mk<par-end___' : self.__end_para_func,
'mi<mk<footnt-ope' : self.__end_para_func,
'mi<mk<footnt-ind' : self.__end_para_func,
}
self.__after_open_bracket_dict = {
'cb<nu<clos-brack' : self.__close_bracket_func,
'tx<nu<__________' : self.__found_text_func,
'tx<hx<__________' : self.__found_text_func,
'tx<ut<__________' : self.__found_text_func,
'text' : self.__found_text_func,
'mi<mk<inline-fld' : self.__found_text_func,
'ob<nu<open-brack': self.__found_open_bracket_func,
'mi<mk<par-end___' : self.__end_para_func,
'mi<mk<footnt-ope' : self.__end_para_func,
'mi<mk<footnt-ind' : self.__end_para_func,
'cw<fd<field_____' : self.__found_field_func,
}
self.__state = 'default'
self.__brac_count = 0 # do I need this?
self.__list_inline_list = []
self.__body_inline_list = []
self.__groups_in_waiting_list = [0]
self.__groups_in_waiting_body = [0]
self.__groups_in_waiting = self.__groups_in_waiting_body
self.__place = 'non_list'
self.__inline_list = self.__body_inline_list
self.__in_para = 0 # not in paragraph
self.__char_dict = {
# character info => ci
'annotation' : 'annotation',
'blue______' : 'blue',
'bold______' : 'bold',
'caps______' : 'caps',
'char-style' : 'character-style',
'dbl-strike' : 'double-strike-through',
'emboss____' : 'emboss',
'engrave___' : 'engrave',
'font-color' : 'font-color',
'font-down_' : 'subscript',
'font-size_' : 'font-size',
'font-style' : 'font-style',
'font-up___' : 'superscript',
'footnot-mk' : 'footnote-marker',
'green_____' : 'green',
'hidden____' : 'hidden',
'italics___' : 'italics',
'outline___' : 'outline',
'red_______' : 'red',
'shadow____' : 'shadow',
'small-caps' : 'small-caps',
'strike-thr' : 'strike-through',
'subscript_' : 'subscript',
'superscrip' : 'superscript',
'underlined' : 'underlined',
}
self.__caps_list = ['false']
def __set_list_func(self, line):
"""
Requires:
line--line of text
Returns:
nothing
Logic:
"""
if self.__place == 'in_list':
if self.__token_info == 'mi<mk<lst-tx-end':
self.__place = 'not_in_list'
self.__inline_list = self.__body_inline_list
self.__groups_in_waiting = self.__groups_in_waiting_body
else:
if self.__token_info == 'mi<mk<lst-tx-beg':
self.__place = 'in_list'
self.__inline_list = self.__list_inline_list
self.__groups_in_waiting = self.__groups_in_waiting_list
def __default_func(self, line):
"""
Requires:
line-- line of text
Returns:
nothing
Logic:
"""
action = self.__default_dict.get(self.__token_info)
if action:
action(line)
self.__write_obj.write(line)
def __found_open_bracket_func(self, line):
"""
Requires:
line -- current line of text
Returns:
nothing
Logic:
Change the state to 'after_open_bracket'
"""
self.__state = 'after_open_bracket'
self.__brac_count += 1
self.__groups_in_waiting[0] += 1
self.__inline_list.append({})
self.__inline_list[-1]['contains_inline'] = 0
def __after_open_bracket_func(self, line):
"""
Requires:
line --line of text
Returns:
nothing
Logic:
If the token is a control word for character info (cw<ci), use another
method to add to the dictionary.
Use the dictionary to get the approriate function.
Always print out the line.
"""
if line[0:2] == 'cw':
self.__handle_control_word(line)
else:
action = self.__after_open_bracket_dict.get(self.__token_info)
if action:
self.__state = 'default' # a non control word?
action(line)
self.__write_obj.write(line)
def __handle_control_word(self, line):
"""
Required:
line --line of text
Returns:
nothing
Logic:
Handle the control word for inline groups.
Add each name - value to a dictionary.
If the font style of Symbol, Wingdings, or Dingbats is found,
always mark this. I need this later to convert the text to
the right utf.
"""
# cw<ci<shadow_____<nu<true
# self.__char_dict = {
char_info = line[6:16]
char_value = line[20:-1]
name = self.__char_dict.get(char_info)
if name:
self.__inline_list[-1]['contains_inline'] = 1
self.__inline_list[-1][name] = char_value
"""
if name == 'font-style':
if char_value == 'Symbol':
self.__write_obj.write('mi<mk<font-symbo\n')
elif char_value == 'Wingdings':
self.__write_obj.write('mi<mk<font-wingd\n')
elif char_value == 'Zapf Dingbats':
self.__write_obj.write('mi<mk<font-dingb\n')
"""
def __close_bracket_func(self, line):
"""
Requires:
line --line of text
Returns:
Nothing
Logic:
If there are no inline groups, do nothing.
Get the keys of the last dictionary in the inline_groups.
If 'contains_inline' in the keys, write a close tag.
If the_dict contains font information, write a mk tag.
"""
if len(self.__inline_list) == 0:
# nothing to add
return
the_dict = self.__inline_list[-1]
the_keys = the_dict.keys()
# always close out
if self.__place == 'in_list':
if 'contains_inline' in the_keys and the_dict['contains_inline'] == 1\
and self.__groups_in_waiting[0] == 0:
self.__write_obj.write('mi<tg<close_____<inline\n')
if 'font-style' in the_keys:
self.__write_obj.write('mi<mk<font-end__\n')
if 'caps' in the_keys:
self.__write_obj.write('mi<mk<caps-end__\n')
else:
# close out only if in a paragraph
if 'contains_inline' in the_keys and the_dict['contains_inline'] == 1\
and self.__in_para and self.__groups_in_waiting[0] == 0:
self.__write_obj.write('mi<tg<close_____<inline\n')
if 'font-style' in the_keys:
self.__write_obj.write('mi<mk<font-end__\n')
if 'caps' in the_keys:
self.__write_obj.write('mi<mk<caps-end__\n')
self.__inline_list.pop()
if self.__groups_in_waiting[0] != 0:
self.__groups_in_waiting[0] -= 1
def __found_text_func(self, line):
"""
Required:
line--line of text
Return:
nothing
Logic:
Two cases:
1. in a list. Simply write inline
2. Not in a list
Text can mark the start of a paragraph.
If already in a paragraph, check to see if any groups are waiting
to be added. If so, use another method to write these groups.
"""
if self.__place == 'in_list':
self.__write_inline()
else:
if not self.__in_para:
self.__in_para = 1
self.__start_para_func(line)
else:
if self.__groups_in_waiting[0] != 0:
self.__write_inline()
def __write_inline(self):
"""
Required:
nothing
Returns
Nothing
Logic:
Method for writing inline when text is found.
Only write those groups that are "waiting", or that have no
tags yet.
First, slice the list self.__inline list to get just the groups
in waiting.
Iterate through this slice, which contains only dictionaries.
Get the keys in each dictionary. If 'font-style' is in the keys,
write a marker tag. (I will use this marker tag later when conerting
hext text to utf8.)
Write a tag for the inline vaues.
"""
if self.__groups_in_waiting[0] != 0:
last_index = -1 * self.__groups_in_waiting[0]
inline_list = self.__inline_list[last_index:]
if len(inline_list) <= 0:
if self.__run_level > 3:
msg = 'self.__inline_list is %s\n' % self.__inline_list
raise self.__bug_handler, msg
self.__write_obj.write('error\n')
self.__groups_in_waiting[0] = 0
return
for the_dict in inline_list:
if the_dict['contains_inline']:
the_keys = the_dict.keys()
if 'font-style' in the_keys:
face = the_dict['font-style']
self.__write_obj.write('mi<mk<font______<%s\n' % face)
if 'caps' in the_keys:
value = the_dict['caps']
self.__write_obj.write('mi<mk<caps______<%s\n' % value)
self.__write_obj.write('mi<tg<open-att__<inline')
for the_key in the_keys:
if the_key != 'contains_inline':
self.__write_obj.write('<%s>%s' % (the_key, the_dict[the_key]))
self.__write_obj.write('\n')
self.__groups_in_waiting[0] = 0
def __end_para_func(self, line):
"""
Requires:
line -- line of text
Returns:
nothing
Logic:
Slice from the end the groups in waiting.
Iterate through the list. If the dictionary contaings info, write
a closing tag.
"""
if not self.__in_para:
return
if self.__groups_in_waiting[0] == 0:
inline_list = self.__inline_list
else:
last_index = -1 * self.__groups_in_waiting[0]
inline_list = self.__inline_list[0:last_index]
for the_dict in inline_list:
contains_info = the_dict.get('contains_inline')
if contains_info:
the_keys = the_dict.keys()
if 'font-style' in the_keys:
self.__write_obj.write('mi<mk<font-end__\n')
if 'caps' in the_keys:
self.__write_obj.write('mi<mk<caps-end__\n')
self.__write_obj.write('mi<tg<close_____<inline\n')
self.__in_para = 0
def __start_para_func(self, line):
"""
Requires:
line -- line of text
Returns:
nothing
Logic:
Iterate through the self.__inline_list to get each dict.
If the dict containst inline info, get the keys.
Iterate through the keys and print out the key and value.
"""
for the_dict in self.__inline_list:
contains_info = the_dict.get('contains_inline')
if contains_info :
the_keys = the_dict.keys()
if 'font-style' in the_keys:
face = the_dict['font-style']
self.__write_obj.write('mi<mk<font______<%s\n' % face)
if 'caps' in the_keys:
value = the_dict['caps']
self.__write_obj.write('mi<mk<caps______<%s\n' % value)
self.__write_obj.write('mi<tg<open-att__<inline')
for the_key in the_keys:
if the_key != 'contains_inline':
self.__write_obj.write('<%s>%s' % (the_key, the_dict[the_key]))
self.__write_obj.write('\n')
self.__groups_in_waiting[0] = 0
def __found_field_func(self, line):
"""
Just a default function to make sure I don't prematurely exit
default state
"""
pass
def form_tags(self):
"""
Requires:
area--area to parse (list or non-list)
Returns:
nothing
Logic:
Read one line in at a time. Determine what action to take based on
the state.
"""
self.__initiate_values()
read_obj = open(self.__file, 'r')
self.__write_obj = open(self.__write_to, 'w')
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
token = line[0:-1]
self.__token_info = ''
if token == 'tx<mc<__________<rdblquote'\
or token == 'tx<mc<__________<ldblquote'\
or token == 'tx<mc<__________<lquote'\
or token == 'tx<mc<__________<rquote'\
or token == 'tx<mc<__________<emdash'\
or token == 'tx<mc<__________<endash'\
or token == 'tx<mc<__________<bullet':
self.__token_info = 'text'
else:
self.__token_info = line[:16]
self.__set_list_func(line)
action = self.__state_dict.get(self.__state)
if action == None:
sys.stderr.write('No matching state in module inline_for_lists.py\n')
sys.stderr.write(self.__state + '\n')
action(line)
read_obj.close()
self.__write_obj.close()
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "inline.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)

View File

@ -0,0 +1,67 @@
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program; if not, write to the Free Software #
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
# 02111-1307 USA #
# #
# #
#########################################################################
import os, tempfile, re
from libprs500.ebooks.rtf2xml import copy
class FixLineEndings:
"""Fix line endings"""
def __init__(self,
bug_handler,
in_file = None,
copy = None,
run_level = 1,
replace_illegals = 1,
):
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__run_level = run_level
self.__write_to = tempfile.mktemp()
self.__replace_illegals = replace_illegals
def fix_endings(self):
##tempFileName = tempfile.mktemp()
illegal_regx = re.compile( '\x00|\x01|\x02|\x03|\x04|\x05|\x06|\x07|\x08|\x0B|\x0E|\x0F|\x10|\x11|\x12|\x13')
#nums = [0, 1, 2, 3, 4, 5, 6, 7, 8, 11, 14, 15, 16, 17, 18, 19]
"""
read_obj = open(self.__file, 'r')
line = read_obj.read(1000)
regexp = re.compile(r"\r")
macintosh = regexp.search(line)
read_obj.close()
"""
# always check since I have to get rid of illegal characters
macintosh = 1
if macintosh:
line = 1
read_obj = open(self.__file, 'r')
write_obj = open(self.__write_to, 'w')
while line:
line = read_obj.read(1000)
# line = re.sub(regexp,"\n",line)
line = line.replace ('\r', '\n')
if self.__replace_illegals:
line = re.sub(illegal_regx, '', line)
# for num in nums:
# line = line.replace(chr(num), '')
write_obj.write(line )
read_obj.close()
write_obj.close()
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "line_endings.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)

View File

@ -0,0 +1,193 @@
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program; if not, write to the Free Software #
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
# 02111-1307 USA #
# #
# #
#########################################################################
import os, tempfile
from libprs500.ebooks.rtf2xml import copy
class ListNumbers:
"""
RTF puts list numbers outside of the paragraph. The public method
in this class put the list numbers inside the paragraphs.
"""
def __init__(self,
in_file,
bug_handler,
copy = None,
run_level = 1,
):
"""
Required:
'file'
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
Returns:
nothing
"""
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__write_to = tempfile.mktemp()
def __initiate_values(self):
"""
initiate values for fix_list_numbers.
Required:
Nothing
Return:
Nothing
"""
self.__state = "default"
self.__list_chunk = ''
self.__previous_line = ''
self.__list_text_ob_count = ''
self.__state_dict={
'default' : self.__default_func,
'after_ob' : self.__after_ob_func,
'list_text' : self.__list_text_func,
'after_list_text' : self.__after_list_text_func
}
def __after_ob_func(self, line):
"""
Handle the line immediately after an open bracket.
Required:
self, line
Returns:
Nothing
"""
if self.__token_info == 'cw<ls<list-text_':
self.__state = 'list_text'
self.__list_chunk = self.__list_chunk + \
self.__previous_line + line
self.__list_text_ob = self.__ob_count
self.__cb_count = 0
else:
self.__write_obj.write(self.__previous_line)
self.__write_obj.write(line)
self.__state = 'default'
def __after_list_text_func(self, line):
"""
Look for an open bracket or a line of text, and then print out the
self.__list_chunk. Print out the line.
"""
if line[0:2] == 'ob' or line[0:2] == 'tx':
self.__state = 'default'
self.__write_obj.write('mi<mk<lst-txbeg_\n')
self.__write_obj.write('mi<mk<para-beg__\n')
self.__write_obj.write('mi<mk<lst-tx-beg\n')
self.__write_obj.write(
# 'mi<tg<open-att__<list-text<type>%s\n' % self.__list_type)
'mi<tg<open-att__<list-text\n')
self.__write_obj.write(self.__list_chunk)
self.__write_obj.write('mi<tg<close_____<list-text\n')
self.__write_obj.write('mi<mk<lst-tx-end\n')
self.__list_chunk = ''
self.__write_obj.write(line)
def __determine_list_type(self, chunk):
"""
Determine if the list is ordered or itemized
"""
lines = chunk.split('\n')
text_string = ''
for line in lines:
if line [0:5] == 'tx<hx':
if line[17:] == '\'B7':
return "unordered"
elif line[0:5] == 'tx<nu':
text_string += line[17:]
text_string = text_string.replace('.', '')
text_string = text_string.replace('(', '')
text_string = text_string.replace(')', '')
if text_string.isdigit():
return 'ordered'
"""
sys.stderr.write('module is list_numbers\n')
sys.stderr.write('method is __determine type\n')
sys.stderr.write('Couldn\'t get type of list\n')
"""
# must be some type of ordered list -- just a guess!
return 'unordered'
def __list_text_func(self, line):
"""
Handle lines that are part of the list text. If the end of the list
text is found (the closing bracket matches the self.__list_text_ob),
then change the state. Always add the line to the self.__list_chunk
Required:
self, line
Returns:
Nothing
"""
if self.__list_text_ob == self.__cb_count:
self.__state = 'after_list_text'
self.__right_after_list_text = 1
self.__list_type = self.__determine_list_type(self.__list_chunk)
self.__write_obj.write('mi<mk<list-type_<%s\n' % self.__list_type)
if self.__token_info != 'cw<pf<par-def___':
self.__list_chunk = self.__list_chunk + line
def __default_func(self, line):
"""
Handle the lines that are not part of any special state. Look for an
opening bracket. If an open bracket is found, add this line to a
temporary self.__previous line, which other methods need. Otherwise,
print out the line.
Required:
self, line
Returns:
Nothing
"""
if self.__token_info == 'ob<nu<open-brack':
self.__state = 'after_ob'
self.__previous_line = line
else:
self.__write_obj.write(line)
def fix_list_numbers(self):
"""
Required:
nothing
Returns:
original file will be changed
Logic:
Read in one line a time from the file. Keep track of opening and
closing brackets. Determine the method ('action') by passing the
state to the self.__state_dict.
Simply print out the line to a temp file until an open bracket
is found. Check the next line. If it is list-text, then start
adding to the self.__list_chunk until the closing bracket is
found.
Next, look for an open bracket or text. When either is found,
print out self.__list_chunk and the line.
"""
self.__initiate_values()
read_obj = open(self.__file, 'r')
self.__write_obj = open(self.__write_to, 'w')
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
if self.__token_info == 'ob<nu<open-brack':
self.__ob_count = line[-5:-1]
if self.__token_info == 'cb<nu<clos-brack':
self.__cb_count = line[-5:-1]
action = self.__state_dict.get(self.__state)
action(line)
read_obj.close()
self.__write_obj.close()
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "list_numbers.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)

View File

@ -0,0 +1,431 @@
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program; if not, write to the Free Software #
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
# 02111-1307 USA #
# #
# #
#########################################################################
class ListTable:
"""
Parse the list table line. Make a string. Form a dictionary.
Return the string and the dictionary.
"""
def __init__(
self,
bug_handler,
run_level = 1,
):
self.__bug_handler = bug_handler
self.__initiate_values()
self.__run_level = run_level
def __initiate_values(self):
self.__list_table_final = ''
self.__state = 'default'
self.__final_dict = {}
self.__list_dict = {}
self.__all_lists = []
self.__level_text_string = ''
self.__level_text_list = []
self.__found_level_text_length = 0
self.__level_text_position = None
self.__prefix_string = None
self.__level_numbers_string = ''
self.__state_dict = {
'default' : self.__default_func,
'level' : self.__level_func,
'list' : self.__list_func,
'unsure_ob' : self.__after_bracket_func,
'level_number' : self.__level_number_func,
'level_text' : self.__level_text_func,
'list_name' : self.__list_name_func,
}
self.__main_list_dict = {
'cw<ls<ls-tem-id_' : 'list-template-id',
'cw<ls<list-hybri' : 'list-hybrid',
'cw<ls<lis-tbl-id' : 'list-table-id',
}
self.__level_dict = {
'cw<ls<level-star' : 'list-number-start',
'cw<ls<level-spac' : 'list-space',
'cw<ls<level-inde' : 'level-indent',
'cw<ls<fir-ln-ind' : 'first-line-indent',
'cw<ls<left-inden' : 'left-indent',
'cw<ls<tab-stop__' : 'tabs',
'cw<ls<level-type' : 'numbering-type',
'cw<pf<right-inde' : 'right-indent',
'cw<pf<left-inden' : 'left-indent',
'cw<pf<fir-ln-ind' : 'first-line-indent',
'cw<ci<italics___' : 'italics',
'cw<ci<bold______' : 'bold',
'cw<ss<para-style' : 'paragraph-style-name',
}
"""
all_lists =
[{anything here?}
[{list-templateid = ""}
[{level-indent}],[{level-indent}]
]
],
"""
def __parse_lines(self, line):
"""
Required : line --line to parse
Returns: nothing
Logic:
Split the lines into a list by a new line. Process the line
according to the state.
"""
lines = line.split('\n')
self.__ob_count = 0
self.__ob_group = 0
for line in lines:
self.__token_info = line[:16]
if self.__token_info == 'ob<nu<open-brack':
self.__ob_count = line[-4:]
self.__ob_group += 1
if self.__token_info == 'cb<nu<clos-brack':
self.__cb_count = line[-4:]
self.__ob_group -= 1
action = self.__state_dict.get(self.__state)
if action == None:
print self.__state
action(line)
self.__write_final_string()
# self.__add_to_final_line()
def __default_func(self, line):
"""
Requires: line --line to process
Return: nothing
Logic:
This state is used at the start and end of a list. Look for an
opening bracket, which marks the change of state.
"""
if self.__token_info == 'ob<nu<open-brack':
self.__state = 'unsure_ob'
def __found_list_func(self, line):
"""
Requires: line -- line to process
Returns: nothing
Logic:
I have found \list.
Change the state to list
Get the open bracket count so you know when this state ends.
Append an empty list to all lists.
Create a temporary dictionary. This dictionary has the key of
"list-id" and the value of an empty list. Later, this empty list
will be filled with all the ids for which the formatting is valid.
Append the temporary dictionary to the new list.
"""
self.__state = 'list'
self.__list_ob_count = self.__ob_count
self.__all_lists.append([])
the_dict = {'list-id': []}
self.__all_lists[-1].append(the_dict)
def __list_func(self, line):
"""
Requires: line --line to process
Returns: nothing
Logic:
This method is called when you are in a list, but outside of a level.
Check for the end of the list. Otherwise, use the self.__mainlist_dict
to determine if you need to add a lines values to the main list.
"""
if self.__token_info == 'cb<nu<clos-brack' and\
self.__cb_count == self.__list_ob_count:
self.__state = 'default'
elif self.__token_info == 'ob<nu<open-brack':
self.__state = 'unsure_ob'
else:
att = self.__main_list_dict.get(self.__token_info)
if att:
value = line[20:]
# dictionary is always the first item in the last list
# [{att:value}, [], [att:value, []]
self.__all_lists[-1][0][att] = value
def __found_level_func(self, line):
"""
Requires: line -- line to process
Returns: nothing
Logic:
I have found \listlevel.
Change the state to level
Get the open bracket count so you know when this state ends.
Append an empty list to the last list inside all lists.
Create a temporary dictionary.
Append the temporary dictionary to the new list.
self.__all_lists now looks like:
[[{list-id:[]}, [{}]]]
Where:
self.__all_lists[-1] => a list. The first item is a dictionary.
The second item is a list containing a dictionary:
[{list-id:[]}, [{}]]
self.__all_lists[-1][0] => a dictionary of the list attributes
self.__all_lists[-1][-1] => a list with just a dictionary
self.__all_lists[-1][-1][0] => the dictionary of level attributes
"""
self.__state = 'level'
self.__level_ob_count = self.__ob_count
self.__all_lists[-1].append([])
the_dict = {}
self.__all_lists[-1][-1].append(the_dict)
self.__level_dict
def __level_func(self, line):
"""
Requires:
line -- line to parse
Returns:
nothing
Logic:
Look for the end of the this group.
Change states if an open bracket is found.
Add attributes to all_dicts if an appropriate token is found.
"""
if self.__token_info == 'cb<nu<clos-brack' and\
self.__cb_count == self.__level_ob_count:
self.__state = 'list'
elif self.__token_info == 'ob<nu<open-brack':
self.__state = 'unsure_ob'
else:
att = self.__level_dict.get(self.__token_info)
if att:
value = line[20:]
self.__all_lists[-1][-1][0][att] = value
def __level_number_func(self, line):
"""
Requires:
line -- line to process
Returns:
nothing
Logic:
Check for the end of the group.
Otherwise, if the token is hexidecimal, create an attribute.
Do so by finding the base-10 value of the number. Then divide
this by 2 and round it. Remove the ".0". Sandwwhich the result to
give you something like level1-show-level.
The show-level attribute means the numbering for this level.
"""
if self.__token_info == 'cb<nu<clos-brack' and\
self.__cb_count == self.__level_number_ob_count:
self.__state = 'level'
self.__all_lists[-1][-1][0]['level-numbers'] = self.__level_numbers_string
self.__level_numbers_string = ''
elif self.__token_info == 'tx<hx<__________':
self.__level_numbers_string += '\\&#x0027;%s' % line[18:]
elif self.__token_info == 'tx<nu<__________':
self.__level_numbers_string += line[17:]
"""
num = line[18:]
num = int(num, 16)
level = str(round((num - 1)/2, 0))
level = level[:-2]
level = 'level%s-show-level' % level
self.__all_lists[-1][-1][0][level] = 'true'
"""
def __level_text_func(self, line):
"""
Requires:
line --line to process
Returns:
nothing
Logic:
Check for the end of the group.
Otherwise, if the text is hexidecimal, call on the method
__parse_level_text_length.
Otheriwse, if the text is regular text, create an attribute.
This attribute indicates the puncuation after a certain level.
An example is "level1-marker = '.'"
Otherwise, check for a level-template-id.
"""
if self.__token_info == 'cb<nu<clos-brack' and\
self.__cb_count == self.__level_text_ob_count:
if self.__prefix_string:
if self.__all_lists[-1][-1][0]['numbering-type'] == 'bullet':
self.__prefix_string = self.__prefix_string.replace('_', '')
self.__all_lists[-1][-1][0]['bullet-type'] = self.__prefix_string
self.__state = 'level'
# self.__figure_level_text_func()
self.__level_text_string = ''
self.__found_level_text_length = 0
elif self.__token_info == 'tx<hx<__________':
self.__parse_level_text_length(line)
elif self.__token_info == 'tx<nu<__________':
text = line[17:]
if text and text[-1] == ';':
text = text.replace(';', '')
if not self.__level_text_position:
self.__prefix_string = text
else:
self.__all_lists[-1][-1][0][self.__level_text_position] = text
elif self.__token_info == 'cw<ls<lv-tem-id_':
value = line[20:]
self.__all_lists[-1][-1][0]['level-template-id'] = value
def __parse_level_text_length(self, line):
"""
Requires:
line --line with hexidecimal number
Returns:
nothing
Logic:
Method is used for to parse text in the \leveltext group.
"""
num = line[18:]
the_num = int(num, 16)
if not self.__found_level_text_length:
self.__all_lists[-1][-1][0]['list-text-length'] = str(the_num)
self.__found_level_text_length = 1
else:
the_num += 1
the_string = str(the_num)
level_marker = 'level%s-suffix' % the_string
show_marker = 'show-level%s' % the_string
self.__level_text_position = level_marker
self.__all_lists[-1][-1][0][show_marker] = 'true'
if self.__prefix_string:
prefix_marker = 'level%s-prefix' % the_string
self.__all_lists[-1][-1][0][prefix_marker] = self.__prefix_string
self.__prefix_string = None
def __list_name_func(self, line):
"""
Requires:
line --line to process
Returns:
nothing
Logic:
Simply check for the end of the group and change states.
"""
if self.__token_info == 'cb<nu<clos-brack' and\
self.__cb_count == self.__list_name_ob_count:
self.__state = 'list'
def __after_bracket_func(self, line):
"""
Requires:
line --line to parse
Returns:
nothing.
Logic:
The last token found was "{". This method determines what group
you are now in.
WARNING: this could cause problems. If no group is found, the state will remain
unsure_ob, which means no other text will be parsed.
"""
if self.__token_info == 'cw<ls<level-text':
self.__state = 'level_text'
self.__level_text_ob_count = self.__ob_count
elif self.__token_info == 'cw<ls<level-numb':
self.__level_number_ob_count = self.__ob_count
self.__state = 'level_number'
elif self.__token_info == 'cw<ls<list-tb-le':
self.__found_level_func(line)
elif self.__token_info == 'cw<ls<list-in-tb':
self.__found_list_func(line)
elif self.__token_info == 'cw<ls<list-name_':
self.__state = 'list_name'
self.__list_name_ob_count = self.__ob_count
else:
if self.__run_level > 3:
msg = 'No matching token after open bracket\n'
msg += 'token is "%s\n"' % (line)
raise self.__bug_handler
def __add_to_final_line(self):
"""
Method no longer used.
"""
self.__list_table_final = 'mi<mk<listabbeg_\n'
self.__list_table_final += 'mi<tg<open______<list-table\n' + \
'mi<mk<listab-beg\n' + self.__list_table_final
self.__list_table_final += \
'mi<mk<listab-end\n' + 'mi<tg<close_____<list-table\n'
self.__list_table_final += 'mi<mk<listabend_\n'
def __write_final_string(self):
"""
Requires:
nothing
Returns:
nothing
Logic:
Write out the list-table start tag.
Iterate through self.__all_lists. For each list, write out
a list-in-table tag. Get the dictionary of this list
(the first item). Print out the key => value pair.
Remove the first item (the dictionary) form this list. Now iterate
through what is left in the list. Each list will conatin one item,
a dictionary. Get this dictionary and print out key => value pair.
"""
not_allow = ['list-id',]
id = 0
self.__list_table_final = 'mi<mk<listabbeg_\n'
self.__list_table_final += 'mi<tg<open______<list-table\n' + \
'mi<mk<listab-beg\n' + self.__list_table_final
for list in self.__all_lists:
id += 1
self.__list_table_final += 'mi<tg<open-att__<list-in-table'
# self.__list_table_final += '<list-id>%s' % (str(id))
the_dict = list[0]
the_keys = the_dict.keys()
for the_key in the_keys:
if the_key in not_allow:
continue
att = the_key
value = the_dict[att]
self.__list_table_final += '<%s>%s' % (att, value)
self.__list_table_final += '\n'
levels = list[1:]
level_num = 0
for level in levels:
level_num += 1
self.__list_table_final += 'mi<tg<empty-att_<level-in-table'
self.__list_table_final += '<level>%s' % (str(level_num))
the_dict2 = level[0]
the_keys2 = the_dict2.keys()
is_bullet = 0
bullet_text = ''
for the_key2 in the_keys2:
if the_key2 in not_allow:
continue
test_bullet = the_dict2.get('numbering-type')
if test_bullet == 'bullet':
is_bullet = 1
att2 = the_key2
value2 = the_dict2[att2]
# sys.stderr.write('%s\n' % att2[0:10])
if att2[0:10] == 'show-level' and is_bullet:
# sys.stderr.write('No print %s\n' % att2)
pass
elif att2[-6:] == 'suffix' and is_bullet:
# sys.stderr.write('%s\n' % att2)
bullet_text += value2
elif att2[-6:] == 'prefix' and is_bullet:
# sys.stderr.write('%s\n' % att2)
bullet_text += value2
else:
self.__list_table_final += '<%s>%s' % (att2, value2)
if is_bullet:
pass
# self.__list_table_final += '<bullet-type>%s' % (bullet_text)
self.__list_table_final += '\n'
self.__list_table_final += 'mi<tg<close_____<list-in-table\n'
self.__list_table_final += \
'mi<mk<listab-end\n' + 'mi<tg<close_____<list-table\n'
self.__list_table_final += 'mi<mk<listabend_\n'
def parse_list_table(self, line):
"""
Requires:
line -- line with border definition in it
Returns:
A string and the dictionary of list-table values and attributes.
Logic:
Call on the __parse_lines metod, which splits the text string into
lines (which will be tokens) and processes them.
"""
self.__parse_lines(line)
return self.__list_table_final, self.__all_lists

View File

@ -0,0 +1,442 @@
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program; if not, write to the Free Software #
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
# 02111-1307 USA #
# #
# #
#########################################################################
import sys, os, tempfile, re
from libprs500.ebooks.rtf2xml import copy
class MakeLists:
"""
Form lists.
Use RTF's own formatting to determine if a paragraph definition is part of a
list.
Use indents to determine items and how lists are nested.
"""
def __init__(self,
in_file,
bug_handler,
headings_to_sections,
list_of_lists,
copy = None,
run_level = 1,
no_headings_as_list = 1,
write_list_info = 0,
):
"""
Required:
'file'
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
Returns:
nothing
"""
self.__file = in_file
self.__bug_handler = bug_handler
self.__run_level = run_level
self.__no_headings_as_list = no_headings_as_list
self.__headings_to_sections = headings_to_sections
self.__copy = copy
self.__write_to = tempfile.mktemp()
self.__list_of_lists = list_of_lists
self.__write_list_info = write_list_info
def __initiate_values(self):
"""
Required:
Nothing
Return:
Nothing
Logic:
The self.__end_list is a list of tokens that will force a list to end.
Likewise, the self.__end_lines is a list of lines that forces a list to end.
"""
self.__state = "default"
self.__left_indent = 0
self.__list_type = 'not-defined'
self.__pard_def = ""
self.__all_lists = []
self.__level = 0
self.__list_chunk = ''
self.__state_dict={
'default' : self.__default_func,
'in_pard' : self.__in_pard_func,
'after_pard' : self.__after_pard_func,
}
self.__headings = [
'heading 1', 'heading 2', 'heading 3', 'heading 4',
'heading 5', 'heading 6', 'heading 7', 'heading 8',
'heading 9'
]
self.__allow_levels = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
self.__style_name = ''
self.__end_list = [
'mi<mk<body-close',
'mi<mk<par-in-fld',
'cw<tb<cell______',
'cw<tb<row-def___',
'cw<tb<row_______',
'mi<mk<sect-close',
'mi<mk<sect-start',
'mi<mk<header-beg',
'mi<mk<header-end',
'mi<mk<head___clo',
'mi<mk<fldbk-end_',
'mi<mk<close_cell',
'mi<mk<footnt-ope',
'mi<mk<foot___clo',
'mi<mk<tabl-start',
# 'mi<mk<sec-fd-beg',
]
self.__end_lines = [
'mi<tg<close_____<cell\n',
]
self.__id_regex = re.compile(r'\<list-id\>(\d+)')
self.__lv_regex = re.compile(r'\<list-level\>(\d+)')
self.__found_appt = 0
self.__line_num = 0
def __in_pard_func(self, line):
"""
Required:
line -- the line of current text.
Return:
Nothing
Logic:
You are in a list, but in the middle of a paragraph definition.
Don't do anything until you find the end of the paragraph definition.
"""
if self.__token_info == 'mi<mk<pard-end__':
self.__state = 'after_pard'
self.__write_obj.write(line)
def __after_pard_func(self, line):
"""
Required:
line -- the line of current text.
Return:
Nothing
Logic:
You are in a list, but after a paragraph definition. You have to
determine if the last pargraph definition ends a list, continues
the old one, or starts a new one.
Otherwise, look for a paragraph definition. If one is found, determine if
the paragraph definition contains a list-id. If it does, use the method
self.__list_after_par_def to determine the action.
If the paragraph definition does not contain a list-id, use the method
close_lists to close out items and lists for a paragraph that is not
If a bigger block is found (such as a section or a cell), end all lists.
indented.
If no special line is found, add each line to a buffer.
"""
if self.__token_info == 'mi<tg<open-att__' and line[17:37] == 'paragraph-definition':
is_heading = self.__is_a_heading()
# found paragraph definition and not heading 1
search_obj = re.search(self.__id_regex, line)
if search_obj and not is_heading: # found list-id
search_obj_lv = re.search(self.__lv_regex, line)
if search_obj_lv:
self.__level = search_obj_lv.group(1)
num = search_obj.group(1)
self.__list_after_par_def_func(line, num)
self.__write_obj.write(line)
self.__state = 'in_pard'
# heading 1
elif is_heading:
self.__left_indent = -1000
self.__close_lists()
self.__write_obj.write(self.__list_chunk)
self.__list_chunk = ''
self.__state = 'default'
self.__write_obj.write(line)
# Normal with no list id
else:
self.__close_lists()
self.__write_obj.write(self.__list_chunk)
self.__list_chunk = ''
self.__write_obj.write(line)
if len(self.__all_lists) == 0:
self.__state= 'default'
else:
self.__state = 'in_pard'
# section to end lists
elif self.__token_info in self.__end_list :
self.__left_indent = -1000
self.__close_lists()
self.__write_obj.write(self.__list_chunk)
self.__list_chunk = ''
self.__state = 'default'
self.__write_obj.write(line)
else:
self.__list_chunk += line
def __list_after_par_def_func(self, line, id):
"""
Required:
line -- the line of current text.
id -- the id of the current list
Return:
Nothing
Logic:
You have found the end of a paragraph definition, and have found
another paragraph definition with a list id.
If the list-id is different from the last paragraph definition,
write the string in the buffer. Close out the lists with another
method and start a new list.
If the list id is the same as the last one, check the indent on the
current paragraph definition. If it is greater than the previous one,
do not end the current list or item. Start a new list.
"""
last_list_id = self.__all_lists[-1]['id']
if id != last_list_id:
self.__close_lists()
self.__write_obj.write(self.__list_chunk)
self.__write_start_list(id)
self.__list_chunk = ''
else:
last_list_indent = self.__all_lists[-1]['left-indent']
if self.__left_indent > last_list_indent:
self.__write_obj.write(self.__list_chunk)
self.__write_start_list(id)
else:
self.__write_end_item()
self.__write_obj.write(self.__list_chunk)
self.__write_start_item()
self.__list_chunk = ''
def __close_lists(self):
"""
Required:
Nothing
Return:
Nothing
Logic:
Reverse the list of dictionaries. Iterate through the list and
get the indent for each list. If the current indent is less than
or equal to the indent in the dictionary, close that level.
Keep track of how many levels you close. Reduce the list by that
many levels.
Reverse the list again.
"""
if self.__line_num < 25 and self.__found_appt:
sys.stderr.write('in closing out lists\n')
sys.stderr.write('current_indent is "%s"\n' % self.__left_indent)
current_indent = self.__left_indent
self.__all_lists.reverse()
num_levels_closed = 0
for the_dict in self.__all_lists:
list_indent = the_dict.get('left-indent')
if self.__line_num < 25 and self.__found_appt:
sys.stderr.write('last indent is "%s"' % list_indent)
if current_indent <= list_indent:
self.__write_end_item()
self.__write_end_list()
num_levels_closed += 1
self.__all_lists = self.__all_lists[num_levels_closed:]
self.__all_lists.reverse()
def __write_end_list(self):
"""
Required:
Nothing
Return:
Nothing
Logic:
Write the end of a list.
"""
self.__write_obj.write('mi<tg<close_____<list\n')
self.__write_obj.write('mi<mk<list_close\n')
def __write_start_list(self, id):
"""
Required:
id -- the id of the current list.
Return:
Nothing
Logic:
Write the start of a list and add the id and left-indent to the
self.__all_lists list.
Write cues of when a list starts for later processing.
In order to determine the type of list, you have to iterate through
the self.__list_of lists. This list looks like:
[[{list-id: [1, 2], [{}], [{}]] [{list-id: [3, 4], [{}]]]
I need to get the inside lists of the main lists. Then I need to get
the first item of what I just got. This is a dictionary. Get the list-id.
This is a list. Check to see if the current id is in this list. If
so, then get the list-type from the dictionary.
"""
the_dict = {}
the_dict['left-indent'] = self.__left_indent
the_dict['id'] = id
self.__all_lists.append(the_dict)
self.__write_obj.write(
'mi<mk<list_start\n'
)
# bogus levels are sometimes written for empty paragraphs
if str(self.__level) not in self.__allow_levels:
lev_num = '0'
else:
lev_num = self.__level
self.__write_obj.write(
'mi<tg<open-att__<list<list-id>%s<level>%s'
% (id, lev_num)
)
list_dict = {}
if self.__list_of_lists: # older RTF won't generate a list_of_lists
index_of_list = self.__get_index_of_list(id)
if index_of_list != None:# found a matching id
list_dict = self.__list_of_lists[index_of_list][0]
level = int(self.__level) + 1
level_dict = self.__list_of_lists[index_of_list][level][0]
list_type = level_dict.get('numbering-type')
if list_type == 'bullet':
list_type = 'unordered'
else:
list_type = 'ordered'
self.__write_obj.write(
'<list-type>%s' % (list_type))
else: # no matching id
self.__write_obj.write(
'<list-type>%s' % (self.__list_type))
else:# older RTF
self.__write_obj.write(
'<list-type>%s' % (self.__list_type))
# if you want to dump all the info to the list, rather than
# keeping it in the table above, change self.__write_list_info
# to true.
if self.__list_of_lists and self.__write_list_info and list_dict:
not_allow = ['list-id',]
the_keys_list = list_dict.keys()
for the_key in the_keys_list:
if the_key in not_allow:
continue
self.__write_obj.write('<%s>%s' % (the_key, list_dict[the_key]))
the_keys_level = level_dict.keys()
for the_key in the_keys_level:
self.__write_obj.write('<%s>%s' % (the_key, level_dict[the_key]))
self.__write_obj.write('\n')
self.__write_obj.write(
'mi<mk<liststart_\n'
)
self.__write_start_item()
def __get_index_of_list(self, id):
"""
Requires:
id -- id of current paragraph-definition
Returns:
an index of where the id occurs in list_of_lists, the
dictionary passed to this module.
Logic:
Iterate through the big lists, the one passed to this module and
get the first item, the dictionary. Use a counter to keep
track of how many times you iterate with the counter.
Once you find a match, return the counter.
If no match is found, print out an error message.
"""
# some RTF use 0 indexed list. Don't know what to do?
if id == '0':
return
the_index = 0
for list in self.__list_of_lists:
the_dict = list[0]
id_in_list = the_dict.get('list-id')
if id in id_in_list:
return the_index
the_index += 1
if self.__run_level > 0:
sys.stderr.write('Module is make_lists.py\n'
'Method is __get_index_of_list\n'
'The main list does not appear to have a matching id for %s \n'
% (id)
)
# sys.stderr.write(repr(self.__list_of_lists))
# if self.__run_level > 3:
# msg = 'level is "%s"\n' % self.__run_level
# self.__bug_handler
def __write_start_item(self):
self.__write_obj.write('mi<mk<item_start\n')
self.__write_obj.write('mi<tg<open______<item\n')
self.__write_obj.write('mi<mk<itemstart_\n')
def __write_end_item(self):
self.__write_obj.write('mi<tg<item_end__\n')
self.__write_obj.write('mi<tg<close_____<item\n')
self.__write_obj.write('mi<tg<item__end_\n')
def __default_func(self, line):
"""
Required:
self, line
Returns:
Nothing
Logic
Look for the start of a paragraph defintion. If one is found, check if
it contains a list-id. If it does, start a list. Change the state to
in_pard.
"""
if self.__token_info == 'mi<tg<open-att__' and line[17:37] == 'paragraph-definition':
is_a_heading = self.__is_a_heading()
if not is_a_heading:
search_obj = re.search(self.__id_regex, line)
if search_obj:
num = search_obj.group(1)
self.__state = 'in_pard'
search_obj_lv = re.search(self.__lv_regex, line)
if search_obj_lv:
self.__level = search_obj_lv.group(1)
self.__write_start_list(num)
self.__write_obj.write(line)
def __is_a_heading(self):
if self.__style_name in self.__headings:
if self.__headings_to_sections:
return 1
else:
if self.__no_headings_as_list:
return 1
else:
return 0
else:
return 0
def __get_indent(self, line):
if self.__token_info == 'mi<mk<left_inden':
self.__left_indent = float(line[17:-1])
def __get_list_type(self, line):
if self.__token_info == 'mi<mk<list-type_': # <ordered
self.__list_type = line[17:-1]
if self.__list_type == 'item':
self.__list_type = "unordered"
def __get_style_name(self, line):
if self.__token_info == 'mi<mk<style-name':
self.__style_name = line[17:-1]
def make_lists(self):
"""
Required:
nothing
Returns:
original file will be changed
Logic:
"""
self.__initiate_values()
read_obj = open(self.__file, 'r')
self.__write_obj = open(self.__write_to, 'w')
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
self.__get_indent(line)
self.__get_list_type(line)
self.__get_style_name(line)
action = self.__state_dict.get(self.__state)
action(line)
read_obj.close()
self.__write_obj.close()
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "make_lists.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)

View File

@ -0,0 +1,132 @@
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program; if not, write to the Free Software #
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
# 02111-1307 USA #
# #
# #
#########################################################################
import sys
"""
"""
class OldRtf:
"""
Check to see if the RTF is an older version
Logic:
"""
def __init__(self, in_file, bug_handler, run_level ):
"""
Required:
'file'--file to parse
'table_data' -- a dictionary for each table.
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
Returns:
nothing
"""
self.__file = in_file
self.__bug_handler = bug_handler
self.__initiate_values()
self.__ob_group = 0
def __initiate_values(self):
self.__previous_token = ''
self.__new_found = 0
self.__allowable = [
'annotation' ,
'blue______' ,
'bold______',
'caps______',
'char-style' ,
'dbl-strike' ,
'emboss____',
'engrave___' ,
'font-color',
'font-down_' ,
'font-size_',
'font-style',
'font-up___',
'footnot-mk' ,
'green_____' ,
'hidden____',
'italics___',
'outline___',
'red_______',
'shadow____' ,
'small-caps',
'strike-thr',
'subscript_',
'superscrip' ,
'underlined' ,
]
self.__state = 'before_body'
self.__action_dict = {
'before_body' : self.__before_body_func,
'in_body' : self.__check_tokens_func,
'after_pard' : self.__after_pard_func,
}
self.__is_old = 0
self.__found_new = 0
def __check_tokens_func(self, line):
if self.__inline_info in self.__allowable:
if self.__ob_group == self.__base_ob_count:
return 'old_rtf'
else:
self.__found_new += 1
elif self.__token_info == 'cw<pf<par-def___':
self.__state = 'after_pard'
def __before_body_func(self, line):
if self.__token_info == 'mi<mk<body-open_':
self.__state = 'in_body'
self.__base_ob_count = self.__ob_group
def __after_pard_func(self, line):
if line[0:2] != 'cw':
self.__state = 'in_body'
def check_if_old_rtf(self):
"""
Requires:
nothing
Returns:
1 if file is older RTf
0 if file is newer RTF
"""
read_obj = open(self.__file, 'r')
line = 1
line_num = 0
while line:
line = read_obj.readline()
line_num += 1
self.__token_info = line[:16]
if self.__token_info == 'mi<mk<body-close':
return 0
self.__ob_group = 0
if self.__token_info == 'ob<nu<open-brack':
self.__ob_group += 1
self.__ob_count = line[-5:-1]
if self.__token_info == 'cb<nu<clos-brack':
self.__ob_group -= 1
self.__cb_count = line[-5:-1]
self.__inline_info = line[6:16]
if self.__state == 'after_body':
return 0
action = self.__action_dict.get(self.__state)
if not action:
sys.stderr.write('No action for state!\n')
result = action(line)
if result == 'new_rtf':
return 0
elif result == 'old_rtf':
return 1
self.__previous_token = line[6:16]
return 0

View File

@ -0,0 +1,274 @@
import sys
from libprs500.ebooks import rtf2xml
class ParseOptions:
"""
Requires:
system_string --The string from the command line
options_dict -- a dictionary with the key equal to the opition, and
a list describing that option. (See below)
Returns:
A tupple. The first item in the tupple is a dictionary containing
the arguments for each options. The second is a list of the
arguments.
If invalid options are passed to the module, 0,0 is returned.
Examples:
Your script has the option '--indents', and '--output=file'.
You want to give short option names as well:
--i and -o=file
Use this:
options_dict = {'output': [1, 'o'],
'indents': [0, 'i']
}
options_obj = ParseOptions(
system_string = sys.argv,
options_dict = options_dict
)
options, arguments = options_obj.parse_options()
print options
print arguments
The result will be:
{indents:None, output:'/home/paul/file'}, ['/home/paul/input']
"""
def __init__(self, system_string, options_dict):
self.__system_string = system_string[1:]
long_list = self.__make_long_list_func(options_dict)
# # print long_list
short_list = self.__make_short_list_func(options_dict)
# # print short_list
self.__legal_options = long_list + short_list
# # print self.__legal_options
self.__short_long_dict = self.__make_short_long_dict_func(options_dict)
# # print self.__short_long_dict
self.__opt_with_args = self.__make_options_with_arg_list(options_dict)
# # print self.__opt_with_args
self.__options_okay = 1
def __make_long_list_func(self, options_dict):
"""
Required:
options_dict -- the dictionary mapping options to a list
Returns:
a list of legal options
"""
legal_list = []
keys = options_dict.keys()
for key in keys:
key = '--' + key
legal_list.append(key)
return legal_list
def __make_short_list_func(self, options_dict):
"""
Required:
options_dict --the dictionary mapping options to a list
Returns:
a list of legal short options
"""
legal_list = []
keys = options_dict.keys()
for key in keys:
values = options_dict[key]
try:
legal_list.append('-' + values[1])
except IndexError:
pass
return legal_list
def __make_short_long_dict_func(self, options_dict):
"""
Required:
options_dict --the dictionary mapping options to a list
Returns:
a dictionary with keys of short options and values of long options
Logic:
read through the options dictionary and pair short options with long options
"""
short_long_dict = {}
keys = options_dict.keys()
for key in keys:
values = options_dict[key]
try:
short = '-' + values[1]
long = '--' + key
short_long_dict[short] = long
except IndexError:
pass
return short_long_dict
def __make_options_with_arg_list(self, options_dict):
"""
Required:
options_dict --the dictionary mapping options to a list
Returns:
a list of options that take arguments.
"""
opt_with_arg = []
keys = options_dict.keys()
for key in keys:
values = options_dict[key]
try:
if values[0]:
opt_with_arg.append('--' + key)
except IndexError:
pass
return opt_with_arg
def __sub_short_with_long(self):
"""
Required:
nothing
Returns:
a new system string
Logic:
iterate through the system string and replace short options with long options
"""
new_string = []
sub_list = self.__short_long_dict.keys()
for item in self.__system_string:
if item in sub_list:
item = self.__short_long_dict[item]
new_string.append(item)
return new_string
def __pair_arg_with_option(self):
"""
Required:
nothing
Returns
nothing (changes value of self.__system_string)
Logic:
iterate through the system string, and match arguments with options:
old_list = ['--foo', 'bar']
new_list = ['--foo=bar'
"""
opt_len = len(self.__system_string)
new_system_string = []
counter = 0
slurp_value = 0
for arg in self.__system_string:
# previous value was an option with an argument, so this arg is
# actually an argument that has already been added
counter += 1
if slurp_value:
slurp_value = 0
continue
# not an option--an argument
if arg[0] != '-':
new_system_string.append(arg)
# option and argument already paired
elif '=' in arg:
new_system_string .append(arg)
else:
# this option takes an argument
if arg in self.__opt_with_args:
# option is the last in the list
if counter + 1 > opt_len:
sys.stderr.write('option "%s" must take an argument\n' % arg)
new_system_string.append(arg)
self.__options_okay = 0
else:
# the next item in list is also an option
if self.__system_string[counter][0] == '-':
sys.stderr.write('option "%s" must take an argument\n' % arg)
new_system_string.append(arg)
self.__options_okay = 0
# the next item in the list is the argument
else:
new_system_string.append(arg + '=' + self.__system_string[counter])
slurp_value = 1
# this option does not take an argument
else:
new_system_string.append(arg)
return new_system_string
def __get_just_options(self):
"""
Requires:
nothing
Returns:
list of options
Logic:
Iterate through the self.__system string, looking for the last
option. The options are everything in the sysem string before the
last option.
Check to see that the options contain no arguments.
"""
highest = 0
counter = 0
found_options = 0
for item in self.__system_string:
if item[0] == '-':
highest = counter
found_options = 1
counter += 1
if found_options:
just_options = self.__system_string[:highest + 1]
arguments = self.__system_string[highest + 1:]
else:
just_options = []
arguments = self.__system_string
if found_options:
for item in just_options:
if item[0] != '-':
sys.stderr.write('%s is an argument in an option list\n' % item)
self.__options_okay = 0
return just_options, arguments
def __is_legal_option_func(self):
"""
Requires:
nothing
Returns:
nothing
Logic:
Check each value in the newly creatd options list to see if it
matches what the user describes as a legal option.
"""
illegal_options = []
for arg in self.__system_string:
if '=' in arg:
temp_list = arg.split('=')
arg = temp_list[0]
if arg not in self.__legal_options and arg[0] == '-':
illegal_options.append(arg)
if illegal_options:
self.__options_okay = 0
sys.stderr.write('The following options are not permitted:\n')
for not_legal in illegal_options:
sys.stderr.write('%s\n' % not_legal)
def __make_options_dict(self, options):
options_dict = {}
for item in options:
if '=' in item:
option, arg = item.split('=')
else:
option = item
arg = None
if option[0] == '-':
option = option[1:]
if option[0] == '-':
option = option[1:]
options_dict[option] = arg
return options_dict
def parse_options(self):
self.__system_string = self.__sub_short_with_long()
# # print 'subbed list is %s' % self.__system_string
self.__system_string = self.__pair_arg_with_option()
# # print 'list with pairing is %s' % self.__system_string
options, arguments = self.__get_just_options()
# # print 'options are %s ' % options
# # print 'arguments are %s ' % arguments
self.__is_legal_option_func()
if self.__options_okay:
options_dict = self.__make_options_dict(options)
# # print options_dict
return options_dict, arguments
else:
return 0,0
if __name__ == '__main__':
this_dict = {
'indents': [0, 'i'],
'output': [1, 'o'],
'test3': [1, 't'],
}
test_obj = ParseOptions(system_string = sys.argv,
options_dict = this_dict
)
options, the_args = test_obj.parse_options()
print options, the_args
"""
this_options = ['--foo', '-o']
this_opt_with_args = ['--foo']
"""

View File

@ -0,0 +1,147 @@
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program; if not, write to the Free Software #
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
# 02111-1307 USA #
# #
# #
#########################################################################
import sys, os, codecs
from libprs500.ebooks import rtf2xml
class Output:
"""
Output file
"""
def __init__(self,
file,
orig_file,
output_dir = None,
out_file = None
):
"""
Required:
'file' -- xml file ready to output
orig_file -- original rtf file
Optional:
output_file -- the file to output to
Returns:
nothing
"""
self.__file = file
self.__orig_file = orig_file
self.__output_dir = output_dir
self.__no_ask = 1
self.__out_file = out_file
def output(self):
"""
Required:
nothing
Returns:
nothing
Logic:
output the line to the screen if no output file given. Otherwise, output to
the file.
"""
# self.__output_xml(self.__file, self.__out_file)
if self.__output_dir:
self.__output_to_dir_func()
elif self.__out_file:
self.__output_xml(self.__file, self.__out_file)
else:
self.__output_to_standard_func()
def __output_to_dir_func(self):
"""
Requires:
nothing
Returns:
nothing
Logic:
Create a file within the output directory.
Read one file at a time. Output line to the newly-created file.
"""
base_name = os.path.basename(self.__orig_file)
base_name, ext = os.path.splitext(base_name)
output_file = '%s.xml' % base_name
output_file = os.path.join(self.__output_dir, output_file)
# change if user wants to output to a specific file
if self.__out_file:
output_file = os.path.join(self.__output_dir, self.__out_file)
user_response = 'o'
if os.path.isfile(output_file):
if self.__no_ask:
user_response = 'o'
else:
msg = 'Do you want to over-write %s?\n' % output_file
msg += 'Type "o" to over-write.\n'
msg += 'Type any other key to print to standard output.\n'
sys.stderr.write(msg)
user_response = raw_input()
if user_response == 'o':
read_obj = open(self.__file, 'r')
write_obj = open(output_file, 'w')
line = 1
while line:
line = read_obj.readline()
write_obj.write(line)
read_obj.close()
write_obj.close()
else:
self.__output_to_standard_func()
def __output_to_file_func(self):
"""
Required:
nothing
Returns:
nothing
Logic:
read one line at a time. Output to standard
"""
read_obj = open(self.__file, 'r')
write_obj = open(self.__out_file, 'w')
line = 1
while line:
line = read_obj.readline()
write_obj.write(line)
read_obj.close()
write_obj.close()
def __output_to_standard_func(self):
"""
Required:
nothing
Returns:
nothing
Logic:
read one line at a time. Output to standard
"""
read_obj = open(self.__file, 'r')
line = 1
while line:
line = read_obj.readline()
sys.stdout.write(line)
read_obj.close()
def __output_xml(self, in_file, out_file):
"""
output the ill-formed xml file
"""
(utf8_encode, utf8_decode, utf8_reader, utf8_writer) = codecs.lookup("utf-8")
write_obj = utf8_writer(open(out_file, 'w'))
write_obj = open(out_file, 'w')
read_obj = utf8_writer(open(in_file, 'r'))
read_obj = open(in_file, 'r')
line = 1
while line:
line = read_obj.readline()
if isinstance(line, type(u"")):
line = line.encode("utf-8")
write_obj.write(line)
read_obj.close()
write_obj.close()

View File

@ -0,0 +1,203 @@
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program; if not, write to the Free Software #
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
# 02111-1307 USA #
# #
# #
#########################################################################
import sys,os
from libprs500.ebooks import rtf2xml
class OverrideTable:
"""
Parse a line of text to make the override table. Return a string
(which will convert to XML) and the dictionary containing all the
information about the lists. This dictionary is the result of the
dictionary that is first passed to this module. This module
modifies the dictionary, assigning lists numbers to each list.
"""
def __init__(
self,
list_of_lists,
run_level = 1,
):
self.__list_of_lists = list_of_lists
self.__initiate_values()
self.__run_level = run_level
def __initiate_values(self):
self.__override_table_final = ''
self.__state = 'default'
self.__override_list = []
self.__state_dict = {
'default' : self.__default_func,
'override' : self.__override_func,
'unsure_ob' : self.__after_bracket_func,
}
self.__override_dict = {
'cw<ls<lis-tbl-id' : 'list-table-id',
'cw<ls<list-id___' : 'list-id',
}
def __override_func(self, line):
"""
Requires:
line -- line to parse
Returns:
nothing
Logic:
The group {\override has been found.
Check for the end of the group.
Otherwise, add appropriate tokens to the override dictionary.
"""
if self.__token_info == 'cb<nu<clos-brack' and\
self.__cb_count == self.__override_ob_count:
self.__state = 'default'
self.__parse_override_dict()
else:
att = self.__override_dict.get(self.__token_info)
if att:
value = line[20:]
self.__override_list[-1][att] = value
def __parse_override_dict(self):
"""
Requires:
nothing
Returns:
nothing
Logic:
The list of all information about RTF lists has been passed to
this module. As of this point, this python list has no id number,
which is needed later to identify which lists in the body should
be assigned which formatting commands from the list-table.
In order to get an id, I have to check to see when the list-table-id
from the override_dict (generated in this module) matches the list-table-id
in list_of_lists (generated in the list_table.py module). When a match is found,
append the lists numbers to the self.__list_of_lists dictionary
that contains the empty lists:
[[{list-id:[HERE!],[{}]]
This is a list, since one list in the table in the preamble of RTF can
apply to multiple lists in the body.
"""
override_dict = self.__override_list[-1]
list_id = override_dict.get('list-id')
if list_id == None and self.__level > 3:
msg = 'This override does not appear to have a list-id\n'
raise self.__bug_handler, msg
current_table_id = override_dict.get('list-table-id')
if current_table_id == None and self.__run_level > 3:
msg = 'This override does not appear to have a list-table-id\n'
raise self.__bug_handler, msg
counter = 0
for list in self.__list_of_lists:
info_dict = list[0]
old_table_id = info_dict.get('list-table-id')
if old_table_id == current_table_id:
self.__list_of_lists[counter][0]['list-id'].append(list_id)
break
counter += 1
def __parse_lines(self, line):
"""
Requires:
line --ine to parse
Returns:
nothing
Logic:
Break the into tokens by splitting it on the newline.
Call on the method according to the state.
"""
lines = line.split('\n')
self.__ob_count = 0
self.__ob_group = 0
for line in lines:
self.__token_info = line[:16]
if self.__token_info == 'ob<nu<open-brack':
self.__ob_count = line[-4:]
self.__ob_group += 1
if self.__token_info == 'cb<nu<clos-brack':
self.__cb_count = line[-4:]
self.__ob_group -= 1
action = self.__state_dict.get(self.__state)
if action == None:
print self.__state
action(line)
self.__write_final_string()
# self.__add_to_final_line()
def __default_func(self, line):
"""
Requires:
line -- line to parse
Return:
nothing
Logic:
Look for an open bracket and change states when found.
"""
if self.__token_info == 'ob<nu<open-brack':
self.__state = 'unsure_ob'
def __after_bracket_func(self, line):
"""
Requires:
line -- line to parse
Returns:
nothing
Logic:
The last token was an open bracket. You need to determine
the group based on the token after.
WARNING: this could cause problems. If no group is found, the
state will remain unsure_ob, which means no other text will be
parsed. I should do states by a list and simply pop this
unsure_ob state to get the previous state.
"""
if self.__token_info == 'cw<ls<lis-overid':
self.__state = 'override'
self.__override_ob_count = self.__ob_count
the_dict = {}
self.__override_list.append(the_dict)
elif self.__run_level > 3:
msg = 'No matching token after open bracket\n'
msg += 'token is "%s\n"' % (line)
raise self.__bug_handler, msg
def __write_final_string(self):
"""
Requires:
line -- line to parse
Returns:
nothing
Logic:
First write out the override-table tag.
Iteratere through the dictionaries in the main override_list.
For each dictionary, write an empty tag "override-list". Add
the attributes and values of the tag from the dictionary.
"""
self.__override_table_final = 'mi<mk<over_beg_\n'
self.__override_table_final += 'mi<tg<open______<override-table\n' + \
'mi<mk<overbeg__\n' + self.__override_table_final
for the_dict in self.__override_list:
self.__override_table_final += 'mi<tg<empty-att_<override-list'
the_keys = the_dict.keys()
for the_key in the_keys:
self.__override_table_final += \
'<%s>%s' % (the_key, the_dict[the_key])
self.__override_table_final += '\n'
self.__override_table_final += '\n'
self.__override_table_final += \
'mi<mk<overri-end\n' + 'mi<tg<close_____<override-table\n'
self.__override_table_final += 'mi<mk<overribend_\n'
def parse_override_table(self, line):
"""
Requires:
line -- line with border definition in it
Returns:
A string that will be converted to XML, and a dictionary of
all the properties of the RTF lists.
Logic:
"""
self.__parse_lines(line)
return self.__override_table_final, self.__list_of_lists

View File

@ -0,0 +1,739 @@
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program; if not, write to the Free Software #
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
# 02111-1307 USA #
# #
# #
#########################################################################
import sys, os, tempfile
from libprs500.ebooks.rtf2xml import copy, border_parse
class ParagraphDef:
"""
=================
Purpose
=================
Write paragraph definition tags.
States:
1. before_1st_para_def.
Before any para_def token is found. This means all the text in the preamble.
Look for the token 'cw<pf<par-def___'. This will changet the state to collect_tokens.
2. collect_tokens.
Found a paragraph_def. Need to get all tokens.
Change with start of a paragrph ('mi<mk<para-start'). State then becomes
in_paragraphs
If another paragraph definition is found, the state does not change.
But the dictionary is reset.
3. in_paragraphs
State changes when 'mi<mk<para-end__', or end of paragraph is found.
State then becomes 'self.__state = 'after_para_end'
4. after_para_end
If 'mi<mk<para-start' (the start of a paragraph) or 'mi<mk<para-end__' (the end of a paragraph--must be empty paragraph?) are found:
state changes to 'in_paragraphs'
If 'cw<pf<par-def___' (paragraph_definition) is found:
state changes to collect_tokens
if 'mi<mk<body-close', 'mi<mk<par-in-fld', 'cw<tb<cell______','cw<tb<row-def___','cw<tb<row_______', 'mi<mk<sect-close', 'mi<mk<header-beg', 'mi<mk<header-end'
are found. (All these tokens mark the start of a bigger element. para_def must
be closed:
state changes to 'after_para_def'
5. after_para_def
'mi<mk<para-start' changes state to in_paragraphs
if another paragraph_def is found, the state changes to collect_tokens.
"""
def __init__(self,
in_file,
bug_handler,
default_font,
copy = None,
run_level = 1,):
"""
Required:
'file'--file to parse
'default_font' --document default font
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
Returns:
nothing
"""
self.__file = in_file
self.__bug_handler = bug_handler
self.__default_font = default_font
self.__copy = copy
self.__run_level = run_level
self.__write_to = tempfile.mktemp()
def __initiate_values(self):
"""
Initiate all values.
"""
# Dictionary needed to convert shortened style names to readable names
self.__token_dict={
# paragraph formatting => pf
'par-end___' : 'para',
'par-def___' : 'paragraph-definition',
'keep-w-nex' : 'keep-with-next',
'widow-cntl' : 'widow-control',
'adjust-rgt' : 'adjust-right',
'language__' : 'language',
'right-inde' : 'right-indent',
'fir-ln-ind' : 'first-line-indent',
'left-inden' : 'left-indent',
'space-befo' : 'space-before',
'space-afte' : 'space-after',
'line-space' : 'line-spacing',
'default-ta' : 'default-tab',
'align_____' : 'align',
'widow-cntr' : 'widow-control',
# stylesheet = > ss
'style-shet' : 'stylesheet',
'based-on__' : 'based-on-style',
'next-style' : 'next-style',
'char-style' : 'character-style',
# this is changed to get a nice attribute
'para-style' : 'name',
# graphics => gr
'picture___' : 'pict',
'obj-class_' : 'obj_class',
'mac-pic___' : 'mac-pict',
# section => sc
'section___' : 'section-new',
'sect-defin' : 'section-reset',
'sect-note_' : 'endnotes-in-section',
# list=> ls
'list-text_' : 'list-text',
# this line must be wrong because it duplicates an earlier one
'list-text_' : 'list-text',
'list______' : 'list',
'list-lev-d' : 'list-level-definition',
'list-cardi' : 'list-cardinal-numbering',
'list-decim' : 'list-decimal-numbering',
'list-up-al' : 'list-uppercase-alphabetic-numbering',
'list-up-ro' : 'list-uppercae-roman-numbering',
'list-ord__' : 'list-ordinal-numbering',
'list-ordte' : 'list-ordinal-text-numbering',
'list-bulli' : 'list-bullet',
'list-simpi' : 'list-simple',
'list-conti' : 'list-continue',
'list-hang_' : 'list-hang',
# 'list-tebef' : 'list-text-before',
'list-level' : 'level',
'list-id___' : 'list-id',
'list-start' : 'list-start',
'nest-level' : 'nest-level',
# duplicate
'list-level' : 'list-level',
# notes => nt
'footnote__' : 'footnote',
'type______' : 'type',
# anchor => an
'toc_______' : 'anchor-toc',
'book-mk-st' : 'bookmark-start',
'book-mk-en' : 'bookmark-end',
'index-mark' : 'anchor-index',
'place_____' : 'place',
# field => fd
'field_____' : 'field',
'field-inst' : 'field-instruction',
'field-rslt' : 'field-result',
'datafield_' : 'data-field',
# info-tables => it
'font-table' : 'font-table',
'colr-table' : 'color-table',
'lovr-table' : 'list-override-table',
'listtable_' : 'list-table',
'revi-table' : 'revision-table',
# character info => ci
'hidden____' : 'hidden',
'italics___' : 'italics',
'bold______' : 'bold',
'strike-thr' : 'strike-through',
'shadow____' : 'shadow',
'outline___' : 'outline',
'small-caps' : 'small-caps',
'caps______' : 'caps',
'dbl-strike' : 'double-strike-through',
'emboss____' : 'emboss',
'engrave___' : 'engrave',
'subscript_' : 'subscript',
'superscrip' : 'superscipt',
'font-style' : 'font-style',
'font-color' : 'font-color',
'font-size_' : 'font-size',
'font-up___' : 'superscript',
'font-down_' : 'subscript',
'red_______' : 'red',
'blue______' : 'blue',
'green_____' : 'green',
# table => tb
'row-def___' : 'row-definition',
'cell______' : 'cell',
'row_______' : 'row',
'in-table__' : 'in-table',
'columns___' : 'columns',
'row-pos-le' : 'row-position-left',
'cell-posit' : 'cell-position',
# preamble => pr
# underline
'underlined' : 'underlined',
# border => bd
'bor-t-r-hi' : 'border-table-row-horizontal-inside',
'bor-t-r-vi' : 'border-table-row-vertical-inside',
'bor-t-r-to' : 'border-table-row-top',
'bor-t-r-le' : 'border-table-row-left',
'bor-t-r-bo' : 'border-table-row-bottom',
'bor-t-r-ri' : 'border-table-row-right',
'bor-cel-bo' : 'border-cell-bottom',
'bor-cel-to' : 'border-cell-top',
'bor-cel-le' : 'border-cell-left',
'bor-cel-ri' : 'border-cell-right',
'bor-par-bo' : 'border-paragraph-bottom',
'bor-par-to' : 'border-paragraph-top',
'bor-par-le' : 'border-paragraph-left',
'bor-par-ri' : 'border-paragraph-right',
'bor-par-bo' : 'border-paragraph-box',
'bor-for-ev' : 'border-for-every-paragraph',
'bor-outsid' : 'border-outisde',
'bor-none__' : 'border',
# border type => bt
'bdr-single' : 'single',
'bdr-doubtb' : 'double-thickness-border',
'bdr-shadow' : 'shadowed-border',
'bdr-double' : 'double-border',
'bdr-dotted' : 'dotted-border',
'bdr-dashed' : 'dashed',
'bdr-hair__' : 'hairline',
'bdr-inset_' : 'inset',
'bdr-das-sm' : 'dash-small',
'bdr-dot-sm' : 'dot-dash',
'bdr-dot-do' : 'dot-dot-dash',
'bdr-outset' : 'outset',
'bdr-trippl' : 'tripple',
'bdr-thsm__' : 'thick-thin-small',
'bdr-htsm__' : 'thin-thick-small',
'bdr-hthsm_' : 'thin-thick-thin-small',
'bdr-thm__' : 'thick-thin-medium',
'bdr-htm__' : 'thin-thick-medium',
'bdr-hthm_' : 'thin-thick-thin-medium',
'bdr-thl__' : 'thick-thin-large',
'bdr-hthl_' : 'think-thick-think-large',
'bdr-wavy_' : 'wavy',
'bdr-d-wav' : 'double-wavy',
'bdr-strip' : 'striped',
'bdr-embos' : 'emboss',
'bdr-engra' : 'engrave',
'bdr-frame' : 'frame',
'bdr-li-wid' : 'line-width',
}
self.__tabs_dict = {
'cw<pf<tab-stop__' : self.__tab_stop_func,
'cw<pf<tab-center' : self.__tab_type_func,
'cw<pf<tab-right_' : self.__tab_type_func,
'cw<pf<tab-dec___' : self.__tab_type_func,
'cw<pf<leader-dot' : self.__tab_leader_func,
'cw<pf<leader-hyp' : self.__tab_leader_func,
'cw<pf<leader-und' : self.__tab_leader_func,
'cw<pf<tab-bar-st' : self.__tab_bar_func,
}
self.__tab_type_dict = {
'cw<pf<tab-center' : 'center',
'cw<pf<tab-right_' : 'right',
'cw<pf<tab-dec___' : 'decimal',
'cw<pf<leader-dot' : 'leader-dot',
'cw<pf<leader-hyp' : 'leader-hyphen',
'cw<pf<leader-und' : 'leader-underline',
}
self.__border_obj = border_parse.BorderParse()
self.__style_num_strings = []
self.__body_style_strings = []
self.__state = 'before_1st_para_def'
self.__att_val_dict = {}
self.__start_marker = 'mi<mk<pard-start\n' # outside para tags
self.__start2_marker = 'mi<mk<pardstart_\n' # inside para tags
self.__end2_marker = 'mi<mk<pardend___\n' # inside para tags
self.__end_marker = 'mi<mk<pard-end__\n' # outside para tags
self.__text_string = ''
self.__state_dict = {
'before_1st_para_def' : self.__before_1st_para_def_func,
'collect_tokens' : self.__collect_tokens_func,
'after_para_def' : self.__after_para_def_func,
'in_paragraphs' : self.__in_paragraphs_func,
'after_para_end' : self.__after_para_end_func,
}
self.__collect_tokens_dict = {
'mi<mk<para-start' : self.__end_para_def_func,
'cw<pf<par-def___' : self.__para_def_in_para_def_func,
'cw<tb<cell______' : self.__empty_table_element_func,
'cw<tb<row_______' : self.__empty_table_element_func,
}
self.__after_para_def_dict = {
'mi<mk<para-start' : self.__start_para_after_def_func,
'cw<pf<par-def___' : self.__found_para_def_func,
'cw<tb<cell______' : self.__empty_table_element_func,
'cw<tb<row_______' : self.__empty_table_element_func,
}
self.__in_paragraphs_dict = {
'mi<mk<para-end__' : self.__found_para_end_func,
}
self.__after_para_end_dict = {
'mi<mk<para-start' : self.__continue_block_func,
'mi<mk<para-end__' : self.__continue_block_func,
'cw<pf<par-def___' : self.__new_para_def_func,
'mi<mk<body-close' : self.__stop_block_func,
'mi<mk<par-in-fld' : self.__stop_block_func,
'cw<tb<cell______' : self.__stop_block_func,
'cw<tb<row-def___' : self.__stop_block_func,
'cw<tb<row_______' : self.__stop_block_func,
'mi<mk<sect-close' : self.__stop_block_func,
'mi<mk<sect-start' : self.__stop_block_func,
'mi<mk<header-beg' : self.__stop_block_func,
'mi<mk<header-end' : self.__stop_block_func,
'mi<mk<head___clo' : self.__stop_block_func,
'mi<mk<fldbk-end_' : self.__stop_block_func,
'mi<mk<lst-txbeg_' : self.__stop_block_func,
}
def __before_1st_para_def_func(self, line):
"""
Required:
line -- line to parse
Returns:
nothing
Logic:
Look for the beginning of a paragaraph definition
"""
##cw<pf<par-def___<nu<true
if self.__token_info == 'cw<pf<par-def___':
self.__found_para_def_func()
else:
self.__write_obj.write(line)
def __found_para_def_func(self):
self.__state = 'collect_tokens'
# not exactly right--have to reset the dictionary--give it default
# values
self.__reset_dict()
def __collect_tokens_func(self, line):
"""
Required:
line --line to parse
Returns:
nothing
Logic:
Check the collect_tokens_dict for either the beginning of a
paragraph or a new paragraph definition. Take the actions
according to the value in the dict.
Otherwise, check if the token is not a control word. If it is not,
change the state to after_para_def.
Otherwise, check if the token is a paragraph definition word; if
so, add it to the attributes and values dictionary.
"""
action = self.__collect_tokens_dict.get(self.__token_info)
if action:
action(line)
elif line[0:2] != 'cw':
self.__write_obj.write(line)
self.__state = 'after_para_def'
elif line[0:5] == 'cw<bd':
self.__parse_border(line)
else:
action = self.__tabs_dict.get(self.__token_info)
if action:
action(line)
else:
token = self.__token_dict.get(line[6:16])
if token:
self.__att_val_dict[token] = line[20:-1]
def __tab_stop_func(self, line):
"""
"""
type = 'tabs-%s' % self.__tab_type
self.__att_val_dict['tabs'] += '%s:' % self.__tab_type
self.__att_val_dict['tabs'] += '%s;' % line[20:-1]
self.__tab_type = 'left'
def __tab_type_func(self, line):
"""
"""
type = self.__tab_type_dict.get(self.__token_info)
if type != None:
self.__tab_type = type
else:
if self.__run_level > 3:
msg = 'no entry for %s\n' % self.__token_info
raise self.__bug_handler, msg
def __tab_leader_func(self, line):
"""
"""
leader = self.__tab_type_dict.get(self.__token_info)
if leader != None:
type = 'tabs-%s' % self.__tab_type
self.__att_val_dict['tabs'] += '%s^' % leader
else:
if self.__run_level > 3:
msg = 'no entry for %s\n' % self.__token_info
raise self.__bug_handler, msg
def __tab_bar_func(self, line):
"""
"""
# self.__att_val_dict['tabs-bar'] += '%s:' % line[20:-1]
self.__att_val_dict['tabs'] += 'bar:%s;' % (line[20:-1])
self.__tab_type = 'left'
def __parse_border(self, line):
"""
Requires:
line --line to parse
Returns:
nothing (updates dictionary)
Logic:
Uses the border_parse module to return a dictionary of attribute
value pairs for a border line.
"""
border_dict = self.__border_obj.parse_border(line)
self.__att_val_dict.update(border_dict)
def __para_def_in_para_def_func(self, line):
"""
Requires:
line --line to parse
Returns:
nothing
Logic:
I have found a \pard while I am collecting tokens. I want to reset
the dectionary and do nothing else.
"""
# Change this
self.__state = 'collect_tokens'
self.__reset_dict()
def __end_para_def_func(self, line):
"""
Requires:
Nothing
Returns:
Nothing
Logic:
The previous state was collect tokens, and I have found the start
of a paragraph. I want to outut the defintion tag; output the line
itself (telling me of the beginning of a paragraph);change the
state to 'in_paragraphs';
"""
self.__write_para_def_beg()
self.__write_obj.write(line)
self.__state = 'in_paragraphs'
def __start_para_after_def_func(self, line):
"""
Requires:
Nothing
Returns:
Nothing
Logic:
The state was is after_para_def. and I have found the start of a
paragraph. I want to outut the defintion tag; output the line
itself (telling me of the beginning of a paragraph);change the
state to 'in_paragraphs'.
(I now realize that this is absolutely identical to the function above!)
"""
self.__write_para_def_beg()
self.__write_obj.write(line)
self.__state = 'in_paragraphs'
def __after_para_def_func(self, line):
"""
Requires:
line -- line to parse
Returns:
nothing
Logic:
Check if the token info is the start of a paragraph. If so, call
on the function found in the value of the dictionary.
"""
action = self.__after_para_def_dict.get(self.__token_info)
if self.__token_info == 'cw<pf<par-def___':
self.__found_para_def_func()
elif action:
action(line)
else:
self.__write_obj.write(line)
def __in_paragraphs_func(self, line):
"""
Requires:
line --current line
Returns:
nothing
Logic:
Look for the end of a paragraph, the start of a cell or row.
"""
action = self.__in_paragraphs_dict.get(self.__token_info)
if action:
action(line)
else:
self.__write_obj.write(line)
def __found_para_end_func(self,line):
"""
Requires:
line -- line to print out
Returns:
Nothing
Logic:
State is in paragraphs. You have found the end of a paragraph. You
need to print out the line and change the state to after
paragraphs.
"""
self.__state = 'after_para_end'
self.__write_obj.write(line)
def __after_para_end_func(self, line):
"""
Requires:
line -- line to output
Returns:
nothing
Logic:
The state is after the end of a paragraph. You are collecting all
the lines in a string and waiting to see if you need to write
out the paragraph definition. If you find another paragraph
definition, then you write out the old paragraph dictionary and
print out the string. You change the state to collect tokens.
If you find any larger block elemens, such as cell, row,
field-block, or section, you write out the paragraph defintion and
then the text string.
If you find the beginning of a paragraph, then you don't need to
write out the paragraph definition. Write out the string, and
change the state to in paragraphs.
"""
self.__text_string += line
action = self.__after_para_end_dict.get(self.__token_info)
if action:
action(line)
def __continue_block_func(self, line):
"""
Requires:
line --line to print out
Returns:
Nothing
Logic:
The state is after the end of a paragraph. You have found the
start of a paragaph, so you don't need to print out the paragaph
definition. Print out the string, the line, and change the state
to in paragraphs.
"""
self.__state = 'in_paragraphs'
self.__write_obj.write(self.__text_string)
self.__text_string = ''
# found a new paragraph definition after an end of a paragraph
def __new_para_def_func(self, line):
"""
Requires:
line -- line to output
Returns:
Nothing
Logic:
You have found a new paragraph defintion at the end of a
paragraph. Output the end of the old paragraph defintion. Output
the text string. Output the line. Change the state to collect
tokens. (And don't forget to set the text string to ''!)
"""
self.__write_para_def_end_func()
self.__found_para_def_func()
# after a paragraph and found reason to stop this block
def __stop_block_func(self, line):
"""
Requires:
line --(shouldn't be here?)
Returns:
nothing
Logic:
The state is after a paragraph, and you have found a larger block
than paragraph-definition. You want to write the end tag of the
old defintion and reset the text string (handled by other
methods).
"""
self.__write_para_def_end_func()
self.__state = 'after_para_def'
def __write_para_def_end_func(self):
"""
Requires:
nothing
Returns:
nothing
Logic:
Print out the end of the pargraph definition tag, and the markers
that let me know when I have reached this tag. (These markers are
used for later parsing.)
"""
self.__write_obj.write(self.__end2_marker)
self.__write_obj.write('mi<tg<close_____<paragraph-definition\n')
self.__write_obj.write(self.__end_marker)
self.__write_obj.write(self.__text_string)
self.__text_string = ''
keys = self.__att_val_dict.keys()
if 'font-style' in keys:
self.__write_obj.write('mi<mk<font-end__\n')
if 'caps' in keys:
self.__write_obj.write('mi<mk<caps-end__\n')
def __get_num_of_style(self):
"""
Requires:
nothing
Returns:
nothing
Logic:
Get a unique value for each style.
"""
my_string = ''
new_style = 0
# when determining uniqueness for a style, ingorne these values, since
# they don't tell us if the style is unique
ignore_values = ['style-num', 'nest-level', 'in-table']
keys = self.__att_val_dict.keys()
keys.sort()
for key in keys:
if key in ignore_values:
continue
my_string += '%s:%s' % (key, self.__att_val_dict[key])
if my_string in self.__style_num_strings:
num = self.__style_num_strings.index(my_string)
num += 1 # since indexing starts at zero, rather than 1
else:
self.__style_num_strings.append(my_string)
num = len(self.__style_num_strings)
new_style = 1
num = '%04d' % num
self.__att_val_dict['style-num'] = 's' + str(num)
if new_style:
self.__write_body_styles()
def __write_body_styles(self):
style_string = ''
style_string += 'mi<tg<empty-att_<paragraph-style-in-body'
style_string += '<name>%s' % self.__att_val_dict['name']
style_string += '<style-number>%s' % self.__att_val_dict['style-num']
tabs_list = ['tabs-left', 'tabs-right', 'tabs-decimal', 'tabs-center',
'tabs-bar', 'tabs']
if self.__att_val_dict['tabs'] != '':
the_value = self.__att_val_dict['tabs']
# the_value = the_value[:-1]
style_string += ('<%s>%s' % ('tabs', the_value))
keys = self.__att_val_dict.keys()
keys.sort()
for key in keys:
if key != 'name' and key !='style-num' and key != 'in-table'\
and key not in tabs_list:
style_string += ('<%s>%s' % (key, self.__att_val_dict[key]))
style_string += '\n'
self.__body_style_strings.append(style_string)
def __write_para_def_beg(self):
"""
Requires:
nothing
Returns:
nothing
Logic:
Print out the beginning of the pargraph definition tag, and the markers
that let me know when I have reached this tag. (These markers are
used for later parsing.)
"""
self.__get_num_of_style()
table = self.__att_val_dict.get('in-table')
if table:
# del self.__att_val_dict['in-table']
self.__write_obj.write('mi<mk<in-table__\n')
else:
self.__write_obj.write('mi<mk<not-in-tbl\n')
left_indent = self.__att_val_dict.get('left-indent')
if left_indent:
self.__write_obj.write('mi<mk<left_inden<%s\n' % left_indent)
is_list = self.__att_val_dict.get('list-id')
if is_list:
self.__write_obj.write('mi<mk<list-id___<%s\n' % is_list)
else:
self.__write_obj.write('mi<mk<no-list___\n')
self.__write_obj.write('mi<mk<style-name<%s\n' % self.__att_val_dict['name'])
self.__write_obj.write(self.__start_marker)
self.__write_obj.write('mi<tg<open-att__<paragraph-definition')
self.__write_obj.write('<name>%s' % self.__att_val_dict['name'])
self.__write_obj.write('<style-number>%s' % self.__att_val_dict['style-num'])
tabs_list = ['tabs-left', 'tabs-right', 'tabs-decimal', 'tabs-center',
'tabs-bar', 'tabs']
"""
for tab_item in tabs_list:
if self.__att_val_dict[tab_item] != '':
the_value = self.__att_val_dict[tab_item]
the_value = the_value[:-1]
self.__write_obj.write('<%s>%s' % (tab_item, the_value))
"""
if self.__att_val_dict['tabs'] != '':
the_value = self.__att_val_dict['tabs']
# the_value = the_value[:-1]
self.__write_obj.write('<%s>%s' % ('tabs', the_value))
keys = self.__att_val_dict.keys()
keys.sort()
for key in keys:
if key != 'name' and key !='style-num' and key != 'in-table'\
and key not in tabs_list:
self.__write_obj.write('<%s>%s' % (key, self.__att_val_dict[key]))
self.__write_obj.write('\n')
self.__write_obj.write(self.__start2_marker)
if 'font-style' in keys:
face = self.__att_val_dict['font-style']
self.__write_obj.write('mi<mk<font______<%s\n' % face)
if 'caps' in keys:
value = self.__att_val_dict['caps']
self.__write_obj.write('mi<mk<caps______<%s\n' % value)
def __empty_table_element_func(self, line):
self.__write_obj.write('mi<mk<in-table__\n')
self.__write_obj.write(line)
self.__state = 'after_para_def'
def __reset_dict(self):
"""
Requires:
nothing
Returns:
nothing
Logic:
The dictionary containing values and attributes must be reset each
time a new paragraphs definition is found.
"""
self.__att_val_dict.clear()
self.__att_val_dict['name'] = 'Normal'
self.__att_val_dict['font-style'] = self.__default_font
self.__tab_type = 'left'
self.__att_val_dict['tabs-left'] = ''
self.__att_val_dict['tabs-right'] = ''
self.__att_val_dict['tabs-center'] = ''
self.__att_val_dict['tabs-decimal'] = ''
self.__att_val_dict['tabs-bar'] = ''
self.__att_val_dict['tabs'] = ''
def make_paragraph_def(self):
"""
Requires:
nothing
Returns:
nothing (changes the original file)
Logic:
Read one line in at a time. Determine what action to take based on
the state.
"""
self.__initiate_values()
read_obj = open(self.__file, 'r')
self.__write_obj = open(self.__write_to, 'w')
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
action = self.__state_dict.get(self.__state)
if action == None:
sys.stderr.write('no no matching state in module sections.py\n')
sys.stderr.write(self.__state + '\n')
action(line)
read_obj.close()
self.__write_obj.close()
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "paragraphs_def.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)
return self.__body_style_strings

View File

@ -0,0 +1,253 @@
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program; if not, write to the Free Software #
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
# 02111-1307 USA #
# #
# #
#########################################################################
import sys, os, tempfile
from libprs500.ebooks.rtf2xml import copy
class Paragraphs:
"""
=================
Purpose
=================
Write paragraph tags for a tokenized file. (This module won't be any use to use
to you unless you use it as part of the other modules.)
-------------
Method
-------------
RTF does not tell you when a paragraph begins. It only tells you when the
paragraph ends.
In order to make paragraphs out of this limited info, the parser starts in the
body of the documents and assumes it is not in a paragraph. It looks for clues
to begin a paragraph. Text starts a paragraph; so does an inline field or
list-text. If an end of paragraph marker (\par) is found, then this indicates
a blank paragraph.
Once a paragraph is found, the state changes to 'paragraph.' In this state,
clues are looked to for the end of a paragraph. The end of a paragraph marker
(\par) marks the end of a paragraph. So does the end of a footnote or heading;
a paragraph definintion; the end of a field-block; and the beginning of a
section. (How about the end of a section or the end of a field-block?)
"""
def __init__(self,
in_file,
bug_handler,
copy = None,
write_empty_para = 1,
run_level = 1,
):
"""
Required:
'file'--file to parse
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
Returns:
nothing
"""
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__write_empty_para = write_empty_para
self.__run_level = run_level
self.__write_to = tempfile.mktemp()
def __initiate_values(self):
"""
Initiate all values.
"""
self.__state = 'before_body'
self.__start_marker = 'mi<mk<para-start\n' # outside para tags
self.__start2_marker = 'mi<mk<par-start_\n' # inside para tags
self.__end2_marker = 'mi<mk<par-end___\n' # inside para tags
self.__end_marker = 'mi<mk<para-end__\n' # outside para tags
self.__state_dict = {
'before_body' : self.__before_body_func,
'not_paragraph' : self.__not_paragraph_func,
'paragraph' : self.__paragraph_func,
}
self.__paragraph_dict = {
'cw<pf<par-end___' : self.__close_para_func, # end of paragraph
'mi<mk<headi_-end' : self.__close_para_func, # end of header or footer
##'cw<pf<par-def___' : self.__close_para_func, # paragraph definition
# 'mi<mk<fld-bk-end' : self.__close_para_func, # end of field-block
'mi<mk<fldbk-end_' : self.__close_para_func, # end of field-block
'mi<mk<body-close' : self.__close_para_func, # end of body
'mi<mk<sect-close' : self.__close_para_func, # end of body
'mi<mk<sect-start' : self.__close_para_func, # start of section
'mi<mk<foot___clo' : self.__close_para_func, # end of footnote
'cw<tb<cell______' : self.__close_para_func, # end of cell
'mi<mk<par-in-fld' : self.__close_para_func, # start of block field
'cw<pf<par-def___' : self.__bogus_para__def_func, # paragraph definition
}
self.__not_paragraph_dict = {
'tx<nu<__________' : self.__start_para_func,
'tx<hx<__________' : self.__start_para_func,
'tx<ut<__________' : self.__start_para_func,
'tx<mc<__________' : self.__start_para_func,
'mi<mk<inline-fld' : self.__start_para_func,
'mi<mk<para-beg__' : self.__start_para_func,
'cw<pf<par-end___' : self.__empty_para_func,
'mi<mk<pict-start' : self.__start_para_func,
'cw<pf<page-break' : self.__empty_pgbk_func, # page break
}
def __before_body_func(self, line):
"""
Required:
line -- line to parse
Returns:
nothing
Logic:
This function handles all the lines before the start of the body.
Once the body starts, the state is switched to 'not_paragraph'
"""
if self.__token_info == 'mi<mk<body-open_':
self.__state = 'not_paragraph'
self.__write_obj.write(line)
def __not_paragraph_func(self, line):
"""
Required:
line --line to parse
Returns:
nothing
Logic:
This function handles all lines that are outside of the paragraph.
It looks for clues that start a paragraph, and when found,
switches states and writes the start tags.
"""
action = self.__not_paragraph_dict.get(self.__token_info)
if action:
action(line)
self.__write_obj.write(line)
def __paragraph_func(self, line):
"""
Required:
line --line to parse
Returns:
nothing
Logic:
This function handles all the lines that are in the paragraph. It
looks for clues to the end of the paragraph. When a clue is found,
it calls on another method to write the end of the tag and change
the state.
"""
action = self.__paragraph_dict.get(self.__token_info)
if action:
action(line)
else:
self.__write_obj.write(line)
def __start_para_func(self, line):
"""
Requires:
line --line to parse
Returns:
nothing
Logic:
This function writes the beginning tags for a paragraph and
changes the state to paragraph.
"""
self.__write_obj.write(self.__start_marker) # marker for later parsing
self.__write_obj.write(
'mi<tg<open______<para\n'
)
self.__write_obj.write(self.__start2_marker)
self.__state = 'paragraph'
def __empty_para_func(self, line):
"""
Requires:
line --line to parse
Returns:
nothing
Logic:
This function writes the empty tags for a paragraph.
It does not do anything if self.__write_empty_para is 0.
"""
if self.__write_empty_para:
self.__write_obj.write(self.__start_marker) # marker for later parsing
self.__write_obj.write(
'mi<tg<empty_____<para\n'
)
self.__write_obj.write(self.__end_marker) # marker for later parsing
def __empty_pgbk_func(self, line):
"""
Requires:
line --line to parse
Returns:
nothing
Logic:
This function writes the empty tags for a page break.
"""
self.__write_obj.write(
'mi<tg<empty_____<page-break\n'
)
def __close_para_func(self, line):
"""
Requires:
line --line to parse
Returns:
nothing
Logic:
This function writes the end tags for a paragraph and
changes the state to not_paragraph.
"""
self.__write_obj.write(self.__end2_marker) # marker for later parser
self.__write_obj.write(
'mi<tg<close_____<para\n'
)
self.__write_obj.write(self.__end_marker) # marker for later parser
self.__write_obj.write(line)
self.__state = 'not_paragraph'
def __bogus_para__def_func(self, line):
"""
Requires:
line --line to parse
Returns:
nothing
Logic:
if a \pard occurs in a paragraph, I want to ignore it. (I believe)
"""
self.__write_obj.write('mi<mk<bogus-pard\n')
def make_paragraphs(self):
"""
Requires:
nothing
Returns:
nothing (changes the original file)
Logic:
Read one line in at a time. Determine what action to take based on
the state. If the state is before the body, look for the
beginning of the body.
When the body is found, change the state to 'not_paragraph'. The
only other state is 'paragraph'.
"""
self.__initiate_values()
read_obj = open(self.__file, 'r')
self.__write_obj = open(self.__write_to, 'w')
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
action = self.__state_dict.get(self.__state)
if action == None:
sys.stderr.write('no no matching state in module sections.py\n')
sys.stderr.write(self.__state + '\n')
action(line)
read_obj.close()
self.__write_obj.close()
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "paragraphs.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)

View File

@ -0,0 +1,186 @@
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program; if not, write to the Free Software #
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
# 02111-1307 USA #
# #
# #
#########################################################################
import sys, os, tempfile
from libprs500.ebooks.rtf2xml import copy
class Pict:
"""Process graphic information"""
def __init__(self,
in_file,
bug_handler,
out_file,
copy = None,
orig_file = None,
run_level = 1,
):
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__run_level = run_level
self.__write_to = tempfile.mktemp()
self.__bracket_count = 0
self.__ob_count = 0
self.__cb_count = 0
self.__pict_count = 0
self.__in_pict = 0
self.__already_found_pict = 0
self.__orig_file = orig_file
self.__initiate_pict_dict()
self.__out_file = out_file
# this is left over
self.__no_ask = 1
def __initiate_pict_dict(self):
self.__pict_dict = {
'ob<nu<open-brack' : self.__open_br_func,
'cb<nu<clos-brack' : self.__close_br_func,
'tx<nu<__________' : self.__text_func,
}
def __open_br_func(self, line):
return "{\n"
def __close_br_func(self, line):
return "}\n"
def __text_func(self, line):
#tx<nu<__________<true text
return line[18:]
def __make_dir(self):
""" Make a dirctory to put the image data in"""
base_name = os.path.basename(self.__orig_file)
base_name = os.path.splitext(base_name)[0]
if self.__out_file:
dir_name = os.path.dirname(self.__out_file)
else:
dir_name = os.path.dirname(self.__orig_file)
# self.__output_to_file_func()
self.__dir_name = base_name + "_rtf_pict_dir/"
self.__dir_name = os.path.join(dir_name, self.__dir_name)
if not os.path.isdir(self.__dir_name):
try:
os.mkdir(self.__dir_name)
except OSError, msg:
msg = str(msg)
msg += "Couldn't make directory '%s':\n" % (self.__dir_name)
raise self.__bug_handler
else:
if self.__no_ask:
user_response = 'r'
else:
msg = 'Do you want to remove all files in %s?\n' % self.__dir_name
msg += 'Type "r" to remove.\n'
msg += 'Type any other key to keep files in place.\n'
sys.stderr.write(msg)
user_response = raw_input()
if user_response == 'r':
if self.__run_level > 1:
sys.stderr.write('Removing files from old pict directory...\n')
all_files = os.listdir(self.__dir_name)
for the_file in all_files:
the_file = os.path.join(self.__dir_name, the_file)
try:
os.remove(the_file)
except OSError:
pass
if self.__run_level > 1:
sys.stderr.write('Files removed.\n')
def __create_pict_file(self):
"""Create a file for all the pict data to be written to.
"""
self.__pict_file = os.path.join(self.__dir_name, 'picts.rtf')
write_pic_obj = open(self.__pict_file, 'w')
write_pic_obj.close()
self.__write_pic_obj = open(self.__pict_file, 'a')
def __in_pict_func(self, line):
if self.__cb_count == self.__pict_br_count:
self.__in_pict = 0
self.__write_pic_obj.write("}\n")
return 1
else:
action = self.__pict_dict.get(self.__token_info)
if action:
line = action(line)
self.__write_pic_obj.write(line)
return 0
def __default(self, line, write_obj):
"""Determine if each token marks the beginning of pict data.
If it does, create a new file to write data to (if that file
has not already been created.) Set the self.__in_pict flag to true.
If the line does not contain pict data, return 1
"""
"""
$pict_count++;
$pict_count = sprintf("%03d", $pict_count);
print OUTPUT "dv<xx<em<nu<pict<at<num>$pict_count\n";
"""
if self.__token_info == 'cw<gr<picture___':
self.__pict_count += 1
# write_obj.write("mi<tg<em<at<pict<num>%03d\n" % self.__pict_count)
write_obj.write('mi<mk<pict-start\n')
write_obj.write('mi<tg<empty-att_<pict<num>%03d\n' % self.__pict_count)
write_obj.write('mi<mk<pict-end__\n')
if not self.__already_found_pict:
self.__create_pict_file()
self.__already_found_pict=1;
self.__print_rtf_header()
self.__in_pict = 1
self.__pict_br_count = self.__ob_count
self.__cb_count = 0
self.__write_pic_obj.write("{\\pict\n")
return 0
return 1
def __print_rtf_header(self):
"""Print to pict file the necessary RTF data for the file to be
recognized as an RTF file.
"""
self.__write_pic_obj.write("{\\rtf1 \n")
self.__write_pic_obj.write("{\\fonttbl\\f0\\null;} \n")
self.__write_pic_obj.write("{\\colortbl\\red255\\green255\\blue255;} \n")
self.__write_pic_obj.write("\\pard \n")
def process_pict(self):
self.__make_dir()
read_obj = open(self.__file)
write_obj = open(self.__write_to, 'w')
line_to_read = 'dummy'
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
if self.__token_info == 'ob<nu<open-brack':
self.__ob_count = line[-5:-1]
if self.__token_info == 'cb<nu<clos-brack':
self.__cb_count = line[-5:-1]
if not self.__in_pict:
to_print = self.__default(line, write_obj)
if to_print :
write_obj.write(line)
else:
to_print = self.__in_pict_func(line)
if to_print :
write_obj.write(line)
if self.__already_found_pict:
self.__write_pic_obj.write("}\n")
self.__write_pic_obj.close()
read_obj.close()
write_obj.close()
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "pict.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)
if self.__pict_count == 0:
try:
os.rmdir(self.__dir_name)
except OSError:
pass

View File

@ -0,0 +1,554 @@
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program; if not, write to the Free Software #
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
# 02111-1307 USA #
# #
# #
#########################################################################
import sys, os, tempfile
from libprs500.ebooks.rtf2xml import copy, override_table, list_table
class PreambleDiv:
"""
Break the preamble into divisions.
"""
def __init__(self, in_file,
bug_handler,
copy = None,
no_namespace = None,
run_level = 1,
):
"""
Required:
'file'
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
Returns:
nothing
"""
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__no_namespace = no_namespace
self.__write_to = tempfile.mktemp()
self.__run_level = run_level
def __initiate_values(self):
"""
Set values, including those for the dictionary.
"""
self.__all_lists = {}
self.__page = {
'margin-top' : 72,
'margin-bottom' : 72,
'margin-left' : 90,
'margin-right' : 90,
'gutter' : 0,
}
self.__cb_count = ''
self.__ob_count = ''
self.__state = 'preamble'
self.__rtf_final = ''
self.__close_group_count = ''
self.__found_font_table = 0
self.__list_table_final = ''
self.__override_table_final = ''
self.__revision_table_final = ''
self.__doc_info_table_final = ''
self.__state_dict = {
'default' : self.__default_func,
'rtf_header' : self.__rtf_head_func,
'preamble' : self.__preamble_func,
'font_table' : self.__font_table_func,
'color_table' : self.__color_table_func,
'style_sheet' : self.__style_sheet_func,
'list_table' : self.__list_table_func,
'override_table' : self.__override_table_func,
'revision_table' : self.__revision_table_func,
'doc_info' : self.__doc_info_func,
'body' : self.__body_func,
'ignore' : self.__ignore_func,
'cw<ri<rtf_______' : self.__found_rtf_head_func,
'cw<pf<par-def___' : self.__para_def_func,
'tx<nu<__________' : self.__text_func,
'cw<tb<row-def___' : self.__row_def_func,
'cw<sc<section___' : self.__new_section_func,
'cw<sc<sect-defin' : self.__new_section_func,
'cw<it<font-table' : self.__found_font_table_func,
'cw<it<colr-table' : self.__found_color_table_func,
'cw<ss<style-shet' : self.__found_style_sheet_func,
'cw<it<listtable_' : self.__found_list_table_func,
'cw<it<lovr-table' : self.__found_override_table_func,
'cw<it<revi-table' : self.__found_revision_table_func,
'cw<di<doc-info__' : self.__found_doc_info_func,
'cw<pa<margin-lef' : self.__margin_func,
'cw<pa<margin-rig' : self.__margin_func,
'cw<pa<margin-top' : self.__margin_func,
'cw<pa<margin-bot' : self.__margin_func,
'cw<pa<gutter____' : self.__margin_func,
'cw<pa<paper-widt' : self.__margin_func,
'cw<pa<paper-hght' : self.__margin_func,
# 'cw<tb<columns___' : self.__section_func,
}
self.__margin_dict = {
'margin-lef' : 'margin-left',
'margin-rig' : 'margin-right',
'margin-top' : 'margin-top',
'margin-bot' : 'margin-bottom',
'gutter____' : 'gutter',
'paper-widt' : 'paper-width',
'paper-hght' : 'paper-height',
}
self.__translate_sec = {
'columns___' : 'column',
}
self.__section = {}
# self.__write_obj.write(self.__color_table_final)
self.__color_table_final = ''
self.__style_sheet_final = ''
self.__individual_font = 0
self.__old_font = 0
self.__ob_group = 0 # depth of group
self.__font_table_final = 0
self.__list_table_obj = list_table.ListTable(
run_level = self.__run_level,
bug_handler = self.__bug_handler,
)
def __ignore_func(self, line):
"""
Ignore all lines, until the bracket is found that marks the end of
the group.
"""
if self.__ignore_num == self.__cb_count:
self.__state = self.__previous_state
def __found_rtf_head_func(self, line):
self.__state = 'rtf_header'
def __rtf_head_func(self, line):
if self.__ob_count == '0002':
self.__rtf_final = (
'mi<mk<rtfhed-beg\n' +
self.__rtf_final +
'mi<mk<rtfhed-end\n'
)
self.__state = 'preamble'
elif self.__token_info == 'tx<nu<__________' or \
self.__token_info == 'cw<pf<par-def___':
self.__state = 'body'
self.__rtf_final = (
'mi<mk<rtfhed-beg\n' +
self.__rtf_final +
'mi<mk<rtfhed-end\n'
)
self.__make_default_font_table()
self.__write_preamble()
self.__write_obj.write(line)
else:
self.__rtf_final = self.__rtf_final + line
def __make_default_font_table(self):
"""
If not font table is fount, need to write one out.
"""
self.__font_table_final = 'mi<tg<open______<font-table\n'
self.__font_table_final += 'mi<mk<fonttb-beg\n'
self.__font_table_final += 'mi<mk<fontit-beg\n'
self.__font_table_final += 'cw<ci<font-style<nu<0\n'
self.__font_table_final += 'tx<nu<__________<Times;\n'
self.__font_table_final += 'mi<mk<fontit-end\n'
self.__font_table_final += 'mi<mk<fonttb-end\n'
self.__font_table_final += 'mi<tg<close_____<font-table\n'
def __make_default_color_table(self):
"""
If no color table is found, write a string for a default one
"""
self.__color_table_final = 'mi<tg<open______<color-table\n'
self.__color_table_final += 'mi<mk<clrtbl-beg\n'
self.__color_table_final += 'cw<ci<red_______<nu<00\n'
self.__color_table_final += 'cw<ci<green_____<nu<00\n'
self.__color_table_final += 'cw<ci<blue______<en<00\n'
self.__color_table_final += 'mi<mk<clrtbl-end\n'
self.__color_table_final += 'mi<tg<close_____<color-table\n'
def __make_default_style_table(self):
"""
If not font table is found, make a string for a default one
"""
"""
self.__style_sheet_final = 'mi<tg<open______<style-table\n'
self.__style_sheet_final +=
self.__style_sheet_final +=
self.__style_sheet_final +=
self.__style_sheet_final +=
self.__style_sheet_final +=
self.__style_sheet_final += 'mi<tg<close_____<style-table\n'
"""
self.__style_sheet_final = """mi<tg<open______<style-table
mi<mk<styles-beg
mi<mk<stylei-beg
cw<ci<font-style<nu<0
tx<nu<__________<Normal;
mi<mk<stylei-end
mi<mk<stylei-beg
cw<ss<char-style<nu<0
tx<nu<__________<Default Paragraph Font;
mi<mk<stylei-end
mi<mk<styles-end
mi<tg<close_____<style-table
"""
def __found_font_table_func(self, line):
if self.__found_font_table:
self.__state = 'ignore'
else:
self.__state = 'font_table'
self.__font_table_final = ''
self.__close_group_count = self.__ob_count
self.__cb_count = 0
self.__found_font_table = 1
def __font_table_func(self, line):
"""
Keep adding to the self.__individual_font string until end of group
found. If a bracket is found, check that it is only one bracket deep.
If it is, then set the marker for an individual font. If it is not,
then ignore all data in this group.
cw<ci<font-style<nu<0
"""
if self.__cb_count == self.__close_group_count:
self.__state = 'preamble'
self.__font_table_final = 'mi<tg<open______<font-table\n' + \
'mi<mk<fonttb-beg\n' + self.__font_table_final
self.__font_table_final += \
'mi<mk<fonttb-end\n' + 'mi<tg<close_____<font-table\n'
elif self.__token_info == 'ob<nu<open-brack':
if int(self.__ob_count) == int(self.__close_group_count) + 1:
self.__font_table_final += \
'mi<mk<fontit-beg\n'
self.__individual_font = 1
else:
# ignore
self.__previous_state = 'font_table'
self.__state = 'ignore'
self.__ignore_num = self.__ob_count
elif self.__token_info == 'cb<nu<clos-brack':
if int(self.__cb_count) == int(self.__close_group_count) + 1:
self.__individual_font = 0
self.__font_table_final += \
'mi<mk<fontit-end\n'
elif self.__individual_font:
if self.__old_font and self.__token_info == 'tx<nu<__________':
if ';' in line:
self.__font_table_final += line
self.__font_table_final += 'mi<mk<fontit-end\n'
self.__individual_font = 0
else:
self.__font_table_final += line
elif self.__token_info == 'cw<ci<font-style':
self.__old_font = 1
self.__individual_font = 1
self.__font_table_final += 'mi<mk<fontit-beg\n'
self.__font_table_final += line
def __old_font_func(self, line):
"""
Required:
line --line to parse
Returns:
nothing
Logic:
used for older forms of RTF:
\f3\fswiss\fcharset77 Helvetica-Oblique;\f4\fnil\fcharset77 Geneva;}
Note how each font is not divided by a bracket
"""
def __found_color_table_func(self, line):
"""
all functions that start with __found operate the same. They set the
state, initiate a string, determine the self.__close_group_count, and
set self.__cb_count to zero.
"""
self.__state = 'color_table'
self.__color_table_final = ''
self.__close_group_count = self.__ob_count
self.__cb_count = 0
def __color_table_func(self, line):
if int(self.__cb_count) == int(self.__close_group_count):
self.__state = 'preamble'
self.__color_table_final = 'mi<tg<open______<color-table\n' + \
'mi<mk<clrtbl-beg\n' + self.__color_table_final
self.__color_table_final += \
'mi<mk<clrtbl-end\n' + 'mi<tg<close_____<color-table\n'
else:
self.__color_table_final += line
def __found_style_sheet_func(self, line):
self.__state = 'style_sheet'
self.__style_sheet_final = ''
self.__close_group_count = self.__ob_count
self.__cb_count = 0
def __style_sheet_func(self, line):
"""
Same logic as the font_table_func.
"""
if self.__cb_count == self.__close_group_count:
self.__state = 'preamble'
self.__style_sheet_final = 'mi<tg<open______<style-table\n' + \
'mi<mk<styles-beg\n' + self.__style_sheet_final
self.__style_sheet_final += \
'mi<mk<styles-end\n' + 'mi<tg<close_____<style-table\n'
elif self.__token_info == 'ob<nu<open-brack':
if int(self.__ob_count) == int(self.__close_group_count) + 1:
self.__style_sheet_final += \
'mi<mk<stylei-beg\n'
elif self.__token_info == 'cb<nu<clos-brack':
if int(self.__cb_count) == int(self.__close_group_count) + 1:
self.__style_sheet_final += \
'mi<mk<stylei-end\n'
else:
self.__style_sheet_final += line
def __found_list_table_func(self, line):
self.__state = 'list_table'
self.__list_table_final = ''
self.__close_group_count = self.__ob_count
self.__cb_count = 0
def __list_table_func(self, line):
if self.__cb_count == self.__close_group_count:
self.__state = 'preamble'
self.__list_table_final, self.__all_lists =\
self.__list_table_obj.parse_list_table(
self.__list_table_final)
# sys.stderr.write(repr(all_lists))
elif self.__token_info == '':
pass
else:
self.__list_table_final += line
pass
def __found_override_table_func(self, line):
self.__override_table_obj = override_table.OverrideTable(
run_level = self.__run_level,
list_of_lists = self.__all_lists,
)
self.__state = 'override_table'
self.__override_table_final = ''
self.__close_group_count = self.__ob_count
self.__cb_count = 0
# cw<it<lovr-table
def __override_table_func(self, line):
if self.__cb_count == self.__close_group_count:
self.__state = 'preamble'
self.__override_table_final, self.__all_lists =\
self.__override_table_obj.parse_override_table(self.__override_table_final)
elif self.__token_info == '':
pass
else:
self.__override_table_final += line
def __found_revision_table_func(self, line):
self.__state = 'revision_table'
self.__revision_table_final = ''
self.__close_group_count = self.__ob_count
self.__cb_count = 0
def __revision_table_func(self, line):
if int(self.__cb_count) == int(self.__close_group_count):
self.__state = 'preamble'
self.__revision_table_final = 'mi<tg<open______<revision-table\n' + \
'mi<mk<revtbl-beg\n' + self.__revision_table_final
self.__revision_table_final += \
'mi<mk<revtbl-end\n' + 'mi<tg<close_____<revision-table\n'
else:
self.__revision_table_final += line
def __found_doc_info_func(self, line):
self.__state = 'doc_info'
self.__doc_info_table_final = ''
self.__close_group_count = self.__ob_count
self.__cb_count = 0
def __doc_info_func(self, line):
if self.__cb_count == self.__close_group_count:
self.__state = 'preamble'
self.__doc_info_table_final = 'mi<tg<open______<doc-information\n' + \
'mi<mk<doc-in-beg\n' + self.__doc_info_table_final
self.__doc_info_table_final += \
'mi<mk<doc-in-end\n' + 'mi<tg<close_____<doc-information\n'
elif self.__token_info == 'ob<nu<open-brack':
if int(self.__ob_count) == int(self.__close_group_count) + 1:
self.__doc_info_table_final += \
'mi<mk<docinf-beg\n'
elif self.__token_info == 'cb<nu<clos-brack':
if int(self.__cb_count) == int(self.__close_group_count) + 1:
self.__doc_info_table_final += \
'mi<mk<docinf-end\n'
else:
self.__doc_info_table_final += line
def __margin_func(self, line):
"""
Handles lines that describe page info. Add the apporpriate info in the
token to the self.__margin_dict dicitonary.
"""
info = line[6:16]
changed = self.__margin_dict.get(info)
if changed == None:
print 'woops!'
else:
self.__page[changed] = line[20:-1]
#cw<pa<margin-lef<nu<1728
def __print_page_info(self):
self.__write_obj.write('mi<tg<empty-att_<page-definition')
for key in self.__page.keys():
self.__write_obj.write(
'<%s>%s' % (key, self.__page[key])
)
self.__write_obj.write('\n')
#mi<tg<open-att__<footn
def __print_sec_info(self):
"""
Check if there is any section info. If so, print it out.
If not, print out an empty tag to satisfy the dtd.
"""
if len(self.__section.keys()) == 0:
self.__write_obj.write(
'mi<tg<open______<section-definition\n'
)
else:
self.__write_obj.write(
'mi<tg<open-att__<section-definition')
keys = self.__section.keys()
for key in keys:
self.__write_obj.write(
'<%s>%s' % (key, self.__section[key])
)
self.__write_obj.write('\n')
def __section_func(self, line):
"""
Add info pertaining to section to the self.__section dictionary, to be
printed out later.
"""
info = self.__translate_sec.get(line[6:16])
if info == None:
sys.stderr.write ('woops!\n')
else:
self.__section[info] = 'true'
def __body_func(self, line):
self.__write_obj.write(line)
def __default_func(self, line):
# either in preamble or in body
pass
def __para_def_func(self, line):
# if self.__ob_group == 1
# this tells dept of group
if self.__cb_count == '0002':
self.__state = 'body'
self.__write_preamble()
self.__write_obj.write(line)
def __text_func(self, line):
"""
If the cb_count is less than 1, you have hit the body
For older RTF
Newer RTF should never have to use this function
"""
if self.__cb_count == '':
cb_count = '0002'
else:
cb_count = self.__cb_count
# ignore previous lines
# should be
# if self.__ob_group == 1
# this tells dept of group
if cb_count == '0002':
self.__state = 'body'
self.__write_preamble()
self.__write_obj.write(line)
def __row_def_func(self, line):
# if self.__ob_group == 1
# this tells dept of group
if self.__cb_count == '0002':
self.__state = 'body'
self.__write_preamble()
self.__write_obj.write(line)
def __new_section_func(self, line):
"""
This is new. The start of a section marks the end of the preamble
"""
if self.__cb_count == '0002':
self.__state = 'body'
self.__write_preamble()
else:
sys.stderr.write('module is preamble_div\n')
sys.stderr.write('method is __new_section_func\n')
sys.stderr.write('bracket count should be 2?\n')
self.__write_obj.write(line)
def __write_preamble(self):
"""
Write all the strings, which represent all the data in the preamble.
Write a body and section beginning.
"""
if self.__no_namespace:
self.__write_obj.write(
'mi<tg<open______<doc\n'
)
else:
self.__write_obj.write(
'mi<tg<open-att__<doc<xmlns>http://rtf2xml.sourceforge.net/\n')
self.__write_obj.write('mi<tg<open______<preamble\n')
self.__write_obj.write(self.__rtf_final)
if not self.__color_table_final:
self.__make_default_color_table()
if not self.__font_table_final:
self.__make_default_font_table()
self.__write_obj.write(self.__font_table_final)
self.__write_obj.write(self.__color_table_final)
if not self.__style_sheet_final:
self.__make_default_style_table()
self.__write_obj.write(self.__style_sheet_final)
self.__write_obj.write(self.__list_table_final)
self.__write_obj.write(self.__override_table_final)
self.__write_obj.write(self.__revision_table_final)
self.__write_obj.write(self.__doc_info_table_final)
self.__print_page_info()
self.__write_obj.write('ob<nu<open-brack<0001\n')
self.__write_obj.write('ob<nu<open-brack<0002\n')
self.__write_obj.write('cb<nu<clos-brack<0002\n')
self.__write_obj.write('mi<tg<close_____<preamble\n')
self.__write_obj.write('mi<tg<open______<body\n')
# self.__write_obj.write('mi<tg<open-att__<section<num>1\n')
# self.__print_sec_info()
# self.__write_obj.write('mi<tg<open______<headers-and-footers\n')
# self.__write_obj.write('mi<mk<head_foot_<\n')
# self.__write_obj.write('mi<tg<close_____<headers-and-footers\n')
self.__write_obj.write('mi<mk<body-open_\n')
def __preamble_func(self, line):
"""
Check if the token info belongs to the dictionary. If so, take the
appropriate action.
"""
action = self.__state_dict.get(self.__token_info)
if action:
action(line)
def make_preamble_divisions(self):
self.__initiate_values()
read_obj = open(self.__file, 'r')
self.__write_obj = open(self.__write_to, 'w')
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
if self.__token_info == 'ob<nu<open-brack':
self.__ob_count = line[-5:-1]
self.__ob_group += 1
if self.__token_info == 'cb<nu<clos-brack':
self.__cb_count = line[-5:-1]
self.__ob_group -= 1
action = self.__state_dict.get(self.__state)
if action == None:
print self.__state
action(line)
read_obj.close()
self.__write_obj.close()
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "preamble_div.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)
return self.__all_lists

View File

@ -0,0 +1,145 @@
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program; if not, write to the Free Software #
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
# 02111-1307 USA #
# #
# #
#########################################################################
import sys,os
from libprs500.ebooks.rtf2xml import copy
class Preamble:
"""
Fix the reamaing parts of the preamble. This module does very little. It
makes sure that no text gets put in the revision of list table. In the
future, when I understand how to interprett he revision table and list
table, I will make these methods more functional.
"""
def __init__(self, file, bug_handler, platform, default_font, code_page,
copy=None, temp_dir=None):
"""
Required:
file--file to parse
platform --Windows or Macintosh
default_font -- the default font
code_page --the code page (ansi1252, for example)
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
Returns:
nothing
"""
self.__file=file
self.__bug_handler = bug_handler
self.__copy = copy
self.__default_font = default_font
self.__code_page = code_page
self.__platform = platform
if temp_dir:
self.__write_to = os.path.join(temp_dir,"info_table_info.data")
else:
self.__write_to = "info_table_info.data"
def __initiate_values(self):
"""
Initiate all values.
"""
self.__state = 'default'
self.__text_string = ''
self.__state_dict = {
'default' : self.__default_func,
'revision' : self.__revision_table_func,
'list_table' : self.__list_table_func,
'body' : self.__body_func,
}
self.__default_dict = {
'mi<mk<rtfhed-beg' : self.__found_rtf_head_func,
'mi<mk<listabbeg_' : self.__found_list_table_func,
'mi<mk<revtbl-beg' : self.__found_revision_table_func,
'mi<mk<body-open_' : self.__found_body_func,
}
def __default_func(self, line):
action = self.__default_dict.get(self.__token_info)
if action:
action(line)
else:
self.__write_obj.write(line)
def __found_rtf_head_func(self, line):
"""
Requires:
line -- the line to parse
Returns:
nothing.
Logic:
Write to the output file the default font info, the code page
info, and the platform info.
"""
self.__write_obj.write(
'mi<tg<empty-att_<rtf-definition'
'<default-font>%s<code-page>%s'
'<platform>%s\n' % (self.__default_font, self.__code_page,
self.__platform)
)
def __found_list_table_func(self, line):
self.__state = 'list_table'
def __list_table_func(self, line):
if self.__token_info == 'mi<mk<listabend_':
self.__state = 'default'
elif line[0:2] == 'tx':
pass
else:
self.__write_obj.write(line)
def __found_revision_table_func(self, line):
self.__state = 'revision'
def __revision_table_func(self, line):
if self.__token_info == 'mi<mk<revtbl-end':
self.__state = 'default'
elif line[0:2] == 'tx':
pass
else:
self.__write_obj.write(line)
def __found_body_func(self, line):
self.__state = 'body'
self.__write_obj.write(line)
def __body_func(self, line):
self.__write_obj.write(line)
def fix_preamble(self):
"""
Requires:
nothing
Returns:
nothing (changes the original file)
Logic:
Read one line in at a time. Determine what action to take based on
the state. The state can either be defaut, the revision table, or
the list table.
"""
self.__initiate_values()
read_obj = open(self.__file, 'r')
self.__write_obj = open(self.__write_to, 'w')
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
action = self.__state_dict.get(self.__state)
if action == None:
sys.stderr.write('no no matching state in module preamble_rest.py\n')
sys.stderr.write(self.__state + '\n')
action(line)
read_obj.close()
self.__write_obj.close()
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "preamble_div.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)

View File

@ -0,0 +1,826 @@
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program; if not, write to the Free Software #
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
# 02111-1307 USA #
# #
# #
#########################################################################
import os, re, tempfile
from libprs500.ebooks.rtf2xml import copy, check_brackets
class ProcessTokens:
"""
Process each token on a line and add information that will be useful for
later processing. Information will be put on one line, delimited by "<"
for main fields, and ">" for sub fields
"""
def __init__(self,
in_file,
exception_handler,
bug_handler,
copy = None,
run_level = 1,
):
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__run_level = run_level
self.__write_to = tempfile.mktemp()
self.initiate_token_dict()
##self.initiate_token_actions()
self.compile_expressions()
self.__bracket_count=0
self.__exception_handler = exception_handler
self.__bug_handler = bug_handler
def compile_expressions(self):
self.__num_exp = re.compile(r"([a-zA-Z]+)(.*)")
self.__utf_exp = re.compile(r'(&.*?;)')
def initiate_token_dict(self):
self.__return_code = 0
self.dict_token={
# unicode
'mshex' : ('nu', '__________', self.__ms_hex_func),
# brackets
'{' : ('nu', '{', self.ob_func),
'}' : ('nu', '}', self.cb_func),
# microsoft characters
'ldblquote' : ('mc', 'ldblquote', self.ms_sub_func),
'rdblquote' : ('mc', 'rdblquote', self.ms_sub_func),
'rquote' : ('mc', 'rquote', self.ms_sub_func),
'lquote' : ('mc', 'lquote', self.ms_sub_func),
'emdash' : ('mc', 'emdash', self.ms_sub_func),
'endash' : ('mc', 'endash', self.ms_sub_func),
'bullet' : ('mc', 'bullet', self.ms_sub_func),
'~' : ('mc', '~', self.ms_sub_func),
'tab' : ('mc', 'tab', self.ms_sub_func),
'_' : ('mc', '_', self.ms_sub_func),
';' : ('mc', ';', self.ms_sub_func),
# this must be wrong
'-' : ('mc', '-', self.ms_sub_func),
# misc => ml
'*' : ('ml', 'asterisk__', self.default_func),
':' : ('ml', 'colon_____', self.default_func),
# text
'backslash' : ('nu', '\\', self.text_func),
'ob' : ('nu', '{', self.text_func),
'cb' : ('nu', '}', self.text_func),
# paragraph formatting => pf
'page' : ('pf', 'page-break', self.default_func),
'par' : ('pf', 'par-end___', self.default_func),
'pard' : ('pf', 'par-def___', self.default_func),
'keepn' : ('pf', 'keep-w-nex', self.bool_st_func),
'widctlpar' : ('pf', 'widow-cntl', self.bool_st_func),
'adjustright' : ('pf', 'adjust-rgt', self.bool_st_func),
'lang' : ('pf', 'language__', self.__language_func),
'ri' : ('pf', 'right-inde', self.divide_by_20),
'fi' : ('pf', 'fir-ln-ind', self.divide_by_20),
'li' : ('pf', 'left-inden', self.divide_by_20),
'sb' : ('pf', 'space-befo', self.divide_by_20),
'sa' : ('pf', 'space-afte', self.divide_by_20),
'sl' : ('pf', 'line-space', self.divide_by_20),
'deftab' : ('pf', 'default-ta', self.divide_by_20),
'ql' : ('pf', 'align_____<left', self.two_part_func),
'qc' : ('pf', 'align_____<cent', self.two_part_func),
'qj' : ('pf', 'align_____<just', self.two_part_func),
'qr' : ('pf', 'align_____<right', self.two_part_func),
'nowidctlpar' : ('pf', 'widow-cntr<false', self.two_part_func),
'tx' : ('pf', 'tab-stop__', self.divide_by_20),
'tb' : ('pf', 'tab-bar-st', self.divide_by_20),
'tqr' : ('pf', 'tab-right_', self.default_func),
'tqdec' : ('pf', 'tab-dec___', self.default_func),
'tqc' : ('pf', 'tab-center', self.default_func),
'tlul' : ('pf', 'leader-und', self.default_func),
'tlhyph' : ('pf', 'leader-hyp', self.default_func),
'tldot' : ('pf', 'leader-dot', self.default_func),
# stylesheet = > ss
'stylesheet' : ('ss', 'style-shet', self.default_func),
'sbasedon' : ('ss', 'based-on__', self.default_func),
'snext' : ('ss', 'next-style', self.default_func),
'cs' : ('ss', 'char-style', self.default_func),
's' : ('ss', 'para-style', self.default_func),
# graphics => gr
'pict' : ('gr', 'picture___', self.default_func),
'objclass' : ('gr', 'obj-class_', self.default_func),
'macpict' : ('gr', 'mac-pic___', self.default_func),
# section => sc
'sect' : ('sc', 'section___', self.default_func),
'sectd' : ('sc', 'sect-defin', self.default_func),
'endhere' : ('sc', 'sect-note_', self.default_func),
# list=> ls
'pntext' : ('ls', 'list-text_', self.default_func),
# this line must be wrong because it duplicates an earlier one
'listtext' : ('ls', 'list-text_', self.default_func),
'pn' : ('ls', 'list______', self.default_func),
'pnseclvl' : ('ls', 'list-level', self.default_func),
'pncard' : ('ls', 'list-cardi', self.bool_st_func),
'pndec' : ('ls', 'list-decim', self.bool_st_func),
'pnucltr' : ('ls', 'list-up-al', self.bool_st_func),
'pnucrm' : ('ls', 'list-up-ro', self.bool_st_func),
'pnord' : ('ls', 'list-ord__', self.bool_st_func),
'pnordt' : ('ls', 'list-ordte', self.bool_st_func),
'pnlvlblt' : ('ls', 'list-bulli', self.bool_st_func),
'pnlvlbody' : ('ls', 'list-simpi', self.bool_st_func),
'pnlvlcont' : ('ls', 'list-conti', self.bool_st_func),
'pnhang' : ('ls', 'list-hang_', self.bool_st_func),
'pntxtb' : ('ls', 'list-tebef', self.bool_st_func),
'ilvl' : ('ls', 'list-level', self.default_func),
'ls' : ('ls', 'list-id___', self.default_func),
'pnstart' : ('ls', 'list-start', self.default_func),
'itap' : ('ls', 'nest-level', self.default_func),
'leveltext' : ('ls', 'level-text', self.default_func),
'levelnumbers' : ('ls', 'level-numb', self.default_func),
'list' : ('ls', 'list-in-tb', self.default_func),
'listlevel' : ('ls', 'list-tb-le', self.default_func),
'listname' : ('ls', 'list-name_', self.default_func),
'listtemplateid' : ('ls', 'ls-tem-id_', self.default_func),
'leveltemplateid' : ('ls', 'lv-tem-id_', self.default_func),
'listhybrid' : ('ls', 'list-hybri', self.default_func),
'levelstartat' : ('ls', 'level-star', self.default_func),
'levelspace' : ('ls', 'level-spac', self.divide_by_20),
'levelindent' : ('ls', 'level-inde', self.default_func),
'levelnfc' : ('ls', 'level-type', self.__list_type_func),
'levelnfcn' : ('ls', 'level-type', self.__list_type_func),
'listid' : ('ls', 'lis-tbl-id', self.default_func),
'listoverride' : ('ls', 'lis-overid', self.default_func),
# duplicate
'pnlvl' : ('ls', 'list-level', self.default_func),
# root info => ri
'rtf' : ('ri', 'rtf_______', self.default_func),
'deff' : ('ri', 'deflt-font', self.default_func),
'mac' : ('ri', 'macintosh_', self.default_func),
'ansi' : ('ri', 'ansi______', self.default_func),
'ansicpg' : ('ri', 'ansi-codpg', self.default_func),
# notes => nt
'footnote' : ('nt', 'footnote__', self.default_func),
'ftnalt' : ('nt', 'type______<endnote', self.two_part_func),
# anchor => an
'tc' : ('an', 'toc_______', self.default_func),
'bkmkstt' : ('an', 'book-mk-st', self.default_func),
'bkmkstart' : ('an', 'book-mk-st', self.default_func),
'bkmkend' : ('an', 'book-mk-en', self.default_func),
'xe' : ('an', 'index-mark', self.default_func),
'rxe' : ('an', 'place_____', self.default_func),
# index => in
'bxe' : ('in', 'index-bold', self.default_func),
'ixe' : ('in', 'index-ital', self.default_func),
'txe' : ('in', 'index-see_', self.default_func),
# table of contents => tc
'tcl' : ('tc', 'toc-level_', self.default_func),
'tcn' : ('tc', 'toc-sup-nu', self.default_func),
# field => fd
'field' : ('fd', 'field_____', self.default_func),
'fldinst' : ('fd', 'field-inst', self.default_func),
'fldrslt' : ('fd', 'field-rslt', self.default_func),
'datafield' : ('fd', 'datafield_', self.default_func),
# info-tables => it
'fonttbl' : ('it', 'font-table', self.default_func),
'colortbl' : ('it', 'colr-table', self.default_func),
'listoverridetable' : ('it', 'lovr-table', self.default_func),
'listtable' : ('it', 'listtable_', self.default_func),
'revtbl' : ('it', 'revi-table', self.default_func),
# character info => ci
'b' : ('ci', 'bold______', self.bool_st_func),
'blue' : ('ci', 'blue______', self.color_func),
'caps' : ('ci', 'caps______', self.bool_st_func),
'cf' : ('ci', 'font-color', self.default_func),
'chftn' : ('ci', 'footnot-mk', self.bool_st_func),
'dn' : ('ci', 'font-down_', self.divide_by_2),
'embo' : ('ci', 'emboss____', self.bool_st_func),
'f' : ('ci', 'font-style', self.default_func),
'fs' : ('ci', 'font-size_', self.divide_by_2),
'green' : ('ci', 'green_____', self.color_func),
'i' : ('ci', 'italics___', self.bool_st_func),
'impr' : ('ci', 'engrave___', self.bool_st_func),
'outl' : ('ci', 'outline___', self.bool_st_func),
'plain' : ('ci', 'plain_____', self.bool_st_func),
'red' : ('ci', 'red_______', self.color_func),
'scaps' : ('ci', 'small-caps', self.bool_st_func),
'shad' : ('ci', 'shadow____', self.bool_st_func),
'strike' : ('ci', 'strike-thr', self.bool_st_func),
'striked' : ('ci', 'dbl-strike', self.bool_st_func),
'sub' : ('ci', 'subscript_', self.bool_st_func),
'super' : ('ci', 'superscrip', self.bool_st_func),
'nosupersub' : ('ci', 'no-su-supe', self.__no_sup_sub_func),
'up' : ('ci', 'font-up___', self.divide_by_2),
'v' : ('ci', 'hidden____', self.default_func),
# table => tb
'trowd' : ('tb', 'row-def___', self.default_func),
'cell' : ('tb', 'cell______', self.default_func),
'row' : ('tb', 'row_______', self.default_func),
'intbl' : ('tb', 'in-table__', self.default_func),
'cols' : ('tb', 'columns___', self.default_func),
'trleft' : ('tb', 'row-pos-le', self.divide_by_20),
'cellx' : ('tb', 'cell-posit', self.divide_by_20),
'trhdr' : ('tb', 'row-header', self.default_func),
# preamble => pr
# document information => di
'info' : ('di', 'doc-info__', self.default_func),
'author' : ('di', 'author____', self.default_func),
'operator' : ('di', 'operator__', self.default_func),
'title' : ('di', 'title_____', self.default_func),
'keywords' : ('di', 'keywords__', self.default_func),
'doccomm' : ('di', 'doc-notes_', self.default_func),
'comment' : ('di', 'doc-notes_', self.default_func),
'subject' : ('di', 'subject___', self.default_func),
'creatim' : ('di', 'create-tim', self.default_func),
'yr' : ('di', 'year______', self.default_func),
'mo' : ('di', 'month_____', self.default_func),
'dy' : ('di', 'day_______', self.default_func),
'min' : ('di', 'minute____', self.default_func),
'revtim' : ('di', 'revis-time', self.default_func),
'nofwords' : ('di', 'num-of-wor', self.default_func),
'nofchars' : ('di', 'num-of-chr', self.default_func),
'nofpages' : ('di', 'num-of-pag', self.default_func),
'edmins' : ('di', 'edit-time_', self.default_func),
# headers and footers => hf
'headerf' : ('hf', 'head-first', self.default_func),
'headerl' : ('hf', 'head-left_', self.default_func),
'headerr' : ('hf', 'head-right', self.default_func),
'footerf' : ('hf', 'foot-first', self.default_func),
'footerl' : ('hf', 'foot-left_', self.default_func),
'footerr' : ('hf', 'foot-right', self.default_func),
'header' : ('hf', 'header____', self.default_func),
'footer' : ('hf', 'footer____', self.default_func),
# page => pa
'margl' : ('pa', 'margin-lef', self.divide_by_20),
'margr' : ('pa', 'margin-rig', self.divide_by_20),
'margb' : ('pa', 'margin-bot', self.divide_by_20),
'margt' : ('pa', 'margin-top', self.divide_by_20),
'gutter' : ('pa', 'gutter____', self.divide_by_20),
'paperw' : ('pa', 'paper-widt', self.divide_by_20),
'paperh' : ('pa', 'paper-hght', self.divide_by_20),
# annotation => an
'annotation' : ('an', 'annotation', self.default_func),
# underline
'ul' : ('ul', 'underlined<continous', self.two_part_func),
'uld' : ('ul', 'underlined<dotted', self.two_part_func),
'uldash' : ('ul', 'underlined<dash', self.two_part_func),
'uldashd' : ('ul', 'underlined<dash-dot', self.two_part_func),
'uldashdd' : ('ul', 'underlined<dash-dot-dot', self.two_part_func),
'uldb' : ('ul', 'underlined<double', self.two_part_func),
'ulhwave' : ('ul', 'underlined<heavy-wave', self.two_part_func),
'ulldash' : ('ul', 'underlined<long-dash', self.two_part_func),
'ulth' : ('ul', 'underlined<thich', self.two_part_func),
'ulthd' : ('ul', 'underlined<thick-dotted', self.two_part_func),
'ulthdash' : ('ul', 'underlined<thick-dash', self.two_part_func),
'ulthdashd' : ('ul', 'underlined<thick-dash-dot', self.two_part_func),
'ulthdashdd' : ('ul', 'underlined<thick-dash-dot-dot', self.two_part_func),
'ulthldash' : ('ul', 'underlined<thick-long-dash', self.two_part_func),
'ululdbwave' : ('ul', 'underlined<double-wave', self.two_part_func),
'ulw' : ('ul', 'underlined<word', self.two_part_func),
'ulwave' : ('ul', 'underlined<wave', self.two_part_func),
'ulnone' : ('ul', 'underlined<false', self.two_part_func),
# border => bd
'trbrdrh' : ('bd', 'bor-t-r-hi', self.default_func),
'trbrdrv' : ('bd', 'bor-t-r-vi', self.default_func),
'trbrdrt' : ('bd', 'bor-t-r-to', self.default_func),
'trbrdrl' : ('bd', 'bor-t-r-le', self.default_func),
'trbrdrb' : ('bd', 'bor-t-r-bo', self.default_func),
'trbrdrr' : ('bd', 'bor-t-r-ri', self.default_func),
'clbrdrb' : ('bd', 'bor-cel-bo', self.default_func),
'clbrdrt' : ('bd', 'bor-cel-to', self.default_func),
'clbrdrl' : ('bd', 'bor-cel-le', self.default_func),
'clbrdrr' : ('bd', 'bor-cel-ri', self.default_func),
'brdrb' : ('bd', 'bor-par-bo', self.default_func),
'brdrt' : ('bd', 'bor-par-to', self.default_func),
'brdrl' : ('bd', 'bor-par-le', self.default_func),
'brdrr' : ('bd', 'bor-par-ri', self.default_func),
'box' : ('bd', 'bor-par-bx', self.default_func),
'chbrdr' : ('bd', 'bor-par-bo', self.default_func),
'brdrbtw' : ('bd', 'bor-for-ev', self.default_func),
'brdrbar' : ('bd', 'bor-outsid', self.default_func),
'brdrnone' : ('bd', 'bor-none__<false', self.two_part_func),
# border type => bt
'brdrs' : ('bt', 'bdr-single', self.default_func),
'brdrth' : ('bt', 'bdr-doubtb', self.default_func),
'brdrsh' : ('bt', 'bdr-shadow', self.default_func),
'brdrdb' : ('bt', 'bdr-double', self.default_func),
'brdrdot' : ('bt', 'bdr-dotted', self.default_func),
'brdrdash' : ('bt', 'bdr-dashed', self.default_func),
'brdrhair' : ('bt', 'bdr-hair__', self.default_func),
'brdrinset' : ('bt', 'bdr-inset_', self.default_func),
'brdrdashsm' : ('bt', 'bdr-das-sm', self.default_func),
'brdrdashd' : ('bt', 'bdr-dot-sm', self.default_func),
'brdrdashdd' : ('bt', 'bdr-dot-do', self.default_func),
'brdroutset' : ('bt', 'bdr-outset', self.default_func),
'brdrtriple' : ('bt', 'bdr-trippl', self.default_func),
'brdrtnthsg' : ('bt', 'bdr-thsm__', self.default_func),
'brdrthtnsg' : ('bt', 'bdr-htsm__', self.default_func),
'brdrtnthtnsg' : ('bt', 'bdr-hthsm_', self.default_func),
'brdrtnthmg' : ('bt', 'bdr-thm___', self.default_func),
'brdrthtnmg' : ('bt', 'bdr-htm___', self.default_func),
'brdrtnthtnmg' : ('bt', 'bdr-hthm__', self.default_func),
'brdrtnthlg' : ('bt', 'bdr-thl___', self.default_func),
'brdrtnthtnlg' : ('bt', 'bdr-hthl__', self.default_func),
'brdrwavy' : ('bt', 'bdr-wavy__', self.default_func),
'brdrwavydb' : ('bt', 'bdr-d-wav_', self.default_func),
'brdrdashdotstr' : ('bt', 'bdr-strip_', self.default_func),
'brdremboss' : ('bt', 'bdr-embos_', self.default_func),
'brdrengrave' : ('bt', 'bdr-engra_', self.default_func),
'brdrframe' : ('bt', 'bdr-frame_', self.default_func),
'brdrw' : ('bt', 'bdr-li-wid', self.divide_by_20),
'brsp' : ('bt', 'bdr-sp-wid', self.divide_by_20),
'brdrcf' : ('bt', 'bdr-color_', self.default_func),
# comments
# 'comment' : ('cm', 'comment___', self.default_func),
}
self.__number_type_dict = {
0: 'Arabic',
1: 'uppercase Roman numeral',
2: 'lowercase Roman numeral',
3: 'uppercase letter',
4: 'lowercase letter',
5: 'ordinal number',
6: 'cardianl text number',
7: 'ordinal text number',
10: 'Kanji numbering without the digit character',
11: 'Kanji numbering with the digit character',
1246: 'phonetic Katakana characters in aiueo order',
1346: 'phonetic katakana characters in iroha order',
14: 'double byte character',
15: 'single byte character',
16: 'Kanji numbering 3',
17: 'Kanji numbering 4',
18: 'Circle numbering' ,
19: 'double-byte Arabic numbering',
2046: 'phonetic double-byte Katakana characters',
2146: 'phonetic double-byte katakana characters',
22: 'Arabic with leading zero',
23: 'bullet',
24: 'Korean numbering 2',
25: 'Korean numbering 1',
26: 'Chinese numbering 1',
27: 'Chinese numbering 2',
28: 'Chinese numbering 3',
29: 'Chinese numbering 4',
30: 'Chinese Zodiac numbering 1',
31: 'Chinese Zodiac numbering 2',
32: 'Chinese Zodiac numbering 3',
33: 'Taiwanese double-byte numbering 1',
34: 'Taiwanese double-byte numbering 2',
35: 'Taiwanese double-byte numbering 3',
36: 'Taiwanese double-byte numbering 4',
37: 'Chinese double-byte numbering 1',
38: 'Chinese double-byte numbering 2',
39: 'Chinese double-byte numbering 3',
40: 'Chinese double-byte numbering 4',
41: 'Korean double-byte numbering 1',
42: 'Korean double-byte numbering 2',
43: 'Korean double-byte numbering 3',
44: 'Korean double-byte numbering 4',
45: 'Hebrew non-standard decimal',
46: 'Arabic Alif Ba Tah',
47: 'Hebrew Biblical standard',
48: 'Arabic Abjad style',
255: 'No number',
}
self.__language_dict = {
1078 : 'Afrikaans',
1052 : 'Albanian',
1025 : 'Arabic',
5121 : 'Arabic Algeria',
15361 : 'Arabic Bahrain',
3073 : 'Arabic Egypt',
1 : 'Arabic General',
2049 : 'Arabic Iraq',
11265 : 'Arabic Jordan',
13313 : 'Arabic Kuwait',
12289 : 'Arabic Lebanon',
4097 : 'Arabic Libya',
6145 : 'Arabic Morocco',
8193 : 'Arabic Oman',
16385 : 'Arabic Qatar',
10241 : 'Arabic Syria',
7169 : 'Arabic Tunisia',
14337 : 'Arabic U.A.E.',
9217 : 'Arabic Yemen',
1067 : 'Armenian',
1101 : 'Assamese',
2092 : 'Azeri Cyrillic',
1068 : 'Azeri Latin',
1069 : 'Basque',
1093 : 'Bengali',
4122 : 'Bosnia Herzegovina',
1026 : 'Bulgarian',
1109 : 'Burmese',
1059 : 'Byelorussian',
1027 : 'Catalan',
2052 : 'Chinese China',
4 : 'Chinese General',
3076 : 'Chinese Hong Kong',
4100 : 'Chinese Singapore',
1028 : 'Chinese Taiwan',
1050 : 'Croatian',
1029 : 'Czech',
1030 : 'Danish',
2067 : 'Dutch Belgium',
1043 : 'Dutch Standard',
3081 : 'English Australia',
10249 : 'English Belize',
2057 : 'English British',
4105 : 'English Canada',
9225 : 'English Caribbean',
9 : 'English General',
6153 : 'English Ireland',
8201 : 'English Jamaica',
5129 : 'English New Zealand',
13321 : 'English Philippines',
7177 : 'English South Africa',
11273 : 'English Trinidad',
1033 : 'English United States',
1061 : 'Estonian',
1080 : 'Faerose',
1065 : 'Farsi',
1035 : 'Finnish',
1036 : 'French',
2060 : 'French Belgium',
11276 : 'French Cameroon',
3084 : 'French Canada',
12300 : 'French Cote d\'Ivoire',
5132 : 'French Luxembourg',
13324 : 'French Mali',
6156 : 'French Monaco',
8204 : 'French Reunion',
10252 : 'French Senegal',
4108 : 'French Swiss',
7180 : 'French West Indies',
9228 : 'French Democratic Republic of the Congo',
1122 : 'Frisian',
1084 : 'Gaelic',
2108 : 'Gaelic Ireland',
1110 : 'Galician',
1079 : 'Georgian',
1031 : 'German',
3079 : 'German Austrian',
5127 : 'German Liechtenstein',
4103 : 'German Luxembourg',
2055 : 'German Switzerland',
1032 : 'Greek',
1095 : 'Gujarati',
1037 : 'Hebrew',
1081 : 'Hindi',
1038 : 'Hungarian',
1039 : 'Icelandic',
1057 : 'Indonesian',
1040 : 'Italian',
2064 : 'Italian Switzerland',
1041 : 'Japanese',
1099 : 'Kannada',
1120 : 'Kashmiri',
2144 : 'Kashmiri India',
1087 : 'Kazakh',
1107 : 'Khmer',
1088 : 'Kirghiz',
1111 : 'Konkani',
1042 : 'Korean',
2066 : 'Korean Johab',
1108 : 'Lao',
1062 : 'Latvian',
1063 : 'Lithuanian',
2087 : 'Lithuanian Classic',
1086 : 'Malay',
2110 : 'Malay Brunei Darussalam',
1100 : 'Malayalam',
1082 : 'Maltese',
1112 : 'Manipuri',
1102 : 'Marathi',
1104 : 'Mongolian',
1121 : 'Nepali',
2145 : 'Nepali India',
1044 : 'Norwegian Bokmal',
2068 : 'Norwegian Nynorsk',
1096 : 'Oriya',
1045 : 'Polish',
1046 : 'Portuguese (Brazil)',
2070 : 'Portuguese (Portugal)',
1094 : 'Punjabi',
1047 : 'Rhaeto-Romanic',
1048 : 'Romanian',
2072 : 'Romanian Moldova',
1049 : 'Russian',
2073 : 'Russian Moldova',
1083 : 'Sami Lappish',
1103 : 'Sanskrit',
3098 : 'Serbian Cyrillic',
2074 : 'Serbian Latin',
1113 : 'Sindhi',
1051 : 'Slovak',
1060 : 'Slovenian',
1070 : 'Sorbian',
11274 : 'Spanish Argentina',
16394 : 'Spanish Bolivia',
13322 : 'Spanish Chile',
9226 : 'Spanish Colombia',
5130 : 'Spanish Costa Rica',
7178 : 'Spanish Dominican Republic',
12298 : 'Spanish Ecuador',
17418 : 'Spanish El Salvador',
4106 : 'Spanish Guatemala',
18442 : 'Spanish Honduras',
2058 : 'Spanish Mexico',
3082 : 'Spanish Modern',
19466 : 'Spanish Nicaragua',
6154 : 'Spanish Panama',
15370 : 'Spanish Paraguay',
10250 : 'Spanish Peru',
20490 : 'Spanish Puerto Rico',
1034 : 'Spanish Traditional',
14346 : 'Spanish Uruguay',
8202 : 'Spanish Venezuela',
1072 : 'Sutu',
1089 : 'Swahili',
1053 : 'Swedish',
2077 : 'Swedish Finland',
1064 : 'Tajik',
1097 : 'Tamil',
1092 : 'Tatar',
1098 : 'Telugu',
1054 : 'Thai',
1105 : 'Tibetan',
1073 : 'Tsonga',
1074 : 'Tswana',
1055 : 'Turkish',
1090 : 'Turkmen',
1058 : 'Ukranian',
1056 : 'Urdu',
2080 : 'Urdu India',
2115 : 'Uzbek Cyrillic',
1091 : 'Uzbek Latin',
1075 : 'Venda',
1066 : 'Vietnamese',
1106 : 'Welsh',
1076 : 'Xhosa',
1085 : 'Yiddish',
1077 : 'Zulu',
1024 : 'Unkown',
255 : 'Unkown',
}
"""
# unknown
# These must get passed on because they occure after \*
'do' : ('un', 'unknown___', self.default_func),
'company' : ('un', 'company___', self.default_func),
'shpinst' : ('un', 'unknown___', self.default_func),
'panose' : ('un', 'unknown___', self.default_func),
'falt' : ('un', 'unknown___', self.default_func),
'listoverridetable' : ('un', 'unknown___', self.default_func),
'category' : ('un', 'unknown___', self.default_func),
'template' : ('un', 'unknown___', self.default_func),
'ud' : ('un', 'unknown___', self.default_func),
'formfield' : ('un', 'unknown___', self.default_func),
'ts' : ('un', 'unknown___', self.default_func),
'rsidtbl' : ('un', 'unknown___', self.default_func),
'generator' : ('un', 'unknown___', self.default_func),
'ftnsep' : ('un', 'unknown___', self.default_func),
'aftnsep' : ('un', 'unknown___', self.default_func),
'aftnsepc' : ('un', 'unknown___', self.default_func),
'aftncn' : ('un', 'unknown___', self.default_func),
'objclass' : ('un', 'unknown___', self.default_func),
'objdata' : ('un', 'unknown___', self.default_func),
'picprop' : ('un', 'unknown___', self.default_func),
'blipuid' : ('un', 'unknown___', self.default_func),
"""
def __ms_hex_func(self, pre, token, num):
num = num[1:] # chop off leading 0, which I added
num = num.upper() # the mappings store hex in caps
return 'tx<hx<__________<\'%s\n' % num # add an ' for the mappings
def ms_sub_func(self, pre, token, num):
return 'tx<mc<__________<%s\n' % token
def default_func(self, pre, token, num):
if num == None:
num = 'true'
return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
def __list_type_func(self, pre, token, num):
type = 'arabic'
if num == None:
type = 'Arabic'
else:
try:
num = int(num)
except ValueError:
if self.__run_level > 3:
msg = 'number "%s" cannot be converted to integer\n' % num
raise self.__bug_handler, msg
type = self.__number_type_dict.get(num)
if type == None:
if self.__run_level > 3:
msg = 'No type for "%s" in self.__number_type_dict\n'
raise self.__bug_handler
type = 'Arabic'
return 'cw<%s<%s<nu<%s\n' % (pre, token, type)
def __language_func(self, pre, token, num):
lang_name = self.__language_dict.get(int(num))
if not lang_name:
lang_name = "not defined"
if self.__run_level > 3:
msg = 'No entry for number "%s"' % num
raise self.__bug_handler, msg
return 'cw<%s<%s<nu<%s\n' % (pre, token, lang_name)
def two_part_func(self, pre, token, num):
list = token.split("<")
token = list[0]
num = list[1]
return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
##return 'cw<nu<nu<nu<%s>num<%s\n' % (token, num)
def divide_by_2(self, pre, token, num):
num = self.divide_num(num, 2)
return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
##return 'cw<nu<nu<nu<%s>%s<%s\n' % (token, num, token)
def divide_by_20(self, pre, token, num):
num = self.divide_num(num, 20)
return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
##return 'cw<nu<nu<nu<%s>%s<%s\n' % (token, num, token)
def text_func(self, pre, token, num=None):
return 'tx<nu<__________<%s\n' % token
def ob_func(self, pre, token, num=None):
self.__bracket_count += 1
##return 'ob<%04d\n' % self.__bracket_count
return 'ob<nu<open-brack<%04d\n' % self.__bracket_count
def cb_func(self, pre, token, num=None):
##line = 'cb<%04d\n' % self.__bracket_count
line = 'cb<nu<clos-brack<%04d\n' % self.__bracket_count
self.__bracket_count -= 1
return line
def color_func(self, pre, token, num):
third_field = 'nu'
if num[-1] == ';':
num = num[:-1]
third_field = 'en'
num = str('%X' % int(num))
if len(num) != 2:
num = "0" + num
return 'cw<%s<%s<%s<%s\n' % (pre, token, third_field, num)
##return 'cw<cl<%s<nu<nu<%s>%s<%s\n' % (third_field, token, num, token)
def bool_st_func(self, pre, token, num):
if num is None or num == '' or num == '1':
return 'cw<%s<%s<nu<true\n' % (pre, token)
##return 'cw<nu<nu<nu<%s>true<%s\n' % (token, token)
elif num == '0':
return 'cw<%s<%s<nu<false\n' % (pre, token)
##return 'cw<nu<nu<nu<%s>false<%s\n' % (token, token)
else:
msg = 'boolean should have some value module process tokens\n'
msg += 'token is ' + token + "\n"
msg += "'" + num + "'" + "\n"
raise self.__bug_handler, msg
def __no_sup_sub_func(self, pre, token, num):
the_string = 'cw<ci<subscript_<nu<false\n'
the_string += 'cw<ci<superscrip<nu<false\n'
return the_string
def divide_num(self, numerator, denominator):
try:
numerator = float(numerator)
except TypeError, msg:
if self.__run_level > 3:
msg = 'no number to process?\n'
msg += 'this indicates that the token '
msg += ' \(\\li\) should have a number and does not\n'
msg += 'numerator is "%s"\n' % numerator
msg += 'denominator is "%s"\n' % denominator
raise self.__bug_handler, msg
if 5 > self.__return_code:
self.__return_code = 5
return 0
num = '%0.2f' % round(numerator/denominator, 2)
return num
string_num = str(num)
if string_num[-2:] == ".0":
string_num = string_num[:-2]
return string_num
def split_let_num(self, token):
match_obj = re.search(self.__num_exp,token)
if match_obj != None:
first = match_obj.group(1)
second = match_obj.group(2)
if not second:
if self.__run_level > 3:
msg = "token is '%s' \n" % token
raise self.__bug_handler, msg
return first, 0
else:
if self.__run_level > 3:
msg = "token is '%s' \n" % token
raise self.__bug_handler
return token, 0
return first, second
def convert_to_hex(self,number):
"""Convert a string to uppercase hexidecimal"""
num = int(number)
try:
hex_num = "%X" % num
return hex_num
except:
raise self.__bug_handler
def process_cw(self, token):
"""Change the value of the control word by determining what dictionary
it belongs to"""
special = [ '*', ':', '}', '{', '~', '_', '-', ';' ]
##if token != "{" or token != "}":
token = token[1:] # strip off leading \
token = token.replace(" ", "")
##if not token: return
only_alpha = token.isalpha()
num = None
if not only_alpha and token not in special:
token, num = self.split_let_num(token)
pre, token, action = self.dict_token.get(token, (None, None, None))
if action:
return action(pre, token, num)
# unused function
def initiate_token_actions(self):
self.action_for_token={
'{' : self.ob_func,
'}' : self.cb_func,
'\\' : self.process_cw,
}
# unused function
def evaluate_token(self,token):
"""Evaluate tokens. Return a value if the token is not a
control word. Otherwise, pass token onto another method
for further evaluation."""
token, action = self.dict_token.get(token[0:1])
if action:
line = action(token)
return line
else :
return 'tx<nu<nu<nu<nu<%s\n' % token
def __check_brackets(self, in_file):
self.__check_brack_obj = check_brackets.CheckBrackets\
(file = in_file)
good_br = self.__check_brack_obj.check_brackets()[0]
if not good_br:
return 1
def process_tokens(self):
"""Main method for handling other methods. """
first_token = 0
second_token = 0
read_obj = open(self.__file, 'r')
write_obj = open(self.__write_to, 'w')
line_to_read = "dummy"
line_count = 0
while line_to_read:
line_to_read = read_obj.readline()
token = line_to_read
token = token.replace("\n","")
if not token:
continue
line_count += 1
try:
token.decode('us-ascii')
except UnicodeError, msg:
msg = str(msg)
msg += 'Invalid RTF: File not ascii encoded.\n'
raise self.__exception_handler, msg
if not first_token:
if token != '\\{':
msg = 'Invalid RTF: document doesn\'t start with {\n'
raise self.__exception_handler, msg
first_token = 1
elif first_token and not second_token:
if token[0:4] != '\\rtf':
msg ='Invalid RTF: document doesn\'t start with \\rtf \n'
raise self.__exception_handler, msg
second_token = 1
##token = self.evaluate_token(token)
the_index = token.find('\\ ')
if token != None and the_index > -1:
msg ='Invalid RTF: token "\\ " not valid. \n'
raise self.__exception_handler, msg
elif token[0:1] == "\\":
line = self.process_cw(token)
if line != None:
write_obj.write(line)
else:
fields = re.split(self.__utf_exp, token)
for field in fields:
if not field:
continue
if field[0:1] == '&':
write_obj.write('tx<ut<__________<%s\n' % field)
else:
write_obj.write('tx<nu<__________<%s\n' % field)
read_obj.close()
write_obj.close()
if not line_count:
msg ='Invalid RTF: file appears to be empty. \n'
raise self.__exception_handler, msg
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "processed_tokens.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)
bad_brackets = self.__check_brackets(self.__file)
if bad_brackets:
msg = 'Invalid RTF: document does not have matching brackets.\n'
raise self.__exception_handler, msg
else:
return self.__return_code

View File

@ -0,0 +1,52 @@
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program; if not, write to the Free Software #
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
# 02111-1307 USA #
# #
# #
#########################################################################
import os, tempfile
from libprs500.ebooks.rtf2xml import copy
class ReplaceIllegals:
"""
reaplace illegal lower ascii characters
"""
def __init__(self,
in_file,
copy = None,
run_level = 1,
):
self.__file = in_file
self.__copy = copy
self.__run_level = run_level
self.__write_to = tempfile.mktemp()
def replace_illegals(self):
"""
"""
nums = [0, 1, 2, 3, 4, 5, 6, 7, 8, 11, 13, 14, 15, 16, 17, 18, 19]
read_obj = open(self.__file, 'r')
write_obj = open(self.__write_to, 'w')
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
for num in nums:
line = line.replace(chr(num), '')
write_obj.write(line)
read_obj.close()
write_obj.close()
copy_obj = copy.Copy()
if self.__copy:
copy_obj.copy_file(self.__write_to, "replace_illegals.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)

View File

@ -0,0 +1,513 @@
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program; if not, write to the Free Software #
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
# 02111-1307 USA #
# #
# #
#########################################################################
import sys, os, tempfile
from libprs500.ebooks.rtf2xml import copy
class Sections:
"""
=================
Purpose
=================
Write section tags for a tokenized file. (This module won't be any use to use
to you unless you use it as part of the other modules.)
---------------
logic
---------------
The tags for the first section breaks have already been written.
RTF stores section breaks with the \sect tag. Each time this tag is
encountered, add one to the counter.
When I encounter the \sectd tag, I want to collect all the appropriate tokens
that describe the section. When I reach a \pard, I know I an stop collecting
tokens and write the section tags.
The exception to this method occurs when sections occur in field blocks, such
as the index. Normally, two section break occur within the index and other
field-blocks. (If less or more section breaks occurr, this code may not work.)
I want the sections to occurr outside of the index. That is, the index
should be nested inside one section tag. After the index is complete, a new
section should begin.
In order to write the sections outside of the field blocks, I have to store
all of the field block as a string. When I ecounter the \sect tag, add one to
the section counter, but store this number in a list. Likewise, store the
information describing the section in another list.
When I reach the end of the field block, choose the first item from the
numbered list as the section number. Choose the first item in the description
list as the values and attributes of the section. Enclose the field string
between the section tags.
Start a new section outside the field-block strings. Use the second number in
the list; use the second item in the description list.
CHANGE (2004-04-26) No longer write sections that occurr in field-blocks.
Instead, ingore all section information in a field-block.
"""
def __init__(self,
in_file,
bug_handler,
copy = None,
run_level = 1):
"""
Required:
'file'--file to parse
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
Returns:
nothing
"""
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__run_level = run_level
self.__write_to = tempfile.mktemp()
def __initiate_values(self):
"""
Initiate all values.
"""
self.__mark_start = 'mi<mk<sect-start\n'
self.__mark_end = 'mi<mk<sect-end__\n'
self.__in_field = 0
self.__section_values = {}
self.__list_of_sec_values = []
self.__field_num = []
self.__section_num = 0
self.__state = 'before_body'
self.__found_first_sec = 0
self.__text_string = ''
self.__field_instruction_string = ''
self.__state_dict = {
'before_body' : self.__before_body_func,
'body' : self.__body_func,
'before_first_sec' : self.__before_first_sec_func,
'section' : self.__section_func,
'section_def' : self.__section_def_func,
'sec_in_field' : self.__sec_in_field_func,
}
# cw<sc<sect-defin<nu<true
self.__body_dict = {
'cw<sc<section___' : self.__found_section_func,
'mi<mk<sec-fd-beg' : self.__found_sec_in_field_func,
'cw<sc<sect-defin' : self.__found_section_def_bef_sec_func,
}
self.__section_def_dict = {
'cw<pf<par-def___' : (self.__end_sec_def_func, None),
'mi<mk<body-open_' : (self.__end_sec_def_func, None),
'cw<tb<columns___' : (self.__attribute_func, 'columns'),
'cw<pa<margin-lef' : (self.__attribute_func, 'margin-left'),
'cw<pa<margin-rig' : (self.__attribute_func, 'margin-right'),
'mi<mk<header-ind' : (self.__end_sec_def_func, None),
# premature endings
#__end_sec_premature_func
'tx<nu<__________' : (self.__end_sec_premature_func, None),
'cw<ci<font-style' : (self.__end_sec_premature_func, None),
'cw<ci<font-size_' : (self.__end_sec_premature_func, None),
}
self.__sec_in_field_dict = {
'mi<mk<sec-fd-end' : self.__end_sec_in_field_func,
# changed this 2004-04-26
# two lines
# 'cw<sc<section___' : self.__found_section_in_field_func,
# 'cw<sc<sect-defin' : self.__found_section_def_in_field_func,
}
def __found_section_def_func(self, line):
"""
Required:
line -- the line to parse
Returns:
nothing
Logic:
I have found a section definition. Change the state to
setion_def (so subsequent lines will be processesed as part of
the section definition), and clear the section_values dictionary.
"""
self.__state = 'section_def'
self.__section_values.clear()
def __attribute_func(self, line, name):
"""
Required:
line -- the line to be parsed
name -- the changed, readable name (as opposed to the
abbreviated one)
Returns:
nothing
Logic:
I need to add the right data to the section values dictionary so I
can retrive it later. The attribute (or key) is the name; the
value is the last part of the text string.
ex: cw<tb<columns___<nu<2
"""
attribute = name
value = line[20:-1]
self.__section_values[attribute] = value
def __found_section_func(self, line):
"""
Requires:
line -- the line to parse
Returns:
nothing
Logic:
I have found the beginning of a section, so change the state
accordingly. Also add one to the section counter.
"""
self.__state = 'section'
self.__write_obj.write(line)
self.__section_num += 1
def __found_section_def_bef_sec_func(self, line):
"""
Requires:
line -- the line to parse
Returns:
nothing
Logic:
I have found the beginning of a section, so change the state
accordingly. Also add one to the section counter.
"""
self.__section_num += 1
self.__found_section_def_func(line)
self.__write_obj.write(line)
def __section_func(self, line):
"""
Requires:
line --the line to parse
Returns:
nothing
Logic:
"""
if self.__token_info == 'cw<sc<sect-defin':
self.__found_section_def_func(line)
self.__write_obj.write(line)
def __section_def_func(self, line):
"""
Required:
line --line to parse
Returns:
nothing
Logic:
I have found a section definition. Check if the line is the end of
the defnition (a paragraph defintion), or if it contains info that
should be added to the values dictionary. If neither of these
cases are true, output the line to a file.
"""
action, name = self.__section_def_dict.get(self.__token_info, (None, None))
if action:
action(line, name)
if self.__in_field:
self.__sec_in_field_string += line
else:
self.__write_obj.write(line)
else:
self.__write_obj.write(line)
def __end_sec_def_func(self, line, name):
"""
Requires:
line --the line to parse
name --changed, readable name
Returns:
nothing
Logic:
The end of the section definition has been found. Reset the state.
Call on the write_section method.
"""
if not self.__in_field:
self.__state = 'body'
else:
self.__state = 'sec_in_field'
self.__write_section(line)
def __end_sec_premature_func(self, line, name):
"""
Requires:
line --the line to parse
name --changed, readable name
Returns:
nothing
Logic:
Text or control words indicating text have been found
before \pard. This shoud indicate older RTF. Reset the state
Write the section defintion. Insert a paragraph definition.
Insert {} to mark the end of a paragraph defintion
"""
if not self.__in_field:
self.__state = 'body'
else:
self.__state = 'sec_in_field'
self.__write_section(line)
self.__write_obj.write('cw<pf<par-def___<nu<true\n')
self.__write_obj.write('ob<nu<open-brack<0000\n')
self.__write_obj.write('cb<nu<clos-brack<0000\n')
def __write_section(self, line):
"""
Requires:
nothing
Returns:
nothing
Logic:
Form a string of attributes and values. If you are not in a field
block, write this string to the output file. Otherwise, call on
the handle_sec_def method to handle this string.
"""
my_string = self.__mark_start
if self.__found_first_sec:
my_string += 'mi<tg<close_____<section\n'
else:
self.__found_first_sec = 1
my_string += 'mi<tg<open-att__<section<num>%s' % str(self.__section_num)
my_string += '<num-in-level>%s' % str(self.__section_num)
my_string += '<type>rtf-native'
my_string += '<level>0'
keys = self.__section_values.keys()
if len(keys) > 0:
for key in keys:
my_string += '<%s>%s' % (key, self.__section_values[key])
my_string += '\n'
my_string += self.__mark_end
# # my_string += line
if self.__state == 'body':
self.__write_obj.write(my_string)
elif self.__state == 'sec_in_field':
self.__handle_sec_def(my_string)
elif self.__run_level > 3:
msg = 'missed a flag\n'
raise self.__bug_handler, msg
def __handle_sec_def(self, my_string):
"""
Requires:
my_string -- the string of attributes and values. (Do I need this?)
Returns:
nothing
Logic:
I need to append the dictionary of attributes and values to list
so I can use it later when I reach the end of the field-block.
"""
values_dict = self.__section_values
self.__list_of_sec_values.append(values_dict)
def __body_func(self, line):
"""
Requires:
line --the line to parse
Returns:
nothing
Logic:
Look for the beginning of a section. Otherwise, print the line to
the output file.
"""
action = self.__body_dict.get(self.__token_info)
if action:
action(line)
else:
self.__write_obj.write(line)
def __before_body_func(self, line):
"""
Requires:
line --line to parse
Returns:
nothing
Logic:
Look for the beginning of the body. Always print out the line.
"""
if self.__token_info == 'mi<mk<body-open_':
self.__state = 'before_first_sec'
self.__write_obj.write(line)
def __before_first_sec_func(self, line):
"""
Requires:
line -- line to parse
Returns:
nothing
Logic:
Look for the beginning of the first section. This can be \\sectd,
but in older RTF it could mean the any paragraph or row definition
"""
if self.__token_info == 'cw<sc<sect-defin':
self.__state = 'section_def'
self.__section_num += 1
self.__section_values.clear()
elif self.__token_info == 'cw<pf<par-def___':
self.__state = 'body'
self.__section_num += 1
self.__write_obj.write (
'mi<tg<open-att__<section<num>%s'
'<num-in-level>%s'
'<type>rtf-native'
'<level>0\n'
% (str(self.__section_num), str(self.__section_num))
)
self.__found_first_sec = 1
elif self.__token_info == 'tx<nu<__________':
self.__state = 'body'
self.__section_num += 1
self.__write_obj.write (
'mi<tg<open-att__<section<num>%s'
'<num-in-level>%s'
'<type>rtf-native'
'<level>0\n'
% (str(self.__section_num), str(self.__section_num))
)
self.__write_obj.write(
'cw<pf<par-def___<true\n'
)
self.__found_first_sec = 1
self.__write_obj.write(line)
def __found_sec_in_field_func(self, line):
"""
Requires:
line --line to parse
Returns:
nothing
Logic:
I have found the beginning of a field that has a section (or
really, two) inside of it. Change the state, and start adding to
one long string.
"""
self.__state = 'sec_in_field'
self.__sec_in_field_string = line
self.__in_field = 1
def __sec_in_field_func(self, line):
"""
Requires:
line --the line to parse
Returns:
nothing
Logic:
Check for the end of the field, or the beginning of a section
definition.
CHANGED! Just print out each line. Ignore any sections or
section definition info.
"""
action = self.__sec_in_field_dict.get(self.__token_info)
if action:
action(line)
else:
# change this 2004-04-26
# self.__sec_in_field_string += line
self.__write_obj.write(line)
def __end_sec_in_field_func(self, line):
"""
Requires:
line --line to parse
Returns:
nothing
Logic:
Add the last line to the field string. Call on the method
print_field_sec_attributes to write the close and beginning of a
section tag. Print out the field string. Call on the same method
to again write the close and beginning of a section tag.
Change the state.
"""
# change this 2004-04-26
# Don't do anyting
"""
self.__sec_in_field_string += line
self.__print_field_sec_attributes()
self.__write_obj.write(self.__sec_in_field_string)
self.__print_field_sec_attributes()
"""
self.__state = 'body'
self.__in_field = 0
# this is changed too
self.__write_obj.write(line)
def __print_field_sec_attributes(self):
"""
Requires:
nothing
Returns:
nothing
Logic:
Get the number and dictionary of values from the lists. The number
and dictionary will be the first item of each list. Write the
close tag. Write the start tag. Write the attribute and values in
the dictionary. Get rid of the first item in each list.
keys = self.__section_values.keys()
if len(keys) > 0:
my_string += 'mi<tg<open-att__<section-definition'
for key in keys:
my_string += '<%s>%s' % (key, self.__section_values[key])
my_string += '\n'
else:
my_string += 'mi<tg<open______<section-definition\n'
"""
num = self.__field_num[0]
self.__field_num = self.__field_num[1:]
self.__write_obj.write(
'mi<tg<close_____<section\n'
'mi<tg<open-att__<section<num>%s' % str(num)
)
if self.__list_of_sec_values:
keys = self.__list_of_sec_values[0].keys()
for key in keys:
self.__write_obj.write(
'<%s>%s\n' % (key, self.__list_of_sec_values[0][key]))
self.__list_of_sec_values = self.__list_of_sec_values[1:]
self.__write_obj.write('<level>0')
self.__write_obj.write('<type>rtf-native')
self.__write_obj.write('<num-in-level>%s' % str(self.__section_num))
self.__write_obj.write('\n')
# Look here
def __found_section_in_field_func(self, line):
"""
Requires:
line --line to parse
Returns:
nothing
Logic:
I have found a section in a field block. Add one to section
counter, and append this number to a list.
"""
self.__section_num += 1
self.__field_num.append(self.__section_num)
self.__sec_in_field_string += line
def __found_section_def_in_field_func(self, line):
"""
Requires:
line --line to parse
Returns:
nothing
Logic:
I have found a section definition in a filed block. Change the
state and clear the values dictionary.
"""
self.__state = 'section_def'
self.__section_values.clear()
def make_sections(self):
"""
Requires:
nothing
Returns:
nothing (changes the original file)
Logic:
Read one line in at a time. Determine what action to take based on
the state. If the state is before the body, look for the
beginning of the body.
If the state is body, send the line to the body method.
"""
self.__initiate_values()
read_obj = open(self.__file, 'r')
self.__write_obj = open(self.__write_to, 'w')
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
action = self.__state_dict.get(self.__state)
if action == None:
sys.stderr.write('no no matching state in module sections.py\n')
sys.stderr.write(self.__state + '\n')
action(line)
read_obj.close()
self.__write_obj.close()
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "sections.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)

View File

@ -0,0 +1,705 @@
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program; if not, write to the Free Software #
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
# 02111-1307 USA #
# #
# #
#########################################################################
import sys, os, tempfile
from libprs500.ebooks.rtf2xml import copy, border_parse
class Styles:
"""
Change lines with style numbers to actual style names.
"""
def __init__(self,
in_file,
bug_handler,
copy = None,
run_level = 1,
):
"""
Required:
'file'--file to parse
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
Returns:
nothing
"""
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__write_to = tempfile.mktemp()
self.__run_level = run_level
def __initiate_values(self):
"""
Initiate all values.
"""
self.__border_obj = border_parse.BorderParse()
self.__styles_dict = {'par':{}, 'char':{}}
self.__styles_num = '0'
self.__type_of_style = 'par'
self.__text_string = ''
self.__state = 'before_styles_table'
self.__state_dict = {
'before_styles_table': self.__before_styles_func,
'in_styles_table' : self.__in_styles_func,
'in_individual_style' : self.__in_individual_style_func,
'after_styles_table' : self.__after_styles_func,
'mi<mk<styles-beg' : self.__found_styles_table_func,
'mi<mk<styles-end' : self.__found_end_styles_table_func,
'mi<mk<stylei-beg' : self.__found_beg_ind_style_func,
'mi<mk<stylei-end' : self.__found_end_ind_style_func,
'cw<ss<para-style' : self.__para_style_func,
'cw<ss<char-style' : self.__char_style_func,
}
# A separate dictionary for parsing the body text
self.__body_dict = {
'cw<ss<para-style' : (self.__para_style_in_body_func, 'par'),
'cw<ss<char-style' : (self.__para_style_in_body_func, 'char'),
}
# Dictionary needed to convert shortened style names to readable names
self.__token_dict={
# paragraph formatting => pf
'par-end___' : 'para',
'par-def___' : 'paragraph-definition',
'keep-w-nex' : 'keep-with-next',
'widow-cntl' : 'widow-control',
'adjust-rgt' : 'adjust-right',
'language__' : 'language',
'right-inde' : 'right-indent',
'fir-ln-ind' : 'first-line-indent',
'left-inden' : 'left-indent',
'space-befo' : 'space-before',
'space-afte' : 'space-after',
'line-space' : 'line-spacing',
'default-ta' : 'default-tab',
'align_____' : 'align',
'widow-cntr' : 'widow-control',
# page fomratting mixed in! (Just in older RTF?)
'margin-lef' : 'left-indent',
'margin-rig' : 'right-indent',
'margin-bot' : 'space-after',
'margin-top' : 'space-before',
# stylesheet = > ss
'style-shet' : 'stylesheet',
'based-on__' : 'based-on-style',
'next-style' : 'next-style',
'char-style' : 'character-style',
'para-style' : 'paragraph-style',
# graphics => gr
'picture___' : 'pict',
'obj-class_' : 'obj_class',
'mac-pic___' : 'mac-pict',
# section => sc
'section___' : 'section-new',
'sect-defin' : 'section-reset',
'sect-note_' : 'endnotes-in-section',
# list=> ls
'list-text_' : 'list-text',
# this line must be wrong because it duplicates an earlier one
'list-text_' : 'list-text',
'list______' : 'list',
'list-lev-d' : 'list-level-definition',
'list-cardi' : 'list-cardinal-numbering',
'list-decim' : 'list-decimal-numbering',
'list-up-al' : 'list-uppercase-alphabetic-numbering',
'list-up-ro' : 'list-uppercae-roman-numbering',
'list-ord__' : 'list-ordinal-numbering',
'list-ordte' : 'list-ordinal-text-numbering',
'list-bulli' : 'list-bullet',
'list-simpi' : 'list-simple',
'list-conti' : 'list-continue',
'list-hang_' : 'list-hang',
# 'list-tebef' : 'list-text-before',
'list-level' : 'level',
'list-id___' : 'list-id',
'list-start' : 'list-start',
'nest-level' : 'nest-level',
# duplicate
'list-level' : 'list-level',
# notes => nt
'footnote__' : 'footnote',
'type______' : 'type',
# anchor => an
'toc_______' : 'anchor-toc',
'book-mk-st' : 'bookmark-start',
'book-mk-en' : 'bookmark-end',
'index-mark' : 'anchor-index',
'place_____' : 'place',
# field => fd
'field_____' : 'field',
'field-inst' : 'field-instruction',
'field-rslt' : 'field-result',
'datafield_' : 'data-field',
# info-tables => it
'font-table' : 'font-table',
'colr-table' : 'color-table',
'lovr-table' : 'list-override-table',
'listtable_' : 'list-table',
'revi-table' : 'revision-table',
# character info => ci
'hidden____' : 'hidden',
'italics___' : 'italics',
'bold______' : 'bold',
'strike-thr' : 'strike-through',
'shadow____' : 'shadow',
'outline___' : 'outline',
'small-caps' : 'small-caps',
'dbl-strike' : 'double-strike-through',
'emboss____' : 'emboss',
'engrave___' : 'engrave',
'subscript_' : 'subscript',
'superscrip' : 'superscript',
'plain_____' : 'plain',
'font-style' : 'font-style',
'font-color' : 'font-color',
'font-size_' : 'font-size',
'font-up___' : 'superscript',
'font-down_' : 'subscript',
'red_______' : 'red',
'blue______' : 'blue',
'green_____' : 'green',
'caps______' : 'caps',
# table => tb
'row-def___' : 'row-definition',
'cell______' : 'cell',
'row_______' : 'row',
'in-table__' : 'in-table',
'columns___' : 'columns',
'row-pos-le' : 'row-position-left',
'cell-posit' : 'cell-position',
# preamble => pr
# underline
'underlined' : 'underlined',
# border => bd
'bor-t-r-hi' : 'border-table-row-horizontal-inside',
'bor-t-r-vi' : 'border-table-row-vertical-inside',
'bor-t-r-to' : 'border-table-row-top',
'bor-t-r-le' : 'border-table-row-left',
'bor-t-r-bo' : 'border-table-row-bottom',
'bor-t-r-ri' : 'border-table-row-right',
'bor-cel-bo' : 'border-cell-bottom',
'bor-cel-to' : 'border-cell-top',
'bor-cel-le' : 'border-cell-left',
'bor-cel-ri' : 'border-cell-right',
'bor-par-bo' : 'border-paragraph-bottom',
'bor-par-to' : 'border-paragraph-top',
'bor-par-le' : 'border-paragraph-left',
'bor-par-ri' : 'border-paragraph-right',
'bor-par-bo' : 'border-paragraph-box',
'bor-for-ev' : 'border-for-every-paragraph',
'bor-outsid' : 'border-outisde',
'bor-none__' : 'border',
# border type => bt
'bdr-single' : 'single',
'bdr-doubtb' : 'double-thickness-border',
'bdr-shadow' : 'shadowed-border',
'bdr-double' : 'double-border',
'bdr-dotted' : 'dotted-border',
'bdr-dashed' : 'dashed',
'bdr-hair__' : 'hairline',
'bdr-inset_' : 'inset',
'bdr-das-sm' : 'dash-small',
'bdr-dot-sm' : 'dot-dash',
'bdr-dot-do' : 'dot-dot-dash',
'bdr-outset' : 'outset',
'bdr-trippl' : 'tripple',
'bdr-thsm__' : 'thick-thin-small',
'bdr-htsm__' : 'thin-thick-small',
'bdr-hthsm_' : 'thin-thick-thin-small',
'bdr-thm__' : 'thick-thin-medium',
'bdr-htm__' : 'thin-thick-medium',
'bdr-hthm_' : 'thin-thick-thin-medium',
'bdr-thl__' : 'thick-thin-large',
'bdr-hthl_' : 'think-thick-think-large',
'bdr-wavy_' : 'wavy',
'bdr-d-wav' : 'double-wavy',
'bdr-strip' : 'striped',
'bdr-embos' : 'emboss',
'bdr-engra' : 'engrave',
'bdr-frame' : 'frame',
'bdr-li-wid' : 'line-width',
# tabs
'tab-center' : 'center',
'tab-right_' : 'right',
'tab-dec___' : 'decimal',
'leader-dot' : 'leader-dot',
'leader-hyp' : 'leader-hyphen',
'leader-und' : 'leader-underline',
}
self.__tabs_dict = {
'cw<pf<tab-stop__' : self.__tab_stop_func,
'cw<pf<tab-center' : self.__tab_type_func,
'cw<pf<tab-right_' : self.__tab_type_func,
'cw<pf<tab-dec___' : self.__tab_type_func,
'cw<pf<leader-dot' : self.__tab_leader_func,
'cw<pf<leader-hyp' : self.__tab_leader_func,
'cw<pf<leader-und' : self.__tab_leader_func,
'cw<pf<tab-bar-st' : self.__tab_bar_func,
}
self.__tab_type_dict = {
'cw<pf<tab-center' : 'center',
'cw<pf<tab-right_' : 'right',
'cw<pf<tab-dec___' : 'decimal',
'cw<pf<leader-dot' : 'leader-dot',
'cw<pf<leader-hyp' : 'leader-hyphen',
'cw<pf<leader-und' : 'leader-underline',
}
self.__ignore_list = [
'list-tebef',
]
self.__tabs_list = self.__tabs_dict.keys()
self.__tab_type = 'left'
self.__leader_found = 0
def __in_individual_style_func(self, line):
"""
Required:
line
Returns:
nothing
Logic:
Check if the token marks the end of the individual style. (Action
is the value of the state dictionary, and the only key that will
match in this function is the end of the individual style.)
If the end of the individual style is not found, check if the line
is a control word. If it is, extract the relelvant info and look
up this info in the tokens dictionary. I want to change
abbreviated names for longer, more readable ones.
Write an error message if no key is found for the info.
If the line is text, add the text to a text string. The text
string will be the name of the style.
"""
action = self.__state_dict.get(self.__token_info)
if action:
action(line)
# have to parse border lines with external module
elif line[0:5] == 'cw<bd':
border_dict = self.__border_obj.parse_border(line)
keys = border_dict.keys()
for key in keys:
self.__enter_dict_entry(key, border_dict[key])
elif self.__token_info in self.__tabs_list:
action = self.__tabs_dict.get(self.__token_info)
if action != None:
action(line)
elif line[0:2] == 'cw':
#cw<pf<widow-cntl<nu<true
info = line[6:16]
att = self.__token_dict.get(info)
if att == None :
if info not in self.__ignore_list:
if self.__run_level > 3:
msg = 'no value for key %s\n' % info
raise self.__bug_handler, msg
else:
value = line[20:-1]
self.__enter_dict_entry(att, value)
elif line[0:2] == 'tx':
self.__text_string += line[17:-1]
def __tab_stop_func(self, line):
"""
Requires:
line -- line to parse
Returns:
nothing
Logic:
Try to add the number to dictionary entry tabs-left, or tabs-right, etc.
If the dictionary entry doesn't exist, create one.
"""
type = 'tabs-%s' % self.__tab_type
try:
if self.__leader_found:
self.__styles_dict['par'][self.__styles_num]['tabs']\
+= '%s:' % self.__tab_type
self.__styles_dict['par'][self.__styles_num]['tabs']\
+= '%s;' % line[20:-1]
else:
self.__styles_dict['par'][self.__styles_num]['tabs']\
+= '%s:' % self.__tab_type
self.__styles_dict['par'][self.__styles_num]['tabs']\
+= '%s;' % line[20:-1]
except KeyError:
self.__enter_dict_entry('tabs', '')
self.__styles_dict['par'][self.__styles_num]['tabs']\
+= '%s:' % self.__tab_type
self.__styles_dict['par'][self.__styles_num]['tabs'] += '%s;' % line[20:-1]
self.__tab_type = 'left'
self.__leader_found = 0
def __tab_type_func(self, line):
"""
"""
type = self.__tab_type_dict.get(self.__token_info)
if type != None:
self.__tab_type = type
else:
if self.__run_level > 3:
msg = 'no entry for %s\n' % self.__token_info
raise self.__bug_handler, msg
def __tab_leader_func(self, line):
"""
Requires:
line --line to parse
Returns:
nothing
Logic:
Try to add the string of the tab leader to dictionary entry
tabs-left, or tabs-right, etc. If the dictionary entry doesn't
exist, create one.
"""
self.__leader_found = 1
leader = self.__tab_type_dict.get(self.__token_info)
if leader != None:
leader += '^'
type = 'tabs-%s' % self.__tab_type
try:
self.__styles_dict['par'][self.__styles_num]['tabs'] += ':%s;' % leader
except KeyError:
self.__enter_dict_entry('tabs', '')
self.__styles_dict['par'][self.__styles_num]['tabs'] += '%s;' % leader
else:
if self.__run_level > 3:
msg = 'no entry for %s\n' % self.__token_info
raise self.__bug_handler, msg
def __tab_bar_func(self, line):
"""
Requires:
line -- line to parse
Returns:
nothing
Logic:
Try to add the string of the tab bar to dictionary entry tabs-bar.
If the dictionary entry doesn't exist, create one.
"""
# self.__add_dict_entry('tabs-bar', line[20:-1])
try:
self.__styles_dict['par'][self.__styles_num]['tabs']\
+= '%s:' % 'bar'
self.__styles_dict['par'][self.__styles_num]['tabs']\
+= '%s;' % line[20:-1]
except KeyError:
self.__enter_dict_entry('tabs', '')
self.__styles_dict['par'][self.__styles_num]['tabs']\
+= '%s:' % 'bar'
self.__styles_dict['par'][self.__styles_num]['tabs']\
+= '%s;' % line[20:-1]
self.__tab_type = 'left'
def __enter_dict_entry(self, att, value):
"""
Required:
att -- the attribute
value -- the value
Returns:
nothing
Logic:
Try to add the attribute value directly to the styles dictionary.
If a keyerror is found, that means I have to build the "branches"
of the dictionary before I can add the key value pair.
"""
try:
self.__styles_dict[self.__type_of_style][self.__styles_num][att] = value
except KeyError:
self.__add_dict_entry(att, value)
def __add_dict_entry(self, att, value):
"""
Required:
att --the attribute
value --the value
Returns:
nothing
Logic:
I have to build the branches of the dictionary before I can add
the leaves. (I am comparing a dictionary to a tree.) To achieve
this, I first make a temporary dictionary by extracting either the
inside dictionary of the keyword par or char. This temporary
dictionary is called type_dict.
Next, create a second, smaller dictionary with just the attribute and value.
Add the small dictionary to the type dictionary.
Add this type dictionary to the main styles dictionary.
"""
if self.__type_of_style == 'par':
type_dict =self.__styles_dict['par']
elif self.__type_of_style == 'char':
type_dict = self.__styles_dict['char']
else:
if self.__run_level > 3:
msg = self.__type_of_style + 'error\n'
raise self.__bug_handler, msg
smallest_dict = {}
smallest_dict[att] = value
type_dict[self.__styles_num] = smallest_dict
self.__styles_dict[self.__type_of_style] = type_dict
def __para_style_func(self, line):
"""
Required:
line
Returns:
nothing
Logic:
Set the type of style to paragraph.
Extract the number for a line such as "cw<ss<para-style<nu<15".
"""
self.__type_of_style = 'par'
self.__styles_num = line[20:-1]
"""
self.__enter_dict_entry('tabs-left', '')
self.__enter_dict_entry('tabs-right', '')
self.__enter_dict_entry('tabs-center', '')
self.__enter_dict_entry('tabs-decimal', '')
self.__enter_dict_entry('tabs-bar', '')
"""
def __char_style_func(self, line):
"""
Required:
line
Returns:
nothing
Logic:
Set the type of style to character.
Extract the number for a line such as "cw<ss<char-style<nu<15".
"""
self.__type_of_style = 'char'
self.__styles_num = line[20:-1]
def __found_beg_ind_style_func(self, line):
"""
Required:
line
Returns:
nothing
Logic:
Get rid of the last semicolon in the text string. Add the text
string as the value with 'name' as the key in the style
dictionary.
"""
self.__state = 'in_individual_style'
def __found_end_ind_style_func(self, line):
name = self.__text_string[:-1] # get rid of semicolon
# add 2005-04-29
# get rid of space before or after
name = name.strip()
self.__enter_dict_entry('name', name)
self.__text_string = ''
def __found_end_styles_table_func(self, line):
"""
Required:
line
Returns:
nothing
Logic:
Set the state to after the styles table.
Fix the styles. (I explain this below.)
Print out the style table.
"""
self.__state = 'after_styles_table'
self.__fix_based_on()
self.__print_style_table()
def __fix_based_on(self):
"""
Requires:
nothing
Returns:
nothing
Logic:
The styles dictionary may contain a pair of key values such as
'next-style' => '15'. I want to change the 15 to the name of the
style. I accomplish this by simply looking up the value of 15 in
the styles table.
Use two loops. First, check all the paragraph styles. Then check
all the characer styles.
The inner loop: first check 'next-style', then check 'based-on-style'.
Make sure values exist for the keys to avoid the nasty keyerror message.
"""
types = ['par', 'char']
for type in types:
keys = self.__styles_dict[type].keys()
for key in keys:
styles = ['next-style', 'based-on-style']
for style in styles:
value = self.__styles_dict[type][key].get(style)
if value != None:
temp_dict = self.__styles_dict[type].get(value)
if temp_dict:
changed_value = self.__styles_dict[type][value].get('name')
if changed_value:
self.__styles_dict[type][key][style] = \
changed_value
else:
if value == 0 or value == '0':
pass
else:
if self.__run_level > 4:
msg = '%s %s is based on %s\n' % (type, key, value)
msg = 'There is no style with %s\n' % value
raise self.__bug_handler, msg
del self.__styles_dict[type][key][style]
def __print_style_table(self):
"""
Required:
nothing
Returns:
nothing
Logic:
This function prints out the style table.
I use three nested for loops. The outer loop prints out the
paragraphs styles, then the character styles.
The next loop iterates through the style numbers.
The most inside loop iterates over the pairs of attributes and
values, and prints them out.
"""
types = ['par', 'char']
for type in types:
if type == 'par':
prefix = 'paragraph'
else:
prefix = 'character'
self.__write_obj.write(
'mi<tg<open______<%s-styles\n' % prefix
)
style_numbers = self.__styles_dict[type].keys()
for num in style_numbers:
self.__write_obj.write(
'mi<tg<empty-att_<%s-style-in-table<num>%s' % (prefix, num)
)
attributes = self.__styles_dict[type][num].keys()
for att in attributes:
this_value = self.__styles_dict[type][num][att]
self.__write_obj.write(
'<%s>%s' % (att, this_value)
)
self.__write_obj.write('\n')
self.__write_obj.write(
'mi<tg<close_____<%s-styles\n' % prefix
)
def __found_styles_table_func(self, line):
"""
Required:
line
Returns:
nothing
Logic:
Change the state to in the style table when the marker has been found.
"""
self.__state = 'in_styles_table'
def __before_styles_func(self, line):
"""
Required:
line
Returns:
nothing.
Logic:
Check the line info in the state dictionary. When the beginning of
the styles table is found, change the state to in the styles
table.
"""
action = self.__state_dict.get(self.__token_info)
if not action:
self.__write_obj.write(line)
else:
action(line)
def __in_styles_func(self, line):
"""
Required:
line
Returns:
nothing
Logic:
Check the line for the beginning of an individaul style. If it is
not found, simply print out the line.
"""
action = self.__state_dict.get(self.__token_info)
if action == None:
self.__write_obj.write(line)
else:
action(line)
def __para_style_in_body_func(self, line, type):
"""
Required:
line-- the line
type -- whether a character or paragraph
Returns:
nothing
Logic:
Determine the prefix by whether the type is "par" or "char".
Extract the number from a line such as "cw<ss<para-style<nu<15".
Look up that number in the styles dictionary and put a name for a number
"""
if type == 'par':
prefix = 'para'
else:
prefix = 'char'
num = line[20:-1]
# may be invalid RTF--a style down below not defined above!
try:
value = self.__styles_dict[type][num]['name']
except KeyError:
value = None
if value:
self.__write_obj.write(
'cw<ss<%s-style<nu<%s\n' % (prefix, value)
)
else:
self.__write_obj.write(
'cw<ss<%s_style<nu<not-defined\n' % prefix
)
def __after_styles_func(self, line):
"""
Required:
line
Returns:
nothing
Logic:
Determine if a line with either character of paragraph style info
has been found. If so, then use the appropriate method to parse
the line. Otherwise, write the line to a file.
"""
action, type = self.__body_dict.get(self.__token_info, (None, None))
if action:
action(line, type)
else:
self.__write_obj.write(line)
def convert_styles(self):
"""
Requires:
nothing
Returns:
nothing (changes the original file)
Logic:
Read one line in at a time. Determine what action to take based on
the state. If the state is before the style table, look for the
beginning of the style table.
If the state is in the style table, create the style dictionary
and print out the tags.
If the state if afer the style table, look for lines with style
info, and substitute the number with the name of the style.
"""
self.__initiate_values()
read_obj = open(self.__file, 'r')
self.__write_obj = open(self.__write_to, 'w')
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
action = self.__state_dict.get(self.__state)
if action == None:
sys.stderr.write('no matching state in module styles.py\n')
sys.stderr.write(self.__state + '\n')
action(line)
read_obj.close()
self.__write_obj.close()
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "styles.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)

View File

@ -0,0 +1,543 @@
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program; if not, write to the Free Software #
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
# 02111-1307 USA #
# #
# #
#########################################################################
import sys, os, tempfile
from libprs500.ebooks.rtf2xml import copy, border_parse
"""
States.
1. 'not_in_table'
1. 'cw<tb<row-def___' start a row definition
2. 'mi<mk<in-table__' start table
2. 'in_table'
1. 'mi<mk<pard-start', start of a row, cell
2. 'mi<mk<not-in-tbl', end the table.
3. 'cw<tb<row-def___' start a row definition
3. in_row_definition
1. 'mi<mk<not-in-tbl' : end the row defintion. If in table, end the table.
2. 'mi<mk<pard-start' : end the row defintion
if already in the table, start a row and cell.
3. 'cw<tb<row_______' : end the row definition, end the row
4. 'cw...' use another method to handle the control word
control word might be added to dictionary.
5. 'mi<mk<in-table__' If already in table, do nothing. Otherwise
start the table.
4. 'in_row'
1. 'mi<mk<pard-start', start cell
2. 'mi<mk<not-in-tbl' end table,
3. 'cw<tb<row_______' close row,
5. 'in_cell'
1. 'mi<mk<not-in-tbl', end table
2. 'cw<tb<cell______', end cell
"""
class Table:
"""
Make tables.
Logic:
Read one line at a time. The default state (self.__state) is
'not_in_table'. Look for either a 'cw<tb<in-table__', or a row definition.
"""
def __init__(self,
in_file,
bug_handler,
copy = None,
run_level = 1,):
"""
Required:
'file'--file to parse
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
Returns:
nothing
"""
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__run_level = run_level
self.__write_to = tempfile.mktemp()
def __initiate_values(self):
"""
Initiate all values.
"""
self.__state_dict = {
'in_table': self.__in_table_func,
'in_row_def': self.__in_row_def_func,
'not_in_table': self.__not_in_table_func,
'in_cell': self.__in_cell_func,
'in_row': self.__in_row_func,
}
self.__not_in_table_dict = {
'cw<tb<row-def___': self.__found_row_def_func,
'cw<tb<in-table__': self.__start_table_func,
'mi<mk<in-table__' : self.__start_table_func,
}
# can't use this dictionary. When in row_definition, many tokens
# require multiple definitions
self.__in_row_definition_dict = {
'mi<mk<not-in-tbl' : self.__end_row_table_func,
'mi<mk<pard-start' : self.__end_row_def_func,
}
self.__in_row_dict = {
'mi<mk<not-in-tbl' : self.__close_table,
'mi<mk<pard-start' : self.__start_cell_func,
'cw<tb<row_______' : self.__end_row_func,
'cw<tb<cell______' : self.__empty_cell,
}
# set the default state
self.__state = ['not_in_table']
# set empty data for all tables
self.__table_data = []
# just in case there is no table data
self.__row_dict = {}
self.__cell_list = []
self.__cell_widths = []
def __in_table_func(self, line):
"""
Requires:
line -- line to parse
Logic:
Look for the end of the table. If found, close out the table.
Look for 'mi<mk<pard-start', which marks the beginning of a row. Start
a row and start a cell.
"""
# 'cell' : ('tb', 'cell______', self.default_func),
if self.__token_info == 'mi<mk<not-in-tbl' or\
self.__token_info == 'mi<mk<sect-start' or\
self.__token_info == 'mi<mk<sect-close' or\
self.__token_info == 'mi<mk<body-close':
self.__close_table(line)
elif self.__token_info == 'mi<mk<pard-start':
self.__start_row_func(line)
self.__start_cell_func(line)
elif self.__token_info == 'cw<tb<row-def___':
self.__found_row_def_func(line)
elif self.__token_info == 'cw<tb<cell______':
self.__start_row_func(line)
self.__empty_cell( line)
self.__write_obj.write(line)
def __not_in_table_func(self, line):
"""
Requires:
line -- the line of text read in from document
Returns:
nothing
Logic:
The state is not in a table, so look for the two tokens that
mark the start of a table: 'cw<tb<row-def', or 'cw<tb<in-table__'.
If these tokens are found, use another method to start a table
and change states. Otherwise, just output the line.
"""
action = self.__not_in_table_dict.get(self.__token_info)
if action:
action(line)
self.__write_obj.write(line)
def __close_table(self, line):
"""
Requires:
line -- line to parse
Returns:
?
Logic:
Write the end marker for the table.
Write the end tag for the table.
Set the state to ['not_in_table']
"""
self.__write_obj.write('mi<mk<table-end_\n')
self.__state = ['not_in_table']
self.__table_data[-1]['number-of-columns'] = self.__max_number_cells_in_row
self.__table_data[-1]['number-of-rows'] = self.__rows_in_table
average_cells_in_row = self.__mode(self.__list_of_cells_in_row)
self.__table_data[-1]['average-cells-per-row'] = average_cells_in_row
average_cell_width = self.__mode(self.__cell_widths)
self.__table_data[-1]['average-cell-width'] = average_cell_width
def __found_row_def_func(self, line):
"""
Requires:
line don't need this except for consistency with other methods.
Returns:
nothing
Logic:
A row definition has been found. Collect all the data from this
to use later in writing attributes for the table.
"""
self.__state.append('in_row_def')
self.__last_cell_position = 0
self.__row_dict = {}
self.__cell_list = []
self.__cell_list.append({})
self.__cell_widths = []
def __start_table_func(self, line):
"""
Requires:
line -- line to parse
Returns:
?
Logic:
Add the 'in_table' to the state list.
Write out the table marker.
Initialize table values (not sure about these yet)
"""
self.__rows_in_table = 0;
self.__cells_in_table = 0;
self.__cells_in_row = 0;
self.__max_number_cells_in_row = 0
self.__table_data.append({})
self.__list_of_cells_in_row = []
self.__write_obj.write('mi<mk<tabl-start\n')
self.__state.append('in_table')
def __end_row_table_func(self, line):
"""
Requires:
line --just for consistencey
Returns:
?
Logic:
?
"""
self.__close_table(self, line)
def __end_row_def_func(self, line):
"""
Requires:
line --just for consistency
Returns:
nothing
Logic:
change the state.
get rid of the last {} in the cell list
figure out the number of cells based on the self.__row_dict[widths]
('122, 122')
"""
if len(self.__state) > 0:
if self.__state[-1] == 'in_row_def':
self.__state.pop()
# added [{]] at the *end* of each /cell. Get rid of extra one
self.__cell_list.pop()
widths = self.__row_dict.get('widths')
if widths:
width_list = widths.split(',')
num_cells = len (width_list)
self.__row_dict['number-of-cells'] = num_cells
def __in_row_def_func(self, line):
"""
Requires:
line --line to parse
Returns:
nothing
Logic:
In the text that defines a row. If a control word is found, handle the
control word with another method.
Check for states that will end this state.
While in the row definition, certain tokens can end a row or end a table.
If a paragrah definition (pard-start) is found, and the you are already in
a table, start of a row.
"""
if self.__token_info == 'cw<tb<row_______':
# write tags
self.__end_row_func(line)
# change the state
self.__end_row_def_func(line)
self.__write_obj.write(line)
elif line[0:2] == 'cw':
self.__handle_row_token(line)
self.__write_obj.write(line)
elif self.__token_info == 'mi<mk<not-in-tbl' and 'in_table' in self.__state:
self.__end_row_def_func(line)
self.__close_table(line)
self.__write_obj.write(line)
elif self.__token_info == 'mi<mk<pard-start':
self.__end_row_def_func(line)
# if already in the table, start a row, then cell.
if (self.__state) > 0 and self.__state[-1] == 'in_table':
self.__start_row_func(line)
self.__start_cell_func(line)
self.__write_obj.write(line)
elif self.__token_info == 'mi<mk<in-table__':
self.__end_row_def_func(line)
# if not in table, start a new table
if len(self.__state) > 0 and self.__state[-1] != 'in_table':
self.__start_table_func(line)
self.__write_obj.write(line)
else:
self.__write_obj.write(line)
def __handle_row_token(self, line):
"""
Requires:
line -- line to parse
Returns:
?
Logic:
the tokens in the row definition contain the following information:
1. row borders.
2. cell borders for all cells in the row.
3. cell postions for all cells in the row.
Put all information about row borders into a row dictionary.
Put all information about cell borders into into the dictionary in
the last item in the cell list. ([{border:something, width:something},
{border:something, width:something}])
cw<bd<bor-t-r-to<nu<bdr-hair__|bdr-li-wid:0.50
"""
if line[3:5] == 'bd':
border_obj = border_parse.BorderParse()
the_dict = border_obj.parse_border(line)
keys = the_dict.keys()
# border-cell-top-hairline
in_cell = 0
for key in keys:
if key[0:11] == 'border-cell':
in_cell = 1
for key in keys:
if in_cell:
self.__cell_list[-1][key] = the_dict[key]
else:
self.__row_dict[key] = the_dict[key]
# cw<tb<cell-posit<nu<216.00
elif self.__token_info == 'cw<tb<cell-posit':
self.__found_cell_position(line)
# cw<tb<row-pos-le<nu<-5.40
elif self.__token_info == 'cw<tb<row-pos-le':
position = line[20:-1]
self.__row_dict['left-row-position'] = position
elif self.__token_info == 'cw<tb<row-header':
self.__row_dict['header'] = 'true'
def __start_cell_func(self, line):
"""
Required:
line -- the line of text
Returns:
nothing
Logic:
Append 'in_cell' for states
If the self.__cell list containst dictionaries, get the last dictionary.
Write value => attributes for key=> value
pop the self.__cell_list.
Otherwise, print out a cell tag.
"""
self.__state.append('in_cell')
# self.__cell_list = []
if len(self.__cell_list) > 0:
self.__write_obj.write('mi<tg<open-att__<cell')
# cell_dict = self.__cell_list[-1]
cell_dict = self.__cell_list[0]
keys = cell_dict.keys()
for key in keys:
self.__write_obj.write('<%s>%s' % (key, cell_dict[key]))
self.__write_obj.write('\n')
# self.__cell_list.pop()
self.__cell_list.pop(0)
# self.__cell_list = self.__cell_list[1:]
else:
self.__write_obj.write('mi<tg<open______<cell\n')
self.__cells_in_table += 1
self.__cells_in_row += 1
def __start_row_func(self, line):
"""
Required:
line -- the line of text
Returns:
nothing
Logic:
Append 'in_row' for states
Write value => attributes for key=> value
"""
self.__state.append('in_row')
self.__write_obj.write('mi<tg<open-att__<row')
keys = self.__row_dict.keys()
for key in keys:
self.__write_obj.write('<%s>%s' % (key, self.__row_dict[key]))
self.__write_obj.write('\n')
self.__cells_in_row = 0
self.__rows_in_table += 1
def __found_cell_position(self, line):
"""
needs:
line: current line
returns:
nothing
logic:
Calculate the cell width.
If the cell is the first cell, you should add the left cell position to it.
(This value is often negative.)
Next, set the new last_cell_position to the current cell position.
"""
# cw<tb<cell-posit<nu<216.00
new_cell_position = round(float(line[20:-1]), 2)
left_position = 0
if self.__last_cell_position == 0:
left_position = self.__row_dict.get('left-row-position', 0)
left_position = float(left_position)
width = new_cell_position - self.__last_cell_position - left_position
# width = round(width, 2)
width = str('%.2f' % width)
self.__last_cell_position = new_cell_position
widths_exists = self.__row_dict.get('widths')
if widths_exists:
self.__row_dict['widths'] += ', %s' % str(width)
else:
self.__row_dict['widths'] = str(width)
self.__cell_list[-1]['width'] = width
self.__cell_list.append({})
self.__cell_widths.append(width)
def __in_cell_func(self, line):
"""
Required:
line
Returns:
nothing
Logic:
In the middle of a cell.
Look for the close of the table. If found, use the close table function to close
the table.
Look for the close of the cell. If found, use the close cell function to close out
the cell.
Otherwise, print out the line.
"""
# cw<tb<cell______<nu<true
# mi<mk<sect-start
if self.__token_info == 'mi<mk<not-in-tbl' or\
self.__token_info == 'mi<mk<sect-start' or\
self.__token_info == 'mi<mk<sect-close' or\
self.__token_info == 'mi<mk<body-close':
self.__end_cell_func(line)
self.__end_row_func(line)
self.__close_table(line)
self.__write_obj.write(line)
elif self.__token_info == 'cw<tb<cell______':
self.__end_cell_func(line)
else:
self.__write_obj.write(line)
def __end_cell_func(self, line):
"""
Requires:
line
Returns:
nothing
Logic:
End the cell. Print out the closing marks. Pop the self.__state.
"""
if len(self.__state) > 1:
if self.__state[-1] == 'in_cell':
self.__state.pop()
self.__write_obj.write('mi<mk<close_cell\n')
self.__write_obj.write('mi<tg<close_____<cell\n')
self.__write_obj.write('mi<mk<closecell_\n')
def __in_row_func(self, line):
if self.__token_info == 'mi<mk<not-in-tbl' or\
self.__token_info == 'mi<mk<sect-start' or\
self.__token_info == 'mi<mk<sect-close' or\
self.__token_info == 'mi<mk<body-close':
self.__end_row_func(line)
self.__close_table(line)
self.__write_obj.write(line)
else:
action = self.__in_row_dict.get(self.__token_info)
if action:
action(line)
self.__write_obj.write(line)
"""
elif self.__token_info == 'mi<mk<pard-start':
self.__start_cell_func(line)
self.__write_obj.write(line)
elif self.__token_info == 'cw<tb<row_______':
self.__end_row_func(line)
self.__write_obj.write(line)
else:
self.__write_obj.write(line)
"""
def __end_row_func(self, line):
"""
"""
if len(self.__state) > 1 and self.__state[-1] == 'in_row':
self.__state.pop()
self.__write_obj.write('mi<tg<close_____<row\n')
else:
self.__write_obj.write('mi<tg<empty_____<row\n')
self.__rows_in_table += 1
if self.__cells_in_row > self.__max_number_cells_in_row:
self.__max_number_cells_in_row = self.__cells_in_row
self.__list_of_cells_in_row.append(self.__cells_in_row)
def __empty_cell(self, line):
"""
Required:
line -- line of text
Returns:
nothing
Logic:
Write an empty tag with attributes if there are attributes.
Otherwise, writen an empty tag with cell as element.
"""
if len(self.__cell_list) > 0:
self.__write_obj.write('mi<tg<empty-att_<cell')
cell_dict = self.__cell_list[-1]
keys = cell_dict.keys()
for key in keys:
self.__write_obj.write('<%s>%s' % (key, cell_dict[key]))
self.__write_obj.write('\n')
else:
self.__write_obj.write('mi<tg<empty_____<cell\n')
self.__cells_in_table += 1
self.__cells_in_row += 1
def __mode(self, the_list):
"""
Required:
the_list -- a list of something
Returns:
the number that occurs the most
Logic:
get the count of each item in list. The count that is the greatest
is the mode.
"""
max = 0
mode = 'not-defined'
for item in the_list:
num_of_values = the_list.count(item)
if num_of_values > max:
mode = item
max = num_of_values
return mode
def make_table(self):
"""
Requires:
nothing
Returns:
A dictionary of values for the beginning of the table.
Logic:
Read one line in at a time. Determine what action to take based on
the state.
"""
self.__initiate_values()
read_obj = open(self.__file, 'r')
self.__write_obj = open(self.__write_to, 'w')
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
action = self.__state_dict.get(self.__state[-1])
# print self.__state[-1]
if action == None:
sys.stderr.write('No matching state in module table.py\n')
sys.stderr.write(self.__state[-1] + '\n')
action(line)
read_obj.close()
self.__write_obj.close()
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "table.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)
return self.__table_data

View File

@ -0,0 +1,85 @@
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program; if not, write to the Free Software #
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
# 02111-1307 USA #
# #
# #
#########################################################################
import os, tempfile
from libprs500.ebooks.rtf2xml import copy
# note to self. This is the first module in which I use tempfile. A good idea?
"""
"""
class TableInfo:
"""
Insert table data for tables.
Logic:
"""
def __init__(self,
in_file,
bug_handler,
table_data,
copy=None,
run_level = 1,):
"""
Required:
'file'--file to parse
'table_data' -- a dictionary for each table.
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
Returns:
nothing
"""
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__table_data = table_data
self.__run_level = run_level
self.__write_to = tempfile.mktemp()
# self.__write_to = 'table_info.data'
def insert_info(self):
"""
"""
read_obj = open(self.__file, 'r')
self.__write_obj = open(self.__write_to, 'w')
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
if line == 'mi<mk<tabl-start\n':
if len(self.__table_data) > 0:
table_dict = self.__table_data[0]
self.__write_obj.write('mi<tg<open-att__<table')
keys = table_dict.keys()
for key in keys:
self.__write_obj.write('<%s>%s' % (key, table_dict[key]))
self.__write_obj.write('\n')
self.__table_data = self.__table_data[1:]
else:
# this shouldn't happen!
if self.__run_level > 3:
msg = 'Not enough data for each table\n'
raise self.__bug_handler, msg
self.__write_obj.write('mi<tg<open______<table\n')
elif line == 'mi<mk<table-end_\n':
self.__write_obj.write('mi<tg<close_____<table\n')
self.__write_obj.write(line)
read_obj.close()
self.__write_obj.close()
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "table_info.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)

View File

@ -0,0 +1,116 @@
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program; if not, write to the Free Software #
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
# 02111-1307 USA #
# #
# #
#########################################################################
import os, re, tempfile
from libprs500.ebooks.rtf2xml import copy
class Tokenize:
"""Tokenize RTF into one line per field. Each line will contain information useful for the rest of the script"""
def __init__(self,
in_file,
bug_handler,
copy = None,
run_level = 1,
):
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__special_tokens = [ '_', '~', "'", '{', '}' ]
self.__write_to = tempfile.mktemp()
def __from_ms_to_utf8(self,match_obj):
uni_char = int(match_obj.group(1))
if uni_char < 0:
uni_char += 65536
return '&#x' + str('%X' % uni_char) + ';'
def __neg_unicode_func(self, match_obj):
neg_uni_char = int(match_obj.group(1)) * -1
# sys.stderr.write(str( neg_uni_char))
uni_char = neg_uni_char + 65536
return '&#x' + str('%X' % uni_char) + ';'
def __sub_line_reg(self,line):
line = line.replace("\\\\", "\\backslash ")
line = line.replace("\\~", "\\~ ")
line = line.replace("\\;", "\\; ")
line = line.replace("&", "&amp;")
line = line.replace("<", "&lt;")
line = line.replace(">", "&gt;")
line = line.replace("\\~", "\\~ ")
line = line.replace("\\_", "\\_ ")
line = line.replace("\\:", "\\: ")
line = line.replace("\\-", "\\- ")
# turn into a generic token to eliminate special
# cases and make processing easier
line = line.replace("\\{", "\\ob ")
# turn into a generic token to eliminate special
# cases and make processing easier
line = line.replace("\\}", "\\cb ")
# put a backslash in front of to eliminate special cases and
# make processing easier
line = line.replace("{", "\\{")
# put a backslash in front of to eliminate special cases and
# make processing easier
line = line.replace("}", "\\}")
line = re.sub(self.__utf_exp, self.__from_ms_to_utf8, line)
# line = re.sub( self.__neg_utf_exp, self.__neg_unicode_func, line)
line = re.sub(self.__ms_hex_exp, "\\mshex0\g<1> ", line)
##line = line.replace("\\backslash", "\\\\")
# this is for older RTF
line = re.sub(self.__par_exp, '\\par ', line)
return line
def __compile_expressions(self):
self.__ms_hex_exp = re.compile(r"\\\'(..)")
self.__utf_exp = re.compile(r"\\u(-?\d{3,6})")
self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\\[^\s\\{}&]+(?:\s)?)")
self.__par_exp = re.compile(r'\\$')
self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)")
##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)")
def __create_tokens(self):
self.__compile_expressions()
read_obj = open(self.__file, 'r')
write_obj = open(self.__write_to, 'w')
line_to_read = "dummy"
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
line = line.replace("\n", "")
line = self.__sub_line_reg(line)
tokens = re.split(self.__splitexp, line)
##print tokens
for token in tokens:
if token != "":
write_obj.write(token + "\n")
"""
match_obj = re.search(self.__mixed_exp, token)
if match_obj != None:
first = match_obj.group(1)
second = match_obj.group(2)
write_obj.write(first + "\n")
write_obj.write(second + "\n")
else:
write_obj.write(token + "\n")
"""
read_obj.close()
write_obj.close()
def tokenize(self):
"""Main class for handling other methods. Reads in one line \
at a time, usues method self.sub_line to make basic substitutions,\
uses ? to process tokens"""
self.__create_tokens()
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "tokenize.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)

View File

@ -93,7 +93,7 @@ class Distribution(object):
self.command = cmd.strip()
if os == 'debian':
self.command += '\n'+prefix + 'cp -R /usr/share/pycentral/fonttools/site-packages/FontTools* /usr/lib/python2.5/site-packages/'
self.command += '\n'+prefix+'easy_install -U TTFQuery libprs500 \n'+prefix+'easy_install -f http://sourceforge.net/project/showfiles.php?group_id=68617 rtf2xml\n'+prefix+'libprs500_postinstall'
self.command += '\n'+prefix+'easy_install -U TTFQuery libprs500 \n'+prefix+'libprs500_postinstall'
try:
self.manual = Markup(self.MANUAL_MAP[os])
except KeyError: