mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Move rtf2xml into libprs500 tree and clean it up
This commit is contained in:
parent
7227fe1c4f
commit
e1899e9f1c
@ -122,7 +122,7 @@ def main(args=sys.argv, logger=None):
|
||||
|
||||
|
||||
def generate_xml(rtfpath):
|
||||
from rtf2xml.ParseRtf import ParseRtf
|
||||
from libprs500.ebooks.rtf2xml.ParseRtf import ParseRtf
|
||||
tdir = tempfile.mkdtemp(prefix=__appname__+'_')
|
||||
ofile = os.path.join(tdir, 'index.xml')
|
||||
cwd = os.getcwdu()
|
||||
|
563
src/libprs500/ebooks/rtf2xml/ParseRtf.py
Executable file
563
src/libprs500/ebooks/rtf2xml/ParseRtf.py
Executable file
@ -0,0 +1,563 @@
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# You should have received a copy of the GNU General Public License #
|
||||
# along with this program; if not, write to the Free Software #
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
|
||||
# 02111-1307 USA #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
# $Revision: 1.41 $
|
||||
# $Date: 2006/03/24 23:50:07 $
|
||||
import sys,os
|
||||
from libprs500.ebooks.rtf2xml import headings_to_sections, \
|
||||
line_endings, footnote, fields_small, default_encoding, \
|
||||
make_lists, preamble_div, header, colors, group_borders, \
|
||||
check_encoding, add_brackets, table, combine_borders, \
|
||||
fields_large, process_tokens, hex_2_utf8, tokenize, \
|
||||
delete_info, sections, check_brackets, styles, \
|
||||
paragraph_def, convert_to_tags, output, copy, \
|
||||
list_numbers, info, pict, table_info, fonts, paragraphs, \
|
||||
body_styles, preamble_rest, group_styles, \
|
||||
inline, correct_unicode
|
||||
from libprs500.ebooks.rtf2xml.old_rtf import OldRtf
|
||||
|
||||
"""
|
||||
Here is an example script using the ParseRTF module directly
|
||||
#!/usr/bin/env python
|
||||
|
||||
def Handle_Main():
|
||||
# Handles options and creates a parse object
|
||||
parse_obj =ParseRtf.ParseRtf(
|
||||
in_file = 'in.rtf',
|
||||
# All values from here on are optional
|
||||
# determine the output file
|
||||
out_file = 'out.xml',
|
||||
# determine the run level. The default is 1.
|
||||
run_level = 3,
|
||||
# The name of a debug directory, if you are running at
|
||||
# run level 3 or higer.
|
||||
debug = 'debug_dir',
|
||||
# Convert RTF caps to real caps.
|
||||
# Default is 1.
|
||||
convert_caps = 1,
|
||||
# Indent resulting XML.
|
||||
# Default is 0 (no indent).
|
||||
indent = 1,
|
||||
# Form lists from RTF. Default is 1.
|
||||
form_lists = 1,
|
||||
# Convert headings to sections. Default is 0.
|
||||
headings_to_sections = 1,
|
||||
# Group paragraphs with the same style name. Default is 1.
|
||||
group_styles = 1,
|
||||
# Group borders. Default is 1.
|
||||
group_borders = 1,
|
||||
# Write or do not write paragraphs. Default is 0.
|
||||
empty_paragraphs = 0,
|
||||
)
|
||||
try:
|
||||
parse_obj.parse_rtf()
|
||||
except ParseRtf.InvalidRtfException, msg:
|
||||
sys.stderr.write(msg)
|
||||
except ParseRtf.RtfInvalidCodeException, msg:
|
||||
sys.stderr.write(msg)
|
||||
"""
|
||||
class InvalidRtfException(Exception):
|
||||
"""
|
||||
handle invalid RTF
|
||||
"""
|
||||
pass
|
||||
class RtfInvalidCodeException(Exception):
|
||||
"""
|
||||
handle bugs in program
|
||||
"""
|
||||
pass
|
||||
|
||||
class ParseRtf:
|
||||
"""
|
||||
Main class for controlling the rest of the parsing.
|
||||
"""
|
||||
def __init__(self,
|
||||
in_file,
|
||||
out_file = '',
|
||||
out_dir = None,
|
||||
dtd = '',
|
||||
debug = 0,
|
||||
deb_dir=None,
|
||||
convert_symbol = None,
|
||||
convert_wingdings = None,
|
||||
convert_zapf = None,
|
||||
convert_caps = None,
|
||||
run_level = 1,
|
||||
indent = None,
|
||||
replace_illegals = 1,
|
||||
form_lists = 1,
|
||||
headings_to_sections = 1,
|
||||
group_styles = 1,
|
||||
group_borders = 1,
|
||||
empty_paragraphs = 1,
|
||||
no_dtd = 0,
|
||||
char_data = '',
|
||||
):
|
||||
"""
|
||||
Requires:
|
||||
'file' --file to parse
|
||||
'char_data' --file containing character maps
|
||||
'dtd' --path to dtd
|
||||
Possible parameters, but not necessary:
|
||||
'output' --a file to output the parsed file. (Default is standard
|
||||
output.)
|
||||
'temp_dir' --directory for temporary output (If not provided, the
|
||||
script tries to output to directory where is script is exectued.)
|
||||
'deb_dir' --debug directory. If a debug_dir is provided, the script
|
||||
will copy each run through as a file to examine in the debug_dir
|
||||
'perl_script'--use perl to make tokens. This runs just a bit faster.
|
||||
(I will probably phase this out.)
|
||||
'check_brackets' -- make sure the brackets match up after each run
|
||||
through a file. Only for debugging.
|
||||
Returns: Nothing
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__out_file = out_file
|
||||
self.__out_dir = out_dir
|
||||
self.__temp_dir = out_dir
|
||||
self.__dtd_path = dtd
|
||||
self.__check_file(in_file,"file_to_parse")
|
||||
self.__char_data = char_data
|
||||
self.__debug_dir = debug
|
||||
self.__check_dir(self.__temp_dir)
|
||||
self.__copy = self.__check_dir(self.__debug_dir)
|
||||
self.__convert_caps = convert_caps
|
||||
self.__convert_symbol = convert_symbol
|
||||
self.__convert_wingdings = convert_wingdings
|
||||
self.__convert_zapf = convert_zapf
|
||||
self.__run_level = run_level
|
||||
self.__exit_level = 0
|
||||
self.__indent = indent
|
||||
self.__replace_illegals = replace_illegals
|
||||
self.__form_lists = form_lists
|
||||
self.__headings_to_sections = headings_to_sections
|
||||
self.__group_styles = group_styles
|
||||
self.__group_borders = group_borders
|
||||
self.__empty_paragraphs = empty_paragraphs
|
||||
self.__no_dtd = no_dtd
|
||||
|
||||
def __check_file(self, the_file, type):
|
||||
"""Check to see if files exist"""
|
||||
if the_file == None:
|
||||
if type == "file_to_parse":
|
||||
message = "You must provide a file for the script to work"
|
||||
msg = message
|
||||
raise RtfInvalidCodeException, msg
|
||||
elif os.path.exists(the_file):
|
||||
pass # do nothing
|
||||
else:
|
||||
message = "The file '%s' cannot be found" % the_file
|
||||
msg = message
|
||||
raise RtfInvalidCodeException, msg
|
||||
def __check_dir(self, the_dir):
|
||||
"""Check to see if directory exists"""
|
||||
if not the_dir :
|
||||
return
|
||||
dir_exists = os.path.isdir(the_dir)
|
||||
if not dir_exists:
|
||||
message = "%s is not a directory" % the_dir
|
||||
msg = message
|
||||
raise RtfInvalidCodeException, msg
|
||||
return 1
|
||||
def parse_rtf(self):
|
||||
"""
|
||||
Parse the file by calling on other classes.
|
||||
Requires:
|
||||
Nothing
|
||||
Returns:
|
||||
A parsed file in XML, either to standard output or to a file,
|
||||
depending on the value of 'output' when the instance was created.
|
||||
"""
|
||||
self.__temp_file = self.__make_temp_file(self.__file)
|
||||
# if the self.__deb_dir is true, then create a copy object,
|
||||
# set the directory to write to, remove files, and copy
|
||||
# the new temporary file to this directory
|
||||
if self.__debug_dir:
|
||||
copy_obj = copy.Copy(
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
)
|
||||
copy_obj.set_dir(self.__debug_dir)
|
||||
copy_obj.remove_files()
|
||||
copy_obj.copy_file(self.__temp_file, "original_file")
|
||||
# new as of 2005-08-02. Do I want this?
|
||||
if self.__debug_dir or self.__run_level > 2:
|
||||
self.__check_brack_obj = check_brackets.CheckBrackets\
|
||||
(file = self.__temp_file,
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
)
|
||||
# convert Macintosh line endings to Unix line endings
|
||||
line_obj = line_endings.FixLineEndings(
|
||||
in_file = self.__temp_file,
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
copy = self.__copy,
|
||||
run_level = self.__run_level,
|
||||
replace_illegals = self.__replace_illegals,
|
||||
)
|
||||
return_value = line_obj.fix_endings()
|
||||
self.__return_code(return_value)
|
||||
tokenize_obj = tokenize.Tokenize(
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
in_file = self.__temp_file,
|
||||
copy = self.__copy,
|
||||
run_level = self.__run_level,)
|
||||
tokenize_obj.tokenize()
|
||||
process_tokens_obj = process_tokens.ProcessTokens(
|
||||
in_file = self.__temp_file,
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
copy = self.__copy,
|
||||
run_level = self.__run_level,
|
||||
exception_handler = InvalidRtfException,
|
||||
)
|
||||
try:
|
||||
return_value = process_tokens_obj.process_tokens()
|
||||
except InvalidRtfException, msg:
|
||||
try:
|
||||
os.remove(self.__temp_file)
|
||||
except OSError:
|
||||
pass
|
||||
check_encoding_obj = check_encoding.CheckEncoding(
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
)
|
||||
check_encoding_obj.check_encoding(self.__file)
|
||||
sys.stderr.write('File "%s" does not appear to be RTF.\n' % self.__file)
|
||||
raise InvalidRtfException, msg
|
||||
delete_info_obj = delete_info.DeleteInfo(
|
||||
in_file = self.__temp_file,
|
||||
copy = self.__copy,
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
run_level = self.__run_level,)
|
||||
# found destination means {\*\destination
|
||||
# if found, the RTF should be newer RTF
|
||||
found_destination = delete_info_obj.delete_info()
|
||||
self.__bracket_match('delete_data_info')
|
||||
# put picts in a separate file
|
||||
pict_obj = pict.Pict(
|
||||
in_file = self.__temp_file,
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
copy = self.__copy,
|
||||
orig_file = self.__file,
|
||||
out_file = self.__out_file,
|
||||
run_level = self.__run_level,
|
||||
)
|
||||
pict_obj.process_pict()
|
||||
self.__bracket_match('pict_data_info')
|
||||
correct_uni_obj = correct_unicode.CorrectUnicode(
|
||||
in_file = self.__temp_file,
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
copy = self.__copy,
|
||||
run_level = self.__run_level,
|
||||
exception_handler = InvalidRtfException,
|
||||
)
|
||||
correct_uni_obj.correct_unicode()
|
||||
self.__bracket_match('correct_unicode_info')
|
||||
combine_obj = combine_borders.CombineBorders(
|
||||
in_file = self.__temp_file,
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
copy = self.__copy,
|
||||
run_level = self.__run_level,)
|
||||
combine_obj.combine_borders()
|
||||
self.__bracket_match('combine_borders_info')
|
||||
footnote_obj = footnote.Footnote(
|
||||
in_file = self.__temp_file,
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
copy = self.__copy,
|
||||
run_level = self.__run_level,
|
||||
)
|
||||
footnote_obj.separate_footnotes()
|
||||
self.__bracket_match('separate_footnotes_info')
|
||||
header_obj = header.Header(
|
||||
in_file = self.__temp_file,
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
copy = self.__copy,
|
||||
run_level = self.__run_level,
|
||||
)
|
||||
header_obj.separate_headers()
|
||||
self.__bracket_match('separate_headers_info')
|
||||
list_numbers_obj = list_numbers.ListNumbers(
|
||||
in_file = self.__temp_file,
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
copy = self.__copy,
|
||||
run_level = self.__run_level,
|
||||
)
|
||||
list_numbers_obj.fix_list_numbers()
|
||||
self.__bracket_match('list_number_info')
|
||||
preamble_div_obj = preamble_div.PreambleDiv(
|
||||
in_file = self.__temp_file,
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
copy = self.__copy,
|
||||
run_level = self.__run_level,
|
||||
)
|
||||
list_of_lists = preamble_div_obj.make_preamble_divisions()
|
||||
self.__bracket_match('make_preamble_divisions')
|
||||
encode_obj = default_encoding.DefaultEncoding(
|
||||
in_file = self.__temp_file,
|
||||
run_level = self.__run_level,
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
)
|
||||
platform, code_page, default_font_num = encode_obj.find_default_encoding()
|
||||
hex2utf_obj = hex_2_utf8.Hex2Utf8(
|
||||
in_file = self.__temp_file,
|
||||
copy = self.__copy,
|
||||
area_to_convert = 'preamble',
|
||||
char_file = self.__char_data,
|
||||
default_char_map = code_page,
|
||||
run_level = self.__run_level,
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
invalid_rtf_handler = InvalidRtfException,
|
||||
)
|
||||
hex2utf_obj.convert_hex_2_utf8()
|
||||
self.__bracket_match('hex_2_utf_preamble')
|
||||
fonts_obj = fonts.Fonts(
|
||||
in_file = self.__temp_file,
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
copy = self.__copy,
|
||||
default_font_num = default_font_num,
|
||||
run_level = self.__run_level,
|
||||
)
|
||||
special_font_dict = fonts_obj.convert_fonts()
|
||||
self.__bracket_match('fonts_info')
|
||||
color_obj = colors.Colors(
|
||||
in_file = self.__temp_file,
|
||||
copy = self.__copy,
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
run_level = self.__run_level,
|
||||
)
|
||||
color_obj.convert_colors()
|
||||
self.__bracket_match('colors_info')
|
||||
style_obj = styles.Styles(
|
||||
in_file = self.__temp_file,
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
copy = self.__copy,
|
||||
run_level = self.__run_level,
|
||||
)
|
||||
style_obj.convert_styles()
|
||||
self.__bracket_match('styles_info')
|
||||
info_obj = info.Info(
|
||||
in_file = self.__temp_file,
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
copy = self.__copy,
|
||||
run_level = self.__run_level,
|
||||
)
|
||||
info_obj.fix_info()
|
||||
default_font = special_font_dict.get('default-font')
|
||||
preamble_rest_obj = preamble_rest.Preamble(
|
||||
file = self.__temp_file, copy = self.__copy,
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
platform = platform, default_font = default_font,
|
||||
code_page = code_page)
|
||||
preamble_rest_obj.fix_preamble()
|
||||
self.__bracket_match('preamble_rest_info')
|
||||
old_rtf_obj = OldRtf(
|
||||
in_file = self.__temp_file,
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
run_level = self.__run_level,
|
||||
)
|
||||
# RTF can actually have destination groups and old RTF.
|
||||
# BAH!
|
||||
old_rtf = old_rtf_obj.check_if_old_rtf()
|
||||
if old_rtf:
|
||||
if self.__run_level > 5:
|
||||
msg = 'older RTF\n'
|
||||
msg += 'self.__run_level is "%s"\n' % self.__run_level
|
||||
raise RtfInvalidCodeException, msg
|
||||
if self.__run_level > 1:
|
||||
sys.stderr.write('File could be older RTF...\n')
|
||||
if found_destination:
|
||||
if self.__run_level > 1:
|
||||
sys.stderr.write(
|
||||
'File also has newer RTF.\n'
|
||||
'Will do the best to convert.\n'
|
||||
)
|
||||
add_brackets_obj = add_brackets.AddBrackets(
|
||||
in_file = self.__temp_file,
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
copy = self.__copy,
|
||||
run_level = self.__run_level,
|
||||
)
|
||||
add_brackets_obj.add_brackets()
|
||||
fields_small_obj = fields_small.FieldsSmall(
|
||||
in_file = self.__temp_file,
|
||||
copy = self.__copy,
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
run_level = self.__run_level,)
|
||||
fields_small_obj.fix_fields()
|
||||
self.__bracket_match('fix_small_fields_info')
|
||||
fields_large_obj = fields_large.FieldsLarge(
|
||||
in_file = self.__temp_file,
|
||||
copy = self.__copy,
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
run_level = self.__run_level)
|
||||
fields_large_obj.fix_fields()
|
||||
self.__bracket_match('fix_large_fields_info')
|
||||
sections_obj = sections.Sections(
|
||||
in_file = self.__temp_file,
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
copy = self.__copy,
|
||||
run_level = self.__run_level,)
|
||||
sections_obj.make_sections()
|
||||
self.__bracket_match('sections_info')
|
||||
paragraphs_obj = paragraphs.Paragraphs(
|
||||
in_file = self.__temp_file,
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
copy = self.__copy,
|
||||
write_empty_para = self.__empty_paragraphs,
|
||||
run_level = self.__run_level,)
|
||||
paragraphs_obj.make_paragraphs()
|
||||
self.__bracket_match('paragraphs_info')
|
||||
default_font = special_font_dict['default-font']
|
||||
paragraph_def_obj = paragraph_def.ParagraphDef(
|
||||
in_file = self.__temp_file,
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
copy = self.__copy,
|
||||
default_font = default_font,
|
||||
run_level = self.__run_level,)
|
||||
list_of_styles = paragraph_def_obj.make_paragraph_def()
|
||||
body_styles_obj = body_styles.BodyStyles(
|
||||
in_file = self.__temp_file,
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
copy = self.__copy,
|
||||
list_of_styles = list_of_styles,
|
||||
run_level = self.__run_level,)
|
||||
body_styles_obj.insert_info()
|
||||
self.__bracket_match('body_styles_info')
|
||||
self.__bracket_match('paragraph_def_info')
|
||||
table_obj = table.Table(
|
||||
in_file = self.__temp_file,
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
copy = self.__copy,
|
||||
run_level = self.__run_level,)
|
||||
table_data = table_obj.make_table()
|
||||
self.__bracket_match('table_info')
|
||||
table_info_obj = table_info.TableInfo(
|
||||
in_file = self.__temp_file,
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
copy = self.__copy,
|
||||
table_data = table_data,
|
||||
run_level = self.__run_level,)
|
||||
table_info_obj.insert_info()
|
||||
self.__bracket_match('table__data_info')
|
||||
if self.__form_lists:
|
||||
make_list_obj = make_lists.MakeLists(
|
||||
in_file = self.__temp_file,
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
copy = self.__copy,
|
||||
headings_to_sections = self.__headings_to_sections,
|
||||
run_level = self.__run_level,
|
||||
list_of_lists = list_of_lists,
|
||||
)
|
||||
make_list_obj.make_lists()
|
||||
self.__bracket_match('form_lists_info')
|
||||
if self.__headings_to_sections:
|
||||
headings_to_sections_obj = headings_to_sections.HeadingsToSections(
|
||||
in_file = self.__temp_file,
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
copy = self.__copy,
|
||||
run_level = self.__run_level,)
|
||||
headings_to_sections_obj.make_sections()
|
||||
self.__bracket_match('headings_to_sections_info')
|
||||
if self.__group_styles:
|
||||
group_styles_obj = group_styles.GroupStyles(
|
||||
in_file = self.__temp_file,
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
copy = self.__copy,
|
||||
wrap = 1,
|
||||
run_level = self.__run_level,)
|
||||
group_styles_obj.group_styles()
|
||||
self.__bracket_match('group_styles_info')
|
||||
if self.__group_borders:
|
||||
group_borders_obj = group_borders.GroupBorders(
|
||||
in_file = self.__temp_file,
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
copy = self.__copy,
|
||||
wrap = 1,
|
||||
run_level = self.__run_level,)
|
||||
group_borders_obj.group_borders()
|
||||
self.__bracket_match('group_borders_info')
|
||||
inline_obj = inline.Inline(
|
||||
in_file = self.__temp_file,
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
copy = self.__copy,
|
||||
run_level = self.__run_level,)
|
||||
inline_obj.form_tags()
|
||||
self.__bracket_match('inline_info')
|
||||
hex2utf_obj.update_values(file = self.__temp_file,
|
||||
area_to_convert = 'body',
|
||||
copy = self.__copy,
|
||||
char_file = self.__char_data,
|
||||
convert_caps = self.__convert_caps,
|
||||
convert_symbol = self.__convert_symbol,
|
||||
convert_wingdings = self.__convert_wingdings,
|
||||
convert_zapf = self.__convert_zapf,
|
||||
symbol = 1,
|
||||
wingdings = 1,
|
||||
dingbats = 1,
|
||||
)
|
||||
hex2utf_obj.convert_hex_2_utf8()
|
||||
header_obj.join_headers()
|
||||
footnote_obj.join_footnotes()
|
||||
tags_obj = convert_to_tags.ConvertToTags(
|
||||
in_file = self.__temp_file,
|
||||
copy = self.__copy,
|
||||
dtd_path = self.__dtd_path,
|
||||
indent = self.__indent,
|
||||
run_level = self.__run_level,
|
||||
no_dtd = self.__no_dtd,
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
)
|
||||
tags_obj.convert_to_tags()
|
||||
output_obj = output.Output(
|
||||
file = self.__temp_file,
|
||||
orig_file = self.__file,
|
||||
output_dir = self.__out_dir,
|
||||
out_file = self.__out_file,
|
||||
)
|
||||
output_obj.output()
|
||||
os.remove(self.__temp_file)
|
||||
return self.__exit_level
|
||||
def __bracket_match(self, file_name):
|
||||
if self.__run_level > 2:
|
||||
good_br, msg = self.__check_brack_obj.check_brackets()
|
||||
if good_br:
|
||||
pass
|
||||
# sys.stderr.write( msg + ' in ' + file_name + "\n")
|
||||
else:
|
||||
msg += msg + " in file '" + file_name + "'\n"
|
||||
raise RtfInvalidCodeException, msg
|
||||
def __return_code(self, num):
|
||||
if num == None:
|
||||
return
|
||||
if int(num) > self.__exit_level:
|
||||
self.__exit_level = num
|
||||
def __make_temp_file(self,file):
|
||||
"""Make a temporary file to parse"""
|
||||
write_file="rtf_write_file"
|
||||
read_obj = open(file,'r')
|
||||
write_obj = open(write_file, 'w')
|
||||
line = "dummy"
|
||||
while line:
|
||||
line = read_obj.read(1000)
|
||||
write_obj.write(line )
|
||||
read_obj.close()
|
||||
write_obj.close()
|
||||
return write_file
|
||||
"""
|
||||
mi<tg<open______<style-sheet\n
|
||||
mi<tg<close_____<style-sheet\n
|
||||
mi<tg<open-att__<footnote<num>1\n
|
||||
mi<tg<empty-att_<page-definition<margin>33\n
|
||||
mi<tg<empty_____<para\n
|
||||
"""
|
3
src/libprs500/ebooks/rtf2xml/__init__.py
Executable file
3
src/libprs500/ebooks/rtf2xml/__init__.py
Executable file
@ -0,0 +1,3 @@
|
||||
'''
|
||||
modules for rtf2xml
|
||||
'''
|
205
src/libprs500/ebooks/rtf2xml/add_brackets.py
Executable file
205
src/libprs500/ebooks/rtf2xml/add_brackets.py
Executable file
@ -0,0 +1,205 @@
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# You should have received a copy of the GNU General Public License #
|
||||
# along with this program; if not, write to the Free Software #
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
|
||||
# 02111-1307 USA #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os, tempfile
|
||||
from libprs500.ebooks.rtf2xml import copy, check_brackets
|
||||
# note to self. This is the first module in which I use tempfile. A good idea?
|
||||
"""
|
||||
"""
|
||||
class AddBrackets:
|
||||
"""
|
||||
Add brackets for old RTF.
|
||||
Logic:
|
||||
"""
|
||||
def __init__(self, in_file,
|
||||
bug_handler,
|
||||
copy = None,
|
||||
run_level = 1,
|
||||
):
|
||||
"""
|
||||
Required:
|
||||
'file'--file to parse
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__write_to = tempfile.mktemp()
|
||||
self.__run_level = run_level
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
"""
|
||||
self.__state_dict = {
|
||||
'before_body' : self.__before_body_func,
|
||||
'in_body' : self.__in_body_func,
|
||||
'after_control_word' : self.__after_control_word_func,
|
||||
'in_ignore' : self.__ignore_func,
|
||||
}
|
||||
self.__state = 'before_body'
|
||||
self.__inline = {}
|
||||
self.__temp_group = []
|
||||
self.__open_bracket = 0
|
||||
self.__found_brackets = 0
|
||||
self.__accept = [
|
||||
'cw<ci<bold______',
|
||||
'cw<ci<annotation' ,
|
||||
'cw<ci<blue______' ,
|
||||
'cw<ci<bold______' ,
|
||||
'cw<ci<caps______' ,
|
||||
'cw<ci<char-style' ,
|
||||
'cw<ci<dbl-strike' ,
|
||||
'cw<ci<emboss____' ,
|
||||
'cw<ci<engrave___' ,
|
||||
'cw<ci<font-color' ,
|
||||
'cw<ci<font-down_' ,
|
||||
'cw<ci<font-size_' ,
|
||||
'cw<ci<font-style' ,
|
||||
'cw<ci<font-up___',
|
||||
'cw<ci<footnot-mk',
|
||||
'cw<ci<green_____' ,
|
||||
'cw<ci<hidden____',
|
||||
'cw<ci<italics___' ,
|
||||
'cw<ci<outline___',
|
||||
'cw<ci<red_______' ,
|
||||
'cw<ci<shadow____',
|
||||
'cw<ci<small-caps' ,
|
||||
'cw<ci<strike-thr',
|
||||
'cw<ci<subscript_' ,
|
||||
'cw<ci<superscrip',
|
||||
'cw<ci<underlined' ,
|
||||
'cw<ul<underlined' ,
|
||||
]
|
||||
def __before_body_func(self, line):
|
||||
"""
|
||||
"""
|
||||
if self.__token_info == 'mi<mk<body-open_':
|
||||
self.__state = 'in_body'
|
||||
self.__write_obj.write(line)
|
||||
def __in_body_func(self, line):
|
||||
"""
|
||||
"""
|
||||
if line == 'cb<nu<clos-brack<0001\n' and self.__open_bracket:
|
||||
self.__write_obj.write(
|
||||
'cb<nu<clos-brack<0003\n'
|
||||
)
|
||||
self.__write_obj.write(line)
|
||||
elif self.__token_info == 'ob<nu<open-brack':
|
||||
self.__found_brackets = 1
|
||||
self.__state = 'in_ignore'
|
||||
self.__ignore_count = self.__ob_count
|
||||
self.__write_obj.write(line)
|
||||
elif self.__token_info in self.__accept:
|
||||
self.__temp_group.append(line)
|
||||
self.__state = 'after_control_word'
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
def __after_control_word_func(self, line):
|
||||
"""
|
||||
"""
|
||||
if self.__token_info in self.__accept:
|
||||
self.__temp_group.append(line)
|
||||
else:
|
||||
self.__change_permanent_group()
|
||||
self.__write_group()
|
||||
self.__write_obj.write(line)
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.__state = 'in_ignore'
|
||||
self.__ignore_count = self.__ob_count
|
||||
else:
|
||||
self.__state = 'in_body'
|
||||
def __write_group(self):
|
||||
"""
|
||||
"""
|
||||
if self.__open_bracket:
|
||||
self.__write_obj.write(
|
||||
'cb<nu<clos-brack<0003\n'
|
||||
)
|
||||
self.__open_bracket = 0
|
||||
inline_string = ''
|
||||
the_keys = self.__inline.keys()
|
||||
for the_key in the_keys:
|
||||
value = self.__inline[the_key]
|
||||
if value != 'false':
|
||||
inline_string += '%s<nu<%s\n' % (the_key, value)
|
||||
if inline_string:
|
||||
self.__write_obj.write('ob<nu<open-brack<0003\n')
|
||||
self.__write_obj.write(inline_string)
|
||||
self.__open_bracket = 1
|
||||
self.__temp_group = []
|
||||
def __change_permanent_group(self):
|
||||
"""
|
||||
use temp group to change permanent group
|
||||
"""
|
||||
for line in self.__temp_group:
|
||||
token_info = line[:16]
|
||||
if token_info in self.__accept:
|
||||
att = line[20:-1]
|
||||
self.__inline[token_info] = att
|
||||
def __ignore_func(self, line):
|
||||
"""
|
||||
Don't add any brackets while inside of brackets RTF has already
|
||||
added.
|
||||
"""
|
||||
self.__write_obj.write(line)
|
||||
if self.__token_info == 'cb<nu<clos-brack'and\
|
||||
self.__cb_count == self.__ignore_count:
|
||||
self.__state = 'in_body'
|
||||
def __check_brackets(self, in_file):
|
||||
self.__check_brack_obj = check_brackets.CheckBrackets\
|
||||
(file = in_file)
|
||||
good_br = self.__check_brack_obj.check_brackets()[0]
|
||||
if not good_br:
|
||||
return 1
|
||||
def add_brackets(self):
|
||||
"""
|
||||
"""
|
||||
self.__initiate_values()
|
||||
read_obj = open(self.__file, 'r')
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.__ob_count = line[-5:-1]
|
||||
if self.__token_info == 'cb<nu<clos-brack':
|
||||
self.__cb_count = line[-5:-1]
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if action == None:
|
||||
sys.stderr.write('No matching state in module add_brackets.py\n')
|
||||
sys.stderr.write(self.__state + '\n')
|
||||
action(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
bad_brackets = self.__check_brackets(self.__write_to)
|
||||
if not bad_brackets:
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "add_brackets.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
else:
|
||||
if self.__run_level > 0:
|
||||
sys.stderr.write(
|
||||
'Sorry, but this files has a mix of old and new RTF.\n'
|
||||
'Some characteristics cannot be converted.\n')
|
||||
os.remove(self.__write_to)
|
81
src/libprs500/ebooks/rtf2xml/body_styles.py
Executable file
81
src/libprs500/ebooks/rtf2xml/body_styles.py
Executable file
@ -0,0 +1,81 @@
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# You should have received a copy of the GNU General Public License #
|
||||
# along with this program; if not, write to the Free Software #
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
|
||||
# 02111-1307 USA #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import os, tempfile
|
||||
from libprs500.ebooks.rtf2xml import copy
|
||||
"""
|
||||
Simply write the list of strings after style table
|
||||
"""
|
||||
class BodyStyles:
|
||||
"""
|
||||
Insert table data for tables.
|
||||
Logic:
|
||||
"""
|
||||
def __init__(self,
|
||||
in_file,
|
||||
list_of_styles,
|
||||
bug_handler,
|
||||
copy=None,
|
||||
run_level = 1,):
|
||||
"""
|
||||
Required:
|
||||
'file'--file to parse
|
||||
'table_data' -- a dictionary for each table.
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__list_of_styles = list_of_styles
|
||||
self.__run_level = run_level
|
||||
self.__write_to = tempfile.mktemp()
|
||||
# self.__write_to = 'table_info.data'
|
||||
def insert_info(self):
|
||||
"""
|
||||
"""
|
||||
read_obj = open(self.__file, 'r')
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
if line == 'mi<tg<close_____<style-table\n':
|
||||
if len(self.__list_of_styles) > 0:
|
||||
self.__write_obj.write('mi<tg<open______<styles-in-body\n')
|
||||
the_string = ''.join(self.__list_of_styles)
|
||||
self.__write_obj.write(the_string)
|
||||
self.__write_obj.write('mi<tg<close_____<styles-in-body\n')
|
||||
else:
|
||||
# this shouldn't happen!
|
||||
if self.__run_level > 3:
|
||||
msg = 'Not enough data for each table\n'
|
||||
raise self.__bug_handler, msg
|
||||
# why was this line even here?
|
||||
# self.__write_obj.write('mi<tg<open______<table\n')
|
||||
self.__write_obj.write(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "body_styles.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
189
src/libprs500/ebooks/rtf2xml/border_parse.py
Executable file
189
src/libprs500/ebooks/rtf2xml/border_parse.py
Executable file
@ -0,0 +1,189 @@
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# You should have received a copy of the GNU General Public License #
|
||||
# along with this program; if not, write to the Free Software #
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
|
||||
# 02111-1307 USA #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys
|
||||
class BorderParse:
|
||||
"""
|
||||
Parse a border line and return a dictionary of attributes and values
|
||||
"""
|
||||
def __init__(self):
|
||||
# cw<bd<bor-t-r-hi<nu<true
|
||||
self.__border_dict = {
|
||||
'bor-t-r-hi' : 'border-table-row-horizontal-inside',
|
||||
'bor-t-r-vi' : 'border-table-row-vertical-inside',
|
||||
'bor-t-r-to' : 'border-table-row-top',
|
||||
'bor-t-r-le' : 'border-table-row-left',
|
||||
'bor-t-r-bo' : 'border-table-row-bottom',
|
||||
'bor-t-r-ri' : 'border-table-row-right',
|
||||
'bor-cel-bo' : 'border-cell-bottom',
|
||||
'bor-cel-to' : 'border-cell-top',
|
||||
'bor-cel-le' : 'border-cell-left',
|
||||
'bor-cel-ri' : 'border-cell-right',
|
||||
'bor-par-bo' : 'border-paragraph-bottom',
|
||||
'bor-par-to' : 'border-paragraph-top',
|
||||
'bor-par-le' : 'border-paragraph-left',
|
||||
'bor-par-ri' : 'border-paragraph-right',
|
||||
'bor-par-bx' : 'border-paragraph-box',
|
||||
'bor-for-ev' : 'border-for-every-paragraph',
|
||||
'bor-outsid' : 'border-outside',
|
||||
'bor-none__' : 'border',
|
||||
# border type => bt
|
||||
'bdr-li-wid' : 'line-width',
|
||||
'bdr-sp-wid' : 'padding',
|
||||
'bdr-color_' : 'color',
|
||||
}
|
||||
self.__border_style_dict = {
|
||||
'bdr-single' : 'single',
|
||||
'bdr-doubtb' : 'double-thickness-border',
|
||||
'bdr-shadow' : 'shadowed-border',
|
||||
'bdr-double' : 'double-border',
|
||||
'bdr-dotted' : 'dotted-border',
|
||||
'bdr-dashed' : 'dashed',
|
||||
'bdr-hair__' : 'hairline',
|
||||
'bdr-inset_' : 'inset',
|
||||
'bdr-das-sm' : 'dash-small',
|
||||
'bdr-dot-sm' : 'dot-dash',
|
||||
'bdr-dot-do' : 'dot-dot-dash',
|
||||
'bdr-outset' : 'outset',
|
||||
'bdr-trippl' : 'tripple',
|
||||
'bdr-thsm__' : 'thick-thin-small',
|
||||
'bdr-htsm__' : 'thin-thick-small',
|
||||
'bdr-hthsm_' : 'thin-thick-thin-small',
|
||||
'bdr-thm___' : 'thick-thin-medium',
|
||||
'bdr-htm___' : 'thin-thick-medium',
|
||||
'bdr-hthm__' : 'thin-thick-thin-medium',
|
||||
'bdr-thl___' : 'thick-thin-large',
|
||||
'bdr-hthl__' : 'thin-thick-thin-large',
|
||||
'bdr-wavy__' : 'wavy',
|
||||
'bdr-d-wav_' : 'double-wavy',
|
||||
'bdr-strip_' : 'striped',
|
||||
'bdr-embos_' : 'emboss',
|
||||
'bdr-engra_' : 'engrave',
|
||||
'bdr-frame_' : 'frame',
|
||||
}
|
||||
def parse_border(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line with border definition in it
|
||||
Returns:
|
||||
?
|
||||
Logic:
|
||||
"""
|
||||
border_dict = {}
|
||||
border_style_dict = {}
|
||||
border_style_list = []
|
||||
border_type = self.__border_dict.get(line[6:16])
|
||||
if not border_type:
|
||||
sys.stderr.write(
|
||||
'module is border_parse.py\n'
|
||||
'function is parse_border\n'
|
||||
'token does not have a dictionary value\n'
|
||||
'token is "%s"' % line
|
||||
)
|
||||
return border_dict
|
||||
att_line = line[20:-1]
|
||||
atts = att_line.split('|')
|
||||
# cw<bd<bor-cel-ri<nu<
|
||||
# border has no value--should be no lines
|
||||
if len(atts) == 1 and atts[0] == '':
|
||||
border_dict[border_type] = 'none'
|
||||
return border_dict
|
||||
# border-paragraph-right
|
||||
for att in atts:
|
||||
values = att.split(':')
|
||||
if len(values) ==2:
|
||||
att = values[0]
|
||||
value = values[1]
|
||||
else:
|
||||
value = 'true'
|
||||
style_att = self.__border_style_dict.get(att)
|
||||
if style_att:
|
||||
att = '%s-%s' % (border_type, att)
|
||||
border_style_dict[att] = value
|
||||
border_style_list.append(style_att)
|
||||
else:
|
||||
att = self.__border_dict.get(att)
|
||||
if not att:
|
||||
sys.stderr.write(
|
||||
'module is border_parse_def.py\n'
|
||||
'function is parse_border\n'
|
||||
'token does not have an att value\n'
|
||||
'line is "%s"' % line
|
||||
)
|
||||
att = '%s-%s' % (border_type, att)
|
||||
border_dict[att] = value
|
||||
new_border_dict = self.__determine_styles(border_type, border_style_list)
|
||||
border_dict.update(new_border_dict)
|
||||
return border_dict
|
||||
def __determine_styles(self, border_type, border_style_list):
|
||||
new_border_dict = {}
|
||||
att = '%s-style' % border_type
|
||||
if 'shadowed-border' in border_style_list:
|
||||
new_border_dict[att] = 'shadowed'
|
||||
elif 'engraved' in border_style_list:
|
||||
new_border_dict[att] = 'engraved'
|
||||
elif 'emboss' in border_style_list:
|
||||
new_border_dict[att] = 'emboss'
|
||||
elif 'striped' in border_style_list:
|
||||
new_border_dict[att] = 'striped'
|
||||
elif 'thin-thick-thin-small' in border_style_list:
|
||||
new_border_dict[att] = 'thin-thick-thin-small'
|
||||
elif 'thick-thin-large' in border_style_list:
|
||||
new_border_dict[att] = 'thick-thin-large'
|
||||
elif 'thin-thick-thin-medium' in border_style_list:
|
||||
new_border_dict[att] = 'thin-thick-thin-medium'
|
||||
elif 'thin-thick-medium' in border_style_list:
|
||||
new_border_dict[att] = 'thin-thick-medium'
|
||||
elif 'thick-thin-medium' in border_style_list:
|
||||
new_border_dict[att] = 'thick-thin-medium'
|
||||
elif 'thick-thin-small' in border_style_list:
|
||||
new_border_dict[att] = 'thick-thin-small'
|
||||
elif 'thick-thin-small' in border_style_list:
|
||||
new_border_dict[att] = 'thick-thin-small'
|
||||
elif 'double-wavy' in border_style_list:
|
||||
new_border_dict[att] = 'double-wavy'
|
||||
elif 'dot-dot-dash' in border_style_list:
|
||||
new_border_dict[att] = 'dot-dot-dash'
|
||||
elif 'dot-dash' in border_style_list:
|
||||
new_border_dict[att] = 'dot-dash'
|
||||
elif 'dotted-border' in border_style_list:
|
||||
new_border_dict[att] = 'dotted'
|
||||
elif 'wavy' in border_style_list:
|
||||
new_border_dict[att] = 'wavy'
|
||||
elif 'dash-small' in border_style_list:
|
||||
new_border_dict[att] = 'dash-small'
|
||||
elif 'dashed' in border_style_list:
|
||||
new_border_dict[att] = 'dashed'
|
||||
elif 'frame' in border_style_list:
|
||||
new_border_dict[att] = 'frame'
|
||||
elif 'inset' in border_style_list:
|
||||
new_border_dict[att] = 'inset'
|
||||
elif 'outset' in border_style_list:
|
||||
new_border_dict[att] = 'outset'
|
||||
elif 'tripple-border' in border_style_list:
|
||||
new_border_dict[att] = 'tripple'
|
||||
elif 'double-border' in border_style_list:
|
||||
new_border_dict[att] = 'double'
|
||||
elif 'double-thickness-border' in border_style_list:
|
||||
new_border_dict[att] = 'double-thickness'
|
||||
elif 'hairline' in border_style_list:
|
||||
new_border_dict[att] = 'hairline'
|
||||
elif 'single' in border_style_list:
|
||||
new_border_dict[att] = 'single'
|
||||
else:
|
||||
new_border_dict[att] = border_style_list[0]
|
||||
return new_border_dict
|
3075
src/libprs500/ebooks/rtf2xml/char_set.py
Executable file
3075
src/libprs500/ebooks/rtf2xml/char_set.py
Executable file
File diff suppressed because it is too large
Load Diff
61
src/libprs500/ebooks/rtf2xml/check_brackets.py
Executable file
61
src/libprs500/ebooks/rtf2xml/check_brackets.py
Executable file
@ -0,0 +1,61 @@
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# You should have received a copy of the GNU General Public License #
|
||||
# along with this program; if not, write to the Free Software #
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
|
||||
# 02111-1307 USA #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
class CheckBrackets:
|
||||
"""Check that brackets match up"""
|
||||
def __init__(self, bug_handler = None, file=None):
|
||||
self.__file=file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__bracket_count=0
|
||||
self.__ob_count = 0
|
||||
self.__cb_count = 0
|
||||
self.__open_bracket_num = []
|
||||
def open_brack(self, line):
|
||||
num = line[-5:-1]
|
||||
self.__open_bracket_num.append(num)
|
||||
self.__bracket_count += 1
|
||||
def close_brack(self, line):
|
||||
num = line[-5:-1]
|
||||
##self.__open_bracket_num.append(num)
|
||||
try:
|
||||
last_num = self.__open_bracket_num.pop()
|
||||
except:
|
||||
return 0
|
||||
if num != last_num:
|
||||
return 0
|
||||
self.__bracket_count -= 1
|
||||
return 1
|
||||
def check_brackets(self):
|
||||
read_obj = open(self.__file, 'r')
|
||||
line = 'dummy'
|
||||
line_count = 0
|
||||
while line:
|
||||
line_count += 1
|
||||
line = read_obj.readline()
|
||||
self.__token_info = line[:16]
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.open_brack(line)
|
||||
if self.__token_info == 'cb<nu<clos-brack':
|
||||
right_count = self.close_brack(line)
|
||||
if not right_count:
|
||||
return (0, "closed bracket doesn't match, line %s" % line_count)
|
||||
read_obj.close()
|
||||
if self.__bracket_count != 0:
|
||||
msg = 'At end of file open and closed brackets don\'t match\n'
|
||||
msg = msg + 'total number of brackets is %s' % self.__bracket_count
|
||||
return (0, msg)
|
||||
return (1, "brackets match!")
|
29
src/libprs500/ebooks/rtf2xml/check_encoding.py
Executable file
29
src/libprs500/ebooks/rtf2xml/check_encoding.py
Executable file
@ -0,0 +1,29 @@
|
||||
#!/usr/bin/env python
|
||||
import sys
|
||||
class CheckEncoding:
|
||||
def __init__(self, bug_handler):
|
||||
self.__bug_handler = bug_handler
|
||||
def __get_position_error(self, line, encoding, line_num):
|
||||
char_position = 0
|
||||
for char in line:
|
||||
char_position +=1
|
||||
try:
|
||||
char.decode(encoding)
|
||||
except UnicodeError, msg:
|
||||
sys.stderr.write('line: %s char: %s\n' % (line_num, char_position))
|
||||
sys.stderr.write(str(msg) + '\n')
|
||||
def check_encoding(self, path, encoding='us-ascii'):
|
||||
read_obj = open(path, 'r')
|
||||
line_to_read = 1
|
||||
line_num = 0
|
||||
while line_to_read:
|
||||
line_num += 1
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
try:
|
||||
line.decode(encoding)
|
||||
except UnicodeError:
|
||||
self.__get_position_error(line, encoding, line_num)
|
||||
if __name__ == '__main__':
|
||||
check_encoding_obj = CheckEncoding()
|
||||
check_encoding_obj.check_encoding(sys.argv[1])
|
247
src/libprs500/ebooks/rtf2xml/colors.py
Executable file
247
src/libprs500/ebooks/rtf2xml/colors.py
Executable file
@ -0,0 +1,247 @@
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# You should have received a copy of the GNU General Public License #
|
||||
# along with this program; if not, write to the Free Software #
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
|
||||
# 02111-1307 USA #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os, tempfile, re
|
||||
from libprs500.ebooks.rtf2xml import copy
|
||||
class Colors:
|
||||
"""
|
||||
Change lines with color info from color numbers to the actual color names.
|
||||
"""
|
||||
def __init__(self,
|
||||
in_file,
|
||||
bug_handler,
|
||||
copy = None,
|
||||
run_level = 1
|
||||
):
|
||||
"""
|
||||
Required:
|
||||
'file'--file to parse
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__copy = copy
|
||||
self.__bug_handler = bug_handler
|
||||
self.__write_to = tempfile.mktemp()
|
||||
self.__run_level = run_level
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Initiate all values.
|
||||
"""
|
||||
self.__color_dict = {}
|
||||
self.__state = 'before_color_table'
|
||||
self.__state_dict = {
|
||||
'before_color_table': self.__before_color_func,
|
||||
'in_color_table' : self.__in_color_func,
|
||||
'after_color_table' : self.__after_color_func,
|
||||
'cw<ci<red_______' : self.__default_color_func,
|
||||
'cw<ci<green_____' : self.__default_color_func,
|
||||
'cw<ci<blue______' : self.__blue_func,
|
||||
'tx<nu<__________' : self.__do_nothing_func,
|
||||
}
|
||||
self.__color_string = '#'
|
||||
self.__color_num = 1
|
||||
self.__line_color_exp = re.compile(r'bdr-color_:(\d+)')
|
||||
# cw<bd<bor-par-to<nu<bdr-hair__|bdr-li-wid:0.50|bdr-sp-wid:1.00|bdr-color_:2
|
||||
def __before_color_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Check to see if the line marks the beginning of the color table.
|
||||
If so, change states.
|
||||
Always print out the line.
|
||||
"""
|
||||
# mi<mk<clrtbl-beg
|
||||
if self.__token_info == 'mi<mk<clrtbl-beg':
|
||||
self.__state = 'in_color_table'
|
||||
self.__write_obj.write(line)
|
||||
def __default_color_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
get the hex number from the line and add it to the color string.
|
||||
"""
|
||||
hex_num = line[-3:-1]
|
||||
self.__color_string += hex_num
|
||||
def __blue_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Get the hex number from the line and add it to the color string.
|
||||
Add a key -> value pair to the color dictionary, with the number
|
||||
as the key, and the hex number as the value. Write an empty tag
|
||||
with the hex number and number as attributes. Add one to the color
|
||||
number. Reset the color string to '#'
|
||||
"""
|
||||
hex_num = line[-3:-1]
|
||||
self.__color_string += hex_num
|
||||
self.__color_dict[self.__color_num] = self.__color_string
|
||||
self.__write_obj.write(
|
||||
'mi<tg<empty-att_'
|
||||
'<color-in-table<num>%s<value>%s\n' % (self.__color_num, self.__color_string)
|
||||
)
|
||||
self.__color_num += 1
|
||||
self.__color_string = '#'
|
||||
def __in_color_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Check if the end of the color table has been reached. If so,
|
||||
change the state to after the color table.
|
||||
Othewise, get a function by passing the self.__token_info to the
|
||||
state dictionary.
|
||||
"""
|
||||
#mi<mk<clrtbl-beg
|
||||
#cw<ci<red_______<nu<00
|
||||
if self.__token_info == 'mi<mk<clrtbl-end':
|
||||
self.__state = 'after_color_table'
|
||||
else:
|
||||
action = self.__state_dict.get(self.__token_info)
|
||||
if action == None:
|
||||
sys.stderr.write('in module colors.py\n'
|
||||
'function is self.__in_color_func\n'
|
||||
'no action for %s' % self.__token_info
|
||||
)
|
||||
action(line)
|
||||
def __after_color_func(self, line):
|
||||
"""
|
||||
Check the to see if it contains color info. If it does, extract the
|
||||
number and look up the hex value in the color dictionary. If the color
|
||||
dictionary has no key for the number, print out an error message.
|
||||
Otherwise, print out the line.
|
||||
Added Oct 10, 2003
|
||||
If the number is 0, that indicates no color
|
||||
"""
|
||||
#cw<ci<font-color<nu<2
|
||||
if self.__token_info == 'cw<ci<font-color':
|
||||
hex_num = int(line[20:-1])
|
||||
hex_num = self.__figure_num(hex_num)
|
||||
if hex_num:
|
||||
self.__write_obj.write(
|
||||
'cw<ci<font-color<nu<%s\n' % hex_num
|
||||
)
|
||||
elif line[0:5] == 'cw<bd':
|
||||
the_index = line.find('bdr-color_')
|
||||
if the_index > -1:
|
||||
line = re.sub(self.__line_color_exp, self.__sub_from_line_color, line)
|
||||
self.__write_obj.write(line)
|
||||
"""
|
||||
if num == 0:
|
||||
hex_num = 'false'
|
||||
else:
|
||||
hex_num = self.__color_dict.get(num)
|
||||
if hex_num == None:
|
||||
if self.__run_level > 0:
|
||||
sys.stderr.write(
|
||||
'module is colors.py\n'
|
||||
'function is self.__after_color_func\n'
|
||||
'no value in self.__color_dict for key %s\n' % num
|
||||
)
|
||||
if self.__run_level > 3:
|
||||
sys.stderr.write(
|
||||
'run level is %s\n'
|
||||
'Script will now quit\n'
|
||||
% self.__run_level)
|
||||
else:
|
||||
self.__write_obj.write(
|
||||
'cw<ci<font-color<nu<%s\n' % hex_num
|
||||
)
|
||||
"""
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
# cw<bd<bor-par-to<nu<bdr-hair__|bdr-li-wid:0.50|bdr-sp-wid:1.00|bdr-color_:2
|
||||
def __sub_from_line_color(self, match_obj):
|
||||
num = match_obj.group(1)
|
||||
try:
|
||||
num = int(num)
|
||||
except ValueError:
|
||||
if self.__run_level > 3:
|
||||
msg = 'can\'t make integer from string\n'
|
||||
raise self.__bug_handler, msg
|
||||
else:
|
||||
return 'bdr-color_:no-value'
|
||||
hex_num = self.__figure_num(num)
|
||||
return_value = 'bdr-color_:%s' % hex_num
|
||||
return return_value
|
||||
def __figure_num(self, num):
|
||||
if num == 0:
|
||||
hex_num = 'false'
|
||||
else:
|
||||
hex_num = self.__color_dict.get(num)
|
||||
if hex_num == None:
|
||||
if self.__run_level > 3:
|
||||
msg = 'no value in self.__color_dict for key %s\n' % num
|
||||
raise self.__bug_hanlder, msg
|
||||
if hex_num == None:
|
||||
hex_num = '0'
|
||||
return hex_num
|
||||
def __do_nothing_func(self, line):
|
||||
"""
|
||||
Bad RTF will have text in the color table
|
||||
"""
|
||||
pass
|
||||
def convert_colors(self):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
nothing (changes the original file)
|
||||
Logic:
|
||||
Read one line in at a time. Determine what action to take based on
|
||||
the state. If the state is before the color table, look for the
|
||||
beginning of the color table.
|
||||
If the state is in the color table, create the color dictionary
|
||||
and print out the tags.
|
||||
If the state if afer the color table, look for lines with color
|
||||
info, and substitute the number with the hex number.
|
||||
"""
|
||||
self.__initiate_values()
|
||||
read_obj = open(self.__file, 'r')
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if action == None:
|
||||
sys.stderr.write('no no matching state in module fonts.py\n')
|
||||
sys.stderr.write(self.__state + '\n')
|
||||
action(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "color.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
92
src/libprs500/ebooks/rtf2xml/combine_borders.py
Executable file
92
src/libprs500/ebooks/rtf2xml/combine_borders.py
Executable file
@ -0,0 +1,92 @@
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# You should have received a copy of the GNU General Public License #
|
||||
# along with this program; if not, write to the Free Software #
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
|
||||
# 02111-1307 USA #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import os, tempfile
|
||||
from libprs500.ebooks.rtf2xml import copy
|
||||
class CombineBorders:
|
||||
"""Combine borders in RTF tokens to make later processing easier"""
|
||||
def __init__(self,
|
||||
in_file ,
|
||||
bug_handler,
|
||||
copy = None,
|
||||
run_level = 1,
|
||||
):
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__write_to = tempfile.mktemp()
|
||||
self.__state = 'default'
|
||||
self.__bord_pos = 'default'
|
||||
self.__bord_att = []
|
||||
def found_bd(self, line):
|
||||
#cw<bd<bor-t-r-vi
|
||||
self.__state = 'border'
|
||||
self.__bord_pos = line[6:16]
|
||||
def __default_func(self, line):
|
||||
#cw<bd<bor-t-r-vi
|
||||
if self.__first_five == 'cw<bd':
|
||||
self.found_bd(line)
|
||||
return ''
|
||||
return line
|
||||
def end_border(self, line, write_obj):
|
||||
joiner = "|"
|
||||
border_string = joiner.join(self.__bord_att)
|
||||
self.__bord_att = []
|
||||
write_obj.write('cw<bd<%s<nu<%s\n' % (self.__bord_pos,
|
||||
border_string))
|
||||
self.__state = 'default'
|
||||
self.__bord_string = ''
|
||||
if self.__first_five == 'cw<bd':
|
||||
self. found_bd(line)
|
||||
else:
|
||||
write_obj.write(line)
|
||||
def add_to_border_desc(self, line):
|
||||
#cw<bt<bdr-hair__<nu<true
|
||||
#cw<bt<bdr-linew<nu<0.50
|
||||
#tx<__________<some text
|
||||
border_desc = line[6:16]
|
||||
num = line[20:-1]
|
||||
if num == 'true':
|
||||
num = ''
|
||||
else:
|
||||
num = ':' + num
|
||||
self.__bord_att.append(border_desc + num)
|
||||
def __border_func(self, line, write_obj):
|
||||
if self.__first_five != 'cw<bt':
|
||||
self.end_border(line, write_obj)
|
||||
else:
|
||||
self.add_to_border_desc(line)
|
||||
def combine_borders(self):
|
||||
read_obj = open(self.__file, 'r')
|
||||
write_obj = open(self.__write_to, 'w')
|
||||
line_to_read = 'dummy'
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__first_five = line[0:5]
|
||||
if self.__state == 'border':
|
||||
self.__border_func(line, write_obj)
|
||||
else:
|
||||
to_print = self.__default_func(line)
|
||||
write_obj.write(to_print)
|
||||
read_obj.close()
|
||||
write_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "combine_borders.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
151
src/libprs500/ebooks/rtf2xml/configure_txt.py
Executable file
151
src/libprs500/ebooks/rtf2xml/configure_txt.py
Executable file
@ -0,0 +1,151 @@
|
||||
import os, sys
|
||||
class Configure:
|
||||
def __init__( self,
|
||||
configuration_file,
|
||||
bug_handler,
|
||||
debug_dir = None,
|
||||
show_config_file = None,
|
||||
):
|
||||
"""
|
||||
Requires:
|
||||
file --file to be read
|
||||
output --file to output to
|
||||
Returns:
|
||||
Nothing. Outputs a file
|
||||
Logic:
|
||||
"""
|
||||
self.__configuration_file = configuration_file
|
||||
self.__debug_dir = debug_dir
|
||||
self.__bug_handler = bug_handler
|
||||
self.__show_config_file = show_config_file
|
||||
def get_configuration(self, type):
|
||||
self.__configuration_file = self.__get_file_name()
|
||||
return_dict = {}
|
||||
return_dict['config-location'] = self.__configuration_file
|
||||
if self.__show_config_file and self.__configuration_file:
|
||||
sys.stderr.write('configuration file is "%s"\n' % self.__configuration_file)
|
||||
if self.__show_config_file and not self.__configuration_file:
|
||||
sys.stderr.write('No configuraiton file found; using default vaules\n')
|
||||
if self.__configuration_file:
|
||||
read_obj = open(self.__configuration_file, 'r')
|
||||
line_to_read = 1
|
||||
line_num = 0
|
||||
while line_to_read:
|
||||
line_num += 1
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
line = line.strip()
|
||||
if line[0:1] == '#':
|
||||
continue
|
||||
if not line:
|
||||
continue
|
||||
fields = line.split('=')
|
||||
if len(fields) != 2:
|
||||
msg = line
|
||||
msg += ('Error in configuration.txt, line %s\n' % line_num)
|
||||
msg += ('Options take the form of option = value.\n')
|
||||
msg += ('Please correct the configuration file "%s" before continuing\n'
|
||||
% self.__configuration_file)
|
||||
raise self.__bug_handler, msg
|
||||
att = fields[0]
|
||||
value = fields[1]
|
||||
att = att.strip()
|
||||
value = value.strip()
|
||||
return_dict[att] = value
|
||||
return_dict = self.__parse_dict(return_dict)
|
||||
if return_dict == 1:
|
||||
msg = ('Please correct the configuration file "%s" before continuing\n'
|
||||
% self.__configuration_file)
|
||||
raise self.__bug_handler, msg
|
||||
return return_dict
|
||||
def __get_file_name(self):
|
||||
home_var = os.environ.get('HOME')
|
||||
if home_var:
|
||||
home_config = os.path.join(home_var, '.rtf2xml')
|
||||
if os.path.isfile(home_config):
|
||||
return home_config
|
||||
home_var = os.environ.get('USERPROFILE')
|
||||
if home_var:
|
||||
home_config = os.path.join(home_var, '.rtf2xml')
|
||||
if os.path.isfile(home_config):
|
||||
return home_config
|
||||
script_file = os.path.join(sys.path[0], '.rtf2xml')
|
||||
if os.path.isfile(script_file):
|
||||
return script_file
|
||||
return self.__configuration_file
|
||||
def __parse_dict(self, return_dict):
|
||||
allowable = [
|
||||
'configuration-directory',
|
||||
'smart-output', # = false
|
||||
'level', # = 1
|
||||
'convert-symbol',# = true
|
||||
'convert-wingdings',# = true
|
||||
'convert-zapf-dingbats', # = true
|
||||
'convert-caps',# true
|
||||
'indent', # = 1
|
||||
'group-styles',
|
||||
'group-borders',
|
||||
'headings-to-sections',
|
||||
'lists',
|
||||
'raw-dtd-path',
|
||||
'write-empty-paragraphs',
|
||||
'config-location',
|
||||
'script-name',
|
||||
]
|
||||
the_keys = return_dict.keys()
|
||||
for the_key in the_keys:
|
||||
if the_key not in allowable:
|
||||
sys.stderr.write('options "%s" not a legal option.\n'
|
||||
% the_key)
|
||||
return 1
|
||||
configuration_dir = return_dict.get('configuration-directory')
|
||||
if configuration_dir == None:
|
||||
return_dict['configure-directory'] = None
|
||||
else:
|
||||
if not os.path.isdir(configuration_dir):
|
||||
sys.stderr.write('The dirctory "%s" does not appear to be a directory.\n'
|
||||
% configuration_dir)
|
||||
return 1
|
||||
else:
|
||||
return_dict['configure-directory'] = configuration_dir
|
||||
smart_output = return_dict.get('smart-output')
|
||||
if not smart_output:
|
||||
return_dict['smart-output'] = 0
|
||||
elif smart_output != 'true' and smart_output != 'false':
|
||||
sys.stderr.write('"smart-output" must be true or false.\n')
|
||||
return 1
|
||||
elif smart_output == 'false':
|
||||
return_dict['smart-output'] = 0
|
||||
int_options = ['level', 'indent']
|
||||
for int_option in int_options:
|
||||
value = return_dict.get(int_option)
|
||||
if not value:
|
||||
if int_option == 'level':
|
||||
return_dict['level'] = 1
|
||||
else:
|
||||
return_dict['indent'] = 0
|
||||
else:
|
||||
try:
|
||||
int_num = int(return_dict[int_option])
|
||||
return_dict[int_option] = int_num
|
||||
except:
|
||||
sys.stderr.write('"%s" must be a number\n' % int_option)
|
||||
sys.stderr.write('You choose "%s" ' % return_dict[int_option])
|
||||
return 1
|
||||
fonts = ['convert-symbol', 'convert-wingdings', 'convert-zapf-dingbats',
|
||||
'convert-caps'
|
||||
]
|
||||
for font in fonts:
|
||||
value = return_dict.get(font)
|
||||
if not value:
|
||||
return_dict[font] = 0
|
||||
elif value != 'true' and value != 'false':
|
||||
sys.stderr.write(
|
||||
'"%s" must be true or false.\n' % font)
|
||||
elif value == 'false':
|
||||
return_dict[font] = 0
|
||||
return_dict['xslt-processor'] = None
|
||||
return_dict['no-namespace'] = None
|
||||
return_dict['format'] = 'raw'
|
||||
return_dict['no-pyxml'] = 'true'
|
||||
return return_dict
|
242
src/libprs500/ebooks/rtf2xml/convert_to_tags.py
Executable file
242
src/libprs500/ebooks/rtf2xml/convert_to_tags.py
Executable file
@ -0,0 +1,242 @@
|
||||
import os, tempfile
|
||||
from libprs500.ebooks.rtf2xml import copy
|
||||
public_dtd = 'rtf2xml1.0.dtd'
|
||||
class ConvertToTags:
|
||||
"""
|
||||
Convert file to XML
|
||||
"""
|
||||
def __init__(self,
|
||||
in_file,
|
||||
bug_handler,
|
||||
dtd_path,
|
||||
no_dtd,
|
||||
indent = None,
|
||||
copy = None,
|
||||
run_level = 1,
|
||||
):
|
||||
"""
|
||||
Required:
|
||||
'file'
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__dtd_path = dtd_path
|
||||
self.__no_dtd = no_dtd
|
||||
self.__indent = indent
|
||||
self.__run_level = run_level
|
||||
self.__write_to = tempfile.mktemp()
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Set values, including those for the dictionary.
|
||||
"""
|
||||
self.__state = 'default'
|
||||
self.__new_line = 0
|
||||
self.__block = ('doc', 'preamble', 'rtf-definition', 'font-table',
|
||||
'font-in-table', 'color-table', 'color-in-table', 'style-sheet',
|
||||
'paragraph-styles', 'paragraph-style-in-table', 'character-styles',
|
||||
'character-style-in-table', 'list-table', 'doc-information', 'title',
|
||||
'author', 'operator', 'creation-time', 'revision-time',
|
||||
'editing-time', 'time', 'number-of-pages', 'number-of-words',
|
||||
'number-of-characters', 'page-definition', 'section-definition',
|
||||
'headers-and-footers', 'section', 'para', 'body',
|
||||
'paragraph-definition', 'cell', 'row', 'table', 'revision-table',
|
||||
'style-group', 'border-group','styles-in-body', 'paragraph-style-in-body',
|
||||
'list-in-table', 'level-in-table', 'override-table','override-list',
|
||||
)
|
||||
self.__two_new_line = ('section', 'body', 'table', 'row' 'list-table')
|
||||
self.__state_dict = {
|
||||
'default' : self.__default_func,
|
||||
'mi<tg<open______' : self.__open_func,
|
||||
'mi<tg<close_____' : self.__close_func,
|
||||
'mi<tg<open-att__' : self.__open_att_func,
|
||||
'mi<tg<empty-att_' : self.__empty_att_func,
|
||||
'tx<nu<__________' : self.__text_func,
|
||||
'tx<ut<__________' : self.__text_func,
|
||||
'mi<tg<empty_____' : self.__empty_func,
|
||||
}
|
||||
def __open_func(self, line):
|
||||
"""
|
||||
Print the opening tag and newlines when needed.
|
||||
"""
|
||||
#mi<tg<open______<style-sheet
|
||||
info = line[17:-1]
|
||||
self.__new_line = 0
|
||||
if info in self.__block:
|
||||
self.__write_new_line()
|
||||
if info in self.__two_new_line:
|
||||
self.__write_extra_new_line()
|
||||
self.__write_obj.write('<%s>' % info)
|
||||
def __empty_func(self, line):
|
||||
"""
|
||||
Print out empty tag and newlines when needed.
|
||||
"""
|
||||
info = line[17:-1]
|
||||
self.__write_obj.write(
|
||||
'<%s/>' % info)
|
||||
self.__new_line = 0
|
||||
if info in self.__block:
|
||||
self.__write_new_line()
|
||||
if info in self.__two_new_line:
|
||||
self.__write_extra_new_line()
|
||||
def __open_att_func(self, line):
|
||||
"""
|
||||
Process lines for open tags that have attributes.
|
||||
The important infor is between [17:-1]. Take this info and split it
|
||||
with the delimeter '<'. The first token in this group is the element
|
||||
name. The rest are attributes, separated fromt their values by '>'. So
|
||||
read each token one at a time, and split them by '>'.
|
||||
"""
|
||||
#mi<tg<open-att__<footnote<num>
|
||||
info = line[17:-1]
|
||||
tokens = info.split("<")
|
||||
element_name = tokens[0]
|
||||
tokens = tokens[1:]
|
||||
self.__write_obj.write('<%s' % element_name)
|
||||
for token in tokens:
|
||||
groups = token.split('>')
|
||||
try:
|
||||
val = groups[0]
|
||||
att = groups[1]
|
||||
att = att.replace('"', '"')
|
||||
att = att.replace("'", '"')
|
||||
self.__write_obj.write(
|
||||
' %s="%s"' % (val, att)
|
||||
)
|
||||
except:
|
||||
if self.__run_level > 3:
|
||||
msg = 'index out of range\n'
|
||||
raise self.__bug_handler, msg
|
||||
self.__write_obj.write('>')
|
||||
self.__new_line = 0
|
||||
if element_name in self.__block:
|
||||
self.__write_new_line()
|
||||
if element_name in self.__two_new_line:
|
||||
self.__write_extra_new_line()
|
||||
def __empty_att_func(self, line):
|
||||
"""
|
||||
Same as the __open_att_func, except a '/' is placed at the end of the tag.
|
||||
"""
|
||||
#mi<tg<open-att__<footnote<num>
|
||||
info = line[17:-1]
|
||||
tokens = info.split("<")
|
||||
element_name = tokens[0]
|
||||
tokens = tokens[1:]
|
||||
self.__write_obj.write('<%s' % element_name)
|
||||
for token in tokens:
|
||||
groups = token.split('>')
|
||||
val = groups[0]
|
||||
att = groups[1]
|
||||
att = att.replace('"', '"')
|
||||
att = att.replace("'", '"')
|
||||
self.__write_obj.write(
|
||||
' %s="%s"' % (val, att))
|
||||
self.__write_obj.write('/>')
|
||||
self.__new_line = 0
|
||||
if element_name in self.__block:
|
||||
self.__write_new_line()
|
||||
if element_name in self.__two_new_line:
|
||||
self.__write_extra_new_line()
|
||||
def __close_func(self, line):
|
||||
"""
|
||||
Print out the closed tag and new lines, if appropriate.
|
||||
"""
|
||||
#mi<tg<close_____<style-sheet\n
|
||||
info = line[17:-1]
|
||||
self.__write_obj.write(
|
||||
'</%s>' % info)
|
||||
self.__new_line = 0
|
||||
if info in self.__block:
|
||||
self.__write_new_line()
|
||||
if info in self.__two_new_line:
|
||||
self.__write_extra_new_line()
|
||||
def __text_func(self, line):
|
||||
"""
|
||||
Simply print out the information between [17:-1]
|
||||
"""
|
||||
#tx<nu<__________<Normal;
|
||||
# change this!
|
||||
self.__write_obj.write(line[17:-1])
|
||||
def __write_extra_new_line(self):
|
||||
"""
|
||||
Print out extra new lines if the new lines have not exceeded two. If
|
||||
the new lines are greater than two, do nothing.
|
||||
"""
|
||||
if not self.__indent:
|
||||
return
|
||||
if self.__new_line < 2:
|
||||
self.__write_obj.write('\n')
|
||||
def __default_func(self, line):
|
||||
pass
|
||||
def __write_new_line(self):
|
||||
"""
|
||||
Print out a new line if a new line has not already been printed out.
|
||||
"""
|
||||
if not self.__indent:
|
||||
return
|
||||
if not self.__new_line:
|
||||
self.__write_obj.write('\n')
|
||||
self.__new_line += 1
|
||||
def __write_dec(self):
|
||||
"""
|
||||
Write the XML declaration at the top of the document.
|
||||
"""
|
||||
self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>')
|
||||
self.__new_line = 0
|
||||
self.__write_new_line()
|
||||
if self.__no_dtd:
|
||||
pass
|
||||
elif self.__dtd_path:
|
||||
self.__write_obj.write(
|
||||
'<!DOCTYPE doc SYSTEM "%s">' % self.__dtd_path
|
||||
)
|
||||
elif self.__dtd_path == '':
|
||||
# don't print dtd if further transformations are going to take
|
||||
# place
|
||||
pass
|
||||
else:
|
||||
self.__write_obj.write(
|
||||
'<!DOCTYPE doc PUBLIC "publicID" '
|
||||
'"http://rtf2xml.sourceforge.net/dtd/%s">' % public_dtd
|
||||
)
|
||||
self.__new_line = 0
|
||||
self.__write_new_line()
|
||||
def convert_to_tags(self):
|
||||
"""
|
||||
Read in the file one line at a time. Get the important info, between
|
||||
[:16]. Check if this info matches a dictionary entry. If it does, call
|
||||
the appropriate function.
|
||||
The functions that are called:
|
||||
a text function for text
|
||||
an open funciton for open tags
|
||||
an open with attribute function for tags with attributes
|
||||
an empty with attribute function for tags that are empty but have
|
||||
attribtes.
|
||||
a closed function for closed tags.
|
||||
an empty tag function.
|
||||
"""
|
||||
self.__initiate_values()
|
||||
read_obj = open(self.__file, 'r')
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
self.__write_dec()
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
action = self.__state_dict.get(self.__token_info)
|
||||
if action != None:
|
||||
action(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "convert_to_tags.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
88
src/libprs500/ebooks/rtf2xml/copy.py
Executable file
88
src/libprs500/ebooks/rtf2xml/copy.py
Executable file
@ -0,0 +1,88 @@
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# You should have received a copy of the GNU General Public License #
|
||||
# along with this program; if not, write to the Free Software #
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
|
||||
# 02111-1307 USA #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os
|
||||
class Copy:
|
||||
"""Copy each changed file to a directory for debugging purposes"""
|
||||
__dir = ""
|
||||
def __init__(self, bug_handler, file = None, deb_dir = None, ):
|
||||
self.__file = file
|
||||
self.__bug_handler = bug_handler
|
||||
def set_dir(self, deb_dir):
|
||||
"""Set the temporary directory to write files to"""
|
||||
if deb_dir is None:
|
||||
message = "No directory has been provided to write to in the copy.py"
|
||||
raise self.__bug_handler, message
|
||||
check = os.path.isdir(deb_dir)
|
||||
if not check:
|
||||
message = "%(deb_dir)s is not a directory" % vars()
|
||||
raise self.__bug_handler , message
|
||||
Copy.__dir = deb_dir
|
||||
def remove_files(self ):
|
||||
"""Remove files from directory"""
|
||||
self.__remove_the_files(Copy.__dir)
|
||||
"""
|
||||
list_of_files = os.listdir(Copy.__dir)
|
||||
list_of_files = os.listdir(the_dir)
|
||||
for file in list_of_files:
|
||||
rem_file = os.path.join(Copy.__dir,file)
|
||||
if os.path.isdir(rem_file):
|
||||
self.remove_files(rem_file)
|
||||
else:
|
||||
os.remove(rem_file)
|
||||
"""
|
||||
def __remove_the_files(self, the_dir):
|
||||
"""Remove files from directory"""
|
||||
list_of_files = os.listdir(the_dir)
|
||||
for file in list_of_files:
|
||||
rem_file = os.path.join(Copy.__dir,file)
|
||||
if os.path.isdir(rem_file):
|
||||
self.__remove_the_files(rem_file)
|
||||
else:
|
||||
try:
|
||||
os.remove(rem_file)
|
||||
except OSError:
|
||||
pass
|
||||
def copy_file(self, file, new_file):
|
||||
"""
|
||||
Copy the file to a new name
|
||||
If the platform is linux, use the faster linux command
|
||||
of cp. Otherwise, use a safe python method.
|
||||
"""
|
||||
write_file = os.path.join(Copy.__dir,new_file)
|
||||
platform = sys.platform
|
||||
if platform[:5] == 'linux':
|
||||
command = 'cp %(file)s %(write_file)s' % vars()
|
||||
os.system(command)
|
||||
else:
|
||||
read_obj = open(file,'r')
|
||||
write_obj = open(write_file, 'w')
|
||||
line = "dummy"
|
||||
while line:
|
||||
line = read_obj.read(1000)
|
||||
write_obj.write(line )
|
||||
read_obj.close()
|
||||
write_obj.close()
|
||||
def rename(self, source, dest):
|
||||
read_obj = open(source, 'r')
|
||||
write_obj = open(dest, 'w')
|
||||
line = 1
|
||||
while line:
|
||||
line = read_obj.readline()
|
||||
write_obj.write(line)
|
||||
read_obj.close()
|
||||
write_obj.close()
|
94
src/libprs500/ebooks/rtf2xml/correct_unicode.py
Executable file
94
src/libprs500/ebooks/rtf2xml/correct_unicode.py
Executable file
@ -0,0 +1,94 @@
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# You should have received a copy of the GNU General Public License #
|
||||
# along with this program; if not, write to the Free Software #
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
|
||||
# 02111-1307 USA #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import os, re, tempfile
|
||||
from libprs500.ebooks.rtf2xml import copy
|
||||
class CorrectUnicode:
|
||||
"""
|
||||
corrects sequences such as \u201c\'F0\'BE
|
||||
Where \'F0\'BE has to be eliminated.
|
||||
"""
|
||||
def __init__(self,
|
||||
in_file,
|
||||
exception_handler,
|
||||
bug_handler,
|
||||
copy = None,
|
||||
run_level = 1,
|
||||
):
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__run_level = run_level
|
||||
self.__write_to = tempfile.mktemp()
|
||||
self.__exception_handler = exception_handler
|
||||
self.__bug_handler = bug_handler
|
||||
self.__state = 'outside'
|
||||
self.__utf_exp = re.compile(r'&#x(.*?);')
|
||||
def __process_token(self, line):
|
||||
if self.__state == 'outside':
|
||||
if line[:5] == 'tx<ut':
|
||||
self.__handle_unicode(line)
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
elif self.__state == 'after':
|
||||
if line[:5] == 'tx<hx':
|
||||
pass
|
||||
elif line[:5] == 'tx<ut':
|
||||
self.__handle_unicode(line)
|
||||
else:
|
||||
self.__state = 'outside'
|
||||
self.__write_obj.write(line)
|
||||
else:
|
||||
raise 'should\'t happen'
|
||||
def __handle_unicode(self, line):
|
||||
token = line[16:]
|
||||
match_obj = re.search(self.__utf_exp, token)
|
||||
if match_obj:
|
||||
uni_char = match_obj.group(1)
|
||||
dec_num = int(uni_char, 16)
|
||||
if dec_num > 57343 and dec_num < 63743:
|
||||
self.__state = 'outside'
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
self.__state = 'after'
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
self.__state = 'outside'
|
||||
def correct_unicode(self):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
nothing (changes the original file)
|
||||
Logic:
|
||||
Read one line in at a time.
|
||||
"""
|
||||
read_obj = open(self.__file, 'r')
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
self.__process_token(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "correct_unicode.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
61
src/libprs500/ebooks/rtf2xml/default_encoding.py
Executable file
61
src/libprs500/ebooks/rtf2xml/default_encoding.py
Executable file
@ -0,0 +1,61 @@
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# You should have received a copy of the GNU General Public License #
|
||||
# along with this program; if not, write to the Free Software #
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
|
||||
# 02111-1307 USA #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
class DefaultEncoding:
|
||||
"""
|
||||
Find the default encoding for the doc
|
||||
"""
|
||||
def __init__(self, in_file, bug_handler, run_level = 1,):
|
||||
"""
|
||||
Required:
|
||||
'file'
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
def find_default_encoding(self):
|
||||
platform = 'Windows'
|
||||
default_num = 'not-defined'
|
||||
code_page = 'ansicpg1252'
|
||||
read_obj = open(self.__file, 'r')
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
if self.__token_info == 'mi<mk<rtfhed-end':
|
||||
break
|
||||
if self.__token_info == 'cw<ri<ansi-codpg':
|
||||
#cw<ri<ansi-codpg<nu<10000
|
||||
num = line[20:-1]
|
||||
if not num:
|
||||
num = '1252'
|
||||
code_page = 'ansicpg' + num
|
||||
if self.__token_info == 'cw<ri<macintosh_':
|
||||
platform = 'Macintosh'
|
||||
if self.__token_info == 'cw<ri<deflt-font':
|
||||
default_num = line[20:-1]
|
||||
#cw<ri<deflt-font<nu<0
|
||||
#action = self.__state_dict.get(self.__state)
|
||||
#if action == None:
|
||||
#print self.__state
|
||||
#action(line)
|
||||
read_obj.close()
|
||||
if platform == 'Macintosh':
|
||||
code_page = 'mac_roman'
|
||||
return platform, code_page, default_num
|
219
src/libprs500/ebooks/rtf2xml/delete_info.py
Executable file
219
src/libprs500/ebooks/rtf2xml/delete_info.py
Executable file
@ -0,0 +1,219 @@
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# You should have received a copy of the GNU General Public License #
|
||||
# along with this program; if not, write to the Free Software #
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
|
||||
# 02111-1307 USA #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os, tempfile
|
||||
from libprs500.ebooks.rtf2xml import copy
|
||||
class DeleteInfo:
|
||||
"""Delelet unecessary destination groups"""
|
||||
def __init__(self,
|
||||
in_file ,
|
||||
bug_handler,
|
||||
copy = None,
|
||||
run_level = 1,
|
||||
):
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__write_to = tempfile.mktemp()
|
||||
self.__bracket_count=0
|
||||
self.__ob_count = 0
|
||||
self.__cb_count = 0
|
||||
self.__after_asterisk = 0
|
||||
self.__delete = 0
|
||||
self.__initiate_allow()
|
||||
self.__ob = 0
|
||||
self.__write_cb = 0
|
||||
self.__run_level = run_level
|
||||
self.__found_delete = 0
|
||||
self.__list = 0
|
||||
def __initiate_allow(self):
|
||||
"""
|
||||
Initiate a list of destination groups which should be printed out.
|
||||
"""
|
||||
self.__allowable = ('cw<ss<char-style',
|
||||
'cw<it<listtable_',
|
||||
'cw<it<revi-table',
|
||||
'cw<ls<list-lev-d',
|
||||
'cw<fd<field-inst',
|
||||
'cw<an<book-mk-st',
|
||||
'cw<an<book-mk-en',
|
||||
'cw<an<annotation',
|
||||
'cw<cm<comment___',
|
||||
'cw<it<lovr-table',
|
||||
# 'cw<ls<list______',
|
||||
)
|
||||
self.__not_allowable = (
|
||||
'cw<un<unknown___',
|
||||
'cw<un<company___',
|
||||
'cw<ls<list-level',
|
||||
'cw<fd<datafield_',
|
||||
)
|
||||
self.__state = 'default'
|
||||
self.__state_dict = {
|
||||
'default' : self.__default_func,
|
||||
'after_asterisk' : self.__asterisk_func,
|
||||
'delete' : self.__delete_func,
|
||||
'list' : self.__list_func,
|
||||
}
|
||||
def __default_func(self,line):
|
||||
"""Handle lines when in no special state. Look for an asterisk to
|
||||
begin a special state. Otherwise, print out line."""
|
||||
##cw<ml<asterisk__<nu<true
|
||||
if self.__token_info == 'cw<ml<asterisk__':
|
||||
self.__state = 'after_asterisk'
|
||||
self.__delete_count = self.__ob_count
|
||||
elif self.__token_info == 'ob<nu<open-brack':
|
||||
# write previous bracket, if exists
|
||||
if self.__ob:
|
||||
self.__write_obj.write(self.__ob)
|
||||
self.__ob = line
|
||||
return 0
|
||||
else:
|
||||
# write previous bracket, since didn't fine asterisk
|
||||
if self.__ob:
|
||||
self.__write_obj.write(self.__ob)
|
||||
self.__ob = 0
|
||||
return 1
|
||||
def __delete_func(self,line):
|
||||
"""Handle lines when in delete state. Don't print out lines
|
||||
unless the state has ended."""
|
||||
if self.__delete_count == self.__cb_count:
|
||||
self.__state = 'default'
|
||||
if self.__write_cb:
|
||||
self.__write_cb = 0
|
||||
return 1
|
||||
return 0
|
||||
def __asterisk_func(self,line):
|
||||
"""
|
||||
Determine whether to delete info in group
|
||||
Note on self.__cb flag.
|
||||
If you find that you are in a delete group, and the preivous
|
||||
token in not an open bracket (self.__ob = 0), that means
|
||||
that the delete group is nested inside another acceptable
|
||||
detination group. In this case, you have alrady written
|
||||
the open bracket, so you will need to write the closed one
|
||||
as well.
|
||||
"""
|
||||
# Test for {\*}, in which case don't enter
|
||||
# delete state
|
||||
self.__after_asterisk = 0 # only enter this function once
|
||||
self.__found_delete = 1
|
||||
if self.__token_info == 'cb<nu<clos-brack':
|
||||
if self.__delete_count == self.__cb_count:
|
||||
self.__state = 'default'
|
||||
self.__ob = 0
|
||||
# changed this because haven't printed out start
|
||||
return 0
|
||||
else:
|
||||
# not sure what happens here!
|
||||
# believe I have a '{\*}
|
||||
if self.__run_level > 3:
|
||||
msg = 'flag problem\n'
|
||||
raise self.__bug_handler, msg
|
||||
return 1
|
||||
elif self.__token_info in self.__allowable :
|
||||
if self.__ob:
|
||||
self.__write_obj.write(self.__ob)
|
||||
self.__ob = 0
|
||||
self.__state = 'default'
|
||||
else:
|
||||
pass
|
||||
return 1
|
||||
elif self.__token_info == 'cw<ls<list______':
|
||||
self.__ob = 0
|
||||
self.__found_list_func(line)
|
||||
elif self.__token_info in self.__not_allowable:
|
||||
if not self.__ob:
|
||||
self.__write_cb = 1
|
||||
self.__ob = 0
|
||||
self.__state = 'delete'
|
||||
self.__cb_count = 0
|
||||
return 0
|
||||
else:
|
||||
if self.__run_level > 5:
|
||||
msg = 'After an asterisk, and found neither an allowable or non-allowble token\n'
|
||||
msg += 'token is "%s"\n' % self.__token_info
|
||||
raise self.__bug_handler
|
||||
if not self.__ob:
|
||||
self.__write_cb = 1
|
||||
self.__ob = 0
|
||||
self.__state = 'delete'
|
||||
self.__cb_count = 0
|
||||
return 0
|
||||
def __found_list_func(self, line):
|
||||
"""
|
||||
print out control words in this group
|
||||
"""
|
||||
self.__state = 'list'
|
||||
def __list_func(self, line):
|
||||
"""
|
||||
Check to see if the group has ended.
|
||||
Return 1 for all control words.
|
||||
Return 0 otherwise.
|
||||
"""
|
||||
if self.__delete_count == self.__cb_count and self.__token_info ==\
|
||||
'cb<nu<clos-brack':
|
||||
self.__state = 'default'
|
||||
if self.__write_cb:
|
||||
self.__write_cb = 0
|
||||
return 1
|
||||
return 0
|
||||
elif line[0:2] == 'cw':
|
||||
return 1
|
||||
else:
|
||||
return 0
|
||||
def delete_info(self):
|
||||
"""Main method for handling other methods. Read one line in at
|
||||
a time, and determine wheter to print the line based on the state."""
|
||||
line_to_read = 'dummy'
|
||||
read_obj = open(self.__file, 'r')
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
while line_to_read:
|
||||
#ob<nu<open-brack<0001
|
||||
to_print =1
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.__ob_count = line[-5:-1]
|
||||
if self.__token_info == 'cb<nu<clos-brack':
|
||||
self.__cb_count = line[-5:-1]
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if not action:
|
||||
sys.stderr.write('No action in dictionary state is "%s" \n'
|
||||
% self.__state)
|
||||
to_print = action(line)
|
||||
"""
|
||||
if self.__after_asterisk:
|
||||
to_print = self.__asterisk_func(line)
|
||||
elif self.__list:
|
||||
self.__in_list_func(line)
|
||||
elif self.__delete:
|
||||
to_print = self.__delete_func(line)
|
||||
else:
|
||||
to_print = self.__default_func(line)
|
||||
"""
|
||||
if to_print:
|
||||
self.__write_obj.write(line)
|
||||
self.__write_obj.close()
|
||||
read_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "delete_info.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
return self.__found_delete
|
795
src/libprs500/ebooks/rtf2xml/field_strings.py
Executable file
795
src/libprs500/ebooks/rtf2xml/field_strings.py
Executable file
@ -0,0 +1,795 @@
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# You should have received a copy of the GNU General Public License #
|
||||
# along with this program; if not, write to the Free Software #
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
|
||||
# 02111-1307 USA #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, re
|
||||
class FieldStrings:
|
||||
"""
|
||||
This module is given a string. It processes the field instruction string and
|
||||
returns a list of three values.
|
||||
"""
|
||||
def __init__(self, bug_handler, run_level = 1):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__run_level = run_level
|
||||
self.__bug_handler = bug_handler
|
||||
self.__initiate_values()
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Requires:
|
||||
nothing.
|
||||
Returns:
|
||||
nothing.
|
||||
Logic:
|
||||
initiate values for rest of class.
|
||||
self.__field_instruction_dict:
|
||||
The dictionary for all field names.
|
||||
"""
|
||||
self.__field_instruction_dict = {
|
||||
# number type (arabic, etc.) and number format (\# " ")
|
||||
'EDITTIME' : (self.__num_type_and_format_func, 'editing-time'),
|
||||
'NUMCHARS' : (self.__num_type_and_format_func, 'number-of-characters-in-doc'),
|
||||
'NUMPAGES' : (self.__num_type_and_format_func, 'number-of-pages-in-doc'),
|
||||
'NUMWORDS' : (self.__num_type_and_format_func, 'number-of-words-in-doc'),
|
||||
'REVNUM' : (self.__num_type_and_format_func, 'revision-number'),
|
||||
'SECTIONPAGES' : (self.__num_type_and_format_func, 'num-of-pages-in-section'),
|
||||
'SECTION' : (self.__num_type_and_format_func, 'insert-section-number'),
|
||||
'QUOTE' : (self.__num_type_and_format_func, 'quote'),
|
||||
# number formatting (\# "")
|
||||
'PAGE' : (self.__default_inst_func, 'insert-page-number'),
|
||||
'page' : (self.__default_inst_func, 'insert-page-number'),
|
||||
# date format (\@ "")
|
||||
'CREATEDATE' : (self.__date_func, 'insert-date'),
|
||||
'PRINTDATE' : (self.__date_func, 'insert-date'),
|
||||
# PRINTDATE?
|
||||
'SAVEDATE' : (self.__date_func, 'last-saved'),
|
||||
'TIME' : (self.__date_func, 'insert-time'),
|
||||
# numbers?
|
||||
# these fields take four switches
|
||||
'AUTHOR' : (self.__simple_info_func, 'user-name'),
|
||||
'COMMENTS' : (self.__simple_info_func, 'comments'),
|
||||
'FILENAME' : (self.__simple_info_func, 'file-name'),
|
||||
'filename' : (self.__simple_info_func, 'file-name'),
|
||||
'KEYWORDS' : (self.__simple_info_func, 'keywords'),
|
||||
'LASTSAVEDBY' : (self.__simple_info_func, 'last-saved-by'),
|
||||
'SUBJECT' : (self.__simple_info_func, 'subject'),
|
||||
'TEMPLATE' : (self.__simple_info_func, 'based-on-template'),
|
||||
'TITLE' : (self.__simple_info_func, 'document-title'),
|
||||
'USERADDRESS' : (self.__simple_info_func, 'user-address'),
|
||||
'USERINITIALS' : (self.__simple_info_func, 'user-initials'),
|
||||
'USERNAME' : (self.__simple_info_func, 'user-name'),
|
||||
'EQ' : (self.__equation_func, 'equation'),
|
||||
'HYPERLINK' : (self.__hyperlink_func, 'hyperlink'),
|
||||
'INCLUDEPICTURE': (self.__include_pict_func, 'include-picture'),
|
||||
'INCLUDETEXT' : (self.__include_text_func, 'include-text-from-file'),
|
||||
'INDEX' : (self.__index_func, 'index'),
|
||||
'NOTEREF' : (self.__note_ref_func, 'reference-to-note'),
|
||||
'PAGEREF' : (self.__page_ref_func, 'reference-to-page'),
|
||||
'REF' : (self.__ref_func, 'reference'),
|
||||
'ref' : (self.__ref_func, 'reference'),
|
||||
'SEQ' : (self.__sequence_func, 'numbering-sequence'),
|
||||
'SYMBOL' : (self.__symbol_func, 'symbol'),
|
||||
'TA' : (self.__ta_func, 'anchor-for-table-of-authorities'),
|
||||
'TOA' : (self.__toc_table_func, 'table-of-authorities'),
|
||||
'TOC' : (self.__toc_table_func, 'table-of-contents'),
|
||||
# no switches
|
||||
'AUTONUMOUT' : (self.__no_switch_func, 'auto-num-out?'),
|
||||
'COMPARE' : (self.__no_switch_func, 'compare'),
|
||||
'DOCVARIABLE' : (self.__no_switch_func, 'document-variable'),
|
||||
'GOTOBUTTON' : (self.__no_switch_func, 'go-button'),
|
||||
'NEXT' : (self.__no_switch_func, 'next'),
|
||||
'NEXTIF' : (self.__no_switch_func, 'next-if'),
|
||||
'SKIPIF' : (self.__no_switch_func, 'skip-if'),
|
||||
'IF' : (self.__no_switch_func, 'if'),
|
||||
'MERGEFIELD' : (self.__no_switch_func, 'merge-field'),
|
||||
'MERGEREC' : (self.__no_switch_func, 'merge-record'),
|
||||
'MERGESEQ' : (self.__no_switch_func, 'merge-sequence'),
|
||||
'PLACEHOLDER' : (self.__no_switch_func, 'place-holder'),
|
||||
'PRIVATE' : (self.__no_switch_func, 'private'),
|
||||
'RD' : (self.__no_switch_func, 'referenced-document'),
|
||||
'SET' : (self.__no_switch_func, 'set'),
|
||||
# default instructions (haven't written a method for them
|
||||
'ADVANCE' : (self.__default_inst_func, 'advance'),
|
||||
'ASK' : (self.__default_inst_func, 'prompt-user'),
|
||||
'AUTONUMLGL' : (self.__default_inst_func, 'automatic-number'),
|
||||
'AUTONUM' : (self.__default_inst_func, 'automatic-number'),
|
||||
'AUTOTEXTLIST' : (self.__default_inst_func, 'auto-list-text'),
|
||||
'AUTOTEXT' : (self.__default_inst_func, 'auto-text'),
|
||||
'BARCODE' : (self.__default_inst_func, 'barcode'),
|
||||
'CONTACT' : (self.__default_inst_func, 'contact'),
|
||||
'DATABASE' : (self.__default_inst_func, 'database'),
|
||||
'DATE' : (self.__default_inst_func, 'date'),
|
||||
'date' : (self.__default_inst_func, 'date'),
|
||||
'DOCPROPERTY' : (self.__default_inst_func, 'document-property'),
|
||||
'FILESIZE' : (self.__default_inst_func, 'file-size'),
|
||||
'FILLIN' : (self.__default_inst_func, 'fill-in'),
|
||||
'INFO' : (self.__default_inst_func, 'document-info'),
|
||||
'LINK' : (self.__default_inst_func, 'link'),
|
||||
'PA' : (self.__default_inst_func, 'page'),
|
||||
'PRINT' : (self.__default_inst_func, 'print'),
|
||||
'STYLEREF' : (self.__default_inst_func, 'style-reference'),
|
||||
'USERPROPERTY' : (self.__default_inst_func, 'user-property'),
|
||||
'FORMCHECKBOX' : (self.__default_inst_func, 'form-checkbox'),
|
||||
'FORMTEXT' : (self.__default_inst_func, 'form-text'),
|
||||
# buttons
|
||||
'MACROBUTTON' : (self.__default_inst_func, 'macro-button'),
|
||||
}
|
||||
self.__number_dict = {
|
||||
'Arabic' : 'arabic',
|
||||
'alphabetic' : 'alphabetic',
|
||||
'ALPHABETIC' : 'capital-alphabetic',
|
||||
'roman' : 'roman',
|
||||
'ROMAN' : 'capital-roman',
|
||||
'Ordinal' : 'ordinal',
|
||||
'CardText' : 'cardinal-text',
|
||||
'OrdText' : 'ordinal-text',
|
||||
'Hex' : 'hexidecimal',
|
||||
'DollarText' : 'dollar-text',
|
||||
'Upper' : 'upper-case',
|
||||
'Lower' : 'lower-case',
|
||||
'FirstCap' : 'first-cap',
|
||||
'Caps' : 'caps',
|
||||
}
|
||||
self.__text_format_dict = {
|
||||
'Upper' : 'upper',
|
||||
'Lower' : 'lower',
|
||||
'FirstCap' : 'first-cap',
|
||||
'Caps' : 'caps',
|
||||
}
|
||||
self.__symbol_num_exp = re.compile(r'SYMBOL (.*?) ')
|
||||
self.__symbol_font_exp = re.compile(r'\\f "(.*?)"')
|
||||
self.__symbol_size_exp = re.compile(r'\\s (\d+)')
|
||||
##self.__toc_figure_exp = re.compile(r'\\c "Figure"')
|
||||
# \\@ "dddd, MMMM d, yyyy"
|
||||
self.__date_exp = re.compile(r'\\@\s{1,}"(.*?)"')
|
||||
self.__num_type_exp = re.compile(r'\\\*\s{1,}(Arabic|alphabetic|ALPHABETIC|roman|ROMAN|Ordinal|CardText|OrdText|Hex|DollarText|Upper|Lower|FirstCap|Caps)')
|
||||
self.__format_text_exp = re.compile(r'\\\*\s{1,}(Upper|Lower|FirstCap|Caps)')
|
||||
self.__merge_format_exp = re.compile(r'\\\*\s{1,}MERGEFORMAT')
|
||||
self.__ta_short_field_exp = re.compile(r'\\s\s{1,}"(.*?)"')
|
||||
self.__ta_long_field_exp = re.compile(r'\\l\s{1,}"(.*?)"')
|
||||
self.__ta_category_exp = re.compile(r'\\c\s{1,}(\d+)')
|
||||
# indices
|
||||
self.__index_insert_blank_line_exp = re.compile(r'\\h\s{1,}""')
|
||||
self.__index_insert_letter_exp = re.compile(r'\\h\s{1,}"()"')
|
||||
self.__index_columns_exp = re.compile(r'\\c\s{1,}"(.*?)"')
|
||||
self.__bookmark_exp = re.compile(r'\\b\s{1,}(.*?)\s')
|
||||
self.__d_separator = re.compile(r'\\d\s{1,}(.*?)\s')
|
||||
self.__e_separator = re.compile(r'\\e\s{1,}(.*?)\s')
|
||||
self.__l_separator = re.compile(r'\\l\s{1,}(.*?)\s')
|
||||
self.__p_separator = re.compile(r'\\p\s{1,}(.*?)\s')
|
||||
self.__index_sequence = re.compile(r'\\s\s{1,}(.*?)\s')
|
||||
self.__index_entry_typ_exp = re.compile(r'\\f\s{1,}"(.*?)"')
|
||||
self.__quote_exp = re.compile(r'"(.*?)"')
|
||||
self.__filter_switch = re.compile(r'\\c\s{1,}(.*?)\s')
|
||||
self.__link_switch = re.compile(r'\\l\s{1,}(.*?)\s')
|
||||
def process_string(self, my_string, type):
|
||||
"""
|
||||
Requires:
|
||||
my_string --the string to parse.
|
||||
type -- the type of string.
|
||||
Returns:
|
||||
Returns a string for a field instrution attribute.
|
||||
Logic:
|
||||
This handles all "large" fields, which means everything except
|
||||
toc entries, index entries, and bookmarks
|
||||
Split the string by spaces, and get the first item in the
|
||||
resulting list. This item is the field's type. Check for the
|
||||
action in the field instructions dictionary for further parsing.
|
||||
If no action is found, print out an error message.
|
||||
"""
|
||||
changed_string = ''
|
||||
lines = my_string.split('\n')
|
||||
for line in lines:
|
||||
if line[0:2] == 'tx':
|
||||
changed_string += line[17:]
|
||||
fields = changed_string.split()
|
||||
field_name = fields[0]
|
||||
action, name = self.__field_instruction_dict.get(field_name, (None, None))
|
||||
match_obj = re.search(self.__merge_format_exp, changed_string)
|
||||
if match_obj and name:
|
||||
name += '<update>dynamic'
|
||||
elif name:
|
||||
name += '<update>static'
|
||||
else:
|
||||
pass
|
||||
# no name--not in list above
|
||||
if action:
|
||||
the_list = action(field_name, name, changed_string)
|
||||
else:
|
||||
# change -1 to 0--for now, I want users to report bugs
|
||||
msg = 'no key for "%s" "%s"\n' % (field_name, changed_string)
|
||||
sys.stderr.write(msg)
|
||||
if self.__run_level > 3:
|
||||
msg = 'no key for "%s" "%s"\n' % (field_name, changed_string)
|
||||
raise self.__bug_handler, msg
|
||||
the_list = self.__fall_back_func(field_name, line)
|
||||
return the_list
|
||||
return the_list
|
||||
def __default_inst_func(self, field_name, name, line):
|
||||
"""
|
||||
Requires:
|
||||
field_name -- the first word in the string
|
||||
name -- the changed name according to the dictionary
|
||||
line -- the string to be parsed
|
||||
Returns:
|
||||
The name of the field.
|
||||
Logic:
|
||||
I only need the changed name for the field.
|
||||
"""
|
||||
return [None, None, name]
|
||||
def __fall_back_func(self, field_name, line):
|
||||
"""
|
||||
Requires:
|
||||
field_name -- the first word in the string
|
||||
name -- the changed name according to the dictionary
|
||||
line -- the string to be parsed
|
||||
Returns:
|
||||
The name of the field.
|
||||
Logic:
|
||||
Used for fields not found in dict
|
||||
"""
|
||||
the_string = field_name
|
||||
the_string += '<update>none'
|
||||
return [None, None, the_string]
|
||||
def __equation_func(self, field_name, name, line):
|
||||
"""
|
||||
Requried:
|
||||
field_name -- the first word in the string
|
||||
name --the changed name according to the dictionary
|
||||
line -- the string to be parse
|
||||
Retuns:
|
||||
The name of the field
|
||||
Logic:
|
||||
"""
|
||||
return [None, None, name]
|
||||
def __no_switch_func(self, field_name, name, line):
|
||||
"""
|
||||
Required:
|
||||
field_name --the first
|
||||
field_name -- the first word in the string
|
||||
name --the changed name according to the dictionary
|
||||
line -- the string to be parse
|
||||
Retuns:
|
||||
The name of the field
|
||||
Logic:
|
||||
"""
|
||||
return [None, None, name]
|
||||
def __num_type_and_format_func(self, field_name, name, line):
|
||||
"""
|
||||
Required:
|
||||
field_name -- the first word in the string
|
||||
name --the changed name according to the dictionary
|
||||
line -- the string to be parse
|
||||
Returns:
|
||||
list of None, None, and part of a tag
|
||||
Logic:
|
||||
parse num_type
|
||||
parse num_format
|
||||
"""
|
||||
the_string = name
|
||||
num_format = self.__parse_num_format(line)
|
||||
if num_format:
|
||||
the_string += '<number-format>%s' % num_format
|
||||
num_type = self.__parse_num_type(line)
|
||||
if num_type:
|
||||
the_string += '<number-type>%s' % num_type
|
||||
# Only QUOTE takes a (mandatory?) argument
|
||||
if field_name == 'QUOTE':
|
||||
match_group = re.search(r'QUOTE\s{1,}"(.*?)"', line)
|
||||
if match_group:
|
||||
arg = match_group.group(1)
|
||||
the_string += '<argument>%s' % arg
|
||||
return [None, None, the_string]
|
||||
def __num_format_func(self, field_name, name, line):
|
||||
"""
|
||||
Required:
|
||||
field_name -- the first word in the string
|
||||
name --the changed name according to the dictionary
|
||||
line -- the string to be parse
|
||||
Returns:
|
||||
list of None, None, and part of a tag
|
||||
Logic:
|
||||
"""
|
||||
the_string = name
|
||||
num_format = self.__parse_num_format(line)
|
||||
if num_format:
|
||||
the_string += '<number-format>%s' % num_format
|
||||
return [None, None, the_string]
|
||||
def __parse_num_format(self, the_string):
|
||||
"""
|
||||
Required:
|
||||
the_string -- the string to parse
|
||||
Returns:
|
||||
a string if the_string contains number formatting information
|
||||
None, otherwise
|
||||
Logic:
|
||||
"""
|
||||
match_group = re.search(self.__date_exp, the_string)
|
||||
if match_group:
|
||||
return match_group(1)
|
||||
def __parse_num_type(self, the_string):
|
||||
"""
|
||||
Required:
|
||||
the_string -- the string to parse
|
||||
Returns:
|
||||
a string if the_string contains number type information
|
||||
None, otherwise
|
||||
Logic:
|
||||
the_string might look like:
|
||||
USERNAME \\* Arabic \\* MERGEFORMAT
|
||||
Get the \\* Upper part. Use a dictionary to convert the "Arabic" to
|
||||
a more-readable word for the value of the key "number-type".
|
||||
(<field number-type = "Arabic">
|
||||
"""
|
||||
match_group = re.search(self.__num_type_exp, the_string)
|
||||
if match_group:
|
||||
name = match_group.group(1)
|
||||
changed_name = self.__number_dict.get(name)
|
||||
if changed_name:
|
||||
return changed_name
|
||||
else:
|
||||
sys.stderr.write('module is fields_string\n')
|
||||
sys.stderr.write('method is __parse_num_type\n')
|
||||
sys.stderr.write('no dictionary entry for %s\n' % name)
|
||||
def __date_func(self, field_name, name, line):
|
||||
"""
|
||||
Required:
|
||||
field_name --the fist
|
||||
field_name -- the first word in the string
|
||||
name --the changed name according to the dictionary
|
||||
line -- the string to be parse
|
||||
Returns:
|
||||
list of None, None, and part of a tag
|
||||
Logic:
|
||||
"""
|
||||
the_string = name
|
||||
match_group = re.search(self.__date_exp, line)
|
||||
if match_group:
|
||||
the_string += '<date-format>%s' % match_group.group(1)
|
||||
return [None, None, the_string]
|
||||
def __simple_info_func(self, field_name, name, line):
|
||||
"""
|
||||
Requried:
|
||||
field_name -- the first word in the string
|
||||
name --the changed name according to the dictionary
|
||||
line -- the string to be parse
|
||||
Retuns:
|
||||
The name of the field
|
||||
Logic:
|
||||
These fields can only have the following switches:
|
||||
1. Upper
|
||||
2. Lower
|
||||
3. FirstCap
|
||||
4. Caps
|
||||
"""
|
||||
the_string = name
|
||||
match_group = re.search(self.__format_text_exp, line)
|
||||
if match_group:
|
||||
name = match_group.group(1)
|
||||
changed_name = self.__text_format_dict.get(name)
|
||||
if changed_name:
|
||||
the_string += '<format>%s' % changed_name
|
||||
else:
|
||||
sys.stderr.write('module is fields_string\n')
|
||||
sys.stderr.write('method is __parse_num_type\n')
|
||||
sys.stderr.write('no dictionary entry for %s\n' % name)
|
||||
return [None, None, the_string]
|
||||
def __hyperlink_func(self, field_name, name, line):
|
||||
"""
|
||||
Requried:
|
||||
field_name -- the first word in the string
|
||||
name --the changed name according to the dictionary
|
||||
line -- the string to be parse
|
||||
Retuns:
|
||||
The name of the field
|
||||
Logic:
|
||||
self.__link_switch = re.compile(r'\\l\s{1,}(.*?)\s')
|
||||
"""
|
||||
self.__link_switch = re.compile(r'\\l\s{1,}(.*?)\s')
|
||||
the_string = name
|
||||
match_group = re.search(self.__link_switch, line)
|
||||
if match_group:
|
||||
link = match_group.group(1)
|
||||
link = link.replace('"', """)
|
||||
the_string += '<link>%s' % link
|
||||
# \l "txt" "link"
|
||||
# want "file name" so must get rid of \c "txt"
|
||||
line = re.sub(self.__link_switch, '', line)
|
||||
match_group = re.search(self.__quote_exp, line)
|
||||
if match_group:
|
||||
arg = match_group.group(1)
|
||||
the_string += '<argument>%s' % arg
|
||||
else:
|
||||
pass
|
||||
index = line.find('\\m')
|
||||
if index > -1:
|
||||
the_string += '<html2-image-map>true'
|
||||
index = line.find('\\n')
|
||||
if index > -1:
|
||||
the_string += '<new-window>true'
|
||||
index = line.find('\\h')
|
||||
if index > -1:
|
||||
the_string += '<no-history>true'
|
||||
return [None, None, the_string]
|
||||
def __include_text_func(self, field_name, name, line):
|
||||
"""
|
||||
Requried:
|
||||
field_name -- the first word in the string
|
||||
name --the changed name according to the dictionary
|
||||
line -- the string to be parse
|
||||
Retuns:
|
||||
The name of the field
|
||||
Logic:
|
||||
"""
|
||||
the_string = name
|
||||
match_group = re.search(self.__format_text_exp, line)
|
||||
if match_group:
|
||||
name = match_group.group(1)
|
||||
changed_name = self.__text_format_dict.get(name)
|
||||
if changed_name:
|
||||
the_string += '<format>%s' % changed_name
|
||||
else:
|
||||
sys.stderr.write('module is fields_string\n')
|
||||
sys.stderr.write('method is __parse_num_type\n')
|
||||
sys.stderr.write('no dictionary entry for %s\n' % name)
|
||||
match_group = re.search(self.__filter_switch, line)
|
||||
if match_group:
|
||||
arg = match_group.group(1)
|
||||
the_string += '<filter>%s' % arg
|
||||
# \c "txt" "file name"
|
||||
# want "file name" so must get rid of \c "txt"
|
||||
line = re.sub(self.__filter_switch, '', line)
|
||||
match_group = re.search(self.__quote_exp, line)
|
||||
if match_group:
|
||||
arg = match_group.group(1)
|
||||
arg = arg.replace('"', """)
|
||||
the_string += '<argument>%s' % arg
|
||||
else:
|
||||
sys.stderr.write('Module is field_strings\n')
|
||||
sys.stderr.write('method is include_text_func\n')
|
||||
sys.stderr.write('no argument for include text\n')
|
||||
index = line.find('\\!')
|
||||
if index > -1:
|
||||
the_string += '<no-field-update>true'
|
||||
return [None, None, the_string]
|
||||
def __include_pict_func(self, field_name, name, line):
|
||||
"""
|
||||
Requried:
|
||||
field_name -- the first word in the string
|
||||
name --the changed name according to the dictionary
|
||||
line -- the string to be parse
|
||||
Retuns:
|
||||
The name of the field
|
||||
Logic:
|
||||
"""
|
||||
the_string = name
|
||||
match_group = re.search(self.__filter_switch, line)
|
||||
if match_group:
|
||||
arg = match_group.group(1)
|
||||
arg = arg.replace('"', """)
|
||||
the_string += '<filter>%s' % arg
|
||||
# \c "txt" "file name"
|
||||
# want "file name" so must get rid of \c "txt"
|
||||
line = re.sub(self.__filter_switch, '', line)
|
||||
match_group = re.search(self.__quote_exp, line)
|
||||
if match_group:
|
||||
arg = match_group.group(1)
|
||||
the_string += '<argument>%s' % arg
|
||||
else:
|
||||
sys.stderr.write('Module is field_strings\n')
|
||||
sys.stderr.write('method is include_pict_func\n')
|
||||
sys.stderr.write('no argument for include pict\n')
|
||||
index = line.find('\\d')
|
||||
if index > -1:
|
||||
the_string += '<external>true'
|
||||
return [None, None, the_string]
|
||||
def __ref_func(self, field_name, name, line):
|
||||
"""
|
||||
Requires:
|
||||
field_name -- the first word in the string
|
||||
name -- the changed name according to the dictionary
|
||||
line -- the string to be parsed
|
||||
Returns:
|
||||
The name of the field.
|
||||
Logic:
|
||||
A page reference field looks like this:
|
||||
PAGEREF _Toc440880424 \\h
|
||||
I want to extract the second line of info, which is used as an
|
||||
achor in the resulting XML file.
|
||||
"""
|
||||
the_string = name
|
||||
match_group = re.search(self.__format_text_exp, line)
|
||||
if match_group:
|
||||
name = match_group.group(1)
|
||||
changed_name = self.__text_format_dict.get(name)
|
||||
if changed_name:
|
||||
the_string += '<format>%s' % changed_name
|
||||
else:
|
||||
sys.stderr.write('module is fields_string\n')
|
||||
sys.stderr.write('method is __parse_num_type\n')
|
||||
sys.stderr.write('no dictionary entry for %s\n' % name)
|
||||
line = re.sub(self.__merge_format_exp, '', line)
|
||||
words = line.split()
|
||||
words = words[1:] # get rid of field name
|
||||
for word in words:
|
||||
if word[0:1] != '\\':
|
||||
the_string += '<bookmark>%s' % word
|
||||
index = line.find('\\f')
|
||||
if index > -1:
|
||||
the_string += '<include-note-number>true'
|
||||
index = line.find('\\h')
|
||||
if index > -1:
|
||||
the_string += '<hyperlink>true'
|
||||
index = line.find('\\n')
|
||||
if index > -1:
|
||||
the_string += '<insert-number>true'
|
||||
index = line.find('\\r')
|
||||
if index > -1:
|
||||
the_string += '<insert-number-relative>true'
|
||||
index = line.find('\\p')
|
||||
if index > -1:
|
||||
the_string += '<paragraph-relative-position>true'
|
||||
index = line.find('\\t')
|
||||
if index > -1:
|
||||
the_string += '<suppress-non-delimeter>true'
|
||||
index = line.find('\\w')
|
||||
if index > -1:
|
||||
the_string += '<insert-number-full>true'
|
||||
return [None, None, the_string]
|
||||
def __toc_table_func(self, field_name, name, line):
|
||||
"""
|
||||
Requires:
|
||||
field_name -- the name of the first word in the string
|
||||
name --the changed name, according to the dictionary.
|
||||
line --the string to be parsed.
|
||||
Returns:
|
||||
A string for a TOC table field.
|
||||
Logic:
|
||||
If the string contains Figure, it is a table of figures.
|
||||
Otherwise, it is a plain old table of contents.
|
||||
"""
|
||||
the_string = name
|
||||
index = line.find('\\c "Figure"')
|
||||
if index > -1:
|
||||
the_string = the_string.replace('table-of-contents', 'table-of-figures')
|
||||
# don't really need the first value in this list, I don't believe
|
||||
return [name, None, the_string]
|
||||
def __sequence_func(self, field_name, name, line):
|
||||
"""
|
||||
Requires:
|
||||
field_name --the name of the first word in the string.
|
||||
name --the changed name according to the dictionary.
|
||||
line -- the string to parse.
|
||||
Returns:
|
||||
A string with a a value for the type and label attributes
|
||||
Logic:
|
||||
The type of sequence--whether figure, graph, my-name, or
|
||||
whatever--is represented by the second word in the string. Extract
|
||||
and return.
|
||||
SEQ Figure \\* ARABIC
|
||||
"""
|
||||
fields = line.split()
|
||||
label = fields[1]
|
||||
my_string = '%s<label>%s' % (name, label)
|
||||
return [None, None, my_string]
|
||||
def __ta_func(self, field_name, name, line):
|
||||
"""
|
||||
Requires:
|
||||
field_name --the name of the first word in the string.
|
||||
name --the changed name according to the dictionary.
|
||||
line -- the string to parse.
|
||||
Returns:
|
||||
A string with a a value for the type and label attributes
|
||||
Logic:
|
||||
"""
|
||||
the_string = name
|
||||
match_group = re.search(self.__ta_short_field_exp, line)
|
||||
if match_group:
|
||||
short_name = match_group.group(1)
|
||||
the_string += '<short-field>%s' % short_name
|
||||
match_group = re.search(self.__ta_long_field_exp, line)
|
||||
if match_group:
|
||||
long_name = match_group.group(1)
|
||||
the_string += '<long-field>%s' % long_name
|
||||
match_group = re.search(self.__ta_category_exp, line)
|
||||
if match_group:
|
||||
category = match_group.group(1)
|
||||
the_string += '<category>%s' % category
|
||||
index = line.find('\\b')
|
||||
if index > -1:
|
||||
the_string += '<bold>true'
|
||||
index = line.find('\\i')
|
||||
if index > -1:
|
||||
the_string += '<italics>true'
|
||||
return [None, None, the_string]
|
||||
def __index_func(self, field_name, name, line):
|
||||
"""
|
||||
Requires:
|
||||
field_name --the name of the first word in the string.
|
||||
name --the changed name according to the dictionary.
|
||||
line -- the string to parse.
|
||||
Returns:
|
||||
A string with a a value for the type and label attributes
|
||||
Logic:
|
||||
"""
|
||||
# self.__index_insert_blank_line_exp = re.compile(r'\\h\s{1,}""')
|
||||
# self.__index_insert_letter_exp = re.compile(r'\\h\s{1,}(".*?")')
|
||||
the_string = name
|
||||
match_group = re.search(self.__index_insert_blank_line_exp, line)
|
||||
if match_group:
|
||||
the_string += '<insert-blank-line>true'
|
||||
else:
|
||||
match_group = re.search(self.__index_insert_letter_exp, line)
|
||||
if match_group:
|
||||
insert_letter = match_group.group(1)
|
||||
the_string += '<insert-letter>%s' % insert_letter
|
||||
match_group = re.search(self.__index_columns_exp, line)
|
||||
if match_group:
|
||||
columns = match_group.group(1)
|
||||
the_string += '<number-of-columns>%s' % columns
|
||||
# self.__bookmark_exp = re.compile(r'\\b\s{1,}(.*?)\s')
|
||||
match_group = re.search(self.__bookmark_exp, line)
|
||||
if match_group:
|
||||
bookmark = match_group.group(1)
|
||||
the_string += '<use-bookmark>%s' % bookmark
|
||||
match_group = re.search(self.__d_separator, line)
|
||||
if match_group:
|
||||
separator = match_group.group(1)
|
||||
separator = separator.replace('"', '"')
|
||||
the_string += '<sequence-separator>%s' % separator
|
||||
# self.__e_separator = re.compile(r'\\e\s{1,}(.*?)\s')
|
||||
match_group = re.search(self.__e_separator, line)
|
||||
if match_group:
|
||||
separator = match_group.group(1)
|
||||
separator = separator.replace('"', '"')
|
||||
the_string += '<page-separator>%s' % separator
|
||||
# self.__index_sequence = re.compile(r'\\s\s{1,}(.*?)\s')
|
||||
match_group = re.search(self.__index_sequence, line)
|
||||
if match_group:
|
||||
sequence = match_group.group(1)
|
||||
separator = separator.replace('"', '"')
|
||||
the_string += '<use-sequence>%s' % sequence
|
||||
# self.__index_entry_typ_exp = re.compile(r'\\f\s{1,}"(.*?)"')
|
||||
match_group = re.search(self.__index_entry_typ_exp, line)
|
||||
if match_group:
|
||||
entry_type = match_group.group(1)
|
||||
the_string += '<entry-type>%s' % entry_type
|
||||
# self.__p_separator = re.compile(r'\\p\s{1,}(.*?)\s')
|
||||
match_group = re.search(self.__p_separator, line)
|
||||
if match_group:
|
||||
limit = match_group.group(1)
|
||||
the_string += '<limit-to-letters>%s' % limit
|
||||
match_group = re.search(self.__l_separator, line)
|
||||
if match_group:
|
||||
separator = match_group.group(1)
|
||||
separator = separator.replace('"', '"')
|
||||
the_string += '<multi-page-separator>%s' % separator
|
||||
index = line.find('\\a')
|
||||
if index > -1:
|
||||
the_string += '<accented>true'
|
||||
index = line.find('\\r')
|
||||
if index > -1:
|
||||
the_string += '<sub-entry-on-same-line>true'
|
||||
index = line.find('\\t')
|
||||
if index > -1:
|
||||
the_string += '<enable-yomi-text>true'
|
||||
return [None, None, the_string]
|
||||
def __page_ref_func(self, field_name, name, line):
|
||||
"""
|
||||
Requires:
|
||||
field_name --first name in the string.
|
||||
name -- the changed name according to the dictionary.
|
||||
line -- the string to parse.
|
||||
Returns:
|
||||
A string .
|
||||
Logic:
|
||||
"""
|
||||
the_string = name
|
||||
num_format = self.__parse_num_format(line)
|
||||
if num_format:
|
||||
the_string += '<number-format>%s' % num_format
|
||||
num_type = self.__parse_num_type(line)
|
||||
if num_type:
|
||||
the_string += '<number-type>%s' % num_type
|
||||
line = re.sub(self.__merge_format_exp, '', line)
|
||||
words = line.split()
|
||||
words = words[1:] # get rid of field name
|
||||
for word in words:
|
||||
if word[0:1] != '\\':
|
||||
the_string += '<bookmark>%s' % word
|
||||
index = line.find('\\h')
|
||||
if index > -1:
|
||||
the_string += '<hyperlink>true'
|
||||
index = line.find('\\p')
|
||||
if index > -1:
|
||||
the_string += '<paragraph-relative-position>true'
|
||||
return [None, None, the_string]
|
||||
def __note_ref_func(self, field_name, name, line):
|
||||
"""
|
||||
Requires:
|
||||
field_name --first name in the string.
|
||||
name -- the changed name according to the dictionary.
|
||||
line -- the string to parse.
|
||||
Returns:
|
||||
A string .
|
||||
Logic:
|
||||
"""
|
||||
the_string = name
|
||||
line = re.sub(self.__merge_format_exp, '', line)
|
||||
words = line.split()
|
||||
words = words[1:] # get rid of field name
|
||||
for word in words:
|
||||
if word[0:1] != '\\':
|
||||
the_string += '<bookmark>%s' % word
|
||||
index = line.find('\\h')
|
||||
if index > -1:
|
||||
the_string += '<hyperlink>true'
|
||||
index = line.find('\\p')
|
||||
if index > -1:
|
||||
the_string += '<paragraph-relative-position>true'
|
||||
index = line.find('\\f')
|
||||
if index > -1:
|
||||
the_string += '<include-note-number>true'
|
||||
return [None, None, the_string]
|
||||
def __symbol_func(self, field_name, name, line):
|
||||
"""
|
||||
Requires:
|
||||
field_name --first name in the string.
|
||||
name -- the changed name according to the dictionary.
|
||||
line -- the string to parse.
|
||||
Returns:
|
||||
A string containing font size, font style, and a hexidecimal value.
|
||||
Logic:
|
||||
The SYMBOL field is one of Microsoft's many quirky ways of
|
||||
entering text. The string that results from this method looks like
|
||||
this:
|
||||
SYMBOL 97 \\f "Symbol" \\s 12
|
||||
The first word merely tells us that we have encountered a SYMBOL
|
||||
field.
|
||||
The next value is the Microsoft decimal value. Change this to
|
||||
hexidecimal.
|
||||
The pattern '\\f "some font' tells us the font.
|
||||
The pattern '\\s some size' tells us the font size.
|
||||
Extract all of this information. Store this information in a
|
||||
string, and make this string the last item in a list. The first
|
||||
item in the list is the simple word 'symbol', which tells me that
|
||||
I don't really have field, but UTF-8 data.
|
||||
"""
|
||||
num = ''
|
||||
font = ''
|
||||
font_size = ''
|
||||
changed_line = ''
|
||||
search_obj = re.search(self.__symbol_num_exp, line)
|
||||
if search_obj:
|
||||
num = search_obj.group(1)
|
||||
num = int(num)
|
||||
num = '%X' % num
|
||||
search_obj = re.search(self.__symbol_font_exp, line)
|
||||
if search_obj:
|
||||
font = search_obj.group(1)
|
||||
changed_line += 'cw<ci<font-style<nu<%s\n' % font
|
||||
search_obj = re.search(self.__symbol_size_exp, line)
|
||||
if search_obj:
|
||||
font_size = search_obj.group(1)
|
||||
font_size = int(font_size)
|
||||
font_size = '%.2f' % font_size
|
||||
changed_line += 'cw<ci<font-size_<nu<%s\n' % font_size
|
||||
changed_line += 'tx<hx<__________<\'%s\n' % num
|
||||
return ['Symbol', None, changed_line]
|
358
src/libprs500/ebooks/rtf2xml/fields_large.py
Executable file
358
src/libprs500/ebooks/rtf2xml/fields_large.py
Executable file
@ -0,0 +1,358 @@
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# You should have received a copy of the GNU General Public License #
|
||||
# along with this program; if not, write to the Free Software #
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
|
||||
# 02111-1307 USA #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os, tempfile
|
||||
from libprs500.ebooks.rtf2xml import field_strings, copy
|
||||
class FieldsLarge:
|
||||
"""
|
||||
=========================
|
||||
Logic
|
||||
=========================
|
||||
Make tags for fields.
|
||||
-Fields reflect text that Microsoft Word automatically generates.
|
||||
-Each file contains (or should contain) an inner group called field instructions.
|
||||
-Fields can be nested.
|
||||
--------------
|
||||
Logic
|
||||
--------------
|
||||
1. As soon as a field is found, make a new text string by appending an empty text string to the field list. Collect all the lines in this string until the field instructions are found.
|
||||
2. Collect all the tokens and text in the field instructions. When the end of the field instructions is found, process the string of text with the field_strings module. Append the processed string to the field instructins list.
|
||||
3. Continue collecting tokens. Check for paragraphs or sections. If either is found, add to the paragraph or section list.
|
||||
4. Continue collecting tokens and text either the beginning of a new field is found, or the end of this field is found.
|
||||
5. If a new field is found, repeat steps 1-3.
|
||||
6. If the end of the field is found, process the last text string of the field list.
|
||||
7. If the field list is empty (after removing the last text string), there are no more fields. Print out the final string. If the list contains other strings, add the processed string to the last string in the field list.
|
||||
============================
|
||||
Examples
|
||||
============================
|
||||
This line of RTF:
|
||||
{\field{\*\fldinst { CREATEDATE \\* MERGEFORMAT }}{\fldrslt {
|
||||
\lang1024 1/11/03 10:34 PM}}}
|
||||
Becomes:
|
||||
<field type = "insert-time">
|
||||
10:34 PM
|
||||
</field>
|
||||
The simple field in the above example conatins no paragraph or sections breaks.
|
||||
This line of RTF:
|
||||
{{\field{\*\fldinst SYMBOL 97 \\f "Symbol" \\s 12}{\fldrslt\f3\fs24}}}
|
||||
Becomes:
|
||||
<para><inline font-size="18"><inline font-style="Symbol">Χ</inline></inline></para>
|
||||
The RTF in the example above should be represented as UTF-8 rather than a field.
|
||||
This RTF:
|
||||
{\field\fldedit{\*\fldinst { TOC \\o "1-3" }}{\fldrslt {\lang1024
|
||||
Heading one\tab }{\field{\*\fldinst {\lang1024 PAGEREF _Toc440880424
|
||||
\\h }{\lang1024 {\*\datafield
|
||||
{\lang1024 1}}}{\lang1024 \par }\pard\plain
|
||||
\s18\li240\widctlpar\tqr\tldot\tx8630\aspalpha\aspnum\faauto\adjustright\rin0\lin240\itap0
|
||||
\f4\lang1033\cgrid {\lang1024 Heading 2\tab }{\field{\*\fldinst
|
||||
{\lang1024 PAGEREF _Toc440880425 \\h }{\lang1024 {\*\datafield
|
||||
{\lang1024 1}}}{\lang1024 \par }\pard\plain
|
||||
\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0
|
||||
\f4\lang1033\cgrid }}\pard\plain
|
||||
\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0
|
||||
\f4\lang1033\cgrid {\fs28 \u214\'85 \par }{\fs36 {\field{\*\fldinst
|
||||
SYMBOL 67 \\f "Symbol" \\s 18}{\fldrslt\f3\fs36}}}
|
||||
Becomes:
|
||||
<field-block type="table-of-contents">
|
||||
<paragraph-definition language="1033" nest-level="0"
|
||||
font-style="Times" name="toc 1" adjust-right="true"
|
||||
widow-control="true">
|
||||
<para><inline language="1024">Heading one	</inline><field
|
||||
type="reference-to-page" ref="_Toc440880424"><inline
|
||||
language="1024">1</inline></field></para>
|
||||
</paragraph-definition>
|
||||
<paragraph-definition language="1033" nest-level="0" left-indent="12"
|
||||
font-style="Times" name="toc 2" adjust-right="true"
|
||||
widow-control="true">
|
||||
<para><inline language="1024">Heading 2	</inline><field
|
||||
type="reference-to-page" ref="_Toc440880425"><inline
|
||||
language="1024">1</inline></field></para>
|
||||
</paragraph-definition>
|
||||
</field-block>
|
||||
"""
|
||||
def __init__(self,
|
||||
in_file,
|
||||
bug_handler,
|
||||
copy = None,
|
||||
run_level = 1,
|
||||
):
|
||||
"""
|
||||
Required:
|
||||
'file'--file to parse
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__run_level = run_level
|
||||
self.__write_to = tempfile.mktemp()
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Initiate all values.
|
||||
"""
|
||||
self.__text_string = ''
|
||||
self.__field_instruction_string = ''
|
||||
self.__marker = 'mi<mk<inline-fld\n'
|
||||
self.__state = 'before_body'
|
||||
self.__string_obj = field_strings.FieldStrings(run_level = self.__run_level,
|
||||
bug_handler= self.__bug_handler,)
|
||||
self.__state_dict = {
|
||||
'before_body' : self.__before_body_func,
|
||||
'in_body' : self.__in_body_func,
|
||||
'field' : self.__in_field_func,
|
||||
'field_instruction' : self.__field_instruction_func,
|
||||
}
|
||||
self.__in_body_dict = {
|
||||
'cw<fd<field_____' : self.__found_field_func,
|
||||
}
|
||||
self.__field_dict = {
|
||||
'cw<fd<field-inst' : self.__found_field_instruction_func,
|
||||
'cw<fd<field_____' : self.__found_field_func,
|
||||
'cw<pf<par-end___' : self.__par_in_field_func,
|
||||
'cw<sc<section___' : self.__sec_in_field_func,
|
||||
}
|
||||
self.__field_count = [] # keep track of the brackets
|
||||
self.__field_instruction = [] # field instruction strings
|
||||
self.__symbol = 0 # wheter or not the field is really UTF-8
|
||||
# (these fields cannot be nested.)
|
||||
self.__field_instruction_string = '' # string that collects field instruction
|
||||
self.__par_in_field = [] # paragraphs in field?
|
||||
self.__sec_in_field = [] # sections in field?
|
||||
self.__field_string = [] # list of field strings
|
||||
def __before_body_func(self, line):
|
||||
"""
|
||||
Requried:
|
||||
line --line ro parse
|
||||
Returns:
|
||||
nothing (changes an instant and writes a line)
|
||||
Logic:
|
||||
Check for the beginninf of the body. If found, changed the state.
|
||||
Always write out the line.
|
||||
"""
|
||||
if self.__token_info == 'mi<mk<body-open_':
|
||||
self.__state = 'in_body'
|
||||
self.__write_obj.write(line)
|
||||
def __in_body_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing. (Writes a line to the output file, or performs other actions.)
|
||||
Logic:
|
||||
Check of the beginning of a field. Always output the line.
|
||||
"""
|
||||
action = self.__in_body_dict.get(self.__token_info)
|
||||
if action:
|
||||
action(line)
|
||||
self.__write_obj.write(line)
|
||||
def __found_field_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Set the values for parseing the field. Four lists have to have
|
||||
items appended to them.
|
||||
"""
|
||||
self.__state = 'field'
|
||||
self.__cb_count = 0
|
||||
ob_count = self.__ob_count
|
||||
self.__field_string.append('')
|
||||
self.__field_count.append(ob_count)
|
||||
self.__sec_in_field.append(0)
|
||||
self.__par_in_field.append(0)
|
||||
def __in_field_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing.
|
||||
Logic:
|
||||
Check for the end of the field; a paragaph break; a section break;
|
||||
the beginning of another field; or the beginning of the field
|
||||
instruction.
|
||||
"""
|
||||
if self.__cb_count == self.__field_count[-1]:
|
||||
self.__field_string[-1] += line
|
||||
self.__end_field_func()
|
||||
else:
|
||||
action = self.__field_dict.get(self.__token_info)
|
||||
if action:
|
||||
action(line)
|
||||
else:
|
||||
self.__field_string[-1] += line
|
||||
def __par_in_field_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Write the line to the output file and set the last item in the
|
||||
paragraph in field list to true.
|
||||
"""
|
||||
self.__field_string[-1] += line
|
||||
self.__par_in_field[-1] = 1
|
||||
def __sec_in_field_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Write the line to the output file and set the last item in the
|
||||
section in field list to true.
|
||||
"""
|
||||
self.__field_string[-1] += line
|
||||
self.__sec_in_field[-1] = 1
|
||||
def __found_field_instruction_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Change the state to field instruction. Set the open bracket count of
|
||||
the beginning of this field so you know when it ends. Set the closed
|
||||
bracket count to 0 so you don't prematureley exit this state.
|
||||
"""
|
||||
self.__state = 'field_instruction'
|
||||
self.__field_instruction_count = self.__ob_count
|
||||
self.__cb_count = 0
|
||||
def __field_instruction_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Collect all the lines until the end of the field is reached.
|
||||
Process these lines with the module rtr.field_strings.
|
||||
Check if the field instruction is 'Symbol' (really UTF-8).
|
||||
"""
|
||||
if self.__cb_count == self.__field_instruction_count:
|
||||
# The closing bracket should be written, since the opening bracket
|
||||
# was written
|
||||
self.__field_string[-1] += line
|
||||
my_list = self.__string_obj.process_string(
|
||||
self.__field_instruction_string, 'field_instruction')
|
||||
instruction = my_list[2]
|
||||
self.__field_instruction.append(instruction)
|
||||
if my_list[0] == 'Symbol':
|
||||
self.__symbol = 1
|
||||
self.__state = 'field'
|
||||
self.__field_instruction_string = ''
|
||||
else:
|
||||
self.__field_instruction_string += line
|
||||
def __end_field_func(self):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
Nothing
|
||||
Logic:
|
||||
Pop the last values in the instructions list, the fields list, the
|
||||
paragaph list, and the section list.
|
||||
If the field is a symbol, do not write the tags <field></field>,
|
||||
since this field is really just UTF-8.
|
||||
If the field contains paragraph or section breaks, it is a
|
||||
field-block rather than just a field.
|
||||
Write the paragraph or section markers for later parsing of the
|
||||
file.
|
||||
If the filed list contains more strings, add the latest
|
||||
(processed) string to the last string in the list. Otherwise,
|
||||
write the string to the output file.
|
||||
"""
|
||||
last_bracket = self.__field_count.pop()
|
||||
instruction = self.__field_instruction.pop()
|
||||
inner_field_string = self.__field_string.pop()
|
||||
sec_in_field = self.__sec_in_field.pop()
|
||||
par_in_field = self.__par_in_field.pop()
|
||||
# add a closing bracket, since the closing bracket is not included in
|
||||
# the field string
|
||||
if self.__symbol:
|
||||
inner_field_string = '%scb<nu<clos-brack<%s\n' % \
|
||||
(instruction, last_bracket)
|
||||
elif sec_in_field or par_in_field:
|
||||
inner_field_string = \
|
||||
'mi<mk<fldbkstart\n'\
|
||||
'mi<tg<open-att__<field-block<type>%s\n%s'\
|
||||
'mi<mk<fldbk-end_\n' \
|
||||
'mi<tg<close_____<field-block\n'\
|
||||
'mi<mk<fld-bk-end\n' \
|
||||
% ( instruction, inner_field_string)
|
||||
# write a marker to show an inline field for later parsing
|
||||
else:
|
||||
inner_field_string = \
|
||||
'%s' \
|
||||
'mi<tg<open-att__<field<type>%s\n%s'\
|
||||
'mi<tg<close_____<field\n'\
|
||||
% (self.__marker, instruction, inner_field_string)
|
||||
if sec_in_field:
|
||||
inner_field_string = 'mi<mk<sec-fd-beg\n' + inner_field_string + \
|
||||
'mi<mk<sec-fd-end\n'
|
||||
if par_in_field:
|
||||
inner_field_string = 'mi<mk<par-in-fld\n' + inner_field_string
|
||||
if len(self.__field_string) == 0:
|
||||
self.__write_field_string(inner_field_string)
|
||||
else:
|
||||
self.__field_string[-1] += inner_field_string
|
||||
self.__symbol = 0
|
||||
def __write_field_string(self, the_string):
|
||||
self.__state = 'in_body'
|
||||
self.__write_obj.write(the_string)
|
||||
def fix_fields(self):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
nothing (changes the original file)
|
||||
Logic:
|
||||
Read one line in at a time. Determine what action to take based on
|
||||
the state. If the state is before the body, look for the
|
||||
beginning of the body.
|
||||
If the state is body, send the line to the body method.
|
||||
"""
|
||||
self.__initiate_values()
|
||||
read_obj = open(self.__file, 'r')
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.__ob_count = line[-5:-1]
|
||||
if self.__token_info == 'cb<nu<clos-brack':
|
||||
self.__cb_count = line[-5:-1]
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if action == None:
|
||||
sys.stderr.write('no no matching state in module styles.py\n')
|
||||
sys.stderr.write(self.__state + '\n')
|
||||
action(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "fields_large.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
448
src/libprs500/ebooks/rtf2xml/fields_small.py
Executable file
448
src/libprs500/ebooks/rtf2xml/fields_small.py
Executable file
@ -0,0 +1,448 @@
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# You should have received a copy of the GNU General Public License #
|
||||
# along with this program; if not, write to the Free Software #
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
|
||||
# 02111-1307 USA #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os, tempfile, re
|
||||
from libprs500.ebooks.rtf2xml import field_strings, copy
|
||||
class FieldsSmall:
|
||||
"""
|
||||
=================
|
||||
Purpose
|
||||
=================
|
||||
Write tags for bookmarks, index and toc entry fields in a tokenized file.
|
||||
This module does not handle toc or index tables. (This module won't be any
|
||||
use to use to you unless you use it as part of the other modules.)
|
||||
-----------
|
||||
Method
|
||||
-----------
|
||||
Look for the beginning of a bookmark, index, or toc entry. When such a token
|
||||
is found, store the opeing bracket count in a variable. Collect all the text
|
||||
until the closing bracket entry is found. Send the string to the module
|
||||
field_strings to process it. Write the processed string to the output
|
||||
file.
|
||||
"""
|
||||
def __init__(self,
|
||||
in_file,
|
||||
bug_handler,
|
||||
copy = None,
|
||||
run_level = 1,
|
||||
):
|
||||
"""
|
||||
Required:
|
||||
'file'--file to parse
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__write_to = tempfile.mktemp()
|
||||
self.__run_level = run_level
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Initiate all values.
|
||||
"""
|
||||
self.__string_obj = field_strings.FieldStrings(bug_handler = self.__bug_handler)
|
||||
self.__state = 'before_body'
|
||||
self.__text_string = ''
|
||||
self.__marker = 'mi<mk<inline-fld\n'
|
||||
self.__state_dict = {
|
||||
'before_body' : self.__before_body_func,
|
||||
'body' : self.__body_func,
|
||||
'bookmark' : self.__bookmark_func,
|
||||
'toc_index' : self.__toc_index_func,
|
||||
}
|
||||
self.__body_dict = {
|
||||
'cw<an<book-mk-st' : (self.__found_bookmark_func, 'start'),
|
||||
'cw<an<book-mk-en' : (self.__found_bookmark_func, 'end'),
|
||||
'cw<an<toc_______' : (self.__found_toc_index_func, 'toc'),
|
||||
'cw<an<index-mark' : (self.__found_toc_index_func, 'index'),
|
||||
}
|
||||
ob = 'ob<nu<open-brack.....'
|
||||
cb = 'cb<nu<clos-brack'
|
||||
bk_st = 'cw<an<book-mk-st<nu<true'
|
||||
tx = 'tx<nu<__________<(.*?)'
|
||||
reg_st = ob + bk_st + tx + cb
|
||||
self.__book_start = re.compile(r'%s' % reg_st)
|
||||
def __before_body_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --the line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Look for the beginning of the body. When found, change the state
|
||||
to body. Always print out the line.
|
||||
"""
|
||||
if self.__token_info == 'mi<mk<body-open_':
|
||||
self.__state = 'body'
|
||||
self.__write_obj.write(line)
|
||||
def __body_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --the line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
This function handles all the lines in the body of the documents.
|
||||
Look for a bookmark, index or toc entry and take the appropriate action.
|
||||
"""
|
||||
action, tag = \
|
||||
self.__body_dict.get(self.__token_info, (None, None))
|
||||
if action:
|
||||
action(line, tag)
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
def __found_bookmark_func(self, line, tag):
|
||||
"""
|
||||
Requires:
|
||||
line --the line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
This function is called when a bookmark is found. The opening
|
||||
bracket count is stored int eh beginning bracket count. The state
|
||||
is changed to 'bookmark.'
|
||||
"""
|
||||
self.__beg_bracket_count = self.__ob_count
|
||||
self.__cb_count = 0
|
||||
self.__state = 'bookmark'
|
||||
self.__type_of_bookmark = tag
|
||||
def __bookmark_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --the line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
This function handles all lines within a bookmark. It adds each
|
||||
line to a string until the end of the bookmark is found. It
|
||||
processes the string with the fields_string module, and
|
||||
prints out the result.
|
||||
"""
|
||||
if self.__beg_bracket_count == self.__cb_count:
|
||||
self.__state = 'body'
|
||||
type = 'bookmark-%s' % self.__type_of_bookmark
|
||||
# change here
|
||||
"""
|
||||
my_string = self.__string_obj.process_string(
|
||||
self.__text_string, type)
|
||||
"""
|
||||
my_string = self.__parse_bookmark_func(
|
||||
self.__text_string, type)
|
||||
self.__write_obj.write(self.__marker)
|
||||
self.__write_obj.write(my_string)
|
||||
self.__text_string = ''
|
||||
self.__write_obj.write(line)
|
||||
elif line[0:2] == 'tx':
|
||||
self.__text_string += line[17:-1]
|
||||
def __parse_index_func(self, my_string):
|
||||
"""
|
||||
Requires:
|
||||
my_string --string to parse
|
||||
type --type of string
|
||||
Returns:
|
||||
A string for a toc instruction field.
|
||||
Logic:
|
||||
This method is meant for *both* index and toc entries.
|
||||
I want to eleminate paragraph endings, and I want to divide the
|
||||
entry into a main entry and (if it exists) a sub entry.
|
||||
Split the string by newlines. Read on token at a time. If the
|
||||
token is a special colon, end the main entry element and start the
|
||||
sub entry element.
|
||||
If the token is a pargrah ending, ignore it, since I don't won't
|
||||
paragraphs within toc or index entries.
|
||||
"""
|
||||
my_string, see_string = self.__index_see_func(my_string)
|
||||
my_string, bookmark_string = self.__index_bookmark_func( my_string)
|
||||
italics, bold = self.__index__format_func(my_string)
|
||||
found_sub = 0
|
||||
my_changed_string = 'mi<tg<empty-att_<field<type>index-entry'
|
||||
my_changed_string += '<update>static'
|
||||
if see_string:
|
||||
my_changed_string += '<additional-text>%s' % see_string
|
||||
if bookmark_string:
|
||||
my_changed_string += '<bookmark>%s' % bookmark_string
|
||||
if italics:
|
||||
my_changed_string += '<italics>true'
|
||||
if bold:
|
||||
my_changed_string += '<bold>true'
|
||||
main_entry = ''
|
||||
sub_entry = ''
|
||||
lines = my_string.split('\n')
|
||||
for line in lines:
|
||||
token_info = line[:16]
|
||||
if token_info == 'cw<ml<colon_____':
|
||||
found_sub = 1
|
||||
elif token_info[0:2] == 'tx':
|
||||
if found_sub:
|
||||
sub_entry += line[17:]
|
||||
else:
|
||||
main_entry += line[17:]
|
||||
my_changed_string += '<main-entry>%s' % main_entry
|
||||
if found_sub:
|
||||
my_changed_string += '<sub-entry>%s' % sub_entry
|
||||
my_changed_string += '\n'
|
||||
return my_changed_string
|
||||
def __index_see_func(self, my_string):
|
||||
in_see = 0
|
||||
bracket_count = 0
|
||||
see_string = ''
|
||||
changed_string = ''
|
||||
lines = my_string.split('\n')
|
||||
end_bracket_count = sys.maxint
|
||||
for line in lines:
|
||||
token_info = line[:16]
|
||||
if token_info == 'ob<nu<open-brack':
|
||||
bracket_count += 1
|
||||
if token_info == 'cb<nu<clos-brack':
|
||||
bracket_count -= 1
|
||||
if in_see:
|
||||
if bracket_count == end_bracket_count and token_info == 'cb<nu<clos-brack':
|
||||
in_see = 0
|
||||
else:
|
||||
if token_info == 'tx<nu<__________':
|
||||
see_string += line[17:]
|
||||
else:
|
||||
if token_info == 'cw<in<index-see_':
|
||||
end_bracket_count = bracket_count - 1
|
||||
in_see = 1
|
||||
changed_string += '%s\n' % line
|
||||
return changed_string, see_string
|
||||
def __index_bookmark_func(self, my_string):
|
||||
"""
|
||||
Requries:
|
||||
my_string -- string in all the index
|
||||
Returns:
|
||||
bookmark_string -- the text string of the book mark
|
||||
index_string -- string minus the bookmark_string
|
||||
"""
|
||||
# cw<an<place_____<nu<true
|
||||
in_bookmark = 0
|
||||
bracket_count = 0
|
||||
bookmark_string = ''
|
||||
index_string = ''
|
||||
lines = my_string.split('\n')
|
||||
end_bracket_count = sys.maxint
|
||||
for line in lines:
|
||||
token_info = line[:16]
|
||||
if token_info == 'ob<nu<open-brack':
|
||||
bracket_count += 1
|
||||
if token_info == 'cb<nu<clos-brack':
|
||||
bracket_count -= 1
|
||||
if in_bookmark:
|
||||
if bracket_count == end_bracket_count and token_info == 'cb<nu<clos-brack':
|
||||
in_bookmark = 0
|
||||
index_string += '%s\n' % line
|
||||
else:
|
||||
if token_info == 'tx<nu<__________':
|
||||
bookmark_string += line[17:]
|
||||
else:
|
||||
index_string += '%s\n' % line
|
||||
else:
|
||||
if token_info == 'cw<an<place_____':
|
||||
end_bracket_count = bracket_count - 1
|
||||
in_bookmark = 1
|
||||
index_string += '%s\n' % line
|
||||
return index_string, bookmark_string
|
||||
def __index__format_func(self, my_string):
|
||||
italics = 0
|
||||
bold =0
|
||||
lines = my_string.split('\n')
|
||||
for line in lines:
|
||||
token_info = line[:16]
|
||||
if token_info == 'cw<in<index-bold':
|
||||
bold = 1
|
||||
if token_info == 'cw<in<index-ital':
|
||||
italics = 1
|
||||
return italics, bold
|
||||
def __parse_toc_func(self, my_string):
|
||||
"""
|
||||
Requires:
|
||||
my_string -- all the string in the toc
|
||||
Returns:
|
||||
modidified string
|
||||
Logic:
|
||||
"""
|
||||
toc_level = 0
|
||||
toc_suppress = 0
|
||||
my_string, book_start_string, book_end_string =\
|
||||
self.__parse_bookmark_for_toc(my_string)
|
||||
main_entry = ''
|
||||
my_changed_string = 'mi<tg<empty-att_<field<type>toc-entry'
|
||||
my_changed_string += '<update>static'
|
||||
if book_start_string:
|
||||
my_changed_string += '<bookmark-start>%s' % book_start_string
|
||||
if book_end_string:
|
||||
my_changed_string += '<bookmark-end>%s' % book_end_string
|
||||
lines = my_string.split('\n')
|
||||
for line in lines:
|
||||
token_info = line[:16]
|
||||
if token_info[0:2] == 'tx':
|
||||
main_entry += line[17:]
|
||||
if token_info == 'cw<tc<toc-level_':
|
||||
toc_level = line[20:]
|
||||
if token_info == 'cw<tc<toc-sup-nu':
|
||||
toc_suppress = 1
|
||||
if toc_level:
|
||||
my_changed_string += '<toc-level>%s' % toc_level
|
||||
if toc_suppress:
|
||||
my_changed_string += '<toc-suppress-number>true'
|
||||
my_changed_string += '<main-entry>%s' % main_entry
|
||||
my_changed_string += '\n'
|
||||
return my_changed_string
|
||||
def __parse_bookmark_for_toc(self, my_string):
|
||||
"""
|
||||
Requires:
|
||||
the_string --string of toc, with new lines
|
||||
Returns:
|
||||
the_string -- string minus bookmarks
|
||||
bookmark_string -- bookmarks
|
||||
Logic:
|
||||
"""
|
||||
in_bookmark = 0
|
||||
bracket_count = 0
|
||||
book_start_string = ''
|
||||
book_end_string = ''
|
||||
book_type = 0
|
||||
toc_string = ''
|
||||
lines = my_string.split('\n')
|
||||
end_bracket_count = sys.maxint
|
||||
for line in lines:
|
||||
token_info = line[:16]
|
||||
if token_info == 'ob<nu<open-brack':
|
||||
bracket_count += 1
|
||||
if token_info == 'cb<nu<clos-brack':
|
||||
bracket_count -= 1
|
||||
if in_bookmark:
|
||||
if bracket_count == end_bracket_count and token_info == 'cb<nu<clos-brack':
|
||||
in_bookmark = 0
|
||||
toc_string += '%s\n' % line
|
||||
else:
|
||||
if token_info == 'tx<nu<__________':
|
||||
if book_type == 'start':
|
||||
book_start_string += line[17:]
|
||||
elif book_type == 'end':
|
||||
book_end_string += line[17:]
|
||||
else:
|
||||
toc_string += '%s\n' % line
|
||||
else:
|
||||
if token_info == 'cw<an<book-mk-st' or token_info =='cw<an<book-mk-en':
|
||||
if token_info == 'cw<an<book-mk-st':
|
||||
book_type = 'start'
|
||||
if token_info == 'cw<an<book-mk-en':
|
||||
book_type = 'end'
|
||||
end_bracket_count = bracket_count - 1
|
||||
in_bookmark = 1
|
||||
toc_string += '%s\n' % line
|
||||
return toc_string, book_start_string, book_end_string
|
||||
def __parse_bookmark_func(self, my_string, type):
|
||||
"""
|
||||
Requires:
|
||||
my_string --string to parse
|
||||
type --type of string
|
||||
Returns:
|
||||
A string formated for a field instruction.
|
||||
Logic:
|
||||
The type is the name (either bookmark-end or bookmark-start). The
|
||||
id is the complete text string.
|
||||
"""
|
||||
my_changed_string = ('mi<tg<empty-att_<field<type>%s'
|
||||
'<number>%s<update>none\n' % (type, my_string))
|
||||
return my_changed_string
|
||||
def __found_toc_index_func(self, line, tag):
|
||||
"""
|
||||
Requires:
|
||||
line --the line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
This function is called when a toc or index entry is found. The opening
|
||||
bracket count is stored in the beginning bracket count. The state
|
||||
is changed to 'toc_index.'
|
||||
"""
|
||||
self.__beg_bracket_count = self.__ob_count
|
||||
self.__cb_count = 0
|
||||
self.__state = 'toc_index'
|
||||
self.__tag = tag
|
||||
def __toc_index_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --the line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
This function handles all lines within a toc or index entry. It
|
||||
adds each line to a string until the end of the entry is found. It
|
||||
processes the string with the fields_string module, and
|
||||
prints out the result.
|
||||
"""
|
||||
if self.__beg_bracket_count == self.__cb_count:
|
||||
self.__state = 'body'
|
||||
type = self.__tag
|
||||
if type == 'index':
|
||||
my_string = self.__parse_index_func(
|
||||
self.__text_string)
|
||||
elif type == 'toc':
|
||||
my_string = self.__parse_toc_func(
|
||||
self.__text_string)
|
||||
self.__write_obj.write(self.__marker)
|
||||
self.__write_obj.write(my_string)
|
||||
self.__text_string = ''
|
||||
self.__write_obj.write(line)
|
||||
else:
|
||||
self.__text_string += line
|
||||
def fix_fields(self):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
nothing (changes the original file)
|
||||
Logic:
|
||||
Read one line in at a time. Determine what action to take based on
|
||||
the state. If the state is before the body, look for the
|
||||
beginning of the body.
|
||||
The other two states are toc_index (for toc and index entries) and
|
||||
bookmark.
|
||||
"""
|
||||
self.__initiate_values()
|
||||
read_obj = open(self.__file)
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
line_to_read = '1'
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.__ob_count = line[-5:-1]
|
||||
if self.__token_info == 'cb<nu<clos-brack':
|
||||
self.__cb_count = line[-5:-1]
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if action == None:
|
||||
sys.stderr.write('no no matching state in module fields_small.py\n')
|
||||
sys.stderr.write(self.__state + '\n')
|
||||
action(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "fields_small.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
223
src/libprs500/ebooks/rtf2xml/fonts.py
Executable file
223
src/libprs500/ebooks/rtf2xml/fonts.py
Executable file
@ -0,0 +1,223 @@
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# You should have received a copy of the GNU General Public License #
|
||||
# along with this program; if not, write to the Free Software #
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
|
||||
# 02111-1307 USA #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os, tempfile
|
||||
from libprs500.ebooks.rtf2xml import copy
|
||||
class Fonts:
|
||||
"""
|
||||
Change lines with font info from font numbers to the actual font names.
|
||||
"""
|
||||
def __init__(self,
|
||||
in_file,
|
||||
bug_handler,
|
||||
default_font_num,
|
||||
copy = None,
|
||||
run_level = 1,
|
||||
):
|
||||
"""
|
||||
Required:
|
||||
'file'--file to parse
|
||||
'default_font_num'--the default font number
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__default_font_num = default_font_num
|
||||
self.__write_to = tempfile.mktemp()
|
||||
self.__run_level = run_level
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Initiate all values.
|
||||
"""
|
||||
self.__special_font_dict = {
|
||||
'Symbol' : 0,
|
||||
'Wingdings' : 0,
|
||||
'Zapf Dingbats' : 0,
|
||||
}
|
||||
self.__special_font_list = [
|
||||
'Symbol', 'Wingdings', 'Zapf Dingbats'
|
||||
]
|
||||
self.__state = 'default'
|
||||
self.__state_dict = {
|
||||
'default' : self.__default_func,
|
||||
'font_table' : self.__font_table_func,
|
||||
'after_font_table' : self.__after_font_table_func,
|
||||
'font_in_table' : self.__font_in_table_func,
|
||||
}
|
||||
self.__font_table = {}
|
||||
# individual font written
|
||||
self.__wrote_ind_font = 0
|
||||
def __default_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line
|
||||
Returns:
|
||||
nothing
|
||||
Handle all lines before the font table. Check for the beginning of the
|
||||
font table. If found, change the state. Print out all lines.
|
||||
"""
|
||||
if self.__token_info == 'mi<mk<fonttb-beg':
|
||||
self.__state = 'font_table'
|
||||
self.__write_obj.write(line)
|
||||
def __font_table_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
If the self.__token_info indicates that you have reached the end of
|
||||
the font table, then change the state to after the font table.
|
||||
If the self.__token_info indicates that there is a font in the
|
||||
table, change the state to font in table. Reset the number of the
|
||||
font to the default font (in case there is no number provided, in
|
||||
which case RTF assumes the number will be the default font.) Reset
|
||||
the test string (for the font name) to ''
|
||||
"""
|
||||
if self.__token_info == 'mi<mk<fonttb-end':
|
||||
self.__state = 'after_font_table'
|
||||
elif self.__token_info == 'mi<mk<fontit-beg':
|
||||
self.__state = 'font_in_table'
|
||||
self.__font_num = self.__default_font_num
|
||||
self.__text_line = ''
|
||||
##self.__write_obj.write(line)
|
||||
def __font_in_table_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Check for four conditions:
|
||||
The line contains font-info. In this case, store the number in
|
||||
self.__font_num.
|
||||
The line contains text. In this case, add to the text string
|
||||
self.__text_string.
|
||||
The line marks the end of the individual font in the table. In
|
||||
this case, add a new key-> value pair to the font-table
|
||||
dictionary. Also create an empty tag with the name and number
|
||||
as attributes.
|
||||
Preamture end of font table
|
||||
"""
|
||||
#cw<ci<font-style<nu<4
|
||||
#tx<nu<__________<Times;
|
||||
if self.__token_info == 'mi<mk<fontit-end':
|
||||
self.__wrote_ind_font = 1
|
||||
self.__state = 'font_table'
|
||||
self.__text_line = self.__text_line[:-1] # get rid of last ';'
|
||||
self.__font_table[self.__font_num] = self.__text_line
|
||||
self.__write_obj.write(
|
||||
'mi<tg<empty-att_'
|
||||
'<font-in-table<name>%s<num>%s\n' % (self.__text_line, self.__font_num)
|
||||
)
|
||||
elif self.__token_info == 'cw<ci<font-style':
|
||||
self.__font_num = line[20:-1]
|
||||
elif self.__token_info == 'tx<nu<__________' or \
|
||||
self.__token_info == 'tx<ut<__________':
|
||||
self.__text_line += line[17:-1]
|
||||
elif self.__token_info == 'mi<mk<fonttb-end':
|
||||
self.__found_end_font_table_func()
|
||||
self.__state = 'after_font_table'
|
||||
def __found_end_font_table_func(self):
|
||||
"""
|
||||
Required:
|
||||
nothing
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
If not individual fonts have been written, write one out
|
||||
"""
|
||||
if not self.__wrote_ind_font:
|
||||
self.__write_obj.write(
|
||||
'mi<tg<empty-att_'
|
||||
'<font-in-table<name>Times<num>0\n' )
|
||||
def __after_font_table_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Check the self.__token_info. If this matches a token with font
|
||||
info, then extract the number from the line, and look up the font
|
||||
name in the font dictionary. If no name exists for that number,
|
||||
print out an error. Otherwise print out the same line, except with
|
||||
the name rather than the number.
|
||||
If the line does not contain font info, simply print it out to the
|
||||
file.
|
||||
"""
|
||||
if self.__token_info == 'cw<ci<font-style':
|
||||
font_num = line[20:-1]
|
||||
font_name = self.__font_table.get(font_num)
|
||||
if font_name == None:
|
||||
if self.__run_level > 3:
|
||||
msg = 'no value for %s in self.__font_table\n' % font_num
|
||||
raise self.__bug_handler, msg
|
||||
else:
|
||||
# self.__special_font_dict
|
||||
if font_name in self.__special_font_list:
|
||||
self.__special_font_dict[font_name] = 1
|
||||
self.__write_obj.write(
|
||||
'cw<ci<font-style<nu<%s\n' % font_name
|
||||
)
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
def convert_fonts(self):
|
||||
"""
|
||||
Required:
|
||||
nothing
|
||||
Returns:
|
||||
a dictionary indicating with values for special fonts
|
||||
Logic:
|
||||
Read one line in at a time. Determine what action to take based on
|
||||
the state. If the state is font_table, looke for individual fonts
|
||||
and add the number and font name to a dictionary. Also create a
|
||||
tag for each individual font in the font table.
|
||||
If the state is after the font table, look for lines with font
|
||||
info. Substitute a font name for a font number.
|
||||
"""
|
||||
self.__initiate_values()
|
||||
read_obj = open(self.__file, 'r')
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if action == None:
|
||||
sys.stderr.write('no no matching state in module fonts.py\n')
|
||||
sys.stderr.write(self.__state + '\n')
|
||||
action(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
default_font_name = self.__font_table.get(self.__default_font_num)
|
||||
if not default_font_name:
|
||||
default_font_name = 'Not Defined'
|
||||
self.__special_font_dict['default-font'] = default_font_name
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "fonts.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
return self.__special_font_dict
|
268
src/libprs500/ebooks/rtf2xml/footnote.py
Executable file
268
src/libprs500/ebooks/rtf2xml/footnote.py
Executable file
@ -0,0 +1,268 @@
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# You should have received a copy of the GNU General Public License #
|
||||
# along with this program; if not, write to the Free Software #
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
|
||||
# 02111-1307 USA #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import os, tempfile
|
||||
from libprs500.ebooks.rtf2xml import copy
|
||||
class Footnote:
|
||||
"""
|
||||
Two public methods are available. The first separates all of the
|
||||
footnotes from the body and puts them at the bottom of the text, where
|
||||
they are easier to process. The second joins those footnotes to the
|
||||
proper places in the body.
|
||||
"""
|
||||
def __init__(self,
|
||||
in_file ,
|
||||
bug_handler,
|
||||
copy = None,
|
||||
run_level = 1,
|
||||
):
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__write_to = tempfile.mktemp()
|
||||
self.__found_a_footnote = 0
|
||||
def __first_line_func(self, line):
|
||||
"""
|
||||
Print the tag info for footnotes. Check whether footnote is an
|
||||
endnote and make the tag according to that.
|
||||
"""
|
||||
if self.__token_info == 'cw<nt<type______':
|
||||
self.__write_to_foot_obj.write(
|
||||
'mi<tg<open-att__<footnote<type>endnote<num>%s\n' % self.__footnote_count)
|
||||
else:
|
||||
self.__write_to_foot_obj.write(
|
||||
'mi<tg<open-att__<footnote<num>%s\n' % self.__footnote_count)
|
||||
self.__first_line = 0
|
||||
def __in_footnote_func(self, line):
|
||||
"""Handle all tokens that are part of footnote"""
|
||||
if self.__first_line:
|
||||
self.__first_line_func(line)
|
||||
if self.__token_info == 'cw<ci<footnot-mk':
|
||||
num = str(self.__footnote_count)
|
||||
self.__write_to_foot_obj.write(line)
|
||||
self.__write_to_foot_obj.write(
|
||||
'tx<nu<__________<%s\n' % num
|
||||
)
|
||||
if self.__cb_count == self.__footnote_bracket_count:
|
||||
self.__in_footnote = 0
|
||||
self.__write_obj.write(line)
|
||||
self.__write_to_foot_obj.write(
|
||||
'mi<mk<foot___clo\n')
|
||||
self.__write_to_foot_obj.write(
|
||||
'mi<tg<close_____<footnote\n')
|
||||
self.__write_to_foot_obj.write(
|
||||
'mi<mk<footnt-clo\n')
|
||||
else:
|
||||
self.__write_to_foot_obj.write(line)
|
||||
def __found_footnote(self, line):
|
||||
""" Found a footnote"""
|
||||
self.__found_a_footnote = 1
|
||||
self.__in_footnote = 1
|
||||
self.__first_line = 1
|
||||
self.__footnote_count += 1
|
||||
# temporarily set this to zero so I can enter loop
|
||||
self.__cb_count = 0
|
||||
self.__footnote_bracket_count = self.__ob_count
|
||||
self.__write_obj.write(
|
||||
'mi<mk<footnt-ind<%04d\n' % self.__footnote_count)
|
||||
self.__write_to_foot_obj.write(
|
||||
'mi<mk<footnt-ope<%04d\n' % self.__footnote_count)
|
||||
def __default_sep(self, line):
|
||||
"""Handle all tokens that are not footnote tokens"""
|
||||
if self.__token_info == 'cw<nt<footnote__':
|
||||
self.__found_footnote(line)
|
||||
self.__write_obj.write(line)
|
||||
if self.__token_info == 'cw<ci<footnot-mk':
|
||||
num = str(self.__footnote_count + 1)
|
||||
self.__write_obj.write(
|
||||
'tx<nu<__________<%s\n' % num
|
||||
)
|
||||
def __initiate_sep_values(self):
|
||||
"""
|
||||
initiate counters for separate_footnotes method.
|
||||
"""
|
||||
self.__bracket_count=0
|
||||
self.__ob_count = 0
|
||||
self.__cb_count = 0
|
||||
self.__footnote_bracket_count = 0
|
||||
self.__in_footnote = 0
|
||||
self.__first_line = 0 #have not processed the first line of footnote
|
||||
self.__footnote_count = 0
|
||||
def separate_footnotes(self):
|
||||
"""
|
||||
Separate all the footnotes in an RTF file and put them at the bottom,
|
||||
where they are easier to process. Each time a footnote is found,
|
||||
print all of its contents to a temporary file. Close both the main and
|
||||
temporary file. Print the footnotes from the temporary file to the
|
||||
bottom of the main file.
|
||||
"""
|
||||
self.__initiate_sep_values()
|
||||
read_obj = open(self.__file)
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
self.__footnote_holder = tempfile.mktemp()
|
||||
self.__write_to_foot_obj = open(self.__footnote_holder, 'w')
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
# keep track of opening and closing brackets
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.__ob_count = line[-5:-1]
|
||||
if self.__token_info == 'cb<nu<clos-brack':
|
||||
self.__cb_count = line[-5:-1]
|
||||
# In the middle of footnote text
|
||||
if self.__in_footnote:
|
||||
self.__in_footnote_func(line)
|
||||
# not in the middle of footnote text
|
||||
else:
|
||||
self.__default_sep(line)
|
||||
self.__write_obj.close()
|
||||
read_obj.close()
|
||||
self.__write_to_foot_obj.close()
|
||||
read_obj = open(self.__footnote_holder, 'r')
|
||||
write_obj = open(self.__write_to, 'a')
|
||||
write_obj.write(
|
||||
'mi<mk<sect-close\n'
|
||||
'mi<mk<body-close\n'
|
||||
'mi<tg<close_____<section\n'
|
||||
'mi<tg<close_____<body\n'
|
||||
'mi<tg<close_____<doc\n'
|
||||
'mi<mk<footnt-beg\n')
|
||||
line = 1
|
||||
while line:
|
||||
line = read_obj.readline()
|
||||
write_obj.write(line)
|
||||
write_obj.write(
|
||||
'mi<mk<footnt-end\n')
|
||||
read_obj.close()
|
||||
write_obj.close()
|
||||
os.remove(self.__footnote_holder)
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "footnote_separate.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
def update_info(self, file, copy):
|
||||
"""
|
||||
Unused method
|
||||
"""
|
||||
self.__file = file
|
||||
self.__copy = copy
|
||||
def __get_foot_body_func(self, line):
|
||||
"""
|
||||
Process lines in main body and look for beginning of footnotes.
|
||||
"""
|
||||
# mi<mk<footnt-end
|
||||
if self.__token_info == 'mi<mk<footnt-beg':
|
||||
self.__state = 'foot'
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
def __get_foot_foot_func(self, line):
|
||||
"""
|
||||
Copy footnotes from bottom of file to a separate, temporary file.
|
||||
"""
|
||||
if self.__token_info == 'mi<mk<footnt-end':
|
||||
self.__state = 'body'
|
||||
else:
|
||||
self.__write_to_foot_obj.write(line)
|
||||
def __get_footnotes(self):
|
||||
"""
|
||||
Private method to remove footnotes from main file. Read one line from
|
||||
the main file at a time. If the state is 'body', call on the private
|
||||
__get_foot_foot_func. Otherwise, call on the __get_foot_body_func.
|
||||
These two functions do the work of separating the footnotes form the
|
||||
body.
|
||||
"""
|
||||
read_obj = open(self.__file)
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
# self.__write_to = "footnote_info.data"
|
||||
self.__write_to_foot_obj = open(self.__footnote_holder, 'w')
|
||||
line = 1
|
||||
while line:
|
||||
line = read_obj.readline()
|
||||
self.__token_info = line[:16]
|
||||
if self.__state == 'body':
|
||||
self.__get_foot_body_func(line)
|
||||
elif self.__state == 'foot':
|
||||
self.__get_foot_foot_func(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
self.__write_to_foot_obj.close()
|
||||
def __get_foot_from_temp(self, num):
|
||||
"""
|
||||
Private method for joining footnotes to body. This method reads from
|
||||
the temporary file until the proper footnote marker is found. It
|
||||
collects all the tokens until the end of the footnote, and returns
|
||||
them as a string.
|
||||
"""
|
||||
look_for = 'mi<mk<footnt-ope<' + num + '\n'
|
||||
found_foot = 0
|
||||
string_to_return = ''
|
||||
line = 1
|
||||
while line:
|
||||
line = self.__read_from_foot_obj.readline()
|
||||
if found_foot:
|
||||
if line == 'mi<mk<footnt-clo\n':
|
||||
return string_to_return
|
||||
string_to_return = string_to_return + line
|
||||
else:
|
||||
if line == look_for:
|
||||
found_foot = 1
|
||||
def __join_from_temp(self):
|
||||
"""
|
||||
Private method for rejoining footnotes to body. Read from the
|
||||
newly-created, temporary file that contains the body text but no
|
||||
footnotes. Each time a footnote marker is found, call the private
|
||||
method __get_foot_from_temp(). This method will return a string to
|
||||
print out to the third file.
|
||||
If no footnote marker is found, simply print out the token (line).
|
||||
"""
|
||||
self.__read_from_foot_obj = open(self.__footnote_holder, 'r')
|
||||
read_obj = open(self.__write_to, 'r')
|
||||
self.__write_obj = open(self.__write_to2, 'w')
|
||||
line = 1
|
||||
while line:
|
||||
line = read_obj.readline()
|
||||
if line[:16] == 'mi<mk<footnt-ind':
|
||||
line = self.__get_foot_from_temp(line[17:-1])
|
||||
self.__write_obj.write(line)
|
||||
read_obj.close()
|
||||
def join_footnotes(self):
|
||||
"""
|
||||
Join the footnotes from the bottom of the file and put them in their
|
||||
former places. First, remove the footnotes from the bottom of the
|
||||
input file, outputting them to a temporary file. This creates two new
|
||||
files, one without footnotes, and one of just footnotes. Open both
|
||||
these files to read. When a marker is found in the main file, find the
|
||||
corresponding marker in the footnote file. Output the mix of body and
|
||||
footnotes to a third file.
|
||||
"""
|
||||
if not self.__found_a_footnote:
|
||||
return
|
||||
self.__write_to2 = tempfile.mktemp()
|
||||
self.__state = 'body'
|
||||
self.__get_footnotes()
|
||||
self.__join_from_temp()
|
||||
self.__write_obj.close()
|
||||
self.__read_from_foot_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to2, "footnote_joined.data")
|
||||
copy_obj.rename(self.__write_to2, self.__file)
|
||||
os.remove(self.__write_to2)
|
||||
os.remove(self.__footnote_holder)
|
67
src/libprs500/ebooks/rtf2xml/get_char_map.py
Executable file
67
src/libprs500/ebooks/rtf2xml/get_char_map.py
Executable file
@ -0,0 +1,67 @@
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# You should have received a copy of the GNU General Public License #
|
||||
# along with this program; if not, write to the Free Software #
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
|
||||
# 02111-1307 USA #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
|
||||
class GetCharMap:
|
||||
"""
|
||||
|
||||
Return the character map for the given value
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, bug_handler, char_file):
|
||||
"""
|
||||
|
||||
Required:
|
||||
|
||||
'char_file'--the file with the mappings
|
||||
|
||||
|
||||
|
||||
Returns:
|
||||
|
||||
nothing
|
||||
|
||||
"""
|
||||
self.__char_file = char_file
|
||||
self.__bug_handler = bug_handler
|
||||
|
||||
def get_char_map(self, map):
|
||||
found_map = 0
|
||||
map_dict = {}
|
||||
self.__char_file.seek(0)
|
||||
for line in self.__char_file.readlines():
|
||||
if not line.strip(): continue
|
||||
begin_element = '<%s>' % map;
|
||||
end_element = '</%s>' % map
|
||||
if not found_map:
|
||||
if begin_element in line:
|
||||
found_map = 1
|
||||
else:
|
||||
if end_element in line:
|
||||
break
|
||||
fields = line.split(':')
|
||||
fields[1].replace('\\colon', ':')
|
||||
map_dict[fields[1]] = fields[3]
|
||||
|
||||
|
||||
if not found_map:
|
||||
msg = 'no map found\n'
|
||||
msg += 'map is "%s"\n'%(map,)
|
||||
raise self.__bug_handler, msg
|
||||
return map_dict
|
||||
|
332
src/libprs500/ebooks/rtf2xml/get_options.py
Executable file
332
src/libprs500/ebooks/rtf2xml/get_options.py
Executable file
@ -0,0 +1,332 @@
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# You should have received a copy of the GNU General Public License #
|
||||
# along with this program; if not, write to the Free Software #
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
|
||||
# 02111-1307 USA #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
"""
|
||||
Gets options for main part of script
|
||||
"""
|
||||
import sys, os
|
||||
from libprs500.ebooks.rtf2xml import options_trem, configure_txt
|
||||
class GetOptions:
|
||||
def __init__(self,
|
||||
system_arguments,
|
||||
rtf_dir,
|
||||
bug_handler,
|
||||
configuration_file = None,
|
||||
):
|
||||
self.__system_arguments = system_arguments
|
||||
self.__rtf_dir = rtf_dir
|
||||
self.__configuration_file = configuration_file
|
||||
self.__bug_handler = bug_handler
|
||||
def get_options(self):
|
||||
"""
|
||||
return valid, output, help, show_warnings, debug, file
|
||||
"""
|
||||
return_options = self.__get_config_options()
|
||||
options_dict = {
|
||||
'dir' : [1],
|
||||
'help' : [0, 'h'],
|
||||
'show-warnings' : [0],
|
||||
'caps' : [0,],
|
||||
'no-caps' : [0],
|
||||
'symbol' : [0 ],
|
||||
'no-symbol' : [0],
|
||||
'windings' : [0],
|
||||
'no-wingdings' : [0],
|
||||
'zapf' : [0],
|
||||
'no-zapf' : [0],
|
||||
'font' : [0],
|
||||
'no-font' : [0],
|
||||
'dtd' : [1],
|
||||
'no-dtd' : [0],
|
||||
'version' : [0],
|
||||
'output' : [1, 'o'],
|
||||
'no-namespace' : [0],
|
||||
'level' : [1],
|
||||
'indent' : [1],
|
||||
'no-lists' : [0],
|
||||
'lists' : [0],
|
||||
'group-styles' : [0],
|
||||
'no-group-styles' : [0],
|
||||
'group-borders' : [0],
|
||||
'no-group-borders' : [0],
|
||||
'headings-to-sections' : [0],
|
||||
'no-headings-to-sections' : [0],
|
||||
'empty-para' : [0],
|
||||
'no-empty-para' : [0],
|
||||
'format' : [1, 'f'],
|
||||
'config' : [0],
|
||||
}
|
||||
options_obj = options_trem.ParseOptions(
|
||||
system_string = self.__system_arguments,
|
||||
options_dict = options_dict
|
||||
)
|
||||
options, arguments = options_obj.parse_options()
|
||||
if options == 0:
|
||||
return_options['valid'] = 0
|
||||
return return_options
|
||||
the_keys = options.keys()
|
||||
return_options['help'] = 0
|
||||
if 'help' in the_keys:
|
||||
return_options['help'] = 1
|
||||
return return_options
|
||||
return_options['config'] = 0
|
||||
if 'config' in the_keys:
|
||||
return_options['config'] = 1
|
||||
return return_options
|
||||
return_options['version'] = 0
|
||||
if 'version' in the_keys:
|
||||
return_options['version'] = 1
|
||||
return return_options
|
||||
# unused
|
||||
return_options['out-dir'] = 0
|
||||
if 'dir' in the_keys:
|
||||
out_dir = options['dir']
|
||||
if not os.path.isdir(out_dir):
|
||||
sys.stderr.write('Your output must be an existing directory.\n')
|
||||
return_options['valid'] = 0
|
||||
else:
|
||||
return_options['dir'] = options['dir']
|
||||
return_options['out-file'] = 0
|
||||
if 'output' in the_keys:
|
||||
#out_file = options['output']
|
||||
return_options['out-file'] = options['output']
|
||||
else:
|
||||
pass
|
||||
"""
|
||||
sys.stderr.write(
|
||||
'You must provide an ouput file with the \'o\' option\n')
|
||||
return_options['valid'] = 0
|
||||
"""
|
||||
if 'level' in the_keys:
|
||||
return_options['level'] = options['level']
|
||||
the_level = return_options.get('level')
|
||||
if the_level:
|
||||
try:
|
||||
return_options['level'] = int(the_level)
|
||||
except ValueError:
|
||||
sys.stderr.write('The options "--level" must be a number.\n')
|
||||
return_options['valid'] = 0
|
||||
return return_options
|
||||
if 'dtd' in the_keys:
|
||||
#dtd = options['dtd']
|
||||
return_options['raw-dtd-path'] = options['dtd']
|
||||
acceptable = ['sdoc', 'raw', 'tei']
|
||||
if 'format' in the_keys:
|
||||
format = options['format']
|
||||
if format not in acceptable:
|
||||
sys.stderr.write('--format must take either \'sdoc\' or '
|
||||
'\'tei\'\n')
|
||||
return_options['valid'] = 0
|
||||
return return_options
|
||||
else:
|
||||
return_options['format'] = options['format']
|
||||
# a hack! python chokes on external dtd
|
||||
# Was able to fix this
|
||||
# format = return_options.get('format')
|
||||
# if format != 'raw' and format != None:
|
||||
# return_options['raw-dtd-path'] = ''
|
||||
return_options['show-warnings'] = 0
|
||||
if 'show-warnings' in the_keys:
|
||||
return_options['show-warnings'] = 1
|
||||
if 'no-font' in the_keys:
|
||||
return_options['convert-symbol'] = 0
|
||||
return_options['convert-zapf'] = 0
|
||||
return_options['convert-wingdings'] = 0
|
||||
if 'font' in the_keys:
|
||||
return_options['convert-symbol'] = 1
|
||||
return_options['convert-zapf'] = 1
|
||||
return_options['convert-wingdings'] = 1
|
||||
if 'symbol' in the_keys:
|
||||
return_options['convert-symbol'] = 1
|
||||
if 'no-symbol' in the_keys:
|
||||
return_options['convert-symbol'] = 0
|
||||
if 'wingdings' in the_keys:
|
||||
return_options['convert-wingdings'] = 1
|
||||
if 'no-wingdings' in the_keys:
|
||||
return_options['convert-wingdings'] = 0
|
||||
if 'zapf' in the_keys:
|
||||
return_options['convert-zapf'] = 1
|
||||
if 'no-zapf' in the_keys:
|
||||
return_options['convert-zapf'] = 0
|
||||
if 'caps' in the_keys:
|
||||
return_options['convert-caps'] = 1
|
||||
if 'no-caps' in the_keys:
|
||||
return_options['convert-caps'] = 0
|
||||
if 'no-dtd' in the_keys:
|
||||
return_options['no-dtd'] = 1
|
||||
else:
|
||||
return_options['no-dtd'] = 0
|
||||
return_options['no-ask'] = 0
|
||||
if 'no-ask' in the_keys:
|
||||
return_options['no-ask'] = 1
|
||||
sys.stderr.write('You can also permanetly set the no-ask option in the rtf2xml file.\n')
|
||||
if 'no-namespace' in the_keys:
|
||||
return_options['no-namespace'] = 1
|
||||
if 'headings-to-sections' in the_keys:
|
||||
return_options['headings-to-sections'] = 1
|
||||
elif 'no-headings-to-sections' in the_keys:
|
||||
return_options['headings-to-sections'] = 0
|
||||
if 'no-lists' in the_keys:
|
||||
return_options['form-lists'] = 0
|
||||
elif 'lists' in the_keys:
|
||||
return_options['form-lists'] = 1
|
||||
if 'group-styles' in the_keys:
|
||||
return_options['group-styles'] = 1
|
||||
elif 'no-group-styles' in the_keys:
|
||||
return_options['group-styles'] = 0
|
||||
if 'group-borders' in the_keys:
|
||||
return_options['group-borders'] = 1
|
||||
elif 'no-group-borders' in the_keys:
|
||||
return_options['group-borders'] = 0
|
||||
if 'empty-para' in the_keys:
|
||||
return_options['empty-paragraphs'] = 1
|
||||
elif 'no-empty-para' in the_keys:
|
||||
return_options['empty-paragraphs'] = 0
|
||||
if len(arguments) == 0:
|
||||
sys.stderr.write(
|
||||
'You must provide a file to convert.\n')
|
||||
return_options['valid'] = 0
|
||||
return return_options
|
||||
elif len(arguments) > 1:
|
||||
sys.stderr.write(
|
||||
'You can only convert one file at a time.\n')
|
||||
return_options['valid'] = 0
|
||||
else:
|
||||
return_options['in-file'] = arguments[0]
|
||||
# check for out file
|
||||
smart_output = return_options.get('smart-output')
|
||||
if smart_output == 'false':
|
||||
smart_output = 0
|
||||
if smart_output and not return_options['out-file']:
|
||||
in_file = return_options['in-file']
|
||||
the_file_name, ext = os.path.splitext(in_file)
|
||||
if ext != '.rtf':
|
||||
sys.stderr.write(
|
||||
'Sorry, but this file does not have an "rtf" extension, so \n'
|
||||
'the script will not attempt to convert it.\n'
|
||||
'If it is in fact an rtf file, use the "-o" option.\n'
|
||||
)
|
||||
return_options['valid'] = 0
|
||||
else:
|
||||
return_options['out-file'] = '%s.xml' % the_file_name
|
||||
if not smart_output and not return_options['out-file']:
|
||||
"""
|
||||
sys.stderr.write(
|
||||
'Please provide and file to outut with the -o option.\n'
|
||||
'Or set \'<smart-output value = "true"/>\'.\n'
|
||||
'in the configuration file.\n'
|
||||
)
|
||||
return_options['valid'] = 0
|
||||
"""
|
||||
pass
|
||||
if 'indent' in the_keys:
|
||||
try:
|
||||
value = int(options['indent'])
|
||||
return_options['indent'] = value
|
||||
except ValueError:
|
||||
sys.stderr.write('--indent must take an integer')
|
||||
return_options['valid'] = 0
|
||||
# check for format and pyxml
|
||||
"""
|
||||
the_format = return_options.get('format')
|
||||
if the_format != 'raw':
|
||||
no_pyxml = return_options.get('no-pyxml')
|
||||
if no_pyxml:
|
||||
sys.stderr.write('You want to convert your file to "%s".\n'
|
||||
'Sorry, but you must have pyxml installed\n'
|
||||
'in order to convert your document to anything but raw XML.\n'
|
||||
'Please do not use the --format option.\n\n'
|
||||
% the_format
|
||||
)
|
||||
return_options['valid'] = 0
|
||||
xslt_proc = return_options.get('xslt-processor')
|
||||
if xslt_proc == None and not no_pyxml:
|
||||
sys.stderr.write('You want to convert your file to "%s".\n'
|
||||
'Sorry, but you must have an xslt processor set up\n'
|
||||
'in order to conevert your document to anything but raw XML.\n'
|
||||
'Please use --format raw.\n\n'
|
||||
% the_format
|
||||
)
|
||||
return_options['valid'] = 0
|
||||
"""
|
||||
return return_options
|
||||
def __get_config_options(self):
|
||||
configure_obj = configure_txt.Configure(
|
||||
bug_handler = self.__bug_handler,
|
||||
configuration_file = self.__configuration_file)
|
||||
options_dict = configure_obj.get_configuration(type = 'normal')
|
||||
if options_dict == 1:
|
||||
sys.exit(1)
|
||||
options_dict['valid'] = 1
|
||||
convert_caps = options_dict.get('convert-caps')
|
||||
if convert_caps == 'false':
|
||||
options_dict['convert-caps'] = 0
|
||||
convert_symbol = options_dict.get('convert-symbol')
|
||||
if convert_symbol == 'false':
|
||||
options_dict['convert-symbol'] = 0
|
||||
convert_wingdings = options_dict.get('convert-wingdings')
|
||||
if convert_wingdings == 'false':
|
||||
options_dict['convert-wingdings'] = 0
|
||||
convert_zapf = options_dict.get('convert-zapf-dingbats')
|
||||
if convert_zapf == 'false':
|
||||
options_dict['convert-zapf'] = 0
|
||||
elif convert_zapf == 'true':
|
||||
options_dict['convert-zapf'] = 1
|
||||
headings_to_sections = options_dict.get('headings-to-sections')
|
||||
if headings_to_sections == 'true':
|
||||
options_dict['headings-to-sections'] = 1
|
||||
elif headings_to_sections == '1':
|
||||
options_dict['headings-to-sections'] = 1
|
||||
elif headings_to_sections == 'false':
|
||||
options_dict['headings-to-sections'] = 0
|
||||
elif headings_to_sections == '0':
|
||||
options_dict['headings-to-sections'] = 0
|
||||
else:
|
||||
options_dict['headings-to-sections'] = 0
|
||||
write_empty_paragraphs = options_dict.get('write-empty-paragraphs')
|
||||
if write_empty_paragraphs == 'true':
|
||||
options_dict['empty-paragraphs'] = 1
|
||||
elif write_empty_paragraphs == '1':
|
||||
options_dict['empty-paragraphs'] = 1
|
||||
elif write_empty_paragraphs == 'false':
|
||||
options_dict['empty-paragraphs'] = 0
|
||||
elif write_empty_paragraphs == '0':
|
||||
options_dict['empty-paragraphs'] = 0
|
||||
else:
|
||||
options_dict['empty-paragraphs'] = 1
|
||||
form_lists = options_dict.get('lists')
|
||||
if form_lists == 'true' or form_lists == '1':
|
||||
options_dict['form-lists'] = 1
|
||||
elif form_lists == 'false' or form_lists == '0':
|
||||
options_dict['form-lists'] = 0
|
||||
else:
|
||||
options_dict['form-lists'] = 0
|
||||
group_styles = options_dict.get('group-styles')
|
||||
if group_styles == 'true' or group_styles == '1':
|
||||
options_dict['group-styles'] = 1
|
||||
elif group_styles == 'false' or group_styles == '0':
|
||||
options_dict['group-styles'] = 0
|
||||
else:
|
||||
options_dict['group-styles'] = 0
|
||||
group_borders = options_dict.get('group-borders')
|
||||
if group_borders == 'true' or group_borders == '1':
|
||||
options_dict['group-borders'] = 1
|
||||
elif group_borders == 'false' or group_borders == '0':
|
||||
options_dict['group-borders'] = 0
|
||||
else:
|
||||
options_dict['group-borders'] = 0
|
||||
return options_dict
|
292
src/libprs500/ebooks/rtf2xml/group_borders.py
Executable file
292
src/libprs500/ebooks/rtf2xml/group_borders.py
Executable file
@ -0,0 +1,292 @@
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# You should have received a copy of the GNU General Public License #
|
||||
# along with this program; if not, write to the Free Software #
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
|
||||
# 02111-1307 USA #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os, tempfile, re
|
||||
from libprs500.ebooks.rtf2xml import copy
|
||||
class GroupBorders:
|
||||
"""
|
||||
Form lists.
|
||||
Use RTF's own formatting to determine if a paragraph definition is part of a
|
||||
list.
|
||||
Use indents to determine items and how lists are nested.
|
||||
"""
|
||||
def __init__(self,
|
||||
in_file,
|
||||
bug_handler,
|
||||
copy = None,
|
||||
run_level = 1,
|
||||
wrap = 0,
|
||||
):
|
||||
"""
|
||||
Required:
|
||||
'file'
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__run_level = run_level
|
||||
self.__write_to = tempfile.mktemp()
|
||||
self.__wrap = wrap
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Required:
|
||||
Nothing
|
||||
Return:
|
||||
Nothing
|
||||
Logic:
|
||||
The self.__end_list is a list of tokens that will force a list to end.
|
||||
Likewise, the self.__end_lines is a list of lines that forces a list to end.
|
||||
"""
|
||||
self.__state = "default"
|
||||
self.__left_indent = 0
|
||||
self.__border_num = 0
|
||||
self.__list_type = 'not-defined'
|
||||
self.__pard_def = ""
|
||||
self.__all_lists = []
|
||||
self.__list_chunk = ''
|
||||
self.__state_dict={
|
||||
'default' : self.__default_func,
|
||||
'in_pard' : self.__in_pard_func,
|
||||
'after_pard' : self.__after_pard_func,
|
||||
}
|
||||
# section end
|
||||
self.__end_list = [
|
||||
# section end
|
||||
'mi<mk<sect-close',
|
||||
'mi<mk<sect-start',
|
||||
# table begin
|
||||
'mi<mk<tabl-start',
|
||||
# field block begin
|
||||
'mi<mk<fldbk-end_',
|
||||
'mi<mk<fldbkstart',
|
||||
# cell end
|
||||
'mi<mk<close_cell',
|
||||
# item end
|
||||
'mi<tg<item_end__',
|
||||
# footnote end
|
||||
'mi<mk<foot___clo',
|
||||
'mi<mk<footnt-ope',
|
||||
# heading end
|
||||
'mi<mk<header-beg',
|
||||
'mi<mk<header-end',
|
||||
'mi<mk<head___clo',
|
||||
# lists
|
||||
'mi<tg<item_end__',
|
||||
'mi<tg<item_end__',
|
||||
'mi<mk<list_start'
|
||||
# body close
|
||||
#
|
||||
# style-group
|
||||
'mi<mk<style-grp_',
|
||||
'mi<mk<style_grp_',
|
||||
'mi<mk<style_gend',
|
||||
'mi<mk<stylegend_',
|
||||
# don't use
|
||||
# 'mi<mk<body-close',
|
||||
# 'mi<mk<par-in-fld',
|
||||
# 'cw<tb<cell______',
|
||||
# 'cw<tb<row-def___',
|
||||
# 'cw<tb<row_______',
|
||||
# 'mi<mk<sec-fd-beg',
|
||||
]
|
||||
# <name>Normal<
|
||||
self.__name_regex = re.compile(r'(<name>[^<]+)')
|
||||
self.__border_regex = re.compile(r'border-paragraph')
|
||||
self.__found_appt = 0
|
||||
self.__line_num = 0
|
||||
self.__border_regex = re.compile(r'(<border-paragraph[^<]+|<border-for-every-paragraph[^<]+)')
|
||||
self.__last_border_string = ''
|
||||
def __in_pard_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line -- the line of current text.
|
||||
Return:
|
||||
Nothing
|
||||
Logic:
|
||||
You are in a list, but in the middle of a paragraph definition.
|
||||
Don't do anything until you find the end of the paragraph definition.
|
||||
"""
|
||||
if self.__token_info == 'mi<tg<close_____' \
|
||||
and line[17:-1] == 'paragraph-definition':
|
||||
self.__state = 'after_pard'
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
def __after_pard_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line -- the line of current text.
|
||||
Return:
|
||||
Nothing
|
||||
Logic:
|
||||
"""
|
||||
if self.__token_info == 'mi<tg<open-att__' \
|
||||
and line[17:37] == 'paragraph-definition':
|
||||
# found paragraph definition
|
||||
self.__pard_after_par_def_func(line)
|
||||
elif self.__token_info == 'mi<tg<close_____' \
|
||||
and line[17:-1] == 'paragraph-definition':
|
||||
sys.stderr.write('Wrong flag in __after_pard_func\n')
|
||||
if self.__run_level > 2:
|
||||
msg = 'wrong flag'
|
||||
raise self.__bug_handler, msg
|
||||
elif self.__token_info in self.__end_list:
|
||||
self.__write_obj.write('mi<tg<close_____<paragraph-definition\n')
|
||||
self.__write_end_border_tag()
|
||||
self.__write_obj.write(self.__list_chunk)
|
||||
self.__list_chunk = ''
|
||||
self.__state = 'default'
|
||||
self.__write_obj.write(line)
|
||||
else:
|
||||
self.__list_chunk += line
|
||||
def __close_pard_(self, line):
|
||||
self.__write_obj.write(self.__list_chunk)
|
||||
self.__write_obj.write('mi<tg<close_____<paragraph-definition\n')
|
||||
self.__write_end_wrap()
|
||||
self.__list_chunk = ''
|
||||
self.__state = 'default'
|
||||
def __pard_after_par_def_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line -- the line of current text.
|
||||
id -- the id of the current list
|
||||
Return:
|
||||
Nothing
|
||||
Logic:
|
||||
"""
|
||||
is_border = self.__is_border_func(line)
|
||||
if not is_border:
|
||||
self.__write_obj.write('mi<tg<close_____<paragraph-definition\n')
|
||||
self.__write_end_border_tag()
|
||||
self.__write_obj.write(self.__list_chunk)
|
||||
self.__write_obj.write(line)
|
||||
self.__state = 'default'
|
||||
self.__list_chunk = ''
|
||||
else:
|
||||
border_string, pard_string = self.__parse_pard_with_border(line)
|
||||
if self.__last_border_string == border_string:
|
||||
# just keep going
|
||||
self.__write_obj.write('mi<tg<close_____<paragraph-definition\n')
|
||||
self.__write_obj.write(self.__list_chunk)
|
||||
self.__list_chunk = ''
|
||||
self.__state = 'in_pard'
|
||||
self.__write_obj.write(pard_string)
|
||||
else:
|
||||
# different name for the paragraph definition
|
||||
self.__write_obj.write('mi<tg<close_____<paragraph-definition\n')
|
||||
self.__write_end_border_tag()
|
||||
self.__write_obj.write(self.__list_chunk)
|
||||
self.__write_start_border_tag(border_string)
|
||||
self.__write_obj.write(pard_string)
|
||||
self.__state = 'in_pard'
|
||||
self.__last_border_string = border_string
|
||||
self.__list_chunk = ''
|
||||
def __default_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
self, line
|
||||
Returns:
|
||||
Nothing
|
||||
Logic
|
||||
Look for the start of a paragraph defintion. If one is found, check if
|
||||
it contains a list-id. If it does, start a list. Change the state to
|
||||
in_pard.
|
||||
"""
|
||||
if self.__token_info == 'mi<tg<open-att__' \
|
||||
and line[17:37] == 'paragraph-definition':
|
||||
contains_border = self.__is_border_func(line)
|
||||
if contains_border:
|
||||
border_string, pard_string = self.__parse_pard_with_border(line)
|
||||
self.__write_start_border_tag(border_string)
|
||||
self.__write_obj.write(pard_string)
|
||||
self.__last_border_string = border_string
|
||||
self.__state = 'in_pard'
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
def __write_start_border_tag(self, the_string):
|
||||
self.__write_obj.write('mi<mk<start-brdg\n' )
|
||||
self.__border_num += 1
|
||||
num = '%04d' % self.__border_num
|
||||
num_string = 's%s' % num
|
||||
the_string += '<num>%s' % num_string
|
||||
self.__write_obj.write('mi<tg<open-att__<border-group%s\n' % the_string)
|
||||
def __write_end_border_tag(self):
|
||||
self.__write_obj.write('mi<mk<end-brdg__\n' )
|
||||
self.__write_obj.write('mi<tg<close_____<border-group\n')
|
||||
def __is_border_func(self, line):
|
||||
line = re.sub(self.__name_regex, '', line)
|
||||
index = line.find('border-paragraph')
|
||||
if index > -1:
|
||||
return 1
|
||||
return 0
|
||||
def __parse_pard_with_border(self, line):
|
||||
border_string = ''
|
||||
pard_string = ''
|
||||
tokens = re.split(self.__border_regex, line)
|
||||
for token in tokens:
|
||||
if token[0:17] == '<border-paragraph':
|
||||
border_string += token
|
||||
else:
|
||||
pard_string += token
|
||||
return border_string, pard_string
|
||||
def __write_pard_with_border(self, line):
|
||||
border_string = ''
|
||||
pard_string = ''
|
||||
tokens = re.split(self.__border_regex, line)
|
||||
for token in tokens:
|
||||
if token[0:17] == '<border-paragraph':
|
||||
border_string += token
|
||||
else:
|
||||
pard_string += token
|
||||
self.__write_start_border_tag(border_string)
|
||||
self.__write_obj.write(pard_string)
|
||||
def __get_style_name(self, line):
|
||||
if self.__token_info == 'mi<mk<style-name':
|
||||
self.__style_name = line[17:-1]
|
||||
def group_borders(self):
|
||||
"""
|
||||
Required:
|
||||
nothing
|
||||
Returns:
|
||||
original file will be changed
|
||||
Logic:
|
||||
"""
|
||||
self.__initiate_values()
|
||||
read_obj = open(self.__file, 'r')
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
self.__get_style_name(line)
|
||||
action = self.__state_dict.get(self.__state)
|
||||
action(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "group_borders.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
241
src/libprs500/ebooks/rtf2xml/group_styles.py
Executable file
241
src/libprs500/ebooks/rtf2xml/group_styles.py
Executable file
@ -0,0 +1,241 @@
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# You should have received a copy of the GNU General Public License #
|
||||
# along with this program; if not, write to the Free Software #
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
|
||||
# 02111-1307 USA #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os, tempfile, re
|
||||
from libprs500.ebooks.rtf2xml import copy
|
||||
class GroupStyles:
|
||||
"""
|
||||
Form lists.
|
||||
Use RTF's own formatting to determine if a paragraph definition is part of a
|
||||
list.
|
||||
Use indents to determine items and how lists are nested.
|
||||
"""
|
||||
def __init__(self,
|
||||
in_file,
|
||||
bug_handler,
|
||||
copy = None,
|
||||
run_level = 1,
|
||||
wrap = 0,
|
||||
):
|
||||
"""
|
||||
Required:
|
||||
'file'
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__run_level = run_level
|
||||
self.__write_to = tempfile.mktemp()
|
||||
self.__wrap = wrap
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Required:
|
||||
Nothing
|
||||
Return:
|
||||
Nothing
|
||||
Logic:
|
||||
The self.__end_list is a list of tokens that will force a list to end.
|
||||
Likewise, the self.__end_lines is a list of lines that forces a list to end.
|
||||
"""
|
||||
self.__state = "default"
|
||||
self.__left_indent = 0
|
||||
self.__list_type = 'not-defined'
|
||||
self.__pard_def = ""
|
||||
self.__all_lists = []
|
||||
self.__list_chunk = ''
|
||||
self.__state_dict={
|
||||
'default' : self.__default_func,
|
||||
'in_pard' : self.__in_pard_func,
|
||||
'after_pard' : self.__after_pard_func,
|
||||
}
|
||||
# section end
|
||||
self.__end_list = [
|
||||
# section end
|
||||
'mi<mk<sect-close',
|
||||
'mi<mk<sect-start',
|
||||
# table begin
|
||||
'mi<mk<tabl-start',
|
||||
# field block begin
|
||||
'mi<mk<fldbk-end_',
|
||||
'mi<mk<fldbkstart',
|
||||
# cell end
|
||||
'mi<mk<close_cell',
|
||||
# item end
|
||||
'mi<tg<item_end__',
|
||||
# footnote end
|
||||
'mi<mk<foot___clo',
|
||||
'mi<mk<footnt-ope',
|
||||
# heading end
|
||||
'mi<mk<header-beg',
|
||||
'mi<mk<header-end',
|
||||
'mi<mk<head___clo',
|
||||
# lists
|
||||
'mi<tg<item_end__',
|
||||
'mi<tg<item_end__',
|
||||
'mi<mk<list_start'
|
||||
# body close
|
||||
# don't use
|
||||
# 'mi<mk<body-close',
|
||||
# 'mi<mk<par-in-fld',
|
||||
# 'cw<tb<cell______',
|
||||
# 'cw<tb<row-def___',
|
||||
# 'cw<tb<row_______',
|
||||
# 'mi<mk<sec-fd-beg',
|
||||
]
|
||||
self.__name_regex = re.compile(r'<name>')
|
||||
self.__found_appt = 0
|
||||
self.__line_num = 0
|
||||
def __in_pard_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line -- the line of current text.
|
||||
Return:
|
||||
Nothing
|
||||
Logic:
|
||||
You are in a list, but in the middle of a paragraph definition.
|
||||
Don't do anything until you find the end of the paragraph definition.
|
||||
"""
|
||||
if self.__token_info == 'mi<tg<close_____' \
|
||||
and line[17:-1] == 'paragraph-definition':
|
||||
self.__state = 'after_pard'
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
def __after_pard_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line -- the line of current text.
|
||||
Return:
|
||||
Nothing
|
||||
Logic:
|
||||
"""
|
||||
if self.__token_info == 'mi<tg<open-att__' \
|
||||
and line[17:37] == 'paragraph-definition':
|
||||
# found paragraph definition
|
||||
self.__pard_after_par_def_func(line)
|
||||
elif self.__token_info == 'mi<tg<close_____' \
|
||||
and line[17:-1] == 'paragraph-definition':
|
||||
sys.stderr.write('Wrong flag in __after_pard_func\n')
|
||||
if self.__run_level > 2:
|
||||
msg = 'wrong flag'
|
||||
raise self.__bug_handler, msg
|
||||
elif self.__token_info in self.__end_list:
|
||||
self.__write_obj.write('mi<tg<close_____<paragraph-definition\n')
|
||||
self.__write_end_wrap()
|
||||
self.__write_obj.write(self.__list_chunk)
|
||||
self.__list_chunk = ''
|
||||
self.__state = 'default'
|
||||
self.__write_obj.write(line)
|
||||
else:
|
||||
self.__list_chunk += line
|
||||
def __close_pard_(self, line):
|
||||
self.__write_obj.write(self.__list_chunk)
|
||||
self.__write_obj.write('mi<tg<close_____<paragraph-definition\n')
|
||||
self.__write_end_wrap()
|
||||
self.__list_chunk = ''
|
||||
self.__state = 'default'
|
||||
def __write_start_wrap(self, name):
|
||||
if self.__wrap:
|
||||
self.__write_obj.write('mi<mk<style-grp_<%s\n' % name)
|
||||
self.__write_obj.write('mi<tg<open-att__<style-group<name>%s\n' % name)
|
||||
self.__write_obj.write('mi<mk<style_grp_<%s\n' % name)
|
||||
def __write_end_wrap(self):
|
||||
if self.__wrap:
|
||||
self.__write_obj.write('mi<mk<style_gend\n' )
|
||||
self.__write_obj.write('mi<tg<close_____<style-group\n')
|
||||
self.__write_obj.write('mi<mk<stylegend_\n' )
|
||||
def __pard_after_par_def_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line -- the line of current text.
|
||||
id -- the id of the current list
|
||||
Return:
|
||||
Nothing
|
||||
Logic:
|
||||
"""
|
||||
if self.__last_style_name == self.__style_name:
|
||||
# just keep going
|
||||
if self.__wrap:
|
||||
self.__write_obj.write('mi<tg<close_____<paragraph-definition\n')
|
||||
self.__write_obj.write(self.__list_chunk)
|
||||
self.__list_chunk = ''
|
||||
self.__state = 'in_pard'
|
||||
if self.__wrap:
|
||||
self.__write_obj.write(line)
|
||||
else:
|
||||
# different name for the paragraph definition
|
||||
self.__write_obj.write('mi<tg<close_____<paragraph-definition\n')
|
||||
self.__write_end_wrap()
|
||||
self.__write_obj.write(self.__list_chunk)
|
||||
self.__write_start_wrap(self.__style_name)
|
||||
self.__write_obj.write(line)
|
||||
self.__state = 'in_pard'
|
||||
self.__last_style_name = self.__style_name
|
||||
self.__list_chunk = ''
|
||||
def __default_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
self, line
|
||||
Returns:
|
||||
Nothing
|
||||
Logic
|
||||
Look for the start of a paragraph defintion. If one is found, check if
|
||||
it contains a list-id. If it does, start a list. Change the state to
|
||||
in_pard.
|
||||
"""
|
||||
if self.__token_info == 'mi<tg<open-att__' \
|
||||
and line[17:37] == 'paragraph-definition':
|
||||
self.__state = 'in_pard'
|
||||
self.__last_style_name = self.__style_name
|
||||
self.__write_start_wrap(self.__last_style_name)
|
||||
self.__write_obj.write(line)
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
def __get_style_name(self, line):
|
||||
if self.__token_info == 'mi<mk<style-name':
|
||||
self.__style_name = line[17:-1]
|
||||
def group_styles(self):
|
||||
"""
|
||||
Required:
|
||||
nothing
|
||||
Returns:
|
||||
original file will be changed
|
||||
Logic:
|
||||
"""
|
||||
self.__initiate_values()
|
||||
read_obj = open(self.__file, 'r')
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
self.__get_style_name(line)
|
||||
action = self.__state_dict.get(self.__state)
|
||||
action(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "group_styles.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
265
src/libprs500/ebooks/rtf2xml/header.py
Executable file
265
src/libprs500/ebooks/rtf2xml/header.py
Executable file
@ -0,0 +1,265 @@
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# You should have received a copy of the GNU General Public License #
|
||||
# along with this program; if not, write to the Free Software #
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
|
||||
# 02111-1307 USA #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os, tempfile
|
||||
from libprs500.ebooks.rtf2xml import copy
|
||||
class Header:
|
||||
"""
|
||||
Two public methods are available. The first separates all of the headers
|
||||
and footers from the body and puts them at the bottom of the text, where
|
||||
they are easier to process. The second joins those headers and footers to
|
||||
the proper places in the body.
|
||||
"""
|
||||
def __init__(self,
|
||||
in_file ,
|
||||
bug_handler,
|
||||
copy = None,
|
||||
run_level = 1,
|
||||
):
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__write_to = tempfile.mktemp()
|
||||
self.__found_a_header = 0
|
||||
def __in_header_func(self, line):
|
||||
"""
|
||||
Handle all tokens that are part of header
|
||||
"""
|
||||
if self.__cb_count == self.__header_bracket_count:
|
||||
self.__in_header = 0
|
||||
self.__write_obj.write(line)
|
||||
self.__write_to_head_obj.write(
|
||||
'mi<mk<head___clo\n')
|
||||
self.__write_to_head_obj.write(
|
||||
'mi<tg<close_____<header-or-footer\n')
|
||||
self.__write_to_head_obj.write(
|
||||
'mi<mk<header-clo\n')
|
||||
else:
|
||||
self.__write_to_head_obj.write(line)
|
||||
def __found_header(self, line):
|
||||
"""
|
||||
Found a header
|
||||
"""
|
||||
# but this could be header or footer
|
||||
self.__found_a_header = 1
|
||||
self.__in_header = 1
|
||||
self.__header_count += 1
|
||||
# temporarily set this to zero so I can enter loop
|
||||
self.__cb_count = 0
|
||||
self.__header_bracket_count = self.__ob_count
|
||||
self.__write_obj.write(
|
||||
'mi<mk<header-ind<%04d\n' % self.__header_count)
|
||||
self.__write_to_head_obj.write(
|
||||
'mi<mk<header-ope<%04d\n' % self.__header_count)
|
||||
info = line[6:16]
|
||||
type = self.__head_dict.get(info)
|
||||
if type:
|
||||
self.__write_to_head_obj.write(
|
||||
'mi<tg<open-att__<header-or-footer<type>%s\n' % (type)
|
||||
)
|
||||
else:
|
||||
sys.stderr.write('module is header\n')
|
||||
sys.stderr.write('method is __found_header\n')
|
||||
sys.stderr.write('no dict entry\n')
|
||||
sys.stderr.write('line is %s' % line)
|
||||
self.__write_to_head_obj.write(
|
||||
'mi<tg<open-att__<header-or-footer<type>none\n'
|
||||
)
|
||||
def __default_sep(self, line):
|
||||
"""Handle all tokens that are not header tokens"""
|
||||
if self.__token_info[3:5] == 'hf':
|
||||
self.__found_header(line)
|
||||
self.__write_obj.write(line)
|
||||
def __initiate_sep_values(self):
|
||||
"""
|
||||
initiate counters for separate_footnotes method.
|
||||
"""
|
||||
self.__bracket_count=0
|
||||
self.__ob_count = 0
|
||||
self.__cb_count = 0
|
||||
self.__header_bracket_count = 0
|
||||
self.__in_header = 0
|
||||
self.__header_count = 0
|
||||
self.__head_dict = {
|
||||
'head-left_' : ('header-left'),
|
||||
'head-right' : ('header-right'),
|
||||
'foot-left_' : ('footer-left'),
|
||||
'foot-right' : ('footer-right'),
|
||||
'head-first' : ('header-first' ),
|
||||
'foot-first' : ('footer-first' ),
|
||||
'header____' : ('header' ),
|
||||
'footer____' : ('footer' ),
|
||||
}
|
||||
def separate_headers(self):
|
||||
"""
|
||||
Separate all the footnotes in an RTF file and put them at the bottom,
|
||||
where they are easier to process. Each time a footnote is found,
|
||||
print all of its contents to a temporary file. Close both the main and
|
||||
temporary file. Print the footnotes from the temporary file to the
|
||||
bottom of the main file.
|
||||
"""
|
||||
self.__initiate_sep_values()
|
||||
read_obj = open(self.__file)
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
self.__header_holder = tempfile.mktemp()
|
||||
self.__write_to_head_obj = open(self.__header_holder, 'w')
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
# keep track of opening and closing brackets
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.__ob_count = line[-5:-1]
|
||||
if self.__token_info == 'cb<nu<clos-brack':
|
||||
self.__cb_count = line[-5:-1]
|
||||
# In the middle of footnote text
|
||||
if self.__in_header:
|
||||
self.__in_header_func(line)
|
||||
# not in the middle of footnote text
|
||||
else:
|
||||
self.__default_sep(line)
|
||||
self.__write_obj.close()
|
||||
read_obj.close()
|
||||
self.__write_to_head_obj.close()
|
||||
read_obj = open(self.__header_holder, 'r')
|
||||
write_obj = open(self.__write_to, 'a')
|
||||
write_obj.write(
|
||||
'mi<mk<header-beg\n')
|
||||
line = 1
|
||||
while line:
|
||||
line = read_obj.readline()
|
||||
write_obj.write(line)
|
||||
write_obj.write(
|
||||
'mi<mk<header-end\n')
|
||||
read_obj.close()
|
||||
write_obj.close()
|
||||
os.remove(self.__header_holder)
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "header_separate.info")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
def update_info(self, file, copy):
|
||||
"""
|
||||
Unused method
|
||||
"""
|
||||
self.__file = file
|
||||
self.__copy = copy
|
||||
def __get_head_body_func(self, line):
|
||||
"""
|
||||
Process lines in main body and look for beginning of headers.
|
||||
"""
|
||||
# mi<mk<footnt-end
|
||||
if self.__token_info == 'mi<mk<header-beg':
|
||||
self.__state = 'head'
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
def __get_head_head_func(self, line):
|
||||
"""
|
||||
Copy headers and footers from bottom of file to a separate, temporary file.
|
||||
"""
|
||||
if self.__token_info == 'mi<mk<header-end':
|
||||
self.__state = 'body'
|
||||
else:
|
||||
self.__write_to_head_obj.write(line)
|
||||
def __get_headers(self):
|
||||
"""
|
||||
Private method to remove footnotes from main file. Read one line from
|
||||
the main file at a time. If the state is 'body', call on the private
|
||||
__get_foot_foot_func. Otherwise, call on the __get_foot_body_func.
|
||||
These two functions do the work of separating the footnotes form the
|
||||
body.
|
||||
"""
|
||||
read_obj = open(self.__file)
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
# self.__write_to = "footnote_info.data"
|
||||
self.__write_to_head_obj = open(self.__header_holder, 'w')
|
||||
line = 1
|
||||
while line:
|
||||
line = read_obj.readline()
|
||||
self.__token_info = line[:16]
|
||||
if self.__state == 'body':
|
||||
self.__get_head_body_func(line)
|
||||
elif self.__state == 'head':
|
||||
self.__get_head_head_func(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
self.__write_to_head_obj.close()
|
||||
def __get_head_from_temp(self, num):
|
||||
"""
|
||||
Private method for joining headers and footers to body. This method
|
||||
reads from the temporary file until the proper footnote marker is
|
||||
found. It collects all the tokens until the end of the footnote, and
|
||||
returns them as a string.
|
||||
"""
|
||||
look_for = 'mi<mk<header-ope<' + num + '\n'
|
||||
found_head = 0
|
||||
string_to_return = ''
|
||||
line = 1
|
||||
while line:
|
||||
line = self.__read_from_head_obj.readline()
|
||||
if found_head:
|
||||
if line == 'mi<mk<header-clo\n':
|
||||
return string_to_return
|
||||
string_to_return = string_to_return + line
|
||||
else:
|
||||
if line == look_for:
|
||||
found_head = 1
|
||||
def __join_from_temp(self):
|
||||
"""
|
||||
Private method for rejoining footnotes to body. Read from the
|
||||
newly-created, temporary file that contains the body text but no
|
||||
footnotes. Each time a footnote marker is found, call the private
|
||||
method __get_foot_from_temp(). This method will return a string to
|
||||
print out to the third file.
|
||||
If no footnote marker is found, simply print out the token (line).
|
||||
"""
|
||||
self.__read_from_head_obj = open(self.__header_holder, 'r')
|
||||
read_obj = open(self.__write_to, 'r')
|
||||
self.__write_obj = open(self.__write_to2, 'w')
|
||||
line = 1
|
||||
while line:
|
||||
line = read_obj.readline()
|
||||
if line[:16] == 'mi<mk<header-ind':
|
||||
line = self.__get_head_from_temp(line[17:-1])
|
||||
self.__write_obj.write(line)
|
||||
read_obj.close()
|
||||
def join_headers(self):
|
||||
"""
|
||||
Join the footnotes from the bottom of the file and put them in their
|
||||
former places. First, remove the footnotes from the bottom of the
|
||||
input file, outputting them to a temporary file. This creates two new
|
||||
files, one without footnotes, and one of just footnotes. Open both
|
||||
these files to read. When a marker is found in the main file, find the
|
||||
corresponding marker in the footnote file. Output the mix of body and
|
||||
footnotes to a third file.
|
||||
"""
|
||||
if not self.__found_a_header:
|
||||
return
|
||||
self.__write_to2 = tempfile.mktemp()
|
||||
self.__state = 'body'
|
||||
self.__get_headers()
|
||||
self.__join_from_temp()
|
||||
self.__write_obj.close()
|
||||
self.__read_from_head_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "header_join.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
os.remove(self.__header_holder)
|
215
src/libprs500/ebooks/rtf2xml/headings_to_sections.py
Executable file
215
src/libprs500/ebooks/rtf2xml/headings_to_sections.py
Executable file
@ -0,0 +1,215 @@
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# You should have received a copy of the GNU General Public License #
|
||||
# along with this program; if not, write to the Free Software #
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
|
||||
# 02111-1307 USA #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import os, tempfile, re
|
||||
from libprs500.ebooks.rtf2xml import copy
|
||||
class HeadingsToSections:
|
||||
"""
|
||||
"""
|
||||
def __init__(self,
|
||||
in_file,
|
||||
bug_handler,
|
||||
copy = None,
|
||||
run_level = 1,
|
||||
):
|
||||
"""
|
||||
Required:
|
||||
'file'
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__write_to = tempfile.mktemp()
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Required:
|
||||
Nothing
|
||||
Return:
|
||||
Nothing
|
||||
Logic:
|
||||
The self.__end_list is a list of tokens that will force a list to end.
|
||||
Likewise, the self.__end_lines is a list of lines that forces a list to end.
|
||||
"""
|
||||
self.__state = "default"
|
||||
self.__all_sections = []
|
||||
self.__chunk = ''
|
||||
self.__state_dict={
|
||||
'default' : self.__default_func,
|
||||
'in_table' : self.__in_table_func,
|
||||
'in_list' : self.__in_list_func,
|
||||
'after_body' : self.__after_body_func,
|
||||
}
|
||||
self.__list_depth = 0
|
||||
self.__end_list = [
|
||||
'mi<mk<body-close',
|
||||
# changed 2004-04-26
|
||||
# 'mi<mk<par-in-fld',
|
||||
'mi<mk<sect-close', # right before close of section
|
||||
'mi<mk<sect-start', # right before section start
|
||||
# this should be sect-close!
|
||||
# 'mi<mk<header-beg',
|
||||
# 'mi<mk<header-end',
|
||||
# 'mi<mk<head___clo',
|
||||
#
|
||||
# changed 2004-04-26
|
||||
# 'mi<mk<fldbk-end_',
|
||||
# 'mi<mk<sec-fd-beg',
|
||||
]
|
||||
self.__headings = [
|
||||
'heading 1', 'heading 2', 'heading 3', 'heading 4',
|
||||
'heading 5', 'heading 6', 'heading 7', 'heading 8',
|
||||
'heading 9'
|
||||
]
|
||||
self.__section_num = [0]
|
||||
self.__id_regex = re.compile(r'\<list-id\>(\d+)')
|
||||
def __close_lists(self):
|
||||
"""
|
||||
Required:
|
||||
Nothing
|
||||
Return:
|
||||
Nothing
|
||||
Logic:
|
||||
Reverse the list of dictionaries. Iterate through the list and
|
||||
get the indent for each list. If the current indent is less than
|
||||
or equal to the indent in the dictionary, close that level.
|
||||
Keep track of how many levels you close. Reduce the list by that
|
||||
many levels.
|
||||
Reverse the list again.
|
||||
"""
|
||||
current_indent = self.__left_indent
|
||||
self.__all_lists.reverse()
|
||||
num_levels_closed = 0
|
||||
for the_dict in self.__all_lists:
|
||||
list_indent = the_dict.get('left-indent')
|
||||
if current_indent <= list_indent:
|
||||
self.__write_end_item()
|
||||
self.__write_end_list()
|
||||
num_levels_closed += 1
|
||||
self.__all_lists = self.__all_lists[num_levels_closed:]
|
||||
self.__all_lists.reverse()
|
||||
def __close_sections(self, current_level):
|
||||
self.__all_sections.reverse()
|
||||
num_levels_closed = 0
|
||||
for level in self.__all_sections:
|
||||
if current_level <= level:
|
||||
self.__write_end_section()
|
||||
num_levels_closed += 1
|
||||
self.__all_sections = self.__all_sections[num_levels_closed:]
|
||||
self.__all_sections.reverse()
|
||||
def __write_start_section(self, current_level, name):
|
||||
section_num = ''
|
||||
for the_num in self.__section_num:
|
||||
section_num += '%s.' % the_num
|
||||
section_num = section_num[:-1]
|
||||
num_in_level = len(self.__all_sections)
|
||||
num_in_level = self.__section_num[num_in_level]
|
||||
level = len(self.__all_sections)
|
||||
self.__write_obj.write(
|
||||
'mi<mk<sect-start\n'
|
||||
)
|
||||
self.__write_obj.write (
|
||||
'mi<tg<open-att__<section<num>%s<num-in-level>%s<level>%s'
|
||||
'<type>%s\n'
|
||||
% (section_num, num_in_level, level, name)
|
||||
)
|
||||
def __write_end_section(self):
|
||||
self.__write_obj.write('mi<mk<sect-close\n')
|
||||
self.__write_obj.write('mi<tg<close_____<section\n')
|
||||
def __default_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
self, line
|
||||
Returns:
|
||||
Nothing
|
||||
Logic
|
||||
Look for the start of a paragraph defintion. If one is found, check if
|
||||
it contains a list-id. If it does, start a list. Change the state to
|
||||
in_pard.
|
||||
"""
|
||||
if self.__token_info == 'mi<mk<sect-start':
|
||||
self.__section_num[0] += 1
|
||||
self.__section_num = self.__section_num[0:1]
|
||||
if self.__token_info == 'mi<mk<tabl-start':
|
||||
self.__state = 'in_table'
|
||||
elif self.__token_info == 'mi<mk<list_start':
|
||||
self.__state = 'in_list'
|
||||
self.__list_depth += 1
|
||||
elif self.__token_info in self.__end_list:
|
||||
self.__close_sections(0)
|
||||
elif self.__token_info == 'mi<mk<style-name':
|
||||
name = line[17:-1]
|
||||
if name in self.__headings:
|
||||
self.__handle_heading(name)
|
||||
if self.__token_info == 'mi<mk<body-close':
|
||||
self.__state = 'after_body'
|
||||
self.__write_obj.write(line)
|
||||
def __handle_heading(self, name):
|
||||
num = self.__headings.index(name) + 1
|
||||
self.__close_sections(num)
|
||||
self.__all_sections.append(num)
|
||||
level_depth = len(self.__all_sections) + 1
|
||||
self.__section_num = self.__section_num[:level_depth]
|
||||
if len(self.__section_num) < level_depth:
|
||||
self.__section_num.append(1)
|
||||
else:
|
||||
self.__section_num[-1] += 1
|
||||
self.__write_start_section(num, name)
|
||||
def __in_table_func(self, line):
|
||||
if self.__token_info == 'mi<mk<table-end_':
|
||||
self.__state = 'default'
|
||||
self.__write_obj.write(line)
|
||||
def __in_list_func(self, line):
|
||||
if self.__token_info == 'mi<mk<list_close':
|
||||
self.__list_depth -= 1
|
||||
elif self.__token_info == 'mi<mk<list_start':
|
||||
self.__list_depth += 1
|
||||
if self.__list_depth == 0:
|
||||
self.__state = 'default'
|
||||
self.__write_obj.write(line)
|
||||
def __after_body_func(self, line):
|
||||
self.__write_obj.write(line)
|
||||
def make_sections(self):
|
||||
"""
|
||||
Required:
|
||||
nothing
|
||||
Returns:
|
||||
original file will be changed
|
||||
Logic:
|
||||
"""
|
||||
self.__initiate_values()
|
||||
read_obj = open(self.__file, 'r')
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
action = self.__state_dict.get(self.__state)
|
||||
action(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "sections_to_headings.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
579
src/libprs500/ebooks/rtf2xml/hex_2_utf8.py
Executable file
579
src/libprs500/ebooks/rtf2xml/hex_2_utf8.py
Executable file
@ -0,0 +1,579 @@
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# You should have received a copy of the GNU General Public License #
|
||||
# along with this program; if not, write to the Free Software #
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
|
||||
# 02111-1307 USA #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os, tempfile, cStringIO
|
||||
from libprs500.ebooks.rtf2xml import get_char_map, copy
|
||||
from libprs500.ebooks.rtf2xml.char_set import char_set
|
||||
class Hex2Utf8:
|
||||
"""
|
||||
Convert Microsoft hexidecimal numbers to utf-8
|
||||
"""
|
||||
def __init__(self,
|
||||
in_file,
|
||||
area_to_convert,
|
||||
char_file,
|
||||
default_char_map,
|
||||
bug_handler,
|
||||
invalid_rtf_handler,
|
||||
copy=None,
|
||||
temp_dir=None,
|
||||
symbol = None,
|
||||
wingdings = None,
|
||||
caps = None,
|
||||
convert_caps = None,
|
||||
dingbats = None,
|
||||
run_level = 1,
|
||||
):
|
||||
"""
|
||||
Required:
|
||||
'file'
|
||||
'area_to_convert'--the area of file to convert
|
||||
'char_file'--the file containing the character mappings
|
||||
'default_char_map'--name of default character map
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
'symbol'--whether to load the symbol character map
|
||||
'winddings'--whether to load the wingdings character map
|
||||
'caps'--whether to load the caps characer map
|
||||
'convert_to_caps'--wether to convert caps to utf-8
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__copy = copy
|
||||
if area_to_convert != 'preamble' and area_to_convert != 'body':
|
||||
msg = (
|
||||
'Developer error! Wrong flag.\n'
|
||||
'in module "hex_2_utf8.py\n'
|
||||
'"area_to_convert" must be "body" or "preamble"\n'
|
||||
)
|
||||
raise self.__bug_handler, msg
|
||||
self.__char_file = char_file
|
||||
self.__area_to_convert = area_to_convert
|
||||
self.__default_char_map = default_char_map
|
||||
self.__symbol = symbol
|
||||
self.__wingdings = wingdings
|
||||
self.__dingbats = dingbats
|
||||
self.__caps = caps
|
||||
self.__convert_caps = 0
|
||||
self.__convert_symbol = 0
|
||||
self.__convert_wingdings = 0
|
||||
self.__convert_zapf = 0
|
||||
self.__run_level = run_level
|
||||
self.__write_to = tempfile.mktemp()
|
||||
self.__bug_handler = bug_handler
|
||||
self.__invalid_rtf_handler = invalid_rtf_handler
|
||||
def update_values( self,
|
||||
file,
|
||||
area_to_convert,
|
||||
char_file,
|
||||
convert_caps,
|
||||
convert_symbol,
|
||||
convert_wingdings,
|
||||
convert_zapf,
|
||||
copy=None,
|
||||
temp_dir=None,
|
||||
symbol = None,
|
||||
wingdings = None,
|
||||
caps = None,
|
||||
dingbats = None,
|
||||
):
|
||||
"""
|
||||
Required:
|
||||
'file'
|
||||
'area_to_convert'--the area of file to convert
|
||||
'char_file'--the file containing the character mappings
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
'symbol'--whether to load the symbol character map
|
||||
'winddings'--whether to load the wingdings character map
|
||||
'caps'--whether to load the caps characer map
|
||||
'convert_to_caps'--wether to convert caps to utf-8
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file=file
|
||||
self.__copy = copy
|
||||
if area_to_convert != 'preamble' and area_to_convert != 'body':
|
||||
msg = (
|
||||
'in module "hex_2_utf8.py\n'
|
||||
'"area_to_convert" must be "body" or "preamble"\n'
|
||||
)
|
||||
raise self.__bug_handler, msg
|
||||
self.__area_to_convert = area_to_convert
|
||||
self.__symbol = symbol
|
||||
self.__wingdings = wingdings
|
||||
self.__dingbats = dingbats
|
||||
self.__caps = caps
|
||||
self.__convert_caps = convert_caps
|
||||
self.__convert_symbol = convert_symbol
|
||||
self.__convert_wingdings = convert_wingdings
|
||||
self.__convert_zapf = convert_zapf
|
||||
# new!
|
||||
# no longer try to convert these
|
||||
# self.__convert_symbol = 0
|
||||
# self.__convert_wingdings = 0
|
||||
# self.__convert_zapf = 0
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Required:
|
||||
Nothing
|
||||
Set values, including those for the dictionaries.
|
||||
The file that contains the maps is broken down into many different
|
||||
sets. For example, for the Symbol font, there is the standard part for
|
||||
hexidecimal numbers, and the the part for Microsoft charcters. Read
|
||||
each part in, and then combine them.
|
||||
"""
|
||||
# the default encoding system, the lower map for characters 0 through
|
||||
# 128, and the encoding system for Microsoft characters.
|
||||
# New on 2004-05-8: the self.__char_map is not in diretory with other
|
||||
# modules
|
||||
self.__char_file = cStringIO.StringIO(char_set)
|
||||
char_map_obj = get_char_map.GetCharMap(
|
||||
char_file = self.__char_file,
|
||||
bug_handler = self.__bug_handler,
|
||||
)
|
||||
up_128_dict = char_map_obj.get_char_map(map=self.__default_char_map)
|
||||
bt_128_dict = char_map_obj.get_char_map(map = 'bottom_128')
|
||||
ms_standard_dict = char_map_obj.get_char_map(map = 'ms_standard')
|
||||
self.__def_dict = {}
|
||||
self.__def_dict.update(up_128_dict)
|
||||
self.__def_dict.update(bt_128_dict)
|
||||
self.__def_dict.update(ms_standard_dict)
|
||||
self.__current_dict = self.__def_dict
|
||||
self.__current_dict_name = 'default'
|
||||
self.__in_caps = 0
|
||||
self.__special_fonts_found = 0
|
||||
if self.__symbol:
|
||||
symbol_base_dict = char_map_obj.get_char_map(map = 'SYMBOL')
|
||||
ms_symbol_dict = char_map_obj.get_char_map(map = 'ms_symbol')
|
||||
self.__symbol_dict = {}
|
||||
self.__symbol_dict.update(symbol_base_dict)
|
||||
self.__symbol_dict.update(ms_symbol_dict)
|
||||
if self.__wingdings:
|
||||
wingdings_base_dict = char_map_obj.get_char_map(map = 'wingdings')
|
||||
ms_wingdings_dict = char_map_obj.get_char_map(map = 'ms_wingdings')
|
||||
self.__wingdings_dict = {}
|
||||
self.__wingdings_dict.update(wingdings_base_dict)
|
||||
self.__wingdings_dict.update(ms_wingdings_dict)
|
||||
if self.__dingbats:
|
||||
dingbats_base_dict = char_map_obj.get_char_map(map = 'dingbats')
|
||||
ms_dingbats_dict = char_map_obj.get_char_map(map = 'ms_dingbats')
|
||||
self.__dingbats_dict = {}
|
||||
self.__dingbats_dict.update(dingbats_base_dict)
|
||||
self.__dingbats_dict.update(ms_dingbats_dict)
|
||||
# load dictionary for caps, and make a string for the replacement
|
||||
self.__caps_uni_dict = char_map_obj.get_char_map(map='caps_uni')
|
||||
# # print self.__caps_uni_dict
|
||||
# don't think I'll need this
|
||||
##keys = self.__caps_uni_dict.keys()
|
||||
##self.__caps_uni_replace = '|'.join(keys)
|
||||
self.__preamble_state_dict = {
|
||||
'preamble' : self.__preamble_func,
|
||||
'body' : self.__body_func,
|
||||
'mi<mk<body-open_' : self.__found_body_func,
|
||||
'tx<hx<__________' : self.__hex_text_func,
|
||||
}
|
||||
self.__body_state_dict = {
|
||||
'preamble' : self.__preamble_for_body_func,
|
||||
'body' : self.__body_for_body_func,
|
||||
}
|
||||
self.__in_body_dict = {
|
||||
'mi<mk<body-open_' : self.__found_body_func,
|
||||
'tx<ut<__________' : self.__utf_to_caps_func,
|
||||
'tx<hx<__________' : self.__hex_text_func,
|
||||
'tx<mc<__________' : self.__hex_text_func,
|
||||
'tx<nu<__________' : self.__text_func,
|
||||
'mi<mk<font______' : self.__start_font_func,
|
||||
'mi<mk<caps______' : self.__start_caps_func,
|
||||
'mi<mk<font-end__' : self.__end_font_func,
|
||||
'mi<mk<caps-end__' : self.__end_caps_func,
|
||||
}
|
||||
self.__caps_list = ['false']
|
||||
self.__font_list = ['not-defined']
|
||||
def __hex_text_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
'line' -- the line
|
||||
Logic:
|
||||
get the hex_num and look it up in the default dictionary. If the
|
||||
token is in the dictionary, then check if the value starts with a
|
||||
"&". If it does, then tag the result as utf text. Otherwise, tag it
|
||||
as normal text.
|
||||
If the nex_num is not in the dictionary, then a mistake has been
|
||||
made.
|
||||
"""
|
||||
hex_num = line[17:-1]
|
||||
converted = self.__current_dict.get(hex_num)
|
||||
if converted != None:
|
||||
# tag as utf-8
|
||||
if converted[0:1] == "&":
|
||||
font = self.__current_dict_name
|
||||
if self.__convert_caps\
|
||||
and self.__caps_list[-1] == 'true'\
|
||||
and font != 'Symbol'\
|
||||
and font != 'Wingdings'\
|
||||
and font != 'Zapf Dingbats':
|
||||
converted = self.__utf_token_to_caps_func(converted)
|
||||
self.__write_obj.write(
|
||||
'tx<ut<__________<%s\n' % converted
|
||||
)
|
||||
# tag as normal text
|
||||
else:
|
||||
font = self.__current_dict_name
|
||||
if self.__convert_caps\
|
||||
and self.__caps_list[-1] == 'true'\
|
||||
and font != 'Symbol'\
|
||||
and font != 'Wingdings'\
|
||||
and font != 'Zapf Dingbats':
|
||||
converted = converted.upper()
|
||||
self.__write_obj.write(
|
||||
'tx<nu<__________<%s\n' % converted
|
||||
)
|
||||
# error
|
||||
else:
|
||||
token = hex_num.replace("'", '')
|
||||
the_num = 0
|
||||
if token:
|
||||
the_num = int(token, 16)
|
||||
if the_num > 10:
|
||||
self.__write_obj.write('mi<tg<empty-att_<udef_symbol<num>%s<description>not-in-table\n' %
|
||||
hex_num)
|
||||
if self.__run_level > 4:
|
||||
# msg = 'no dictionary entry for %s\n'
|
||||
# msg += 'the hexidecimal num is "%s"\n' % (hex_num)
|
||||
# msg += 'dictionary is %s\n' % self.__current_dict_name
|
||||
msg = 'Character "&#x%s;" does not appear to be valid (or is a control character)\n' % token
|
||||
raise self.__bug_handler, msg
|
||||
def __found_body_func(self, line):
|
||||
self.__state = 'body'
|
||||
self.__write_obj.write(line)
|
||||
def __body_func(self, line):
|
||||
"""
|
||||
When parsing preamble
|
||||
"""
|
||||
self.__write_obj.write(line)
|
||||
def __preamble_func(self, line):
|
||||
action = self.__preamble_state_dict.get(self.__token_info)
|
||||
if action != None:
|
||||
action(line)
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
def __convert_preamble(self):
|
||||
self.__state = 'preamble'
|
||||
read_obj = open(self.__file, 'r')
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
action = self.__preamble_state_dict.get(self.__state)
|
||||
if action == None:
|
||||
sys.stderr.write('error no state found in hex_2_utf8',
|
||||
self.__state
|
||||
)
|
||||
action(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "preamble_utf_convert.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
def __preamble_for_body_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line -- line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Used when parsing the body.
|
||||
"""
|
||||
if self.__token_info == 'mi<mk<body-open_':
|
||||
self.__found_body_func(line)
|
||||
self.__write_obj.write(line)
|
||||
def __body_for_body_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line -- line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Used when parsing the body.
|
||||
"""
|
||||
action = self.__in_body_dict.get(self.__token_info)
|
||||
if action != None:
|
||||
action(line)
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
def __start_font_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line -- line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
add font face to font_list
|
||||
"""
|
||||
face = line[17:-1]
|
||||
self.__font_list.append(face)
|
||||
if face == 'Symbol' and self.__convert_symbol:
|
||||
self.__current_dict_name = 'Symbol'
|
||||
self.__current_dict = self.__symbol_dict
|
||||
elif face == 'Wingdings' and self.__convert_wingdings:
|
||||
self.__current_dict_name = 'Wingdings'
|
||||
self.__current_dict = self.__wingdings_dict
|
||||
elif face == 'Zapf Dingbats' and self.__convert_zapf:
|
||||
self.__current_dict_name = 'Zapf Dingbats'
|
||||
self.__current_dict = self.__dingbats_dict
|
||||
else:
|
||||
self.__current_dict_name = 'default'
|
||||
self.__current_dict = self.__def_dict
|
||||
def __end_font_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line -- line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
pop font_list
|
||||
"""
|
||||
if len(self.__font_list) > 1:
|
||||
self.__font_list.pop()
|
||||
else:
|
||||
sys.stderr.write('module is hex_2_utf8\n')
|
||||
sys.stderr.write('method is end_font_func\n')
|
||||
sys.stderr.write('self.__font_list should be greater than one?\n')
|
||||
face = self.__font_list[-1]
|
||||
if face == 'Symbol' and self.__convert_symbol:
|
||||
self.__current_dict_name = 'Symbol'
|
||||
self.__current_dict = self.__symbol_dict
|
||||
elif face == 'Wingdings' and self.__convert_wingdings:
|
||||
self.__current_dict_name = 'Wingdings'
|
||||
self.__current_dict = self.__wingdings_dict
|
||||
elif face == 'Zapf Dingbats' and self.__convert_zapf:
|
||||
self.__current_dict_name = 'Zapf Dingbats'
|
||||
self.__current_dict = self.__dingbats_dict
|
||||
else:
|
||||
self.__current_dict_name = 'default'
|
||||
self.__current_dict = self.__def_dict
|
||||
def __start_special_font_func_old(self, line):
|
||||
"""
|
||||
Required:
|
||||
line -- line
|
||||
Returns;
|
||||
nothing
|
||||
Logic:
|
||||
change the dictionary to use in conversion
|
||||
"""
|
||||
# for error checking
|
||||
if self.__token_info == 'mi<mk<font-symbo':
|
||||
self.__current_dict.append(self.__symbol_dict)
|
||||
self.__special_fonts_found += 1
|
||||
self.__current_dict_name = 'Symbol'
|
||||
elif self.__token_info == 'mi<mk<font-wingd':
|
||||
self.__special_fonts_found += 1
|
||||
self.__current_dict.append(self.__wingdings_dict)
|
||||
self.__current_dict_name = 'Wingdings'
|
||||
elif self.__token_info == 'mi<mk<font-dingb':
|
||||
self.__current_dict.append(self.__dingbats_dict)
|
||||
self.__special_fonts_found += 1
|
||||
self.__current_dict_name = 'Zapf Dingbats'
|
||||
def __end_special_font_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
pop the last dictionary, which should be a special font
|
||||
"""
|
||||
if len(self.__current_dict) < 2:
|
||||
sys.stderr.write('module is hex_2_utf 8\n')
|
||||
sys.stderr.write('method is __end_special_font_func\n')
|
||||
sys.stderr.write('less than two dictionaries --can\'t pop\n')
|
||||
self.__special_fonts_found -= 1
|
||||
else:
|
||||
self.__current_dict.pop()
|
||||
self.__special_fonts_found -= 1
|
||||
self.__dict_name = 'default'
|
||||
def __start_caps_func_old(self, line):
|
||||
"""
|
||||
Required:
|
||||
line -- line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
A marker that marks the start of caps has been found. Set
|
||||
self.__in_caps to 1
|
||||
"""
|
||||
self.__in_caps = 1
|
||||
def __start_caps_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line -- line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
A marker that marks the start of caps has been found. Set
|
||||
self.__in_caps to 1
|
||||
"""
|
||||
self.__in_caps = 1
|
||||
value = line[17:-1]
|
||||
self.__caps_list.append(value)
|
||||
def __end_caps_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line -- line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
A marker that marks the end of caps has been found.
|
||||
set self.__in_caps to 0
|
||||
"""
|
||||
if len(self.__caps_list) > 1:
|
||||
self.__caps_list.pop()
|
||||
else:
|
||||
sys.stderr.write('Module is hex_2_utf8\n')
|
||||
sys.stderr.write('method is __end_caps_func\n')
|
||||
sys.stderr.write('caps list should be more than one?\n')
|
||||
def __text_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line -- line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
if in caps, convert. Otherwise, print out.
|
||||
"""
|
||||
text = line[17:-1]
|
||||
if self.__current_dict_name == 'Symbol'\
|
||||
or self.__current_dict_name == 'Wingdings'\
|
||||
or self.__current_dict_name == 'Zapf Dingbats':
|
||||
the_string = ''
|
||||
for letter in text:
|
||||
hex_num = hex(ord(letter))
|
||||
hex_num = str(hex_num)
|
||||
hex_num = hex_num.upper()
|
||||
hex_num = hex_num[2:]
|
||||
hex_num = '\'%s' % hex_num
|
||||
converted = self.__current_dict.get(hex_num)
|
||||
if converted == None:
|
||||
sys.stderr.write('module is hex_2_ut8\n')
|
||||
sys.stderr.write('method is __text_func\n')
|
||||
sys.stderr.write('no hex value for "%s"\n' % hex_num)
|
||||
else:
|
||||
the_string += converted
|
||||
self.__write_obj.write('tx<nu<__________<%s\n' % the_string)
|
||||
else:
|
||||
if self.__caps_list[-1] == 'true' \
|
||||
and self.__convert_caps\
|
||||
and self.__current_dict_name != 'Symbol'\
|
||||
and self.__current_dict_name != 'Wingdings'\
|
||||
and self.__current_dict_name != 'Zapf Dingbats':
|
||||
text = text.upper()
|
||||
self.__write_obj.write('tx<nu<__________<%s\n' % text)
|
||||
def __utf_to_caps_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line -- line to parse
|
||||
returns
|
||||
nothing
|
||||
Logic
|
||||
Get the text, and use another method to convert
|
||||
"""
|
||||
utf_text = line[17:-1]
|
||||
if self.__caps_list[-1] == 'true' and self.__convert_caps:
|
||||
# utf_text = utf_text.upper()
|
||||
utf_text = self.__utf_token_to_caps_func(utf_text)
|
||||
self.__write_obj.write('tx<ut<__________<%s\n' % utf_text)
|
||||
def __utf_token_to_caps_func(self, char_entity):
|
||||
"""
|
||||
Required:
|
||||
utf_text -- such as &xxx;
|
||||
Returns:
|
||||
token converted to the capital equivalent
|
||||
Logic:
|
||||
RTF often stores text in the improper values. For example, a
|
||||
capital umlaut o (?), is stores as ?. This function swaps the
|
||||
case by looking up the value in a dictionary.
|
||||
"""
|
||||
hex_num = char_entity[3:]
|
||||
length = len(hex_num)
|
||||
if length == 3:
|
||||
hex_num = '00%s' % hex_num
|
||||
elif length == 4:
|
||||
hex_num = '0%s' % hex_num
|
||||
new_char_entity = '&#x%s' % hex_num
|
||||
converted = self.__caps_uni_dict.get(new_char_entity)
|
||||
if not converted:
|
||||
# bullets and other entities dont' have capital equivelents
|
||||
return char_entity
|
||||
else:
|
||||
return converted
|
||||
def __convert_body(self):
|
||||
self.__state = 'body'
|
||||
read_obj = open(self.__file, 'r')
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
action = self.__body_state_dict.get(self.__state)
|
||||
if action == None:
|
||||
sys.stderr.write('error no state found in hex_2_utf8',
|
||||
self.__state
|
||||
)
|
||||
action(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "body_utf_convert.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
def convert_hex_2_utf8(self):
|
||||
self.__initiate_values()
|
||||
if self.__area_to_convert == 'preamble':
|
||||
self.__convert_preamble()
|
||||
else:
|
||||
self.__convert_body()
|
||||
"""
|
||||
how to swap case for non-capitals
|
||||
my_string.swapcase()
|
||||
An example of how to use a hash for the caps function
|
||||
(but I shouldn't need this, since utf text is separate
|
||||
from regular text?)
|
||||
sub_dict = {
|
||||
"а" : "some other value"
|
||||
}
|
||||
def my_sub_func(matchobj):
|
||||
info = matchobj.group(0)
|
||||
value = sub_dict.get(info)
|
||||
return value
|
||||
return "f"
|
||||
line = "а more text"
|
||||
reg_exp = re.compile(r'(?P<name>а|б)')
|
||||
line2 = re.sub(reg_exp, my_sub_func, line)
|
||||
print line2
|
||||
"""
|
255
src/libprs500/ebooks/rtf2xml/info.py
Executable file
255
src/libprs500/ebooks/rtf2xml/info.py
Executable file
@ -0,0 +1,255 @@
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# You should have received a copy of the GNU General Public License #
|
||||
# along with this program; if not, write to the Free Software #
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
|
||||
# 02111-1307 USA #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os, tempfile
|
||||
from libprs500.ebooks.rtf2xml import copy
|
||||
class Info:
|
||||
"""
|
||||
Make tags for document-information
|
||||
"""
|
||||
def __init__(self,
|
||||
in_file,
|
||||
bug_handler,
|
||||
copy = None,
|
||||
run_level = 1,
|
||||
):
|
||||
"""
|
||||
Required:
|
||||
'file'--file to parse
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__run_level = run_level
|
||||
self.__write_to = tempfile.mktemp()
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Initiate all values.
|
||||
"""
|
||||
self.__text_string = ''
|
||||
self.__state = 'before_info_table'
|
||||
self.__state_dict = {
|
||||
'before_info_table': self.__before_info_table_func,
|
||||
'after_info_table': self.__after_info_table_func,
|
||||
'in_info_table' : self.__in_info_table_func,
|
||||
'collect_text' : self.__collect_text_func,
|
||||
'collect_tokens' : self.__collect_tokens_func,
|
||||
}
|
||||
self.__info_table_dict = {
|
||||
'cw<di<title_____' : (self.__found_tag_with_text_func, 'title'),
|
||||
'cw<di<author____' : (self.__found_tag_with_text_func, 'author'),
|
||||
'cw<di<keywords__' : (self.__found_tag_with_text_func, 'keywords'),
|
||||
'cw<di<doc-notes_' : (self.__found_tag_with_text_func, 'doc-notes'),
|
||||
'cw<di<subject___' : (self.__found_tag_with_text_func, 'subject'),
|
||||
'cw<di<operator__' : (self.__found_tag_with_text_func, 'operator'),
|
||||
'cw<di<create-tim' : (self.__found_tag_with_tokens_func, 'creation-time'),
|
||||
'cw<di<revis-time' : (self.__found_tag_with_tokens_func, 'revision-time'),
|
||||
'cw<di<edit-time_' : (self.__single_field_func, 'editing-time'),
|
||||
'cw<di<num-of-wor' : (self.__single_field_func, 'number-of-words'),
|
||||
'cw<di<num-of-chr' : (self.__single_field_func, 'number-of-characters'),
|
||||
'cw<di<num-of-pag' : (self.__single_field_func, 'number-of-pages'),
|
||||
}
|
||||
self.__token_dict = {
|
||||
'year______' : 'year',
|
||||
'month_____' : 'month',
|
||||
'day_______' : 'day',
|
||||
'minute____' : 'minute',
|
||||
'revis-time' : 'revision-time',
|
||||
'num-of-wor' : 'number-of-words',
|
||||
'num-of-chr' : 'number-of-characters',
|
||||
'num-of-pag' : 'number-of-pages',
|
||||
}
|
||||
def __before_info_table_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line -- the line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Check for the beginning of the informatin table. When found, set
|
||||
the state to the information table. Always write the line.
|
||||
"""
|
||||
if self.__token_info == 'mi<mk<doc-in-beg':
|
||||
self.__state = 'in_info_table'
|
||||
self.__write_obj.write(line)
|
||||
def __in_info_table_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line to parse
|
||||
Returns:
|
||||
nothing.
|
||||
Logic:
|
||||
Check for the end of information. If not found, check if the
|
||||
token has a special value in the info table dictionay. If it
|
||||
does, execute that function.
|
||||
Otherwise, output the line to the file.
|
||||
"""
|
||||
if self.__token_info == 'mi<mk<doc-in-end':
|
||||
self.__state = 'after_info_table'
|
||||
else:
|
||||
action, tag = self.__info_table_dict.get(self.__token_info, (None, None))
|
||||
if action:
|
||||
action(line, tag)
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
def __found_tag_with_text_func(self, line, tag):
|
||||
"""
|
||||
Requires:
|
||||
line -- line to parse
|
||||
tag --what kind of line
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
This function marks the beginning of informatin fields that have
|
||||
text that must be collected. Set the type of information field
|
||||
with the tag option. Set the state to collecting text
|
||||
"""
|
||||
self.__tag = tag
|
||||
self.__state = 'collect_text'
|
||||
def __collect_text_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
If the end of the information field is found, write the text
|
||||
string to the file.
|
||||
Otherwise, if the line contains text, add it to the text string.
|
||||
"""
|
||||
if self.__token_info == 'mi<mk<docinf-end':
|
||||
self.__state = 'in_info_table'
|
||||
self.__write_obj.write(
|
||||
'mi<tg<open______<%s\n'
|
||||
'tx<nu<__________<%s\n'
|
||||
'mi<tg<close_____<%s\n' % (self.__tag, self.__text_string, self.__tag)
|
||||
)
|
||||
self.__text_string = ''
|
||||
elif line[0:2] == 'tx':
|
||||
self.__text_string += line[17:-1]
|
||||
def __found_tag_with_tokens_func(self, line, tag):
|
||||
"""
|
||||
Requires:
|
||||
line -- line to parse
|
||||
tag -- type of field
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Some fields have a series of tokens (cw<di<year______<nu<2003)
|
||||
that must be parsed as attributes for the element.
|
||||
Set the state to collect tokesn, and set the text string to
|
||||
start an empty element with attributes.
|
||||
"""
|
||||
self.__state = 'collect_tokens'
|
||||
self.__text_string = 'mi<tg<empty-att_<%s' % tag
|
||||
#mi<tg<empty-att_<page-definition<margin>33\n
|
||||
def __collect_tokens_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
This function collects all the token information and adds it to
|
||||
the text string until the end of the field is found.
|
||||
First check of the end of the information field. If found, write
|
||||
the text string to the file.
|
||||
If not found, get the relevant information from the text string.
|
||||
This information cannot be directly added to the text string,
|
||||
because it exists in abbreviated form. (num-of-wor)
|
||||
I want to check this information in a dictionary to convert it
|
||||
to a longer, readable form. If the key does not exist in the
|
||||
dictionary, print out an error message. Otherise add the value
|
||||
to the text string.
|
||||
(num-of-wor => number-of-words)
|
||||
"""
|
||||
#cw<di<year______<nu<2003
|
||||
if self.__token_info == 'mi<mk<docinf-end':
|
||||
self.__state = 'in_info_table'
|
||||
self.__write_obj.write(
|
||||
'%s\n' % self.__text_string
|
||||
)
|
||||
self.__text_string = ''
|
||||
else:
|
||||
att = line[6:16]
|
||||
value = line[20:-1]
|
||||
att_changed = self.__token_dict.get(att)
|
||||
if att_changed == None:
|
||||
if self.__run_level > 3:
|
||||
msg = 'no dictionary match for %s\n' % att
|
||||
raise self.__bug_handler, msg
|
||||
else:
|
||||
self.__text_string += '<%s>%s' % (att_changed, value)
|
||||
def __single_field_func(self, line, tag):
|
||||
value = line[20:-1]
|
||||
self.__write_obj.write(
|
||||
'mi<tg<empty-att_<%s'
|
||||
'<%s>%s\n' % (tag, tag, value)
|
||||
)
|
||||
def __after_info_table_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to write to file
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
After the end of the information table, simple write the line to
|
||||
the file.
|
||||
"""
|
||||
self.__write_obj.write(line)
|
||||
def fix_info(self):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
nothing (changes the original file)
|
||||
Logic:
|
||||
Read one line in at a time. Determine what action to take based on
|
||||
the state. If the state is before the information table, look for the
|
||||
beginning of the style table.
|
||||
If the state is in the information table, use other methods to
|
||||
parse the information
|
||||
style table, look for lines with style info, and substitute the
|
||||
number with the name of the style. If the state if afer the
|
||||
information table, simply write the line to the output file.
|
||||
"""
|
||||
self.__initiate_values()
|
||||
read_obj = open(self.__file, 'r')
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if action == None:
|
||||
sys.stderr.write('no no matching state in module styles.py\n')
|
||||
sys.stderr.write(self.__state + '\n')
|
||||
action(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "info.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
411
src/libprs500/ebooks/rtf2xml/inline.py
Executable file
411
src/libprs500/ebooks/rtf2xml/inline.py
Executable file
@ -0,0 +1,411 @@
|
||||
import sys, os, tempfile
|
||||
from libprs500.ebooks.rtf2xml import copy
|
||||
"""
|
||||
States.
|
||||
1. default
|
||||
1. an open bracket ends this state.
|
||||
2. Text print out text. Print out any groups_in_waiting.
|
||||
3. closed bracket. Close groups
|
||||
2. after an open bracket
|
||||
1. The lack of a control word ends this state.
|
||||
2. paragraph end -- close out all tags
|
||||
3. footnote beg -- close out all tags
|
||||
"""
|
||||
class Inline:
|
||||
"""
|
||||
Make inline tags within lists.
|
||||
Logic:
|
||||
"""
|
||||
def __init__(self,
|
||||
in_file,
|
||||
bug_handler,
|
||||
copy=None,
|
||||
run_level = 1,):
|
||||
"""
|
||||
Required:
|
||||
'file'--file to parse
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__run_level = run_level
|
||||
self.__write_to = tempfile.mktemp()
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Initiate all values.
|
||||
"""
|
||||
self.__state_dict = {
|
||||
'default': self.__default_func,
|
||||
'after_open_bracket': self.__after_open_bracket_func,
|
||||
}
|
||||
self.__default_dict = {
|
||||
'ob<nu<open-brack': self.__found_open_bracket_func,
|
||||
'tx<nu<__________' : self.__found_text_func,
|
||||
'tx<hx<__________' : self.__found_text_func,
|
||||
'tx<ut<__________' : self.__found_text_func,
|
||||
'mi<mk<inline-fld' : self.__found_text_func,
|
||||
'text' : self.__found_text_func,
|
||||
'cb<nu<clos-brack' : self.__close_bracket_func,
|
||||
'mi<mk<par-end___' : self.__end_para_func,
|
||||
'mi<mk<footnt-ope' : self.__end_para_func,
|
||||
'mi<mk<footnt-ind' : self.__end_para_func,
|
||||
}
|
||||
self.__after_open_bracket_dict = {
|
||||
'cb<nu<clos-brack' : self.__close_bracket_func,
|
||||
'tx<nu<__________' : self.__found_text_func,
|
||||
'tx<hx<__________' : self.__found_text_func,
|
||||
'tx<ut<__________' : self.__found_text_func,
|
||||
'text' : self.__found_text_func,
|
||||
'mi<mk<inline-fld' : self.__found_text_func,
|
||||
'ob<nu<open-brack': self.__found_open_bracket_func,
|
||||
'mi<mk<par-end___' : self.__end_para_func,
|
||||
'mi<mk<footnt-ope' : self.__end_para_func,
|
||||
'mi<mk<footnt-ind' : self.__end_para_func,
|
||||
'cw<fd<field_____' : self.__found_field_func,
|
||||
}
|
||||
self.__state = 'default'
|
||||
self.__brac_count = 0 # do I need this?
|
||||
self.__list_inline_list = []
|
||||
self.__body_inline_list = []
|
||||
self.__groups_in_waiting_list = [0]
|
||||
self.__groups_in_waiting_body = [0]
|
||||
self.__groups_in_waiting = self.__groups_in_waiting_body
|
||||
self.__place = 'non_list'
|
||||
self.__inline_list = self.__body_inline_list
|
||||
self.__in_para = 0 # not in paragraph
|
||||
self.__char_dict = {
|
||||
# character info => ci
|
||||
'annotation' : 'annotation',
|
||||
'blue______' : 'blue',
|
||||
'bold______' : 'bold',
|
||||
'caps______' : 'caps',
|
||||
'char-style' : 'character-style',
|
||||
'dbl-strike' : 'double-strike-through',
|
||||
'emboss____' : 'emboss',
|
||||
'engrave___' : 'engrave',
|
||||
'font-color' : 'font-color',
|
||||
'font-down_' : 'subscript',
|
||||
'font-size_' : 'font-size',
|
||||
'font-style' : 'font-style',
|
||||
'font-up___' : 'superscript',
|
||||
'footnot-mk' : 'footnote-marker',
|
||||
'green_____' : 'green',
|
||||
'hidden____' : 'hidden',
|
||||
'italics___' : 'italics',
|
||||
'outline___' : 'outline',
|
||||
'red_______' : 'red',
|
||||
'shadow____' : 'shadow',
|
||||
'small-caps' : 'small-caps',
|
||||
'strike-thr' : 'strike-through',
|
||||
'subscript_' : 'subscript',
|
||||
'superscrip' : 'superscript',
|
||||
'underlined' : 'underlined',
|
||||
}
|
||||
self.__caps_list = ['false']
|
||||
def __set_list_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line--line of text
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
"""
|
||||
if self.__place == 'in_list':
|
||||
if self.__token_info == 'mi<mk<lst-tx-end':
|
||||
self.__place = 'not_in_list'
|
||||
self.__inline_list = self.__body_inline_list
|
||||
self.__groups_in_waiting = self.__groups_in_waiting_body
|
||||
else:
|
||||
if self.__token_info == 'mi<mk<lst-tx-beg':
|
||||
self.__place = 'in_list'
|
||||
self.__inline_list = self.__list_inline_list
|
||||
self.__groups_in_waiting = self.__groups_in_waiting_list
|
||||
def __default_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line-- line of text
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
"""
|
||||
action = self.__default_dict.get(self.__token_info)
|
||||
if action:
|
||||
action(line)
|
||||
self.__write_obj.write(line)
|
||||
def __found_open_bracket_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- current line of text
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Change the state to 'after_open_bracket'
|
||||
"""
|
||||
self.__state = 'after_open_bracket'
|
||||
self.__brac_count += 1
|
||||
self.__groups_in_waiting[0] += 1
|
||||
self.__inline_list.append({})
|
||||
self.__inline_list[-1]['contains_inline'] = 0
|
||||
def __after_open_bracket_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line of text
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
If the token is a control word for character info (cw<ci), use another
|
||||
method to add to the dictionary.
|
||||
Use the dictionary to get the approriate function.
|
||||
Always print out the line.
|
||||
"""
|
||||
if line[0:2] == 'cw':
|
||||
self.__handle_control_word(line)
|
||||
else:
|
||||
action = self.__after_open_bracket_dict.get(self.__token_info)
|
||||
if action:
|
||||
self.__state = 'default' # a non control word?
|
||||
action(line)
|
||||
self.__write_obj.write(line)
|
||||
def __handle_control_word(self, line):
|
||||
"""
|
||||
Required:
|
||||
line --line of text
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Handle the control word for inline groups.
|
||||
Add each name - value to a dictionary.
|
||||
If the font style of Symbol, Wingdings, or Dingbats is found,
|
||||
always mark this. I need this later to convert the text to
|
||||
the right utf.
|
||||
"""
|
||||
# cw<ci<shadow_____<nu<true
|
||||
# self.__char_dict = {
|
||||
char_info = line[6:16]
|
||||
char_value = line[20:-1]
|
||||
name = self.__char_dict.get(char_info)
|
||||
if name:
|
||||
self.__inline_list[-1]['contains_inline'] = 1
|
||||
self.__inline_list[-1][name] = char_value
|
||||
"""
|
||||
if name == 'font-style':
|
||||
if char_value == 'Symbol':
|
||||
self.__write_obj.write('mi<mk<font-symbo\n')
|
||||
elif char_value == 'Wingdings':
|
||||
self.__write_obj.write('mi<mk<font-wingd\n')
|
||||
elif char_value == 'Zapf Dingbats':
|
||||
self.__write_obj.write('mi<mk<font-dingb\n')
|
||||
"""
|
||||
def __close_bracket_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line of text
|
||||
Returns:
|
||||
Nothing
|
||||
Logic:
|
||||
If there are no inline groups, do nothing.
|
||||
Get the keys of the last dictionary in the inline_groups.
|
||||
If 'contains_inline' in the keys, write a close tag.
|
||||
If the_dict contains font information, write a mk tag.
|
||||
"""
|
||||
if len(self.__inline_list) == 0:
|
||||
# nothing to add
|
||||
return
|
||||
the_dict = self.__inline_list[-1]
|
||||
the_keys = the_dict.keys()
|
||||
# always close out
|
||||
if self.__place == 'in_list':
|
||||
if 'contains_inline' in the_keys and the_dict['contains_inline'] == 1\
|
||||
and self.__groups_in_waiting[0] == 0:
|
||||
self.__write_obj.write('mi<tg<close_____<inline\n')
|
||||
if 'font-style' in the_keys:
|
||||
self.__write_obj.write('mi<mk<font-end__\n')
|
||||
if 'caps' in the_keys:
|
||||
self.__write_obj.write('mi<mk<caps-end__\n')
|
||||
else:
|
||||
# close out only if in a paragraph
|
||||
if 'contains_inline' in the_keys and the_dict['contains_inline'] == 1\
|
||||
and self.__in_para and self.__groups_in_waiting[0] == 0:
|
||||
self.__write_obj.write('mi<tg<close_____<inline\n')
|
||||
if 'font-style' in the_keys:
|
||||
self.__write_obj.write('mi<mk<font-end__\n')
|
||||
if 'caps' in the_keys:
|
||||
self.__write_obj.write('mi<mk<caps-end__\n')
|
||||
self.__inline_list.pop()
|
||||
if self.__groups_in_waiting[0] != 0:
|
||||
self.__groups_in_waiting[0] -= 1
|
||||
def __found_text_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line--line of text
|
||||
Return:
|
||||
nothing
|
||||
Logic:
|
||||
Two cases:
|
||||
1. in a list. Simply write inline
|
||||
2. Not in a list
|
||||
Text can mark the start of a paragraph.
|
||||
If already in a paragraph, check to see if any groups are waiting
|
||||
to be added. If so, use another method to write these groups.
|
||||
"""
|
||||
if self.__place == 'in_list':
|
||||
self.__write_inline()
|
||||
else:
|
||||
if not self.__in_para:
|
||||
self.__in_para = 1
|
||||
self.__start_para_func(line)
|
||||
else:
|
||||
if self.__groups_in_waiting[0] != 0:
|
||||
self.__write_inline()
|
||||
def __write_inline(self):
|
||||
"""
|
||||
Required:
|
||||
nothing
|
||||
Returns
|
||||
Nothing
|
||||
Logic:
|
||||
Method for writing inline when text is found.
|
||||
Only write those groups that are "waiting", or that have no
|
||||
tags yet.
|
||||
First, slice the list self.__inline list to get just the groups
|
||||
in waiting.
|
||||
Iterate through this slice, which contains only dictionaries.
|
||||
Get the keys in each dictionary. If 'font-style' is in the keys,
|
||||
write a marker tag. (I will use this marker tag later when conerting
|
||||
hext text to utf8.)
|
||||
Write a tag for the inline vaues.
|
||||
"""
|
||||
if self.__groups_in_waiting[0] != 0:
|
||||
last_index = -1 * self.__groups_in_waiting[0]
|
||||
inline_list = self.__inline_list[last_index:]
|
||||
if len(inline_list) <= 0:
|
||||
if self.__run_level > 3:
|
||||
msg = 'self.__inline_list is %s\n' % self.__inline_list
|
||||
raise self.__bug_handler, msg
|
||||
self.__write_obj.write('error\n')
|
||||
self.__groups_in_waiting[0] = 0
|
||||
return
|
||||
for the_dict in inline_list:
|
||||
if the_dict['contains_inline']:
|
||||
the_keys = the_dict.keys()
|
||||
if 'font-style' in the_keys:
|
||||
face = the_dict['font-style']
|
||||
self.__write_obj.write('mi<mk<font______<%s\n' % face)
|
||||
if 'caps' in the_keys:
|
||||
value = the_dict['caps']
|
||||
self.__write_obj.write('mi<mk<caps______<%s\n' % value)
|
||||
self.__write_obj.write('mi<tg<open-att__<inline')
|
||||
for the_key in the_keys:
|
||||
if the_key != 'contains_inline':
|
||||
self.__write_obj.write('<%s>%s' % (the_key, the_dict[the_key]))
|
||||
self.__write_obj.write('\n')
|
||||
self.__groups_in_waiting[0] = 0
|
||||
def __end_para_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line of text
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Slice from the end the groups in waiting.
|
||||
Iterate through the list. If the dictionary contaings info, write
|
||||
a closing tag.
|
||||
"""
|
||||
if not self.__in_para:
|
||||
return
|
||||
if self.__groups_in_waiting[0] == 0:
|
||||
inline_list = self.__inline_list
|
||||
else:
|
||||
last_index = -1 * self.__groups_in_waiting[0]
|
||||
inline_list = self.__inline_list[0:last_index]
|
||||
for the_dict in inline_list:
|
||||
contains_info = the_dict.get('contains_inline')
|
||||
if contains_info:
|
||||
the_keys = the_dict.keys()
|
||||
if 'font-style' in the_keys:
|
||||
self.__write_obj.write('mi<mk<font-end__\n')
|
||||
if 'caps' in the_keys:
|
||||
self.__write_obj.write('mi<mk<caps-end__\n')
|
||||
self.__write_obj.write('mi<tg<close_____<inline\n')
|
||||
self.__in_para = 0
|
||||
def __start_para_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line of text
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Iterate through the self.__inline_list to get each dict.
|
||||
If the dict containst inline info, get the keys.
|
||||
Iterate through the keys and print out the key and value.
|
||||
"""
|
||||
for the_dict in self.__inline_list:
|
||||
contains_info = the_dict.get('contains_inline')
|
||||
if contains_info :
|
||||
the_keys = the_dict.keys()
|
||||
if 'font-style' in the_keys:
|
||||
face = the_dict['font-style']
|
||||
self.__write_obj.write('mi<mk<font______<%s\n' % face)
|
||||
if 'caps' in the_keys:
|
||||
value = the_dict['caps']
|
||||
self.__write_obj.write('mi<mk<caps______<%s\n' % value)
|
||||
self.__write_obj.write('mi<tg<open-att__<inline')
|
||||
for the_key in the_keys:
|
||||
if the_key != 'contains_inline':
|
||||
self.__write_obj.write('<%s>%s' % (the_key, the_dict[the_key]))
|
||||
self.__write_obj.write('\n')
|
||||
self.__groups_in_waiting[0] = 0
|
||||
def __found_field_func(self, line):
|
||||
"""
|
||||
Just a default function to make sure I don't prematurely exit
|
||||
default state
|
||||
"""
|
||||
pass
|
||||
def form_tags(self):
|
||||
"""
|
||||
Requires:
|
||||
area--area to parse (list or non-list)
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Read one line in at a time. Determine what action to take based on
|
||||
the state.
|
||||
"""
|
||||
self.__initiate_values()
|
||||
read_obj = open(self.__file, 'r')
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
token = line[0:-1]
|
||||
self.__token_info = ''
|
||||
if token == 'tx<mc<__________<rdblquote'\
|
||||
or token == 'tx<mc<__________<ldblquote'\
|
||||
or token == 'tx<mc<__________<lquote'\
|
||||
or token == 'tx<mc<__________<rquote'\
|
||||
or token == 'tx<mc<__________<emdash'\
|
||||
or token == 'tx<mc<__________<endash'\
|
||||
or token == 'tx<mc<__________<bullet':
|
||||
self.__token_info = 'text'
|
||||
else:
|
||||
self.__token_info = line[:16]
|
||||
self.__set_list_func(line)
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if action == None:
|
||||
sys.stderr.write('No matching state in module inline_for_lists.py\n')
|
||||
sys.stderr.write(self.__state + '\n')
|
||||
action(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "inline.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
67
src/libprs500/ebooks/rtf2xml/line_endings.py
Executable file
67
src/libprs500/ebooks/rtf2xml/line_endings.py
Executable file
@ -0,0 +1,67 @@
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# You should have received a copy of the GNU General Public License #
|
||||
# along with this program; if not, write to the Free Software #
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
|
||||
# 02111-1307 USA #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import os, tempfile, re
|
||||
from libprs500.ebooks.rtf2xml import copy
|
||||
class FixLineEndings:
|
||||
"""Fix line endings"""
|
||||
def __init__(self,
|
||||
bug_handler,
|
||||
in_file = None,
|
||||
copy = None,
|
||||
run_level = 1,
|
||||
replace_illegals = 1,
|
||||
):
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__run_level = run_level
|
||||
self.__write_to = tempfile.mktemp()
|
||||
self.__replace_illegals = replace_illegals
|
||||
def fix_endings(self):
|
||||
##tempFileName = tempfile.mktemp()
|
||||
illegal_regx = re.compile( '\x00|\x01|\x02|\x03|\x04|\x05|\x06|\x07|\x08|\x0B|\x0E|\x0F|\x10|\x11|\x12|\x13')
|
||||
#nums = [0, 1, 2, 3, 4, 5, 6, 7, 8, 11, 14, 15, 16, 17, 18, 19]
|
||||
"""
|
||||
read_obj = open(self.__file, 'r')
|
||||
line = read_obj.read(1000)
|
||||
regexp = re.compile(r"\r")
|
||||
macintosh = regexp.search(line)
|
||||
read_obj.close()
|
||||
"""
|
||||
# always check since I have to get rid of illegal characters
|
||||
macintosh = 1
|
||||
if macintosh:
|
||||
line = 1
|
||||
read_obj = open(self.__file, 'r')
|
||||
write_obj = open(self.__write_to, 'w')
|
||||
while line:
|
||||
line = read_obj.read(1000)
|
||||
# line = re.sub(regexp,"\n",line)
|
||||
line = line.replace ('\r', '\n')
|
||||
if self.__replace_illegals:
|
||||
line = re.sub(illegal_regx, '', line)
|
||||
# for num in nums:
|
||||
# line = line.replace(chr(num), '')
|
||||
write_obj.write(line )
|
||||
read_obj.close()
|
||||
write_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "line_endings.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
193
src/libprs500/ebooks/rtf2xml/list_numbers.py
Executable file
193
src/libprs500/ebooks/rtf2xml/list_numbers.py
Executable file
@ -0,0 +1,193 @@
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# You should have received a copy of the GNU General Public License #
|
||||
# along with this program; if not, write to the Free Software #
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
|
||||
# 02111-1307 USA #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import os, tempfile
|
||||
from libprs500.ebooks.rtf2xml import copy
|
||||
class ListNumbers:
|
||||
"""
|
||||
RTF puts list numbers outside of the paragraph. The public method
|
||||
in this class put the list numbers inside the paragraphs.
|
||||
"""
|
||||
def __init__(self,
|
||||
in_file,
|
||||
bug_handler,
|
||||
copy = None,
|
||||
run_level = 1,
|
||||
):
|
||||
"""
|
||||
Required:
|
||||
'file'
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__write_to = tempfile.mktemp()
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
initiate values for fix_list_numbers.
|
||||
Required:
|
||||
Nothing
|
||||
Return:
|
||||
Nothing
|
||||
"""
|
||||
self.__state = "default"
|
||||
self.__list_chunk = ''
|
||||
self.__previous_line = ''
|
||||
self.__list_text_ob_count = ''
|
||||
self.__state_dict={
|
||||
'default' : self.__default_func,
|
||||
'after_ob' : self.__after_ob_func,
|
||||
'list_text' : self.__list_text_func,
|
||||
'after_list_text' : self.__after_list_text_func
|
||||
}
|
||||
def __after_ob_func(self, line):
|
||||
"""
|
||||
Handle the line immediately after an open bracket.
|
||||
Required:
|
||||
self, line
|
||||
Returns:
|
||||
Nothing
|
||||
"""
|
||||
if self.__token_info == 'cw<ls<list-text_':
|
||||
self.__state = 'list_text'
|
||||
self.__list_chunk = self.__list_chunk + \
|
||||
self.__previous_line + line
|
||||
self.__list_text_ob = self.__ob_count
|
||||
self.__cb_count = 0
|
||||
else:
|
||||
self.__write_obj.write(self.__previous_line)
|
||||
self.__write_obj.write(line)
|
||||
self.__state = 'default'
|
||||
def __after_list_text_func(self, line):
|
||||
"""
|
||||
Look for an open bracket or a line of text, and then print out the
|
||||
self.__list_chunk. Print out the line.
|
||||
"""
|
||||
if line[0:2] == 'ob' or line[0:2] == 'tx':
|
||||
self.__state = 'default'
|
||||
self.__write_obj.write('mi<mk<lst-txbeg_\n')
|
||||
self.__write_obj.write('mi<mk<para-beg__\n')
|
||||
self.__write_obj.write('mi<mk<lst-tx-beg\n')
|
||||
self.__write_obj.write(
|
||||
# 'mi<tg<open-att__<list-text<type>%s\n' % self.__list_type)
|
||||
'mi<tg<open-att__<list-text\n')
|
||||
self.__write_obj.write(self.__list_chunk)
|
||||
self.__write_obj.write('mi<tg<close_____<list-text\n')
|
||||
self.__write_obj.write('mi<mk<lst-tx-end\n')
|
||||
self.__list_chunk = ''
|
||||
self.__write_obj.write(line)
|
||||
def __determine_list_type(self, chunk):
|
||||
"""
|
||||
Determine if the list is ordered or itemized
|
||||
"""
|
||||
lines = chunk.split('\n')
|
||||
text_string = ''
|
||||
for line in lines:
|
||||
if line [0:5] == 'tx<hx':
|
||||
if line[17:] == '\'B7':
|
||||
return "unordered"
|
||||
elif line[0:5] == 'tx<nu':
|
||||
text_string += line[17:]
|
||||
text_string = text_string.replace('.', '')
|
||||
text_string = text_string.replace('(', '')
|
||||
text_string = text_string.replace(')', '')
|
||||
if text_string.isdigit():
|
||||
return 'ordered'
|
||||
"""
|
||||
sys.stderr.write('module is list_numbers\n')
|
||||
sys.stderr.write('method is __determine type\n')
|
||||
sys.stderr.write('Couldn\'t get type of list\n')
|
||||
"""
|
||||
# must be some type of ordered list -- just a guess!
|
||||
return 'unordered'
|
||||
def __list_text_func(self, line):
|
||||
"""
|
||||
Handle lines that are part of the list text. If the end of the list
|
||||
text is found (the closing bracket matches the self.__list_text_ob),
|
||||
then change the state. Always add the line to the self.__list_chunk
|
||||
Required:
|
||||
self, line
|
||||
Returns:
|
||||
Nothing
|
||||
"""
|
||||
if self.__list_text_ob == self.__cb_count:
|
||||
self.__state = 'after_list_text'
|
||||
self.__right_after_list_text = 1
|
||||
self.__list_type = self.__determine_list_type(self.__list_chunk)
|
||||
self.__write_obj.write('mi<mk<list-type_<%s\n' % self.__list_type)
|
||||
if self.__token_info != 'cw<pf<par-def___':
|
||||
self.__list_chunk = self.__list_chunk + line
|
||||
def __default_func(self, line):
|
||||
"""
|
||||
Handle the lines that are not part of any special state. Look for an
|
||||
opening bracket. If an open bracket is found, add this line to a
|
||||
temporary self.__previous line, which other methods need. Otherwise,
|
||||
print out the line.
|
||||
Required:
|
||||
self, line
|
||||
Returns:
|
||||
Nothing
|
||||
"""
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.__state = 'after_ob'
|
||||
self.__previous_line = line
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
def fix_list_numbers(self):
|
||||
"""
|
||||
Required:
|
||||
nothing
|
||||
Returns:
|
||||
original file will be changed
|
||||
Logic:
|
||||
Read in one line a time from the file. Keep track of opening and
|
||||
closing brackets. Determine the method ('action') by passing the
|
||||
state to the self.__state_dict.
|
||||
Simply print out the line to a temp file until an open bracket
|
||||
is found. Check the next line. If it is list-text, then start
|
||||
adding to the self.__list_chunk until the closing bracket is
|
||||
found.
|
||||
Next, look for an open bracket or text. When either is found,
|
||||
print out self.__list_chunk and the line.
|
||||
"""
|
||||
self.__initiate_values()
|
||||
read_obj = open(self.__file, 'r')
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.__ob_count = line[-5:-1]
|
||||
if self.__token_info == 'cb<nu<clos-brack':
|
||||
self.__cb_count = line[-5:-1]
|
||||
action = self.__state_dict.get(self.__state)
|
||||
action(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "list_numbers.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
431
src/libprs500/ebooks/rtf2xml/list_table.py
Executable file
431
src/libprs500/ebooks/rtf2xml/list_table.py
Executable file
@ -0,0 +1,431 @@
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# You should have received a copy of the GNU General Public License #
|
||||
# along with this program; if not, write to the Free Software #
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
|
||||
# 02111-1307 USA #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
class ListTable:
|
||||
"""
|
||||
Parse the list table line. Make a string. Form a dictionary.
|
||||
Return the string and the dictionary.
|
||||
"""
|
||||
def __init__(
|
||||
self,
|
||||
bug_handler,
|
||||
run_level = 1,
|
||||
):
|
||||
self.__bug_handler = bug_handler
|
||||
self.__initiate_values()
|
||||
self.__run_level = run_level
|
||||
def __initiate_values(self):
|
||||
self.__list_table_final = ''
|
||||
self.__state = 'default'
|
||||
self.__final_dict = {}
|
||||
self.__list_dict = {}
|
||||
self.__all_lists = []
|
||||
self.__level_text_string = ''
|
||||
self.__level_text_list = []
|
||||
self.__found_level_text_length = 0
|
||||
self.__level_text_position = None
|
||||
self.__prefix_string = None
|
||||
self.__level_numbers_string = ''
|
||||
self.__state_dict = {
|
||||
'default' : self.__default_func,
|
||||
'level' : self.__level_func,
|
||||
'list' : self.__list_func,
|
||||
'unsure_ob' : self.__after_bracket_func,
|
||||
'level_number' : self.__level_number_func,
|
||||
'level_text' : self.__level_text_func,
|
||||
'list_name' : self.__list_name_func,
|
||||
}
|
||||
self.__main_list_dict = {
|
||||
'cw<ls<ls-tem-id_' : 'list-template-id',
|
||||
'cw<ls<list-hybri' : 'list-hybrid',
|
||||
'cw<ls<lis-tbl-id' : 'list-table-id',
|
||||
}
|
||||
self.__level_dict = {
|
||||
'cw<ls<level-star' : 'list-number-start',
|
||||
'cw<ls<level-spac' : 'list-space',
|
||||
'cw<ls<level-inde' : 'level-indent',
|
||||
'cw<ls<fir-ln-ind' : 'first-line-indent',
|
||||
'cw<ls<left-inden' : 'left-indent',
|
||||
'cw<ls<tab-stop__' : 'tabs',
|
||||
'cw<ls<level-type' : 'numbering-type',
|
||||
'cw<pf<right-inde' : 'right-indent',
|
||||
'cw<pf<left-inden' : 'left-indent',
|
||||
'cw<pf<fir-ln-ind' : 'first-line-indent',
|
||||
'cw<ci<italics___' : 'italics',
|
||||
'cw<ci<bold______' : 'bold',
|
||||
'cw<ss<para-style' : 'paragraph-style-name',
|
||||
}
|
||||
"""
|
||||
all_lists =
|
||||
[{anything here?}
|
||||
[{list-templateid = ""}
|
||||
[{level-indent}],[{level-indent}]
|
||||
]
|
||||
],
|
||||
"""
|
||||
def __parse_lines(self, line):
|
||||
"""
|
||||
Required : line --line to parse
|
||||
Returns: nothing
|
||||
Logic:
|
||||
Split the lines into a list by a new line. Process the line
|
||||
according to the state.
|
||||
"""
|
||||
lines = line.split('\n')
|
||||
self.__ob_count = 0
|
||||
self.__ob_group = 0
|
||||
for line in lines:
|
||||
self.__token_info = line[:16]
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.__ob_count = line[-4:]
|
||||
self.__ob_group += 1
|
||||
if self.__token_info == 'cb<nu<clos-brack':
|
||||
self.__cb_count = line[-4:]
|
||||
self.__ob_group -= 1
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if action == None:
|
||||
print self.__state
|
||||
action(line)
|
||||
self.__write_final_string()
|
||||
# self.__add_to_final_line()
|
||||
def __default_func(self, line):
|
||||
"""
|
||||
Requires: line --line to process
|
||||
Return: nothing
|
||||
Logic:
|
||||
This state is used at the start and end of a list. Look for an
|
||||
opening bracket, which marks the change of state.
|
||||
"""
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.__state = 'unsure_ob'
|
||||
def __found_list_func(self, line):
|
||||
"""
|
||||
Requires: line -- line to process
|
||||
Returns: nothing
|
||||
Logic:
|
||||
I have found \list.
|
||||
Change the state to list
|
||||
Get the open bracket count so you know when this state ends.
|
||||
Append an empty list to all lists.
|
||||
Create a temporary dictionary. This dictionary has the key of
|
||||
"list-id" and the value of an empty list. Later, this empty list
|
||||
will be filled with all the ids for which the formatting is valid.
|
||||
Append the temporary dictionary to the new list.
|
||||
"""
|
||||
self.__state = 'list'
|
||||
self.__list_ob_count = self.__ob_count
|
||||
self.__all_lists.append([])
|
||||
the_dict = {'list-id': []}
|
||||
self.__all_lists[-1].append(the_dict)
|
||||
def __list_func(self, line):
|
||||
"""
|
||||
Requires: line --line to process
|
||||
Returns: nothing
|
||||
Logic:
|
||||
This method is called when you are in a list, but outside of a level.
|
||||
Check for the end of the list. Otherwise, use the self.__mainlist_dict
|
||||
to determine if you need to add a lines values to the main list.
|
||||
"""
|
||||
if self.__token_info == 'cb<nu<clos-brack' and\
|
||||
self.__cb_count == self.__list_ob_count:
|
||||
self.__state = 'default'
|
||||
elif self.__token_info == 'ob<nu<open-brack':
|
||||
self.__state = 'unsure_ob'
|
||||
else:
|
||||
att = self.__main_list_dict.get(self.__token_info)
|
||||
if att:
|
||||
value = line[20:]
|
||||
# dictionary is always the first item in the last list
|
||||
# [{att:value}, [], [att:value, []]
|
||||
self.__all_lists[-1][0][att] = value
|
||||
def __found_level_func(self, line):
|
||||
"""
|
||||
Requires: line -- line to process
|
||||
Returns: nothing
|
||||
Logic:
|
||||
I have found \listlevel.
|
||||
Change the state to level
|
||||
Get the open bracket count so you know when this state ends.
|
||||
Append an empty list to the last list inside all lists.
|
||||
Create a temporary dictionary.
|
||||
Append the temporary dictionary to the new list.
|
||||
self.__all_lists now looks like:
|
||||
[[{list-id:[]}, [{}]]]
|
||||
Where:
|
||||
self.__all_lists[-1] => a list. The first item is a dictionary.
|
||||
The second item is a list containing a dictionary:
|
||||
[{list-id:[]}, [{}]]
|
||||
self.__all_lists[-1][0] => a dictionary of the list attributes
|
||||
self.__all_lists[-1][-1] => a list with just a dictionary
|
||||
self.__all_lists[-1][-1][0] => the dictionary of level attributes
|
||||
"""
|
||||
self.__state = 'level'
|
||||
self.__level_ob_count = self.__ob_count
|
||||
self.__all_lists[-1].append([])
|
||||
the_dict = {}
|
||||
self.__all_lists[-1][-1].append(the_dict)
|
||||
self.__level_dict
|
||||
def __level_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Look for the end of the this group.
|
||||
Change states if an open bracket is found.
|
||||
Add attributes to all_dicts if an appropriate token is found.
|
||||
"""
|
||||
if self.__token_info == 'cb<nu<clos-brack' and\
|
||||
self.__cb_count == self.__level_ob_count:
|
||||
self.__state = 'list'
|
||||
elif self.__token_info == 'ob<nu<open-brack':
|
||||
self.__state = 'unsure_ob'
|
||||
else:
|
||||
att = self.__level_dict.get(self.__token_info)
|
||||
if att:
|
||||
value = line[20:]
|
||||
self.__all_lists[-1][-1][0][att] = value
|
||||
def __level_number_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line to process
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Check for the end of the group.
|
||||
Otherwise, if the token is hexidecimal, create an attribute.
|
||||
Do so by finding the base-10 value of the number. Then divide
|
||||
this by 2 and round it. Remove the ".0". Sandwwhich the result to
|
||||
give you something like level1-show-level.
|
||||
The show-level attribute means the numbering for this level.
|
||||
"""
|
||||
if self.__token_info == 'cb<nu<clos-brack' and\
|
||||
self.__cb_count == self.__level_number_ob_count:
|
||||
self.__state = 'level'
|
||||
self.__all_lists[-1][-1][0]['level-numbers'] = self.__level_numbers_string
|
||||
self.__level_numbers_string = ''
|
||||
elif self.__token_info == 'tx<hx<__________':
|
||||
self.__level_numbers_string += '\\'%s' % line[18:]
|
||||
elif self.__token_info == 'tx<nu<__________':
|
||||
self.__level_numbers_string += line[17:]
|
||||
"""
|
||||
num = line[18:]
|
||||
num = int(num, 16)
|
||||
level = str(round((num - 1)/2, 0))
|
||||
level = level[:-2]
|
||||
level = 'level%s-show-level' % level
|
||||
self.__all_lists[-1][-1][0][level] = 'true'
|
||||
"""
|
||||
def __level_text_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to process
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Check for the end of the group.
|
||||
Otherwise, if the text is hexidecimal, call on the method
|
||||
__parse_level_text_length.
|
||||
Otheriwse, if the text is regular text, create an attribute.
|
||||
This attribute indicates the puncuation after a certain level.
|
||||
An example is "level1-marker = '.'"
|
||||
Otherwise, check for a level-template-id.
|
||||
"""
|
||||
if self.__token_info == 'cb<nu<clos-brack' and\
|
||||
self.__cb_count == self.__level_text_ob_count:
|
||||
if self.__prefix_string:
|
||||
if self.__all_lists[-1][-1][0]['numbering-type'] == 'bullet':
|
||||
self.__prefix_string = self.__prefix_string.replace('_', '')
|
||||
self.__all_lists[-1][-1][0]['bullet-type'] = self.__prefix_string
|
||||
self.__state = 'level'
|
||||
# self.__figure_level_text_func()
|
||||
self.__level_text_string = ''
|
||||
self.__found_level_text_length = 0
|
||||
elif self.__token_info == 'tx<hx<__________':
|
||||
self.__parse_level_text_length(line)
|
||||
elif self.__token_info == 'tx<nu<__________':
|
||||
text = line[17:]
|
||||
if text and text[-1] == ';':
|
||||
text = text.replace(';', '')
|
||||
if not self.__level_text_position:
|
||||
self.__prefix_string = text
|
||||
else:
|
||||
self.__all_lists[-1][-1][0][self.__level_text_position] = text
|
||||
elif self.__token_info == 'cw<ls<lv-tem-id_':
|
||||
value = line[20:]
|
||||
self.__all_lists[-1][-1][0]['level-template-id'] = value
|
||||
def __parse_level_text_length(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line with hexidecimal number
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Method is used for to parse text in the \leveltext group.
|
||||
"""
|
||||
num = line[18:]
|
||||
the_num = int(num, 16)
|
||||
if not self.__found_level_text_length:
|
||||
self.__all_lists[-1][-1][0]['list-text-length'] = str(the_num)
|
||||
self.__found_level_text_length = 1
|
||||
else:
|
||||
the_num += 1
|
||||
the_string = str(the_num)
|
||||
level_marker = 'level%s-suffix' % the_string
|
||||
show_marker = 'show-level%s' % the_string
|
||||
self.__level_text_position = level_marker
|
||||
self.__all_lists[-1][-1][0][show_marker] = 'true'
|
||||
if self.__prefix_string:
|
||||
prefix_marker = 'level%s-prefix' % the_string
|
||||
self.__all_lists[-1][-1][0][prefix_marker] = self.__prefix_string
|
||||
self.__prefix_string = None
|
||||
def __list_name_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to process
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Simply check for the end of the group and change states.
|
||||
"""
|
||||
if self.__token_info == 'cb<nu<clos-brack' and\
|
||||
self.__cb_count == self.__list_name_ob_count:
|
||||
self.__state = 'list'
|
||||
def __after_bracket_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing.
|
||||
Logic:
|
||||
The last token found was "{". This method determines what group
|
||||
you are now in.
|
||||
WARNING: this could cause problems. If no group is found, the state will remain
|
||||
unsure_ob, which means no other text will be parsed.
|
||||
"""
|
||||
if self.__token_info == 'cw<ls<level-text':
|
||||
self.__state = 'level_text'
|
||||
self.__level_text_ob_count = self.__ob_count
|
||||
elif self.__token_info == 'cw<ls<level-numb':
|
||||
self.__level_number_ob_count = self.__ob_count
|
||||
self.__state = 'level_number'
|
||||
elif self.__token_info == 'cw<ls<list-tb-le':
|
||||
self.__found_level_func(line)
|
||||
elif self.__token_info == 'cw<ls<list-in-tb':
|
||||
self.__found_list_func(line)
|
||||
elif self.__token_info == 'cw<ls<list-name_':
|
||||
self.__state = 'list_name'
|
||||
self.__list_name_ob_count = self.__ob_count
|
||||
else:
|
||||
if self.__run_level > 3:
|
||||
msg = 'No matching token after open bracket\n'
|
||||
msg += 'token is "%s\n"' % (line)
|
||||
raise self.__bug_handler
|
||||
def __add_to_final_line(self):
|
||||
"""
|
||||
Method no longer used.
|
||||
"""
|
||||
self.__list_table_final = 'mi<mk<listabbeg_\n'
|
||||
self.__list_table_final += 'mi<tg<open______<list-table\n' + \
|
||||
'mi<mk<listab-beg\n' + self.__list_table_final
|
||||
self.__list_table_final += \
|
||||
'mi<mk<listab-end\n' + 'mi<tg<close_____<list-table\n'
|
||||
self.__list_table_final += 'mi<mk<listabend_\n'
|
||||
def __write_final_string(self):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Write out the list-table start tag.
|
||||
Iterate through self.__all_lists. For each list, write out
|
||||
a list-in-table tag. Get the dictionary of this list
|
||||
(the first item). Print out the key => value pair.
|
||||
Remove the first item (the dictionary) form this list. Now iterate
|
||||
through what is left in the list. Each list will conatin one item,
|
||||
a dictionary. Get this dictionary and print out key => value pair.
|
||||
"""
|
||||
not_allow = ['list-id',]
|
||||
id = 0
|
||||
self.__list_table_final = 'mi<mk<listabbeg_\n'
|
||||
self.__list_table_final += 'mi<tg<open______<list-table\n' + \
|
||||
'mi<mk<listab-beg\n' + self.__list_table_final
|
||||
for list in self.__all_lists:
|
||||
id += 1
|
||||
self.__list_table_final += 'mi<tg<open-att__<list-in-table'
|
||||
# self.__list_table_final += '<list-id>%s' % (str(id))
|
||||
the_dict = list[0]
|
||||
the_keys = the_dict.keys()
|
||||
for the_key in the_keys:
|
||||
if the_key in not_allow:
|
||||
continue
|
||||
att = the_key
|
||||
value = the_dict[att]
|
||||
self.__list_table_final += '<%s>%s' % (att, value)
|
||||
self.__list_table_final += '\n'
|
||||
levels = list[1:]
|
||||
level_num = 0
|
||||
for level in levels:
|
||||
level_num += 1
|
||||
self.__list_table_final += 'mi<tg<empty-att_<level-in-table'
|
||||
self.__list_table_final += '<level>%s' % (str(level_num))
|
||||
the_dict2 = level[0]
|
||||
the_keys2 = the_dict2.keys()
|
||||
is_bullet = 0
|
||||
bullet_text = ''
|
||||
for the_key2 in the_keys2:
|
||||
if the_key2 in not_allow:
|
||||
continue
|
||||
test_bullet = the_dict2.get('numbering-type')
|
||||
if test_bullet == 'bullet':
|
||||
is_bullet = 1
|
||||
att2 = the_key2
|
||||
value2 = the_dict2[att2]
|
||||
# sys.stderr.write('%s\n' % att2[0:10])
|
||||
if att2[0:10] == 'show-level' and is_bullet:
|
||||
# sys.stderr.write('No print %s\n' % att2)
|
||||
pass
|
||||
elif att2[-6:] == 'suffix' and is_bullet:
|
||||
# sys.stderr.write('%s\n' % att2)
|
||||
bullet_text += value2
|
||||
elif att2[-6:] == 'prefix' and is_bullet:
|
||||
# sys.stderr.write('%s\n' % att2)
|
||||
bullet_text += value2
|
||||
else:
|
||||
self.__list_table_final += '<%s>%s' % (att2, value2)
|
||||
if is_bullet:
|
||||
pass
|
||||
# self.__list_table_final += '<bullet-type>%s' % (bullet_text)
|
||||
self.__list_table_final += '\n'
|
||||
self.__list_table_final += 'mi<tg<close_____<list-in-table\n'
|
||||
self.__list_table_final += \
|
||||
'mi<mk<listab-end\n' + 'mi<tg<close_____<list-table\n'
|
||||
self.__list_table_final += 'mi<mk<listabend_\n'
|
||||
def parse_list_table(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line with border definition in it
|
||||
Returns:
|
||||
A string and the dictionary of list-table values and attributes.
|
||||
Logic:
|
||||
Call on the __parse_lines metod, which splits the text string into
|
||||
lines (which will be tokens) and processes them.
|
||||
"""
|
||||
self.__parse_lines(line)
|
||||
return self.__list_table_final, self.__all_lists
|
442
src/libprs500/ebooks/rtf2xml/make_lists.py
Executable file
442
src/libprs500/ebooks/rtf2xml/make_lists.py
Executable file
@ -0,0 +1,442 @@
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# You should have received a copy of the GNU General Public License #
|
||||
# along with this program; if not, write to the Free Software #
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
|
||||
# 02111-1307 USA #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os, tempfile, re
|
||||
from libprs500.ebooks.rtf2xml import copy
|
||||
class MakeLists:
|
||||
"""
|
||||
Form lists.
|
||||
Use RTF's own formatting to determine if a paragraph definition is part of a
|
||||
list.
|
||||
Use indents to determine items and how lists are nested.
|
||||
"""
|
||||
def __init__(self,
|
||||
in_file,
|
||||
bug_handler,
|
||||
headings_to_sections,
|
||||
list_of_lists,
|
||||
copy = None,
|
||||
run_level = 1,
|
||||
no_headings_as_list = 1,
|
||||
write_list_info = 0,
|
||||
):
|
||||
"""
|
||||
Required:
|
||||
'file'
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__run_level = run_level
|
||||
self.__no_headings_as_list = no_headings_as_list
|
||||
self.__headings_to_sections = headings_to_sections
|
||||
self.__copy = copy
|
||||
self.__write_to = tempfile.mktemp()
|
||||
self.__list_of_lists = list_of_lists
|
||||
self.__write_list_info = write_list_info
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Required:
|
||||
Nothing
|
||||
Return:
|
||||
Nothing
|
||||
Logic:
|
||||
The self.__end_list is a list of tokens that will force a list to end.
|
||||
Likewise, the self.__end_lines is a list of lines that forces a list to end.
|
||||
"""
|
||||
self.__state = "default"
|
||||
self.__left_indent = 0
|
||||
self.__list_type = 'not-defined'
|
||||
self.__pard_def = ""
|
||||
self.__all_lists = []
|
||||
self.__level = 0
|
||||
self.__list_chunk = ''
|
||||
self.__state_dict={
|
||||
'default' : self.__default_func,
|
||||
'in_pard' : self.__in_pard_func,
|
||||
'after_pard' : self.__after_pard_func,
|
||||
}
|
||||
self.__headings = [
|
||||
'heading 1', 'heading 2', 'heading 3', 'heading 4',
|
||||
'heading 5', 'heading 6', 'heading 7', 'heading 8',
|
||||
'heading 9'
|
||||
]
|
||||
self.__allow_levels = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
|
||||
self.__style_name = ''
|
||||
self.__end_list = [
|
||||
'mi<mk<body-close',
|
||||
'mi<mk<par-in-fld',
|
||||
'cw<tb<cell______',
|
||||
'cw<tb<row-def___',
|
||||
'cw<tb<row_______',
|
||||
'mi<mk<sect-close',
|
||||
'mi<mk<sect-start',
|
||||
'mi<mk<header-beg',
|
||||
'mi<mk<header-end',
|
||||
'mi<mk<head___clo',
|
||||
'mi<mk<fldbk-end_',
|
||||
'mi<mk<close_cell',
|
||||
'mi<mk<footnt-ope',
|
||||
'mi<mk<foot___clo',
|
||||
'mi<mk<tabl-start',
|
||||
# 'mi<mk<sec-fd-beg',
|
||||
]
|
||||
self.__end_lines = [
|
||||
'mi<tg<close_____<cell\n',
|
||||
]
|
||||
self.__id_regex = re.compile(r'\<list-id\>(\d+)')
|
||||
self.__lv_regex = re.compile(r'\<list-level\>(\d+)')
|
||||
self.__found_appt = 0
|
||||
self.__line_num = 0
|
||||
def __in_pard_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line -- the line of current text.
|
||||
Return:
|
||||
Nothing
|
||||
Logic:
|
||||
You are in a list, but in the middle of a paragraph definition.
|
||||
Don't do anything until you find the end of the paragraph definition.
|
||||
"""
|
||||
if self.__token_info == 'mi<mk<pard-end__':
|
||||
self.__state = 'after_pard'
|
||||
self.__write_obj.write(line)
|
||||
def __after_pard_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line -- the line of current text.
|
||||
Return:
|
||||
Nothing
|
||||
Logic:
|
||||
You are in a list, but after a paragraph definition. You have to
|
||||
determine if the last pargraph definition ends a list, continues
|
||||
the old one, or starts a new one.
|
||||
Otherwise, look for a paragraph definition. If one is found, determine if
|
||||
the paragraph definition contains a list-id. If it does, use the method
|
||||
self.__list_after_par_def to determine the action.
|
||||
If the paragraph definition does not contain a list-id, use the method
|
||||
close_lists to close out items and lists for a paragraph that is not
|
||||
If a bigger block is found (such as a section or a cell), end all lists.
|
||||
indented.
|
||||
If no special line is found, add each line to a buffer.
|
||||
"""
|
||||
if self.__token_info == 'mi<tg<open-att__' and line[17:37] == 'paragraph-definition':
|
||||
is_heading = self.__is_a_heading()
|
||||
# found paragraph definition and not heading 1
|
||||
search_obj = re.search(self.__id_regex, line)
|
||||
if search_obj and not is_heading: # found list-id
|
||||
search_obj_lv = re.search(self.__lv_regex, line)
|
||||
if search_obj_lv:
|
||||
self.__level = search_obj_lv.group(1)
|
||||
num = search_obj.group(1)
|
||||
self.__list_after_par_def_func(line, num)
|
||||
self.__write_obj.write(line)
|
||||
self.__state = 'in_pard'
|
||||
# heading 1
|
||||
elif is_heading:
|
||||
self.__left_indent = -1000
|
||||
self.__close_lists()
|
||||
self.__write_obj.write(self.__list_chunk)
|
||||
self.__list_chunk = ''
|
||||
self.__state = 'default'
|
||||
self.__write_obj.write(line)
|
||||
# Normal with no list id
|
||||
else:
|
||||
self.__close_lists()
|
||||
self.__write_obj.write(self.__list_chunk)
|
||||
self.__list_chunk = ''
|
||||
self.__write_obj.write(line)
|
||||
if len(self.__all_lists) == 0:
|
||||
self.__state= 'default'
|
||||
else:
|
||||
self.__state = 'in_pard'
|
||||
# section to end lists
|
||||
elif self.__token_info in self.__end_list :
|
||||
self.__left_indent = -1000
|
||||
self.__close_lists()
|
||||
self.__write_obj.write(self.__list_chunk)
|
||||
self.__list_chunk = ''
|
||||
self.__state = 'default'
|
||||
self.__write_obj.write(line)
|
||||
else:
|
||||
self.__list_chunk += line
|
||||
def __list_after_par_def_func(self, line, id):
|
||||
"""
|
||||
Required:
|
||||
line -- the line of current text.
|
||||
id -- the id of the current list
|
||||
Return:
|
||||
Nothing
|
||||
Logic:
|
||||
You have found the end of a paragraph definition, and have found
|
||||
another paragraph definition with a list id.
|
||||
If the list-id is different from the last paragraph definition,
|
||||
write the string in the buffer. Close out the lists with another
|
||||
method and start a new list.
|
||||
If the list id is the same as the last one, check the indent on the
|
||||
current paragraph definition. If it is greater than the previous one,
|
||||
do not end the current list or item. Start a new list.
|
||||
"""
|
||||
last_list_id = self.__all_lists[-1]['id']
|
||||
if id != last_list_id:
|
||||
self.__close_lists()
|
||||
self.__write_obj.write(self.__list_chunk)
|
||||
self.__write_start_list(id)
|
||||
self.__list_chunk = ''
|
||||
else:
|
||||
last_list_indent = self.__all_lists[-1]['left-indent']
|
||||
if self.__left_indent > last_list_indent:
|
||||
self.__write_obj.write(self.__list_chunk)
|
||||
self.__write_start_list(id)
|
||||
else:
|
||||
self.__write_end_item()
|
||||
self.__write_obj.write(self.__list_chunk)
|
||||
self.__write_start_item()
|
||||
self.__list_chunk = ''
|
||||
def __close_lists(self):
|
||||
"""
|
||||
Required:
|
||||
Nothing
|
||||
Return:
|
||||
Nothing
|
||||
Logic:
|
||||
Reverse the list of dictionaries. Iterate through the list and
|
||||
get the indent for each list. If the current indent is less than
|
||||
or equal to the indent in the dictionary, close that level.
|
||||
Keep track of how many levels you close. Reduce the list by that
|
||||
many levels.
|
||||
Reverse the list again.
|
||||
"""
|
||||
if self.__line_num < 25 and self.__found_appt:
|
||||
sys.stderr.write('in closing out lists\n')
|
||||
sys.stderr.write('current_indent is "%s"\n' % self.__left_indent)
|
||||
current_indent = self.__left_indent
|
||||
self.__all_lists.reverse()
|
||||
num_levels_closed = 0
|
||||
for the_dict in self.__all_lists:
|
||||
list_indent = the_dict.get('left-indent')
|
||||
if self.__line_num < 25 and self.__found_appt:
|
||||
sys.stderr.write('last indent is "%s"' % list_indent)
|
||||
if current_indent <= list_indent:
|
||||
self.__write_end_item()
|
||||
self.__write_end_list()
|
||||
num_levels_closed += 1
|
||||
self.__all_lists = self.__all_lists[num_levels_closed:]
|
||||
self.__all_lists.reverse()
|
||||
def __write_end_list(self):
|
||||
"""
|
||||
Required:
|
||||
Nothing
|
||||
Return:
|
||||
Nothing
|
||||
Logic:
|
||||
Write the end of a list.
|
||||
"""
|
||||
self.__write_obj.write('mi<tg<close_____<list\n')
|
||||
self.__write_obj.write('mi<mk<list_close\n')
|
||||
def __write_start_list(self, id):
|
||||
"""
|
||||
Required:
|
||||
id -- the id of the current list.
|
||||
Return:
|
||||
Nothing
|
||||
Logic:
|
||||
Write the start of a list and add the id and left-indent to the
|
||||
self.__all_lists list.
|
||||
Write cues of when a list starts for later processing.
|
||||
In order to determine the type of list, you have to iterate through
|
||||
the self.__list_of lists. This list looks like:
|
||||
[[{list-id: [1, 2], [{}], [{}]] [{list-id: [3, 4], [{}]]]
|
||||
I need to get the inside lists of the main lists. Then I need to get
|
||||
the first item of what I just got. This is a dictionary. Get the list-id.
|
||||
This is a list. Check to see if the current id is in this list. If
|
||||
so, then get the list-type from the dictionary.
|
||||
"""
|
||||
the_dict = {}
|
||||
the_dict['left-indent'] = self.__left_indent
|
||||
the_dict['id'] = id
|
||||
self.__all_lists.append(the_dict)
|
||||
self.__write_obj.write(
|
||||
'mi<mk<list_start\n'
|
||||
)
|
||||
# bogus levels are sometimes written for empty paragraphs
|
||||
if str(self.__level) not in self.__allow_levels:
|
||||
lev_num = '0'
|
||||
else:
|
||||
lev_num = self.__level
|
||||
self.__write_obj.write(
|
||||
'mi<tg<open-att__<list<list-id>%s<level>%s'
|
||||
% (id, lev_num)
|
||||
)
|
||||
list_dict = {}
|
||||
if self.__list_of_lists: # older RTF won't generate a list_of_lists
|
||||
index_of_list = self.__get_index_of_list(id)
|
||||
if index_of_list != None:# found a matching id
|
||||
list_dict = self.__list_of_lists[index_of_list][0]
|
||||
level = int(self.__level) + 1
|
||||
level_dict = self.__list_of_lists[index_of_list][level][0]
|
||||
list_type = level_dict.get('numbering-type')
|
||||
if list_type == 'bullet':
|
||||
list_type = 'unordered'
|
||||
else:
|
||||
list_type = 'ordered'
|
||||
self.__write_obj.write(
|
||||
'<list-type>%s' % (list_type))
|
||||
else: # no matching id
|
||||
self.__write_obj.write(
|
||||
'<list-type>%s' % (self.__list_type))
|
||||
else:# older RTF
|
||||
self.__write_obj.write(
|
||||
'<list-type>%s' % (self.__list_type))
|
||||
# if you want to dump all the info to the list, rather than
|
||||
# keeping it in the table above, change self.__write_list_info
|
||||
# to true.
|
||||
if self.__list_of_lists and self.__write_list_info and list_dict:
|
||||
not_allow = ['list-id',]
|
||||
the_keys_list = list_dict.keys()
|
||||
for the_key in the_keys_list:
|
||||
if the_key in not_allow:
|
||||
continue
|
||||
self.__write_obj.write('<%s>%s' % (the_key, list_dict[the_key]))
|
||||
the_keys_level = level_dict.keys()
|
||||
for the_key in the_keys_level:
|
||||
self.__write_obj.write('<%s>%s' % (the_key, level_dict[the_key]))
|
||||
self.__write_obj.write('\n')
|
||||
self.__write_obj.write(
|
||||
'mi<mk<liststart_\n'
|
||||
)
|
||||
self.__write_start_item()
|
||||
def __get_index_of_list(self, id):
|
||||
"""
|
||||
Requires:
|
||||
id -- id of current paragraph-definition
|
||||
Returns:
|
||||
an index of where the id occurs in list_of_lists, the
|
||||
dictionary passed to this module.
|
||||
Logic:
|
||||
Iterate through the big lists, the one passed to this module and
|
||||
get the first item, the dictionary. Use a counter to keep
|
||||
track of how many times you iterate with the counter.
|
||||
Once you find a match, return the counter.
|
||||
If no match is found, print out an error message.
|
||||
"""
|
||||
# some RTF use 0 indexed list. Don't know what to do?
|
||||
if id == '0':
|
||||
return
|
||||
the_index = 0
|
||||
for list in self.__list_of_lists:
|
||||
the_dict = list[0]
|
||||
id_in_list = the_dict.get('list-id')
|
||||
if id in id_in_list:
|
||||
return the_index
|
||||
the_index += 1
|
||||
if self.__run_level > 0:
|
||||
sys.stderr.write('Module is make_lists.py\n'
|
||||
'Method is __get_index_of_list\n'
|
||||
'The main list does not appear to have a matching id for %s \n'
|
||||
% (id)
|
||||
)
|
||||
# sys.stderr.write(repr(self.__list_of_lists))
|
||||
# if self.__run_level > 3:
|
||||
# msg = 'level is "%s"\n' % self.__run_level
|
||||
# self.__bug_handler
|
||||
def __write_start_item(self):
|
||||
self.__write_obj.write('mi<mk<item_start\n')
|
||||
self.__write_obj.write('mi<tg<open______<item\n')
|
||||
self.__write_obj.write('mi<mk<itemstart_\n')
|
||||
def __write_end_item(self):
|
||||
self.__write_obj.write('mi<tg<item_end__\n')
|
||||
self.__write_obj.write('mi<tg<close_____<item\n')
|
||||
self.__write_obj.write('mi<tg<item__end_\n')
|
||||
def __default_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
self, line
|
||||
Returns:
|
||||
Nothing
|
||||
Logic
|
||||
Look for the start of a paragraph defintion. If one is found, check if
|
||||
it contains a list-id. If it does, start a list. Change the state to
|
||||
in_pard.
|
||||
"""
|
||||
if self.__token_info == 'mi<tg<open-att__' and line[17:37] == 'paragraph-definition':
|
||||
is_a_heading = self.__is_a_heading()
|
||||
if not is_a_heading:
|
||||
search_obj = re.search(self.__id_regex, line)
|
||||
if search_obj:
|
||||
num = search_obj.group(1)
|
||||
self.__state = 'in_pard'
|
||||
search_obj_lv = re.search(self.__lv_regex, line)
|
||||
if search_obj_lv:
|
||||
self.__level = search_obj_lv.group(1)
|
||||
self.__write_start_list(num)
|
||||
self.__write_obj.write(line)
|
||||
def __is_a_heading(self):
|
||||
if self.__style_name in self.__headings:
|
||||
if self.__headings_to_sections:
|
||||
return 1
|
||||
else:
|
||||
if self.__no_headings_as_list:
|
||||
return 1
|
||||
else:
|
||||
return 0
|
||||
else:
|
||||
return 0
|
||||
def __get_indent(self, line):
|
||||
if self.__token_info == 'mi<mk<left_inden':
|
||||
self.__left_indent = float(line[17:-1])
|
||||
def __get_list_type(self, line):
|
||||
if self.__token_info == 'mi<mk<list-type_': # <ordered
|
||||
self.__list_type = line[17:-1]
|
||||
if self.__list_type == 'item':
|
||||
self.__list_type = "unordered"
|
||||
def __get_style_name(self, line):
|
||||
if self.__token_info == 'mi<mk<style-name':
|
||||
self.__style_name = line[17:-1]
|
||||
def make_lists(self):
|
||||
"""
|
||||
Required:
|
||||
nothing
|
||||
Returns:
|
||||
original file will be changed
|
||||
Logic:
|
||||
"""
|
||||
self.__initiate_values()
|
||||
read_obj = open(self.__file, 'r')
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
self.__get_indent(line)
|
||||
self.__get_list_type(line)
|
||||
self.__get_style_name(line)
|
||||
action = self.__state_dict.get(self.__state)
|
||||
action(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "make_lists.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
132
src/libprs500/ebooks/rtf2xml/old_rtf.py
Executable file
132
src/libprs500/ebooks/rtf2xml/old_rtf.py
Executable file
@ -0,0 +1,132 @@
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# You should have received a copy of the GNU General Public License #
|
||||
# along with this program; if not, write to the Free Software #
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
|
||||
# 02111-1307 USA #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys
|
||||
"""
|
||||
"""
|
||||
class OldRtf:
|
||||
"""
|
||||
Check to see if the RTF is an older version
|
||||
Logic:
|
||||
"""
|
||||
def __init__(self, in_file, bug_handler, run_level ):
|
||||
"""
|
||||
Required:
|
||||
'file'--file to parse
|
||||
'table_data' -- a dictionary for each table.
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__initiate_values()
|
||||
self.__ob_group = 0
|
||||
def __initiate_values(self):
|
||||
self.__previous_token = ''
|
||||
self.__new_found = 0
|
||||
self.__allowable = [
|
||||
'annotation' ,
|
||||
'blue______' ,
|
||||
'bold______',
|
||||
'caps______',
|
||||
'char-style' ,
|
||||
'dbl-strike' ,
|
||||
'emboss____',
|
||||
'engrave___' ,
|
||||
'font-color',
|
||||
'font-down_' ,
|
||||
'font-size_',
|
||||
'font-style',
|
||||
'font-up___',
|
||||
'footnot-mk' ,
|
||||
'green_____' ,
|
||||
'hidden____',
|
||||
'italics___',
|
||||
'outline___',
|
||||
'red_______',
|
||||
'shadow____' ,
|
||||
'small-caps',
|
||||
'strike-thr',
|
||||
'subscript_',
|
||||
'superscrip' ,
|
||||
'underlined' ,
|
||||
]
|
||||
self.__state = 'before_body'
|
||||
self.__action_dict = {
|
||||
'before_body' : self.__before_body_func,
|
||||
'in_body' : self.__check_tokens_func,
|
||||
'after_pard' : self.__after_pard_func,
|
||||
}
|
||||
self.__is_old = 0
|
||||
self.__found_new = 0
|
||||
def __check_tokens_func(self, line):
|
||||
if self.__inline_info in self.__allowable:
|
||||
if self.__ob_group == self.__base_ob_count:
|
||||
return 'old_rtf'
|
||||
else:
|
||||
self.__found_new += 1
|
||||
elif self.__token_info == 'cw<pf<par-def___':
|
||||
self.__state = 'after_pard'
|
||||
def __before_body_func(self, line):
|
||||
if self.__token_info == 'mi<mk<body-open_':
|
||||
self.__state = 'in_body'
|
||||
self.__base_ob_count = self.__ob_group
|
||||
def __after_pard_func(self, line):
|
||||
if line[0:2] != 'cw':
|
||||
self.__state = 'in_body'
|
||||
def check_if_old_rtf(self):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
1 if file is older RTf
|
||||
0 if file is newer RTF
|
||||
"""
|
||||
|
||||
read_obj = open(self.__file, 'r')
|
||||
line = 1
|
||||
line_num = 0
|
||||
while line:
|
||||
line = read_obj.readline()
|
||||
line_num += 1
|
||||
self.__token_info = line[:16]
|
||||
if self.__token_info == 'mi<mk<body-close':
|
||||
return 0
|
||||
self.__ob_group = 0
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.__ob_group += 1
|
||||
self.__ob_count = line[-5:-1]
|
||||
if self.__token_info == 'cb<nu<clos-brack':
|
||||
self.__ob_group -= 1
|
||||
self.__cb_count = line[-5:-1]
|
||||
self.__inline_info = line[6:16]
|
||||
if self.__state == 'after_body':
|
||||
return 0
|
||||
action = self.__action_dict.get(self.__state)
|
||||
if not action:
|
||||
sys.stderr.write('No action for state!\n')
|
||||
result = action(line)
|
||||
if result == 'new_rtf':
|
||||
return 0
|
||||
elif result == 'old_rtf':
|
||||
return 1
|
||||
self.__previous_token = line[6:16]
|
||||
return 0
|
274
src/libprs500/ebooks/rtf2xml/options_trem.py
Executable file
274
src/libprs500/ebooks/rtf2xml/options_trem.py
Executable file
@ -0,0 +1,274 @@
|
||||
import sys
|
||||
from libprs500.ebooks import rtf2xml
|
||||
class ParseOptions:
|
||||
"""
|
||||
Requires:
|
||||
system_string --The string from the command line
|
||||
options_dict -- a dictionary with the key equal to the opition, and
|
||||
a list describing that option. (See below)
|
||||
Returns:
|
||||
A tupple. The first item in the tupple is a dictionary containing
|
||||
the arguments for each options. The second is a list of the
|
||||
arguments.
|
||||
If invalid options are passed to the module, 0,0 is returned.
|
||||
Examples:
|
||||
Your script has the option '--indents', and '--output=file'.
|
||||
You want to give short option names as well:
|
||||
--i and -o=file
|
||||
Use this:
|
||||
options_dict = {'output': [1, 'o'],
|
||||
'indents': [0, 'i']
|
||||
}
|
||||
options_obj = ParseOptions(
|
||||
system_string = sys.argv,
|
||||
options_dict = options_dict
|
||||
)
|
||||
options, arguments = options_obj.parse_options()
|
||||
print options
|
||||
print arguments
|
||||
The result will be:
|
||||
{indents:None, output:'/home/paul/file'}, ['/home/paul/input']
|
||||
"""
|
||||
def __init__(self, system_string, options_dict):
|
||||
self.__system_string = system_string[1:]
|
||||
long_list = self.__make_long_list_func(options_dict)
|
||||
# # print long_list
|
||||
short_list = self.__make_short_list_func(options_dict)
|
||||
# # print short_list
|
||||
self.__legal_options = long_list + short_list
|
||||
# # print self.__legal_options
|
||||
self.__short_long_dict = self.__make_short_long_dict_func(options_dict)
|
||||
# # print self.__short_long_dict
|
||||
self.__opt_with_args = self.__make_options_with_arg_list(options_dict)
|
||||
# # print self.__opt_with_args
|
||||
self.__options_okay = 1
|
||||
def __make_long_list_func(self, options_dict):
|
||||
"""
|
||||
Required:
|
||||
options_dict -- the dictionary mapping options to a list
|
||||
Returns:
|
||||
a list of legal options
|
||||
"""
|
||||
legal_list = []
|
||||
keys = options_dict.keys()
|
||||
for key in keys:
|
||||
key = '--' + key
|
||||
legal_list.append(key)
|
||||
return legal_list
|
||||
def __make_short_list_func(self, options_dict):
|
||||
"""
|
||||
Required:
|
||||
options_dict --the dictionary mapping options to a list
|
||||
Returns:
|
||||
a list of legal short options
|
||||
"""
|
||||
legal_list = []
|
||||
keys = options_dict.keys()
|
||||
for key in keys:
|
||||
values = options_dict[key]
|
||||
try:
|
||||
legal_list.append('-' + values[1])
|
||||
except IndexError:
|
||||
pass
|
||||
return legal_list
|
||||
def __make_short_long_dict_func(self, options_dict):
|
||||
"""
|
||||
Required:
|
||||
options_dict --the dictionary mapping options to a list
|
||||
Returns:
|
||||
a dictionary with keys of short options and values of long options
|
||||
Logic:
|
||||
read through the options dictionary and pair short options with long options
|
||||
"""
|
||||
short_long_dict = {}
|
||||
keys = options_dict.keys()
|
||||
for key in keys:
|
||||
values = options_dict[key]
|
||||
try:
|
||||
short = '-' + values[1]
|
||||
long = '--' + key
|
||||
short_long_dict[short] = long
|
||||
except IndexError:
|
||||
pass
|
||||
return short_long_dict
|
||||
def __make_options_with_arg_list(self, options_dict):
|
||||
"""
|
||||
Required:
|
||||
options_dict --the dictionary mapping options to a list
|
||||
Returns:
|
||||
a list of options that take arguments.
|
||||
"""
|
||||
opt_with_arg = []
|
||||
keys = options_dict.keys()
|
||||
for key in keys:
|
||||
values = options_dict[key]
|
||||
try:
|
||||
if values[0]:
|
||||
opt_with_arg.append('--' + key)
|
||||
except IndexError:
|
||||
pass
|
||||
return opt_with_arg
|
||||
def __sub_short_with_long(self):
|
||||
"""
|
||||
Required:
|
||||
nothing
|
||||
Returns:
|
||||
a new system string
|
||||
Logic:
|
||||
iterate through the system string and replace short options with long options
|
||||
"""
|
||||
new_string = []
|
||||
sub_list = self.__short_long_dict.keys()
|
||||
for item in self.__system_string:
|
||||
if item in sub_list:
|
||||
item = self.__short_long_dict[item]
|
||||
new_string.append(item)
|
||||
return new_string
|
||||
def __pair_arg_with_option(self):
|
||||
"""
|
||||
Required:
|
||||
nothing
|
||||
Returns
|
||||
nothing (changes value of self.__system_string)
|
||||
Logic:
|
||||
iterate through the system string, and match arguments with options:
|
||||
old_list = ['--foo', 'bar']
|
||||
new_list = ['--foo=bar'
|
||||
"""
|
||||
opt_len = len(self.__system_string)
|
||||
new_system_string = []
|
||||
counter = 0
|
||||
slurp_value = 0
|
||||
for arg in self.__system_string:
|
||||
# previous value was an option with an argument, so this arg is
|
||||
# actually an argument that has already been added
|
||||
counter += 1
|
||||
if slurp_value:
|
||||
slurp_value = 0
|
||||
continue
|
||||
# not an option--an argument
|
||||
if arg[0] != '-':
|
||||
new_system_string.append(arg)
|
||||
# option and argument already paired
|
||||
elif '=' in arg:
|
||||
new_system_string .append(arg)
|
||||
else:
|
||||
# this option takes an argument
|
||||
if arg in self.__opt_with_args:
|
||||
# option is the last in the list
|
||||
if counter + 1 > opt_len:
|
||||
sys.stderr.write('option "%s" must take an argument\n' % arg)
|
||||
new_system_string.append(arg)
|
||||
self.__options_okay = 0
|
||||
else:
|
||||
# the next item in list is also an option
|
||||
if self.__system_string[counter][0] == '-':
|
||||
sys.stderr.write('option "%s" must take an argument\n' % arg)
|
||||
new_system_string.append(arg)
|
||||
self.__options_okay = 0
|
||||
# the next item in the list is the argument
|
||||
else:
|
||||
new_system_string.append(arg + '=' + self.__system_string[counter])
|
||||
slurp_value = 1
|
||||
# this option does not take an argument
|
||||
else:
|
||||
new_system_string.append(arg)
|
||||
return new_system_string
|
||||
def __get_just_options(self):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
list of options
|
||||
Logic:
|
||||
Iterate through the self.__system string, looking for the last
|
||||
option. The options are everything in the sysem string before the
|
||||
last option.
|
||||
Check to see that the options contain no arguments.
|
||||
"""
|
||||
highest = 0
|
||||
counter = 0
|
||||
found_options = 0
|
||||
for item in self.__system_string:
|
||||
if item[0] == '-':
|
||||
highest = counter
|
||||
found_options = 1
|
||||
counter += 1
|
||||
if found_options:
|
||||
just_options = self.__system_string[:highest + 1]
|
||||
arguments = self.__system_string[highest + 1:]
|
||||
else:
|
||||
just_options = []
|
||||
arguments = self.__system_string
|
||||
if found_options:
|
||||
for item in just_options:
|
||||
if item[0] != '-':
|
||||
sys.stderr.write('%s is an argument in an option list\n' % item)
|
||||
self.__options_okay = 0
|
||||
return just_options, arguments
|
||||
def __is_legal_option_func(self):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Check each value in the newly creatd options list to see if it
|
||||
matches what the user describes as a legal option.
|
||||
"""
|
||||
illegal_options = []
|
||||
for arg in self.__system_string:
|
||||
if '=' in arg:
|
||||
temp_list = arg.split('=')
|
||||
arg = temp_list[0]
|
||||
if arg not in self.__legal_options and arg[0] == '-':
|
||||
illegal_options.append(arg)
|
||||
if illegal_options:
|
||||
self.__options_okay = 0
|
||||
sys.stderr.write('The following options are not permitted:\n')
|
||||
for not_legal in illegal_options:
|
||||
sys.stderr.write('%s\n' % not_legal)
|
||||
def __make_options_dict(self, options):
|
||||
options_dict = {}
|
||||
for item in options:
|
||||
if '=' in item:
|
||||
option, arg = item.split('=')
|
||||
else:
|
||||
option = item
|
||||
arg = None
|
||||
if option[0] == '-':
|
||||
option = option[1:]
|
||||
if option[0] == '-':
|
||||
option = option[1:]
|
||||
options_dict[option] = arg
|
||||
return options_dict
|
||||
def parse_options(self):
|
||||
self.__system_string = self.__sub_short_with_long()
|
||||
# # print 'subbed list is %s' % self.__system_string
|
||||
self.__system_string = self.__pair_arg_with_option()
|
||||
# # print 'list with pairing is %s' % self.__system_string
|
||||
options, arguments = self.__get_just_options()
|
||||
# # print 'options are %s ' % options
|
||||
# # print 'arguments are %s ' % arguments
|
||||
self.__is_legal_option_func()
|
||||
if self.__options_okay:
|
||||
options_dict = self.__make_options_dict(options)
|
||||
# # print options_dict
|
||||
return options_dict, arguments
|
||||
else:
|
||||
return 0,0
|
||||
if __name__ == '__main__':
|
||||
this_dict = {
|
||||
'indents': [0, 'i'],
|
||||
'output': [1, 'o'],
|
||||
'test3': [1, 't'],
|
||||
}
|
||||
test_obj = ParseOptions(system_string = sys.argv,
|
||||
options_dict = this_dict
|
||||
)
|
||||
options, the_args = test_obj.parse_options()
|
||||
print options, the_args
|
||||
"""
|
||||
this_options = ['--foo', '-o']
|
||||
this_opt_with_args = ['--foo']
|
||||
"""
|
147
src/libprs500/ebooks/rtf2xml/output.py
Executable file
147
src/libprs500/ebooks/rtf2xml/output.py
Executable file
@ -0,0 +1,147 @@
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# You should have received a copy of the GNU General Public License #
|
||||
# along with this program; if not, write to the Free Software #
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
|
||||
# 02111-1307 USA #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os, codecs
|
||||
from libprs500.ebooks import rtf2xml
|
||||
class Output:
|
||||
"""
|
||||
Output file
|
||||
"""
|
||||
def __init__(self,
|
||||
file,
|
||||
orig_file,
|
||||
output_dir = None,
|
||||
out_file = None
|
||||
):
|
||||
"""
|
||||
Required:
|
||||
'file' -- xml file ready to output
|
||||
orig_file -- original rtf file
|
||||
Optional:
|
||||
output_file -- the file to output to
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = file
|
||||
self.__orig_file = orig_file
|
||||
self.__output_dir = output_dir
|
||||
self.__no_ask = 1
|
||||
self.__out_file = out_file
|
||||
def output(self):
|
||||
"""
|
||||
Required:
|
||||
nothing
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
output the line to the screen if no output file given. Otherwise, output to
|
||||
the file.
|
||||
"""
|
||||
# self.__output_xml(self.__file, self.__out_file)
|
||||
if self.__output_dir:
|
||||
self.__output_to_dir_func()
|
||||
elif self.__out_file:
|
||||
self.__output_xml(self.__file, self.__out_file)
|
||||
else:
|
||||
self.__output_to_standard_func()
|
||||
def __output_to_dir_func(self):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Create a file within the output directory.
|
||||
Read one file at a time. Output line to the newly-created file.
|
||||
"""
|
||||
base_name = os.path.basename(self.__orig_file)
|
||||
base_name, ext = os.path.splitext(base_name)
|
||||
output_file = '%s.xml' % base_name
|
||||
output_file = os.path.join(self.__output_dir, output_file)
|
||||
# change if user wants to output to a specific file
|
||||
if self.__out_file:
|
||||
output_file = os.path.join(self.__output_dir, self.__out_file)
|
||||
user_response = 'o'
|
||||
if os.path.isfile(output_file):
|
||||
if self.__no_ask:
|
||||
user_response = 'o'
|
||||
else:
|
||||
msg = 'Do you want to over-write %s?\n' % output_file
|
||||
msg += 'Type "o" to over-write.\n'
|
||||
msg += 'Type any other key to print to standard output.\n'
|
||||
sys.stderr.write(msg)
|
||||
user_response = raw_input()
|
||||
if user_response == 'o':
|
||||
read_obj = open(self.__file, 'r')
|
||||
write_obj = open(output_file, 'w')
|
||||
line = 1
|
||||
while line:
|
||||
line = read_obj.readline()
|
||||
write_obj.write(line)
|
||||
read_obj.close()
|
||||
write_obj.close()
|
||||
else:
|
||||
self.__output_to_standard_func()
|
||||
def __output_to_file_func(self):
|
||||
"""
|
||||
Required:
|
||||
nothing
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
read one line at a time. Output to standard
|
||||
"""
|
||||
read_obj = open(self.__file, 'r')
|
||||
write_obj = open(self.__out_file, 'w')
|
||||
line = 1
|
||||
while line:
|
||||
line = read_obj.readline()
|
||||
write_obj.write(line)
|
||||
read_obj.close()
|
||||
write_obj.close()
|
||||
def __output_to_standard_func(self):
|
||||
"""
|
||||
Required:
|
||||
nothing
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
read one line at a time. Output to standard
|
||||
"""
|
||||
read_obj = open(self.__file, 'r')
|
||||
line = 1
|
||||
while line:
|
||||
line = read_obj.readline()
|
||||
sys.stdout.write(line)
|
||||
read_obj.close()
|
||||
def __output_xml(self, in_file, out_file):
|
||||
"""
|
||||
output the ill-formed xml file
|
||||
"""
|
||||
(utf8_encode, utf8_decode, utf8_reader, utf8_writer) = codecs.lookup("utf-8")
|
||||
write_obj = utf8_writer(open(out_file, 'w'))
|
||||
write_obj = open(out_file, 'w')
|
||||
read_obj = utf8_writer(open(in_file, 'r'))
|
||||
read_obj = open(in_file, 'r')
|
||||
line = 1
|
||||
while line:
|
||||
line = read_obj.readline()
|
||||
if isinstance(line, type(u"")):
|
||||
line = line.encode("utf-8")
|
||||
write_obj.write(line)
|
||||
read_obj.close()
|
||||
write_obj.close()
|
203
src/libprs500/ebooks/rtf2xml/override_table.py
Executable file
203
src/libprs500/ebooks/rtf2xml/override_table.py
Executable file
@ -0,0 +1,203 @@
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# You should have received a copy of the GNU General Public License #
|
||||
# along with this program; if not, write to the Free Software #
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
|
||||
# 02111-1307 USA #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys,os
|
||||
from libprs500.ebooks import rtf2xml
|
||||
class OverrideTable:
|
||||
"""
|
||||
Parse a line of text to make the override table. Return a string
|
||||
(which will convert to XML) and the dictionary containing all the
|
||||
information about the lists. This dictionary is the result of the
|
||||
dictionary that is first passed to this module. This module
|
||||
modifies the dictionary, assigning lists numbers to each list.
|
||||
"""
|
||||
def __init__(
|
||||
self,
|
||||
list_of_lists,
|
||||
run_level = 1,
|
||||
):
|
||||
self.__list_of_lists = list_of_lists
|
||||
self.__initiate_values()
|
||||
self.__run_level = run_level
|
||||
def __initiate_values(self):
|
||||
self.__override_table_final = ''
|
||||
self.__state = 'default'
|
||||
self.__override_list = []
|
||||
self.__state_dict = {
|
||||
'default' : self.__default_func,
|
||||
'override' : self.__override_func,
|
||||
'unsure_ob' : self.__after_bracket_func,
|
||||
}
|
||||
self.__override_dict = {
|
||||
'cw<ls<lis-tbl-id' : 'list-table-id',
|
||||
'cw<ls<list-id___' : 'list-id',
|
||||
}
|
||||
def __override_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
The group {\override has been found.
|
||||
Check for the end of the group.
|
||||
Otherwise, add appropriate tokens to the override dictionary.
|
||||
"""
|
||||
if self.__token_info == 'cb<nu<clos-brack' and\
|
||||
self.__cb_count == self.__override_ob_count:
|
||||
self.__state = 'default'
|
||||
self.__parse_override_dict()
|
||||
else:
|
||||
att = self.__override_dict.get(self.__token_info)
|
||||
if att:
|
||||
value = line[20:]
|
||||
self.__override_list[-1][att] = value
|
||||
def __parse_override_dict(self):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
The list of all information about RTF lists has been passed to
|
||||
this module. As of this point, this python list has no id number,
|
||||
which is needed later to identify which lists in the body should
|
||||
be assigned which formatting commands from the list-table.
|
||||
In order to get an id, I have to check to see when the list-table-id
|
||||
from the override_dict (generated in this module) matches the list-table-id
|
||||
in list_of_lists (generated in the list_table.py module). When a match is found,
|
||||
append the lists numbers to the self.__list_of_lists dictionary
|
||||
that contains the empty lists:
|
||||
[[{list-id:[HERE!],[{}]]
|
||||
This is a list, since one list in the table in the preamble of RTF can
|
||||
apply to multiple lists in the body.
|
||||
"""
|
||||
override_dict = self.__override_list[-1]
|
||||
list_id = override_dict.get('list-id')
|
||||
if list_id == None and self.__level > 3:
|
||||
msg = 'This override does not appear to have a list-id\n'
|
||||
raise self.__bug_handler, msg
|
||||
current_table_id = override_dict.get('list-table-id')
|
||||
if current_table_id == None and self.__run_level > 3:
|
||||
msg = 'This override does not appear to have a list-table-id\n'
|
||||
raise self.__bug_handler, msg
|
||||
counter = 0
|
||||
for list in self.__list_of_lists:
|
||||
info_dict = list[0]
|
||||
old_table_id = info_dict.get('list-table-id')
|
||||
if old_table_id == current_table_id:
|
||||
self.__list_of_lists[counter][0]['list-id'].append(list_id)
|
||||
break
|
||||
counter += 1
|
||||
def __parse_lines(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --ine to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Break the into tokens by splitting it on the newline.
|
||||
Call on the method according to the state.
|
||||
"""
|
||||
lines = line.split('\n')
|
||||
self.__ob_count = 0
|
||||
self.__ob_group = 0
|
||||
for line in lines:
|
||||
self.__token_info = line[:16]
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.__ob_count = line[-4:]
|
||||
self.__ob_group += 1
|
||||
if self.__token_info == 'cb<nu<clos-brack':
|
||||
self.__cb_count = line[-4:]
|
||||
self.__ob_group -= 1
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if action == None:
|
||||
print self.__state
|
||||
action(line)
|
||||
self.__write_final_string()
|
||||
# self.__add_to_final_line()
|
||||
def __default_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line to parse
|
||||
Return:
|
||||
nothing
|
||||
Logic:
|
||||
Look for an open bracket and change states when found.
|
||||
"""
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.__state = 'unsure_ob'
|
||||
def __after_bracket_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
The last token was an open bracket. You need to determine
|
||||
the group based on the token after.
|
||||
WARNING: this could cause problems. If no group is found, the
|
||||
state will remain unsure_ob, which means no other text will be
|
||||
parsed. I should do states by a list and simply pop this
|
||||
unsure_ob state to get the previous state.
|
||||
"""
|
||||
if self.__token_info == 'cw<ls<lis-overid':
|
||||
self.__state = 'override'
|
||||
self.__override_ob_count = self.__ob_count
|
||||
the_dict = {}
|
||||
self.__override_list.append(the_dict)
|
||||
elif self.__run_level > 3:
|
||||
msg = 'No matching token after open bracket\n'
|
||||
msg += 'token is "%s\n"' % (line)
|
||||
raise self.__bug_handler, msg
|
||||
def __write_final_string(self):
|
||||
"""
|
||||
Requires:
|
||||
line -- line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
First write out the override-table tag.
|
||||
Iteratere through the dictionaries in the main override_list.
|
||||
For each dictionary, write an empty tag "override-list". Add
|
||||
the attributes and values of the tag from the dictionary.
|
||||
"""
|
||||
self.__override_table_final = 'mi<mk<over_beg_\n'
|
||||
self.__override_table_final += 'mi<tg<open______<override-table\n' + \
|
||||
'mi<mk<overbeg__\n' + self.__override_table_final
|
||||
for the_dict in self.__override_list:
|
||||
self.__override_table_final += 'mi<tg<empty-att_<override-list'
|
||||
the_keys = the_dict.keys()
|
||||
for the_key in the_keys:
|
||||
self.__override_table_final += \
|
||||
'<%s>%s' % (the_key, the_dict[the_key])
|
||||
self.__override_table_final += '\n'
|
||||
self.__override_table_final += '\n'
|
||||
self.__override_table_final += \
|
||||
'mi<mk<overri-end\n' + 'mi<tg<close_____<override-table\n'
|
||||
self.__override_table_final += 'mi<mk<overribend_\n'
|
||||
def parse_override_table(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line with border definition in it
|
||||
Returns:
|
||||
A string that will be converted to XML, and a dictionary of
|
||||
all the properties of the RTF lists.
|
||||
Logic:
|
||||
"""
|
||||
self.__parse_lines(line)
|
||||
return self.__override_table_final, self.__list_of_lists
|
739
src/libprs500/ebooks/rtf2xml/paragraph_def.py
Executable file
739
src/libprs500/ebooks/rtf2xml/paragraph_def.py
Executable file
@ -0,0 +1,739 @@
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# You should have received a copy of the GNU General Public License #
|
||||
# along with this program; if not, write to the Free Software #
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
|
||||
# 02111-1307 USA #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os, tempfile
|
||||
from libprs500.ebooks.rtf2xml import copy, border_parse
|
||||
class ParagraphDef:
|
||||
"""
|
||||
=================
|
||||
Purpose
|
||||
=================
|
||||
Write paragraph definition tags.
|
||||
States:
|
||||
1. before_1st_para_def.
|
||||
Before any para_def token is found. This means all the text in the preamble.
|
||||
Look for the token 'cw<pf<par-def___'. This will changet the state to collect_tokens.
|
||||
2. collect_tokens.
|
||||
Found a paragraph_def. Need to get all tokens.
|
||||
Change with start of a paragrph ('mi<mk<para-start'). State then becomes
|
||||
in_paragraphs
|
||||
If another paragraph definition is found, the state does not change.
|
||||
But the dictionary is reset.
|
||||
3. in_paragraphs
|
||||
State changes when 'mi<mk<para-end__', or end of paragraph is found.
|
||||
State then becomes 'self.__state = 'after_para_end'
|
||||
4. after_para_end
|
||||
If 'mi<mk<para-start' (the start of a paragraph) or 'mi<mk<para-end__' (the end of a paragraph--must be empty paragraph?) are found:
|
||||
state changes to 'in_paragraphs'
|
||||
If 'cw<pf<par-def___' (paragraph_definition) is found:
|
||||
state changes to collect_tokens
|
||||
if 'mi<mk<body-close', 'mi<mk<par-in-fld', 'cw<tb<cell______','cw<tb<row-def___','cw<tb<row_______', 'mi<mk<sect-close', 'mi<mk<header-beg', 'mi<mk<header-end'
|
||||
are found. (All these tokens mark the start of a bigger element. para_def must
|
||||
be closed:
|
||||
state changes to 'after_para_def'
|
||||
5. after_para_def
|
||||
'mi<mk<para-start' changes state to in_paragraphs
|
||||
if another paragraph_def is found, the state changes to collect_tokens.
|
||||
"""
|
||||
def __init__(self,
|
||||
in_file,
|
||||
bug_handler,
|
||||
default_font,
|
||||
copy = None,
|
||||
run_level = 1,):
|
||||
"""
|
||||
Required:
|
||||
'file'--file to parse
|
||||
'default_font' --document default font
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__default_font = default_font
|
||||
self.__copy = copy
|
||||
self.__run_level = run_level
|
||||
self.__write_to = tempfile.mktemp()
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Initiate all values.
|
||||
"""
|
||||
# Dictionary needed to convert shortened style names to readable names
|
||||
self.__token_dict={
|
||||
# paragraph formatting => pf
|
||||
'par-end___' : 'para',
|
||||
'par-def___' : 'paragraph-definition',
|
||||
'keep-w-nex' : 'keep-with-next',
|
||||
'widow-cntl' : 'widow-control',
|
||||
'adjust-rgt' : 'adjust-right',
|
||||
'language__' : 'language',
|
||||
'right-inde' : 'right-indent',
|
||||
'fir-ln-ind' : 'first-line-indent',
|
||||
'left-inden' : 'left-indent',
|
||||
'space-befo' : 'space-before',
|
||||
'space-afte' : 'space-after',
|
||||
'line-space' : 'line-spacing',
|
||||
'default-ta' : 'default-tab',
|
||||
'align_____' : 'align',
|
||||
'widow-cntr' : 'widow-control',
|
||||
# stylesheet = > ss
|
||||
'style-shet' : 'stylesheet',
|
||||
'based-on__' : 'based-on-style',
|
||||
'next-style' : 'next-style',
|
||||
'char-style' : 'character-style',
|
||||
# this is changed to get a nice attribute
|
||||
'para-style' : 'name',
|
||||
# graphics => gr
|
||||
'picture___' : 'pict',
|
||||
'obj-class_' : 'obj_class',
|
||||
'mac-pic___' : 'mac-pict',
|
||||
# section => sc
|
||||
'section___' : 'section-new',
|
||||
'sect-defin' : 'section-reset',
|
||||
'sect-note_' : 'endnotes-in-section',
|
||||
# list=> ls
|
||||
'list-text_' : 'list-text',
|
||||
# this line must be wrong because it duplicates an earlier one
|
||||
'list-text_' : 'list-text',
|
||||
'list______' : 'list',
|
||||
'list-lev-d' : 'list-level-definition',
|
||||
'list-cardi' : 'list-cardinal-numbering',
|
||||
'list-decim' : 'list-decimal-numbering',
|
||||
'list-up-al' : 'list-uppercase-alphabetic-numbering',
|
||||
'list-up-ro' : 'list-uppercae-roman-numbering',
|
||||
'list-ord__' : 'list-ordinal-numbering',
|
||||
'list-ordte' : 'list-ordinal-text-numbering',
|
||||
'list-bulli' : 'list-bullet',
|
||||
'list-simpi' : 'list-simple',
|
||||
'list-conti' : 'list-continue',
|
||||
'list-hang_' : 'list-hang',
|
||||
# 'list-tebef' : 'list-text-before',
|
||||
'list-level' : 'level',
|
||||
'list-id___' : 'list-id',
|
||||
'list-start' : 'list-start',
|
||||
'nest-level' : 'nest-level',
|
||||
# duplicate
|
||||
'list-level' : 'list-level',
|
||||
# notes => nt
|
||||
'footnote__' : 'footnote',
|
||||
'type______' : 'type',
|
||||
# anchor => an
|
||||
'toc_______' : 'anchor-toc',
|
||||
'book-mk-st' : 'bookmark-start',
|
||||
'book-mk-en' : 'bookmark-end',
|
||||
'index-mark' : 'anchor-index',
|
||||
'place_____' : 'place',
|
||||
# field => fd
|
||||
'field_____' : 'field',
|
||||
'field-inst' : 'field-instruction',
|
||||
'field-rslt' : 'field-result',
|
||||
'datafield_' : 'data-field',
|
||||
# info-tables => it
|
||||
'font-table' : 'font-table',
|
||||
'colr-table' : 'color-table',
|
||||
'lovr-table' : 'list-override-table',
|
||||
'listtable_' : 'list-table',
|
||||
'revi-table' : 'revision-table',
|
||||
# character info => ci
|
||||
'hidden____' : 'hidden',
|
||||
'italics___' : 'italics',
|
||||
'bold______' : 'bold',
|
||||
'strike-thr' : 'strike-through',
|
||||
'shadow____' : 'shadow',
|
||||
'outline___' : 'outline',
|
||||
'small-caps' : 'small-caps',
|
||||
'caps______' : 'caps',
|
||||
'dbl-strike' : 'double-strike-through',
|
||||
'emboss____' : 'emboss',
|
||||
'engrave___' : 'engrave',
|
||||
'subscript_' : 'subscript',
|
||||
'superscrip' : 'superscipt',
|
||||
'font-style' : 'font-style',
|
||||
'font-color' : 'font-color',
|
||||
'font-size_' : 'font-size',
|
||||
'font-up___' : 'superscript',
|
||||
'font-down_' : 'subscript',
|
||||
'red_______' : 'red',
|
||||
'blue______' : 'blue',
|
||||
'green_____' : 'green',
|
||||
# table => tb
|
||||
'row-def___' : 'row-definition',
|
||||
'cell______' : 'cell',
|
||||
'row_______' : 'row',
|
||||
'in-table__' : 'in-table',
|
||||
'columns___' : 'columns',
|
||||
'row-pos-le' : 'row-position-left',
|
||||
'cell-posit' : 'cell-position',
|
||||
# preamble => pr
|
||||
# underline
|
||||
'underlined' : 'underlined',
|
||||
# border => bd
|
||||
'bor-t-r-hi' : 'border-table-row-horizontal-inside',
|
||||
'bor-t-r-vi' : 'border-table-row-vertical-inside',
|
||||
'bor-t-r-to' : 'border-table-row-top',
|
||||
'bor-t-r-le' : 'border-table-row-left',
|
||||
'bor-t-r-bo' : 'border-table-row-bottom',
|
||||
'bor-t-r-ri' : 'border-table-row-right',
|
||||
'bor-cel-bo' : 'border-cell-bottom',
|
||||
'bor-cel-to' : 'border-cell-top',
|
||||
'bor-cel-le' : 'border-cell-left',
|
||||
'bor-cel-ri' : 'border-cell-right',
|
||||
'bor-par-bo' : 'border-paragraph-bottom',
|
||||
'bor-par-to' : 'border-paragraph-top',
|
||||
'bor-par-le' : 'border-paragraph-left',
|
||||
'bor-par-ri' : 'border-paragraph-right',
|
||||
'bor-par-bo' : 'border-paragraph-box',
|
||||
'bor-for-ev' : 'border-for-every-paragraph',
|
||||
'bor-outsid' : 'border-outisde',
|
||||
'bor-none__' : 'border',
|
||||
# border type => bt
|
||||
'bdr-single' : 'single',
|
||||
'bdr-doubtb' : 'double-thickness-border',
|
||||
'bdr-shadow' : 'shadowed-border',
|
||||
'bdr-double' : 'double-border',
|
||||
'bdr-dotted' : 'dotted-border',
|
||||
'bdr-dashed' : 'dashed',
|
||||
'bdr-hair__' : 'hairline',
|
||||
'bdr-inset_' : 'inset',
|
||||
'bdr-das-sm' : 'dash-small',
|
||||
'bdr-dot-sm' : 'dot-dash',
|
||||
'bdr-dot-do' : 'dot-dot-dash',
|
||||
'bdr-outset' : 'outset',
|
||||
'bdr-trippl' : 'tripple',
|
||||
'bdr-thsm__' : 'thick-thin-small',
|
||||
'bdr-htsm__' : 'thin-thick-small',
|
||||
'bdr-hthsm_' : 'thin-thick-thin-small',
|
||||
'bdr-thm__' : 'thick-thin-medium',
|
||||
'bdr-htm__' : 'thin-thick-medium',
|
||||
'bdr-hthm_' : 'thin-thick-thin-medium',
|
||||
'bdr-thl__' : 'thick-thin-large',
|
||||
'bdr-hthl_' : 'think-thick-think-large',
|
||||
'bdr-wavy_' : 'wavy',
|
||||
'bdr-d-wav' : 'double-wavy',
|
||||
'bdr-strip' : 'striped',
|
||||
'bdr-embos' : 'emboss',
|
||||
'bdr-engra' : 'engrave',
|
||||
'bdr-frame' : 'frame',
|
||||
'bdr-li-wid' : 'line-width',
|
||||
}
|
||||
self.__tabs_dict = {
|
||||
'cw<pf<tab-stop__' : self.__tab_stop_func,
|
||||
'cw<pf<tab-center' : self.__tab_type_func,
|
||||
'cw<pf<tab-right_' : self.__tab_type_func,
|
||||
'cw<pf<tab-dec___' : self.__tab_type_func,
|
||||
'cw<pf<leader-dot' : self.__tab_leader_func,
|
||||
'cw<pf<leader-hyp' : self.__tab_leader_func,
|
||||
'cw<pf<leader-und' : self.__tab_leader_func,
|
||||
'cw<pf<tab-bar-st' : self.__tab_bar_func,
|
||||
}
|
||||
self.__tab_type_dict = {
|
||||
'cw<pf<tab-center' : 'center',
|
||||
'cw<pf<tab-right_' : 'right',
|
||||
'cw<pf<tab-dec___' : 'decimal',
|
||||
'cw<pf<leader-dot' : 'leader-dot',
|
||||
'cw<pf<leader-hyp' : 'leader-hyphen',
|
||||
'cw<pf<leader-und' : 'leader-underline',
|
||||
}
|
||||
self.__border_obj = border_parse.BorderParse()
|
||||
self.__style_num_strings = []
|
||||
self.__body_style_strings = []
|
||||
self.__state = 'before_1st_para_def'
|
||||
self.__att_val_dict = {}
|
||||
self.__start_marker = 'mi<mk<pard-start\n' # outside para tags
|
||||
self.__start2_marker = 'mi<mk<pardstart_\n' # inside para tags
|
||||
self.__end2_marker = 'mi<mk<pardend___\n' # inside para tags
|
||||
self.__end_marker = 'mi<mk<pard-end__\n' # outside para tags
|
||||
self.__text_string = ''
|
||||
self.__state_dict = {
|
||||
'before_1st_para_def' : self.__before_1st_para_def_func,
|
||||
'collect_tokens' : self.__collect_tokens_func,
|
||||
'after_para_def' : self.__after_para_def_func,
|
||||
'in_paragraphs' : self.__in_paragraphs_func,
|
||||
'after_para_end' : self.__after_para_end_func,
|
||||
}
|
||||
self.__collect_tokens_dict = {
|
||||
'mi<mk<para-start' : self.__end_para_def_func,
|
||||
'cw<pf<par-def___' : self.__para_def_in_para_def_func,
|
||||
'cw<tb<cell______' : self.__empty_table_element_func,
|
||||
'cw<tb<row_______' : self.__empty_table_element_func,
|
||||
}
|
||||
self.__after_para_def_dict = {
|
||||
'mi<mk<para-start' : self.__start_para_after_def_func,
|
||||
'cw<pf<par-def___' : self.__found_para_def_func,
|
||||
'cw<tb<cell______' : self.__empty_table_element_func,
|
||||
'cw<tb<row_______' : self.__empty_table_element_func,
|
||||
}
|
||||
self.__in_paragraphs_dict = {
|
||||
'mi<mk<para-end__' : self.__found_para_end_func,
|
||||
}
|
||||
self.__after_para_end_dict = {
|
||||
'mi<mk<para-start' : self.__continue_block_func,
|
||||
'mi<mk<para-end__' : self.__continue_block_func,
|
||||
'cw<pf<par-def___' : self.__new_para_def_func,
|
||||
'mi<mk<body-close' : self.__stop_block_func,
|
||||
'mi<mk<par-in-fld' : self.__stop_block_func,
|
||||
'cw<tb<cell______' : self.__stop_block_func,
|
||||
'cw<tb<row-def___' : self.__stop_block_func,
|
||||
'cw<tb<row_______' : self.__stop_block_func,
|
||||
'mi<mk<sect-close' : self.__stop_block_func,
|
||||
'mi<mk<sect-start' : self.__stop_block_func,
|
||||
'mi<mk<header-beg' : self.__stop_block_func,
|
||||
'mi<mk<header-end' : self.__stop_block_func,
|
||||
'mi<mk<head___clo' : self.__stop_block_func,
|
||||
'mi<mk<fldbk-end_' : self.__stop_block_func,
|
||||
'mi<mk<lst-txbeg_' : self.__stop_block_func,
|
||||
}
|
||||
def __before_1st_para_def_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line -- line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Look for the beginning of a paragaraph definition
|
||||
"""
|
||||
##cw<pf<par-def___<nu<true
|
||||
if self.__token_info == 'cw<pf<par-def___':
|
||||
self.__found_para_def_func()
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
def __found_para_def_func(self):
|
||||
self.__state = 'collect_tokens'
|
||||
# not exactly right--have to reset the dictionary--give it default
|
||||
# values
|
||||
self.__reset_dict()
|
||||
def __collect_tokens_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Check the collect_tokens_dict for either the beginning of a
|
||||
paragraph or a new paragraph definition. Take the actions
|
||||
according to the value in the dict.
|
||||
Otherwise, check if the token is not a control word. If it is not,
|
||||
change the state to after_para_def.
|
||||
Otherwise, check if the token is a paragraph definition word; if
|
||||
so, add it to the attributes and values dictionary.
|
||||
"""
|
||||
action = self.__collect_tokens_dict.get(self.__token_info)
|
||||
if action:
|
||||
action(line)
|
||||
elif line[0:2] != 'cw':
|
||||
self.__write_obj.write(line)
|
||||
self.__state = 'after_para_def'
|
||||
elif line[0:5] == 'cw<bd':
|
||||
self.__parse_border(line)
|
||||
else:
|
||||
action = self.__tabs_dict.get(self.__token_info)
|
||||
if action:
|
||||
action(line)
|
||||
else:
|
||||
token = self.__token_dict.get(line[6:16])
|
||||
if token:
|
||||
self.__att_val_dict[token] = line[20:-1]
|
||||
def __tab_stop_func(self, line):
|
||||
"""
|
||||
"""
|
||||
type = 'tabs-%s' % self.__tab_type
|
||||
self.__att_val_dict['tabs'] += '%s:' % self.__tab_type
|
||||
self.__att_val_dict['tabs'] += '%s;' % line[20:-1]
|
||||
self.__tab_type = 'left'
|
||||
def __tab_type_func(self, line):
|
||||
"""
|
||||
"""
|
||||
type = self.__tab_type_dict.get(self.__token_info)
|
||||
if type != None:
|
||||
self.__tab_type = type
|
||||
else:
|
||||
if self.__run_level > 3:
|
||||
msg = 'no entry for %s\n' % self.__token_info
|
||||
raise self.__bug_handler, msg
|
||||
def __tab_leader_func(self, line):
|
||||
"""
|
||||
"""
|
||||
leader = self.__tab_type_dict.get(self.__token_info)
|
||||
if leader != None:
|
||||
type = 'tabs-%s' % self.__tab_type
|
||||
self.__att_val_dict['tabs'] += '%s^' % leader
|
||||
else:
|
||||
if self.__run_level > 3:
|
||||
msg = 'no entry for %s\n' % self.__token_info
|
||||
raise self.__bug_handler, msg
|
||||
def __tab_bar_func(self, line):
|
||||
"""
|
||||
"""
|
||||
# self.__att_val_dict['tabs-bar'] += '%s:' % line[20:-1]
|
||||
self.__att_val_dict['tabs'] += 'bar:%s;' % (line[20:-1])
|
||||
self.__tab_type = 'left'
|
||||
def __parse_border(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing (updates dictionary)
|
||||
Logic:
|
||||
Uses the border_parse module to return a dictionary of attribute
|
||||
value pairs for a border line.
|
||||
"""
|
||||
border_dict = self.__border_obj.parse_border(line)
|
||||
self.__att_val_dict.update(border_dict)
|
||||
def __para_def_in_para_def_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
I have found a \pard while I am collecting tokens. I want to reset
|
||||
the dectionary and do nothing else.
|
||||
"""
|
||||
# Change this
|
||||
self.__state = 'collect_tokens'
|
||||
self.__reset_dict()
|
||||
def __end_para_def_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
Nothing
|
||||
Returns:
|
||||
Nothing
|
||||
Logic:
|
||||
The previous state was collect tokens, and I have found the start
|
||||
of a paragraph. I want to outut the defintion tag; output the line
|
||||
itself (telling me of the beginning of a paragraph);change the
|
||||
state to 'in_paragraphs';
|
||||
"""
|
||||
self.__write_para_def_beg()
|
||||
self.__write_obj.write(line)
|
||||
self.__state = 'in_paragraphs'
|
||||
def __start_para_after_def_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
Nothing
|
||||
Returns:
|
||||
Nothing
|
||||
Logic:
|
||||
The state was is after_para_def. and I have found the start of a
|
||||
paragraph. I want to outut the defintion tag; output the line
|
||||
itself (telling me of the beginning of a paragraph);change the
|
||||
state to 'in_paragraphs'.
|
||||
(I now realize that this is absolutely identical to the function above!)
|
||||
"""
|
||||
self.__write_para_def_beg()
|
||||
self.__write_obj.write(line)
|
||||
self.__state = 'in_paragraphs'
|
||||
def __after_para_def_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Check if the token info is the start of a paragraph. If so, call
|
||||
on the function found in the value of the dictionary.
|
||||
"""
|
||||
action = self.__after_para_def_dict.get(self.__token_info)
|
||||
if self.__token_info == 'cw<pf<par-def___':
|
||||
self.__found_para_def_func()
|
||||
elif action:
|
||||
action(line)
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
def __in_paragraphs_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --current line
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Look for the end of a paragraph, the start of a cell or row.
|
||||
"""
|
||||
action = self.__in_paragraphs_dict.get(self.__token_info)
|
||||
if action:
|
||||
action(line)
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
def __found_para_end_func(self,line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line to print out
|
||||
Returns:
|
||||
Nothing
|
||||
Logic:
|
||||
State is in paragraphs. You have found the end of a paragraph. You
|
||||
need to print out the line and change the state to after
|
||||
paragraphs.
|
||||
"""
|
||||
self.__state = 'after_para_end'
|
||||
self.__write_obj.write(line)
|
||||
def __after_para_end_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line to output
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
The state is after the end of a paragraph. You are collecting all
|
||||
the lines in a string and waiting to see if you need to write
|
||||
out the paragraph definition. If you find another paragraph
|
||||
definition, then you write out the old paragraph dictionary and
|
||||
print out the string. You change the state to collect tokens.
|
||||
If you find any larger block elemens, such as cell, row,
|
||||
field-block, or section, you write out the paragraph defintion and
|
||||
then the text string.
|
||||
If you find the beginning of a paragraph, then you don't need to
|
||||
write out the paragraph definition. Write out the string, and
|
||||
change the state to in paragraphs.
|
||||
"""
|
||||
self.__text_string += line
|
||||
action = self.__after_para_end_dict.get(self.__token_info)
|
||||
if action:
|
||||
action(line)
|
||||
def __continue_block_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to print out
|
||||
Returns:
|
||||
Nothing
|
||||
Logic:
|
||||
The state is after the end of a paragraph. You have found the
|
||||
start of a paragaph, so you don't need to print out the paragaph
|
||||
definition. Print out the string, the line, and change the state
|
||||
to in paragraphs.
|
||||
"""
|
||||
self.__state = 'in_paragraphs'
|
||||
self.__write_obj.write(self.__text_string)
|
||||
self.__text_string = ''
|
||||
# found a new paragraph definition after an end of a paragraph
|
||||
def __new_para_def_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line to output
|
||||
Returns:
|
||||
Nothing
|
||||
Logic:
|
||||
You have found a new paragraph defintion at the end of a
|
||||
paragraph. Output the end of the old paragraph defintion. Output
|
||||
the text string. Output the line. Change the state to collect
|
||||
tokens. (And don't forget to set the text string to ''!)
|
||||
"""
|
||||
self.__write_para_def_end_func()
|
||||
self.__found_para_def_func()
|
||||
# after a paragraph and found reason to stop this block
|
||||
def __stop_block_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --(shouldn't be here?)
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
The state is after a paragraph, and you have found a larger block
|
||||
than paragraph-definition. You want to write the end tag of the
|
||||
old defintion and reset the text string (handled by other
|
||||
methods).
|
||||
"""
|
||||
self.__write_para_def_end_func()
|
||||
self.__state = 'after_para_def'
|
||||
def __write_para_def_end_func(self):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Print out the end of the pargraph definition tag, and the markers
|
||||
that let me know when I have reached this tag. (These markers are
|
||||
used for later parsing.)
|
||||
"""
|
||||
self.__write_obj.write(self.__end2_marker)
|
||||
self.__write_obj.write('mi<tg<close_____<paragraph-definition\n')
|
||||
self.__write_obj.write(self.__end_marker)
|
||||
self.__write_obj.write(self.__text_string)
|
||||
self.__text_string = ''
|
||||
keys = self.__att_val_dict.keys()
|
||||
if 'font-style' in keys:
|
||||
self.__write_obj.write('mi<mk<font-end__\n')
|
||||
if 'caps' in keys:
|
||||
self.__write_obj.write('mi<mk<caps-end__\n')
|
||||
def __get_num_of_style(self):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Get a unique value for each style.
|
||||
"""
|
||||
my_string = ''
|
||||
new_style = 0
|
||||
# when determining uniqueness for a style, ingorne these values, since
|
||||
# they don't tell us if the style is unique
|
||||
ignore_values = ['style-num', 'nest-level', 'in-table']
|
||||
keys = self.__att_val_dict.keys()
|
||||
keys.sort()
|
||||
for key in keys:
|
||||
if key in ignore_values:
|
||||
continue
|
||||
my_string += '%s:%s' % (key, self.__att_val_dict[key])
|
||||
if my_string in self.__style_num_strings:
|
||||
num = self.__style_num_strings.index(my_string)
|
||||
num += 1 # since indexing starts at zero, rather than 1
|
||||
else:
|
||||
self.__style_num_strings.append(my_string)
|
||||
num = len(self.__style_num_strings)
|
||||
new_style = 1
|
||||
num = '%04d' % num
|
||||
self.__att_val_dict['style-num'] = 's' + str(num)
|
||||
if new_style:
|
||||
self.__write_body_styles()
|
||||
def __write_body_styles(self):
|
||||
style_string = ''
|
||||
style_string += 'mi<tg<empty-att_<paragraph-style-in-body'
|
||||
style_string += '<name>%s' % self.__att_val_dict['name']
|
||||
style_string += '<style-number>%s' % self.__att_val_dict['style-num']
|
||||
tabs_list = ['tabs-left', 'tabs-right', 'tabs-decimal', 'tabs-center',
|
||||
'tabs-bar', 'tabs']
|
||||
if self.__att_val_dict['tabs'] != '':
|
||||
the_value = self.__att_val_dict['tabs']
|
||||
# the_value = the_value[:-1]
|
||||
style_string += ('<%s>%s' % ('tabs', the_value))
|
||||
keys = self.__att_val_dict.keys()
|
||||
keys.sort()
|
||||
for key in keys:
|
||||
if key != 'name' and key !='style-num' and key != 'in-table'\
|
||||
and key not in tabs_list:
|
||||
style_string += ('<%s>%s' % (key, self.__att_val_dict[key]))
|
||||
style_string += '\n'
|
||||
self.__body_style_strings.append(style_string)
|
||||
def __write_para_def_beg(self):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Print out the beginning of the pargraph definition tag, and the markers
|
||||
that let me know when I have reached this tag. (These markers are
|
||||
used for later parsing.)
|
||||
"""
|
||||
self.__get_num_of_style()
|
||||
table = self.__att_val_dict.get('in-table')
|
||||
if table:
|
||||
# del self.__att_val_dict['in-table']
|
||||
self.__write_obj.write('mi<mk<in-table__\n')
|
||||
else:
|
||||
self.__write_obj.write('mi<mk<not-in-tbl\n')
|
||||
left_indent = self.__att_val_dict.get('left-indent')
|
||||
if left_indent:
|
||||
self.__write_obj.write('mi<mk<left_inden<%s\n' % left_indent)
|
||||
is_list = self.__att_val_dict.get('list-id')
|
||||
if is_list:
|
||||
self.__write_obj.write('mi<mk<list-id___<%s\n' % is_list)
|
||||
else:
|
||||
self.__write_obj.write('mi<mk<no-list___\n')
|
||||
self.__write_obj.write('mi<mk<style-name<%s\n' % self.__att_val_dict['name'])
|
||||
self.__write_obj.write(self.__start_marker)
|
||||
self.__write_obj.write('mi<tg<open-att__<paragraph-definition')
|
||||
self.__write_obj.write('<name>%s' % self.__att_val_dict['name'])
|
||||
self.__write_obj.write('<style-number>%s' % self.__att_val_dict['style-num'])
|
||||
tabs_list = ['tabs-left', 'tabs-right', 'tabs-decimal', 'tabs-center',
|
||||
'tabs-bar', 'tabs']
|
||||
"""
|
||||
for tab_item in tabs_list:
|
||||
if self.__att_val_dict[tab_item] != '':
|
||||
the_value = self.__att_val_dict[tab_item]
|
||||
the_value = the_value[:-1]
|
||||
self.__write_obj.write('<%s>%s' % (tab_item, the_value))
|
||||
"""
|
||||
if self.__att_val_dict['tabs'] != '':
|
||||
the_value = self.__att_val_dict['tabs']
|
||||
# the_value = the_value[:-1]
|
||||
self.__write_obj.write('<%s>%s' % ('tabs', the_value))
|
||||
keys = self.__att_val_dict.keys()
|
||||
keys.sort()
|
||||
for key in keys:
|
||||
if key != 'name' and key !='style-num' and key != 'in-table'\
|
||||
and key not in tabs_list:
|
||||
self.__write_obj.write('<%s>%s' % (key, self.__att_val_dict[key]))
|
||||
self.__write_obj.write('\n')
|
||||
self.__write_obj.write(self.__start2_marker)
|
||||
if 'font-style' in keys:
|
||||
face = self.__att_val_dict['font-style']
|
||||
self.__write_obj.write('mi<mk<font______<%s\n' % face)
|
||||
if 'caps' in keys:
|
||||
value = self.__att_val_dict['caps']
|
||||
self.__write_obj.write('mi<mk<caps______<%s\n' % value)
|
||||
def __empty_table_element_func(self, line):
|
||||
self.__write_obj.write('mi<mk<in-table__\n')
|
||||
self.__write_obj.write(line)
|
||||
self.__state = 'after_para_def'
|
||||
def __reset_dict(self):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
The dictionary containing values and attributes must be reset each
|
||||
time a new paragraphs definition is found.
|
||||
"""
|
||||
self.__att_val_dict.clear()
|
||||
self.__att_val_dict['name'] = 'Normal'
|
||||
self.__att_val_dict['font-style'] = self.__default_font
|
||||
self.__tab_type = 'left'
|
||||
self.__att_val_dict['tabs-left'] = ''
|
||||
self.__att_val_dict['tabs-right'] = ''
|
||||
self.__att_val_dict['tabs-center'] = ''
|
||||
self.__att_val_dict['tabs-decimal'] = ''
|
||||
self.__att_val_dict['tabs-bar'] = ''
|
||||
self.__att_val_dict['tabs'] = ''
|
||||
def make_paragraph_def(self):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
nothing (changes the original file)
|
||||
Logic:
|
||||
Read one line in at a time. Determine what action to take based on
|
||||
the state.
|
||||
"""
|
||||
self.__initiate_values()
|
||||
read_obj = open(self.__file, 'r')
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if action == None:
|
||||
sys.stderr.write('no no matching state in module sections.py\n')
|
||||
sys.stderr.write(self.__state + '\n')
|
||||
action(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "paragraphs_def.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
return self.__body_style_strings
|
253
src/libprs500/ebooks/rtf2xml/paragraphs.py
Executable file
253
src/libprs500/ebooks/rtf2xml/paragraphs.py
Executable file
@ -0,0 +1,253 @@
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# You should have received a copy of the GNU General Public License #
|
||||
# along with this program; if not, write to the Free Software #
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
|
||||
# 02111-1307 USA #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os, tempfile
|
||||
from libprs500.ebooks.rtf2xml import copy
|
||||
class Paragraphs:
|
||||
"""
|
||||
=================
|
||||
Purpose
|
||||
=================
|
||||
Write paragraph tags for a tokenized file. (This module won't be any use to use
|
||||
to you unless you use it as part of the other modules.)
|
||||
-------------
|
||||
Method
|
||||
-------------
|
||||
RTF does not tell you when a paragraph begins. It only tells you when the
|
||||
paragraph ends.
|
||||
In order to make paragraphs out of this limited info, the parser starts in the
|
||||
body of the documents and assumes it is not in a paragraph. It looks for clues
|
||||
to begin a paragraph. Text starts a paragraph; so does an inline field or
|
||||
list-text. If an end of paragraph marker (\par) is found, then this indicates
|
||||
a blank paragraph.
|
||||
Once a paragraph is found, the state changes to 'paragraph.' In this state,
|
||||
clues are looked to for the end of a paragraph. The end of a paragraph marker
|
||||
(\par) marks the end of a paragraph. So does the end of a footnote or heading;
|
||||
a paragraph definintion; the end of a field-block; and the beginning of a
|
||||
section. (How about the end of a section or the end of a field-block?)
|
||||
"""
|
||||
def __init__(self,
|
||||
in_file,
|
||||
bug_handler,
|
||||
copy = None,
|
||||
write_empty_para = 1,
|
||||
run_level = 1,
|
||||
):
|
||||
"""
|
||||
Required:
|
||||
'file'--file to parse
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__write_empty_para = write_empty_para
|
||||
self.__run_level = run_level
|
||||
self.__write_to = tempfile.mktemp()
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Initiate all values.
|
||||
"""
|
||||
self.__state = 'before_body'
|
||||
self.__start_marker = 'mi<mk<para-start\n' # outside para tags
|
||||
self.__start2_marker = 'mi<mk<par-start_\n' # inside para tags
|
||||
self.__end2_marker = 'mi<mk<par-end___\n' # inside para tags
|
||||
self.__end_marker = 'mi<mk<para-end__\n' # outside para tags
|
||||
self.__state_dict = {
|
||||
'before_body' : self.__before_body_func,
|
||||
'not_paragraph' : self.__not_paragraph_func,
|
||||
'paragraph' : self.__paragraph_func,
|
||||
}
|
||||
self.__paragraph_dict = {
|
||||
'cw<pf<par-end___' : self.__close_para_func, # end of paragraph
|
||||
'mi<mk<headi_-end' : self.__close_para_func, # end of header or footer
|
||||
##'cw<pf<par-def___' : self.__close_para_func, # paragraph definition
|
||||
# 'mi<mk<fld-bk-end' : self.__close_para_func, # end of field-block
|
||||
'mi<mk<fldbk-end_' : self.__close_para_func, # end of field-block
|
||||
'mi<mk<body-close' : self.__close_para_func, # end of body
|
||||
'mi<mk<sect-close' : self.__close_para_func, # end of body
|
||||
'mi<mk<sect-start' : self.__close_para_func, # start of section
|
||||
'mi<mk<foot___clo' : self.__close_para_func, # end of footnote
|
||||
'cw<tb<cell______' : self.__close_para_func, # end of cell
|
||||
'mi<mk<par-in-fld' : self.__close_para_func, # start of block field
|
||||
'cw<pf<par-def___' : self.__bogus_para__def_func, # paragraph definition
|
||||
}
|
||||
self.__not_paragraph_dict = {
|
||||
'tx<nu<__________' : self.__start_para_func,
|
||||
'tx<hx<__________' : self.__start_para_func,
|
||||
'tx<ut<__________' : self.__start_para_func,
|
||||
'tx<mc<__________' : self.__start_para_func,
|
||||
'mi<mk<inline-fld' : self.__start_para_func,
|
||||
'mi<mk<para-beg__' : self.__start_para_func,
|
||||
'cw<pf<par-end___' : self.__empty_para_func,
|
||||
'mi<mk<pict-start' : self.__start_para_func,
|
||||
'cw<pf<page-break' : self.__empty_pgbk_func, # page break
|
||||
}
|
||||
def __before_body_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line -- line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
This function handles all the lines before the start of the body.
|
||||
Once the body starts, the state is switched to 'not_paragraph'
|
||||
"""
|
||||
if self.__token_info == 'mi<mk<body-open_':
|
||||
self.__state = 'not_paragraph'
|
||||
self.__write_obj.write(line)
|
||||
def __not_paragraph_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
This function handles all lines that are outside of the paragraph.
|
||||
It looks for clues that start a paragraph, and when found,
|
||||
switches states and writes the start tags.
|
||||
"""
|
||||
action = self.__not_paragraph_dict.get(self.__token_info)
|
||||
if action:
|
||||
action(line)
|
||||
self.__write_obj.write(line)
|
||||
def __paragraph_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
This function handles all the lines that are in the paragraph. It
|
||||
looks for clues to the end of the paragraph. When a clue is found,
|
||||
it calls on another method to write the end of the tag and change
|
||||
the state.
|
||||
"""
|
||||
action = self.__paragraph_dict.get(self.__token_info)
|
||||
if action:
|
||||
action(line)
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
def __start_para_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
This function writes the beginning tags for a paragraph and
|
||||
changes the state to paragraph.
|
||||
"""
|
||||
self.__write_obj.write(self.__start_marker) # marker for later parsing
|
||||
self.__write_obj.write(
|
||||
'mi<tg<open______<para\n'
|
||||
)
|
||||
self.__write_obj.write(self.__start2_marker)
|
||||
self.__state = 'paragraph'
|
||||
def __empty_para_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
This function writes the empty tags for a paragraph.
|
||||
It does not do anything if self.__write_empty_para is 0.
|
||||
"""
|
||||
if self.__write_empty_para:
|
||||
self.__write_obj.write(self.__start_marker) # marker for later parsing
|
||||
self.__write_obj.write(
|
||||
'mi<tg<empty_____<para\n'
|
||||
)
|
||||
self.__write_obj.write(self.__end_marker) # marker for later parsing
|
||||
def __empty_pgbk_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
This function writes the empty tags for a page break.
|
||||
"""
|
||||
self.__write_obj.write(
|
||||
'mi<tg<empty_____<page-break\n'
|
||||
)
|
||||
def __close_para_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
This function writes the end tags for a paragraph and
|
||||
changes the state to not_paragraph.
|
||||
"""
|
||||
self.__write_obj.write(self.__end2_marker) # marker for later parser
|
||||
self.__write_obj.write(
|
||||
'mi<tg<close_____<para\n'
|
||||
)
|
||||
self.__write_obj.write(self.__end_marker) # marker for later parser
|
||||
self.__write_obj.write(line)
|
||||
self.__state = 'not_paragraph'
|
||||
def __bogus_para__def_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
if a \pard occurs in a paragraph, I want to ignore it. (I believe)
|
||||
"""
|
||||
self.__write_obj.write('mi<mk<bogus-pard\n')
|
||||
def make_paragraphs(self):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
nothing (changes the original file)
|
||||
Logic:
|
||||
Read one line in at a time. Determine what action to take based on
|
||||
the state. If the state is before the body, look for the
|
||||
beginning of the body.
|
||||
When the body is found, change the state to 'not_paragraph'. The
|
||||
only other state is 'paragraph'.
|
||||
"""
|
||||
self.__initiate_values()
|
||||
read_obj = open(self.__file, 'r')
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if action == None:
|
||||
sys.stderr.write('no no matching state in module sections.py\n')
|
||||
sys.stderr.write(self.__state + '\n')
|
||||
action(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "paragraphs.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
186
src/libprs500/ebooks/rtf2xml/pict.py
Executable file
186
src/libprs500/ebooks/rtf2xml/pict.py
Executable file
@ -0,0 +1,186 @@
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# You should have received a copy of the GNU General Public License #
|
||||
# along with this program; if not, write to the Free Software #
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
|
||||
# 02111-1307 USA #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os, tempfile
|
||||
from libprs500.ebooks.rtf2xml import copy
|
||||
class Pict:
|
||||
"""Process graphic information"""
|
||||
def __init__(self,
|
||||
in_file,
|
||||
bug_handler,
|
||||
out_file,
|
||||
copy = None,
|
||||
orig_file = None,
|
||||
run_level = 1,
|
||||
):
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__run_level = run_level
|
||||
self.__write_to = tempfile.mktemp()
|
||||
self.__bracket_count = 0
|
||||
self.__ob_count = 0
|
||||
self.__cb_count = 0
|
||||
self.__pict_count = 0
|
||||
self.__in_pict = 0
|
||||
self.__already_found_pict = 0
|
||||
self.__orig_file = orig_file
|
||||
self.__initiate_pict_dict()
|
||||
self.__out_file = out_file
|
||||
# this is left over
|
||||
self.__no_ask = 1
|
||||
def __initiate_pict_dict(self):
|
||||
self.__pict_dict = {
|
||||
'ob<nu<open-brack' : self.__open_br_func,
|
||||
'cb<nu<clos-brack' : self.__close_br_func,
|
||||
'tx<nu<__________' : self.__text_func,
|
||||
}
|
||||
def __open_br_func(self, line):
|
||||
return "{\n"
|
||||
def __close_br_func(self, line):
|
||||
return "}\n"
|
||||
def __text_func(self, line):
|
||||
#tx<nu<__________<true text
|
||||
return line[18:]
|
||||
def __make_dir(self):
|
||||
""" Make a dirctory to put the image data in"""
|
||||
base_name = os.path.basename(self.__orig_file)
|
||||
base_name = os.path.splitext(base_name)[0]
|
||||
if self.__out_file:
|
||||
dir_name = os.path.dirname(self.__out_file)
|
||||
else:
|
||||
dir_name = os.path.dirname(self.__orig_file)
|
||||
# self.__output_to_file_func()
|
||||
self.__dir_name = base_name + "_rtf_pict_dir/"
|
||||
self.__dir_name = os.path.join(dir_name, self.__dir_name)
|
||||
if not os.path.isdir(self.__dir_name):
|
||||
try:
|
||||
os.mkdir(self.__dir_name)
|
||||
except OSError, msg:
|
||||
msg = str(msg)
|
||||
msg += "Couldn't make directory '%s':\n" % (self.__dir_name)
|
||||
raise self.__bug_handler
|
||||
else:
|
||||
if self.__no_ask:
|
||||
user_response = 'r'
|
||||
else:
|
||||
msg = 'Do you want to remove all files in %s?\n' % self.__dir_name
|
||||
msg += 'Type "r" to remove.\n'
|
||||
msg += 'Type any other key to keep files in place.\n'
|
||||
sys.stderr.write(msg)
|
||||
user_response = raw_input()
|
||||
if user_response == 'r':
|
||||
if self.__run_level > 1:
|
||||
sys.stderr.write('Removing files from old pict directory...\n')
|
||||
all_files = os.listdir(self.__dir_name)
|
||||
for the_file in all_files:
|
||||
the_file = os.path.join(self.__dir_name, the_file)
|
||||
try:
|
||||
os.remove(the_file)
|
||||
except OSError:
|
||||
pass
|
||||
if self.__run_level > 1:
|
||||
sys.stderr.write('Files removed.\n')
|
||||
def __create_pict_file(self):
|
||||
"""Create a file for all the pict data to be written to.
|
||||
"""
|
||||
self.__pict_file = os.path.join(self.__dir_name, 'picts.rtf')
|
||||
write_pic_obj = open(self.__pict_file, 'w')
|
||||
write_pic_obj.close()
|
||||
self.__write_pic_obj = open(self.__pict_file, 'a')
|
||||
def __in_pict_func(self, line):
|
||||
if self.__cb_count == self.__pict_br_count:
|
||||
self.__in_pict = 0
|
||||
self.__write_pic_obj.write("}\n")
|
||||
return 1
|
||||
else:
|
||||
action = self.__pict_dict.get(self.__token_info)
|
||||
if action:
|
||||
line = action(line)
|
||||
self.__write_pic_obj.write(line)
|
||||
return 0
|
||||
def __default(self, line, write_obj):
|
||||
"""Determine if each token marks the beginning of pict data.
|
||||
If it does, create a new file to write data to (if that file
|
||||
has not already been created.) Set the self.__in_pict flag to true.
|
||||
If the line does not contain pict data, return 1
|
||||
"""
|
||||
"""
|
||||
$pict_count++;
|
||||
$pict_count = sprintf("%03d", $pict_count);
|
||||
print OUTPUT "dv<xx<em<nu<pict<at<num>$pict_count\n";
|
||||
"""
|
||||
if self.__token_info == 'cw<gr<picture___':
|
||||
self.__pict_count += 1
|
||||
# write_obj.write("mi<tg<em<at<pict<num>%03d\n" % self.__pict_count)
|
||||
write_obj.write('mi<mk<pict-start\n')
|
||||
write_obj.write('mi<tg<empty-att_<pict<num>%03d\n' % self.__pict_count)
|
||||
write_obj.write('mi<mk<pict-end__\n')
|
||||
if not self.__already_found_pict:
|
||||
self.__create_pict_file()
|
||||
self.__already_found_pict=1;
|
||||
self.__print_rtf_header()
|
||||
self.__in_pict = 1
|
||||
self.__pict_br_count = self.__ob_count
|
||||
self.__cb_count = 0
|
||||
self.__write_pic_obj.write("{\\pict\n")
|
||||
return 0
|
||||
return 1
|
||||
def __print_rtf_header(self):
|
||||
"""Print to pict file the necessary RTF data for the file to be
|
||||
recognized as an RTF file.
|
||||
"""
|
||||
self.__write_pic_obj.write("{\\rtf1 \n")
|
||||
self.__write_pic_obj.write("{\\fonttbl\\f0\\null;} \n")
|
||||
self.__write_pic_obj.write("{\\colortbl\\red255\\green255\\blue255;} \n")
|
||||
self.__write_pic_obj.write("\\pard \n")
|
||||
def process_pict(self):
|
||||
self.__make_dir()
|
||||
read_obj = open(self.__file)
|
||||
write_obj = open(self.__write_to, 'w')
|
||||
line_to_read = 'dummy'
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.__ob_count = line[-5:-1]
|
||||
if self.__token_info == 'cb<nu<clos-brack':
|
||||
self.__cb_count = line[-5:-1]
|
||||
if not self.__in_pict:
|
||||
to_print = self.__default(line, write_obj)
|
||||
if to_print :
|
||||
write_obj.write(line)
|
||||
else:
|
||||
to_print = self.__in_pict_func(line)
|
||||
if to_print :
|
||||
write_obj.write(line)
|
||||
if self.__already_found_pict:
|
||||
self.__write_pic_obj.write("}\n")
|
||||
self.__write_pic_obj.close()
|
||||
read_obj.close()
|
||||
write_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "pict.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
if self.__pict_count == 0:
|
||||
try:
|
||||
os.rmdir(self.__dir_name)
|
||||
except OSError:
|
||||
pass
|
554
src/libprs500/ebooks/rtf2xml/preamble_div.py
Executable file
554
src/libprs500/ebooks/rtf2xml/preamble_div.py
Executable file
@ -0,0 +1,554 @@
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# You should have received a copy of the GNU General Public License #
|
||||
# along with this program; if not, write to the Free Software #
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
|
||||
# 02111-1307 USA #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os, tempfile
|
||||
from libprs500.ebooks.rtf2xml import copy, override_table, list_table
|
||||
class PreambleDiv:
|
||||
"""
|
||||
Break the preamble into divisions.
|
||||
"""
|
||||
def __init__(self, in_file,
|
||||
bug_handler,
|
||||
copy = None,
|
||||
no_namespace = None,
|
||||
run_level = 1,
|
||||
):
|
||||
"""
|
||||
Required:
|
||||
'file'
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__no_namespace = no_namespace
|
||||
self.__write_to = tempfile.mktemp()
|
||||
self.__run_level = run_level
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Set values, including those for the dictionary.
|
||||
"""
|
||||
self.__all_lists = {}
|
||||
self.__page = {
|
||||
'margin-top' : 72,
|
||||
'margin-bottom' : 72,
|
||||
'margin-left' : 90,
|
||||
'margin-right' : 90,
|
||||
'gutter' : 0,
|
||||
}
|
||||
self.__cb_count = ''
|
||||
self.__ob_count = ''
|
||||
self.__state = 'preamble'
|
||||
self.__rtf_final = ''
|
||||
self.__close_group_count = ''
|
||||
self.__found_font_table = 0
|
||||
self.__list_table_final = ''
|
||||
self.__override_table_final = ''
|
||||
self.__revision_table_final = ''
|
||||
self.__doc_info_table_final = ''
|
||||
self.__state_dict = {
|
||||
'default' : self.__default_func,
|
||||
'rtf_header' : self.__rtf_head_func,
|
||||
'preamble' : self.__preamble_func,
|
||||
'font_table' : self.__font_table_func,
|
||||
'color_table' : self.__color_table_func,
|
||||
'style_sheet' : self.__style_sheet_func,
|
||||
'list_table' : self.__list_table_func,
|
||||
'override_table' : self.__override_table_func,
|
||||
'revision_table' : self.__revision_table_func,
|
||||
'doc_info' : self.__doc_info_func,
|
||||
'body' : self.__body_func,
|
||||
'ignore' : self.__ignore_func,
|
||||
'cw<ri<rtf_______' : self.__found_rtf_head_func,
|
||||
'cw<pf<par-def___' : self.__para_def_func,
|
||||
'tx<nu<__________' : self.__text_func,
|
||||
'cw<tb<row-def___' : self.__row_def_func,
|
||||
'cw<sc<section___' : self.__new_section_func,
|
||||
'cw<sc<sect-defin' : self.__new_section_func,
|
||||
'cw<it<font-table' : self.__found_font_table_func,
|
||||
'cw<it<colr-table' : self.__found_color_table_func,
|
||||
'cw<ss<style-shet' : self.__found_style_sheet_func,
|
||||
'cw<it<listtable_' : self.__found_list_table_func,
|
||||
'cw<it<lovr-table' : self.__found_override_table_func,
|
||||
'cw<it<revi-table' : self.__found_revision_table_func,
|
||||
'cw<di<doc-info__' : self.__found_doc_info_func,
|
||||
'cw<pa<margin-lef' : self.__margin_func,
|
||||
'cw<pa<margin-rig' : self.__margin_func,
|
||||
'cw<pa<margin-top' : self.__margin_func,
|
||||
'cw<pa<margin-bot' : self.__margin_func,
|
||||
'cw<pa<gutter____' : self.__margin_func,
|
||||
'cw<pa<paper-widt' : self.__margin_func,
|
||||
'cw<pa<paper-hght' : self.__margin_func,
|
||||
# 'cw<tb<columns___' : self.__section_func,
|
||||
}
|
||||
self.__margin_dict = {
|
||||
'margin-lef' : 'margin-left',
|
||||
'margin-rig' : 'margin-right',
|
||||
'margin-top' : 'margin-top',
|
||||
'margin-bot' : 'margin-bottom',
|
||||
'gutter____' : 'gutter',
|
||||
'paper-widt' : 'paper-width',
|
||||
'paper-hght' : 'paper-height',
|
||||
}
|
||||
self.__translate_sec = {
|
||||
'columns___' : 'column',
|
||||
}
|
||||
self.__section = {}
|
||||
# self.__write_obj.write(self.__color_table_final)
|
||||
self.__color_table_final = ''
|
||||
self.__style_sheet_final = ''
|
||||
self.__individual_font = 0
|
||||
self.__old_font = 0
|
||||
self.__ob_group = 0 # depth of group
|
||||
self.__font_table_final = 0
|
||||
self.__list_table_obj = list_table.ListTable(
|
||||
run_level = self.__run_level,
|
||||
bug_handler = self.__bug_handler,
|
||||
)
|
||||
def __ignore_func(self, line):
|
||||
"""
|
||||
Ignore all lines, until the bracket is found that marks the end of
|
||||
the group.
|
||||
"""
|
||||
if self.__ignore_num == self.__cb_count:
|
||||
self.__state = self.__previous_state
|
||||
def __found_rtf_head_func(self, line):
|
||||
self.__state = 'rtf_header'
|
||||
def __rtf_head_func(self, line):
|
||||
if self.__ob_count == '0002':
|
||||
self.__rtf_final = (
|
||||
'mi<mk<rtfhed-beg\n' +
|
||||
self.__rtf_final +
|
||||
'mi<mk<rtfhed-end\n'
|
||||
)
|
||||
self.__state = 'preamble'
|
||||
elif self.__token_info == 'tx<nu<__________' or \
|
||||
self.__token_info == 'cw<pf<par-def___':
|
||||
self.__state = 'body'
|
||||
self.__rtf_final = (
|
||||
'mi<mk<rtfhed-beg\n' +
|
||||
self.__rtf_final +
|
||||
'mi<mk<rtfhed-end\n'
|
||||
)
|
||||
self.__make_default_font_table()
|
||||
self.__write_preamble()
|
||||
self.__write_obj.write(line)
|
||||
else:
|
||||
self.__rtf_final = self.__rtf_final + line
|
||||
def __make_default_font_table(self):
|
||||
"""
|
||||
If not font table is fount, need to write one out.
|
||||
"""
|
||||
self.__font_table_final = 'mi<tg<open______<font-table\n'
|
||||
self.__font_table_final += 'mi<mk<fonttb-beg\n'
|
||||
self.__font_table_final += 'mi<mk<fontit-beg\n'
|
||||
self.__font_table_final += 'cw<ci<font-style<nu<0\n'
|
||||
self.__font_table_final += 'tx<nu<__________<Times;\n'
|
||||
self.__font_table_final += 'mi<mk<fontit-end\n'
|
||||
self.__font_table_final += 'mi<mk<fonttb-end\n'
|
||||
self.__font_table_final += 'mi<tg<close_____<font-table\n'
|
||||
def __make_default_color_table(self):
|
||||
"""
|
||||
If no color table is found, write a string for a default one
|
||||
"""
|
||||
self.__color_table_final = 'mi<tg<open______<color-table\n'
|
||||
self.__color_table_final += 'mi<mk<clrtbl-beg\n'
|
||||
self.__color_table_final += 'cw<ci<red_______<nu<00\n'
|
||||
self.__color_table_final += 'cw<ci<green_____<nu<00\n'
|
||||
self.__color_table_final += 'cw<ci<blue______<en<00\n'
|
||||
self.__color_table_final += 'mi<mk<clrtbl-end\n'
|
||||
self.__color_table_final += 'mi<tg<close_____<color-table\n'
|
||||
def __make_default_style_table(self):
|
||||
"""
|
||||
If not font table is found, make a string for a default one
|
||||
"""
|
||||
"""
|
||||
self.__style_sheet_final = 'mi<tg<open______<style-table\n'
|
||||
self.__style_sheet_final +=
|
||||
self.__style_sheet_final +=
|
||||
self.__style_sheet_final +=
|
||||
self.__style_sheet_final +=
|
||||
self.__style_sheet_final +=
|
||||
self.__style_sheet_final += 'mi<tg<close_____<style-table\n'
|
||||
"""
|
||||
self.__style_sheet_final = """mi<tg<open______<style-table
|
||||
mi<mk<styles-beg
|
||||
mi<mk<stylei-beg
|
||||
cw<ci<font-style<nu<0
|
||||
tx<nu<__________<Normal;
|
||||
mi<mk<stylei-end
|
||||
mi<mk<stylei-beg
|
||||
cw<ss<char-style<nu<0
|
||||
tx<nu<__________<Default Paragraph Font;
|
||||
mi<mk<stylei-end
|
||||
mi<mk<styles-end
|
||||
mi<tg<close_____<style-table
|
||||
"""
|
||||
def __found_font_table_func(self, line):
|
||||
if self.__found_font_table:
|
||||
self.__state = 'ignore'
|
||||
else:
|
||||
self.__state = 'font_table'
|
||||
self.__font_table_final = ''
|
||||
self.__close_group_count = self.__ob_count
|
||||
self.__cb_count = 0
|
||||
self.__found_font_table = 1
|
||||
def __font_table_func(self, line):
|
||||
"""
|
||||
Keep adding to the self.__individual_font string until end of group
|
||||
found. If a bracket is found, check that it is only one bracket deep.
|
||||
If it is, then set the marker for an individual font. If it is not,
|
||||
then ignore all data in this group.
|
||||
cw<ci<font-style<nu<0
|
||||
"""
|
||||
if self.__cb_count == self.__close_group_count:
|
||||
self.__state = 'preamble'
|
||||
self.__font_table_final = 'mi<tg<open______<font-table\n' + \
|
||||
'mi<mk<fonttb-beg\n' + self.__font_table_final
|
||||
self.__font_table_final += \
|
||||
'mi<mk<fonttb-end\n' + 'mi<tg<close_____<font-table\n'
|
||||
elif self.__token_info == 'ob<nu<open-brack':
|
||||
if int(self.__ob_count) == int(self.__close_group_count) + 1:
|
||||
self.__font_table_final += \
|
||||
'mi<mk<fontit-beg\n'
|
||||
self.__individual_font = 1
|
||||
else:
|
||||
# ignore
|
||||
self.__previous_state = 'font_table'
|
||||
self.__state = 'ignore'
|
||||
self.__ignore_num = self.__ob_count
|
||||
elif self.__token_info == 'cb<nu<clos-brack':
|
||||
if int(self.__cb_count) == int(self.__close_group_count) + 1:
|
||||
self.__individual_font = 0
|
||||
self.__font_table_final += \
|
||||
'mi<mk<fontit-end\n'
|
||||
elif self.__individual_font:
|
||||
if self.__old_font and self.__token_info == 'tx<nu<__________':
|
||||
if ';' in line:
|
||||
self.__font_table_final += line
|
||||
self.__font_table_final += 'mi<mk<fontit-end\n'
|
||||
self.__individual_font = 0
|
||||
else:
|
||||
self.__font_table_final += line
|
||||
elif self.__token_info == 'cw<ci<font-style':
|
||||
self.__old_font = 1
|
||||
self.__individual_font = 1
|
||||
self.__font_table_final += 'mi<mk<fontit-beg\n'
|
||||
self.__font_table_final += line
|
||||
def __old_font_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
used for older forms of RTF:
|
||||
\f3\fswiss\fcharset77 Helvetica-Oblique;\f4\fnil\fcharset77 Geneva;}
|
||||
Note how each font is not divided by a bracket
|
||||
"""
|
||||
def __found_color_table_func(self, line):
|
||||
"""
|
||||
all functions that start with __found operate the same. They set the
|
||||
state, initiate a string, determine the self.__close_group_count, and
|
||||
set self.__cb_count to zero.
|
||||
"""
|
||||
self.__state = 'color_table'
|
||||
self.__color_table_final = ''
|
||||
self.__close_group_count = self.__ob_count
|
||||
self.__cb_count = 0
|
||||
def __color_table_func(self, line):
|
||||
if int(self.__cb_count) == int(self.__close_group_count):
|
||||
self.__state = 'preamble'
|
||||
self.__color_table_final = 'mi<tg<open______<color-table\n' + \
|
||||
'mi<mk<clrtbl-beg\n' + self.__color_table_final
|
||||
self.__color_table_final += \
|
||||
'mi<mk<clrtbl-end\n' + 'mi<tg<close_____<color-table\n'
|
||||
else:
|
||||
self.__color_table_final += line
|
||||
def __found_style_sheet_func(self, line):
|
||||
self.__state = 'style_sheet'
|
||||
self.__style_sheet_final = ''
|
||||
self.__close_group_count = self.__ob_count
|
||||
self.__cb_count = 0
|
||||
def __style_sheet_func(self, line):
|
||||
"""
|
||||
Same logic as the font_table_func.
|
||||
"""
|
||||
if self.__cb_count == self.__close_group_count:
|
||||
self.__state = 'preamble'
|
||||
self.__style_sheet_final = 'mi<tg<open______<style-table\n' + \
|
||||
'mi<mk<styles-beg\n' + self.__style_sheet_final
|
||||
self.__style_sheet_final += \
|
||||
'mi<mk<styles-end\n' + 'mi<tg<close_____<style-table\n'
|
||||
elif self.__token_info == 'ob<nu<open-brack':
|
||||
if int(self.__ob_count) == int(self.__close_group_count) + 1:
|
||||
self.__style_sheet_final += \
|
||||
'mi<mk<stylei-beg\n'
|
||||
elif self.__token_info == 'cb<nu<clos-brack':
|
||||
if int(self.__cb_count) == int(self.__close_group_count) + 1:
|
||||
self.__style_sheet_final += \
|
||||
'mi<mk<stylei-end\n'
|
||||
else:
|
||||
self.__style_sheet_final += line
|
||||
def __found_list_table_func(self, line):
|
||||
self.__state = 'list_table'
|
||||
self.__list_table_final = ''
|
||||
self.__close_group_count = self.__ob_count
|
||||
self.__cb_count = 0
|
||||
def __list_table_func(self, line):
|
||||
if self.__cb_count == self.__close_group_count:
|
||||
self.__state = 'preamble'
|
||||
self.__list_table_final, self.__all_lists =\
|
||||
self.__list_table_obj.parse_list_table(
|
||||
self.__list_table_final)
|
||||
# sys.stderr.write(repr(all_lists))
|
||||
elif self.__token_info == '':
|
||||
pass
|
||||
else:
|
||||
self.__list_table_final += line
|
||||
pass
|
||||
def __found_override_table_func(self, line):
|
||||
self.__override_table_obj = override_table.OverrideTable(
|
||||
run_level = self.__run_level,
|
||||
list_of_lists = self.__all_lists,
|
||||
)
|
||||
self.__state = 'override_table'
|
||||
self.__override_table_final = ''
|
||||
self.__close_group_count = self.__ob_count
|
||||
self.__cb_count = 0
|
||||
# cw<it<lovr-table
|
||||
def __override_table_func(self, line):
|
||||
if self.__cb_count == self.__close_group_count:
|
||||
self.__state = 'preamble'
|
||||
self.__override_table_final, self.__all_lists =\
|
||||
self.__override_table_obj.parse_override_table(self.__override_table_final)
|
||||
elif self.__token_info == '':
|
||||
pass
|
||||
else:
|
||||
self.__override_table_final += line
|
||||
def __found_revision_table_func(self, line):
|
||||
self.__state = 'revision_table'
|
||||
self.__revision_table_final = ''
|
||||
self.__close_group_count = self.__ob_count
|
||||
self.__cb_count = 0
|
||||
def __revision_table_func(self, line):
|
||||
if int(self.__cb_count) == int(self.__close_group_count):
|
||||
self.__state = 'preamble'
|
||||
self.__revision_table_final = 'mi<tg<open______<revision-table\n' + \
|
||||
'mi<mk<revtbl-beg\n' + self.__revision_table_final
|
||||
self.__revision_table_final += \
|
||||
'mi<mk<revtbl-end\n' + 'mi<tg<close_____<revision-table\n'
|
||||
else:
|
||||
self.__revision_table_final += line
|
||||
def __found_doc_info_func(self, line):
|
||||
self.__state = 'doc_info'
|
||||
self.__doc_info_table_final = ''
|
||||
self.__close_group_count = self.__ob_count
|
||||
self.__cb_count = 0
|
||||
def __doc_info_func(self, line):
|
||||
if self.__cb_count == self.__close_group_count:
|
||||
self.__state = 'preamble'
|
||||
self.__doc_info_table_final = 'mi<tg<open______<doc-information\n' + \
|
||||
'mi<mk<doc-in-beg\n' + self.__doc_info_table_final
|
||||
self.__doc_info_table_final += \
|
||||
'mi<mk<doc-in-end\n' + 'mi<tg<close_____<doc-information\n'
|
||||
elif self.__token_info == 'ob<nu<open-brack':
|
||||
if int(self.__ob_count) == int(self.__close_group_count) + 1:
|
||||
self.__doc_info_table_final += \
|
||||
'mi<mk<docinf-beg\n'
|
||||
elif self.__token_info == 'cb<nu<clos-brack':
|
||||
if int(self.__cb_count) == int(self.__close_group_count) + 1:
|
||||
self.__doc_info_table_final += \
|
||||
'mi<mk<docinf-end\n'
|
||||
else:
|
||||
self.__doc_info_table_final += line
|
||||
def __margin_func(self, line):
|
||||
"""
|
||||
Handles lines that describe page info. Add the apporpriate info in the
|
||||
token to the self.__margin_dict dicitonary.
|
||||
"""
|
||||
info = line[6:16]
|
||||
changed = self.__margin_dict.get(info)
|
||||
if changed == None:
|
||||
print 'woops!'
|
||||
else:
|
||||
self.__page[changed] = line[20:-1]
|
||||
#cw<pa<margin-lef<nu<1728
|
||||
def __print_page_info(self):
|
||||
self.__write_obj.write('mi<tg<empty-att_<page-definition')
|
||||
for key in self.__page.keys():
|
||||
self.__write_obj.write(
|
||||
'<%s>%s' % (key, self.__page[key])
|
||||
)
|
||||
self.__write_obj.write('\n')
|
||||
#mi<tg<open-att__<footn
|
||||
def __print_sec_info(self):
|
||||
"""
|
||||
Check if there is any section info. If so, print it out.
|
||||
If not, print out an empty tag to satisfy the dtd.
|
||||
"""
|
||||
if len(self.__section.keys()) == 0:
|
||||
self.__write_obj.write(
|
||||
'mi<tg<open______<section-definition\n'
|
||||
)
|
||||
else:
|
||||
self.__write_obj.write(
|
||||
'mi<tg<open-att__<section-definition')
|
||||
keys = self.__section.keys()
|
||||
for key in keys:
|
||||
self.__write_obj.write(
|
||||
'<%s>%s' % (key, self.__section[key])
|
||||
)
|
||||
self.__write_obj.write('\n')
|
||||
def __section_func(self, line):
|
||||
"""
|
||||
Add info pertaining to section to the self.__section dictionary, to be
|
||||
printed out later.
|
||||
"""
|
||||
info = self.__translate_sec.get(line[6:16])
|
||||
if info == None:
|
||||
sys.stderr.write ('woops!\n')
|
||||
else:
|
||||
self.__section[info] = 'true'
|
||||
def __body_func(self, line):
|
||||
self.__write_obj.write(line)
|
||||
def __default_func(self, line):
|
||||
# either in preamble or in body
|
||||
pass
|
||||
def __para_def_func(self, line):
|
||||
# if self.__ob_group == 1
|
||||
# this tells dept of group
|
||||
if self.__cb_count == '0002':
|
||||
self.__state = 'body'
|
||||
self.__write_preamble()
|
||||
self.__write_obj.write(line)
|
||||
def __text_func(self, line):
|
||||
"""
|
||||
If the cb_count is less than 1, you have hit the body
|
||||
For older RTF
|
||||
Newer RTF should never have to use this function
|
||||
"""
|
||||
if self.__cb_count == '':
|
||||
cb_count = '0002'
|
||||
else:
|
||||
cb_count = self.__cb_count
|
||||
# ignore previous lines
|
||||
# should be
|
||||
# if self.__ob_group == 1
|
||||
# this tells dept of group
|
||||
if cb_count == '0002':
|
||||
self.__state = 'body'
|
||||
self.__write_preamble()
|
||||
self.__write_obj.write(line)
|
||||
def __row_def_func(self, line):
|
||||
# if self.__ob_group == 1
|
||||
# this tells dept of group
|
||||
if self.__cb_count == '0002':
|
||||
self.__state = 'body'
|
||||
self.__write_preamble()
|
||||
self.__write_obj.write(line)
|
||||
def __new_section_func(self, line):
|
||||
"""
|
||||
This is new. The start of a section marks the end of the preamble
|
||||
"""
|
||||
if self.__cb_count == '0002':
|
||||
self.__state = 'body'
|
||||
self.__write_preamble()
|
||||
else:
|
||||
sys.stderr.write('module is preamble_div\n')
|
||||
sys.stderr.write('method is __new_section_func\n')
|
||||
sys.stderr.write('bracket count should be 2?\n')
|
||||
self.__write_obj.write(line)
|
||||
def __write_preamble(self):
|
||||
"""
|
||||
Write all the strings, which represent all the data in the preamble.
|
||||
Write a body and section beginning.
|
||||
"""
|
||||
if self.__no_namespace:
|
||||
self.__write_obj.write(
|
||||
'mi<tg<open______<doc\n'
|
||||
)
|
||||
else:
|
||||
self.__write_obj.write(
|
||||
'mi<tg<open-att__<doc<xmlns>http://rtf2xml.sourceforge.net/\n')
|
||||
self.__write_obj.write('mi<tg<open______<preamble\n')
|
||||
self.__write_obj.write(self.__rtf_final)
|
||||
if not self.__color_table_final:
|
||||
self.__make_default_color_table()
|
||||
if not self.__font_table_final:
|
||||
self.__make_default_font_table()
|
||||
self.__write_obj.write(self.__font_table_final)
|
||||
self.__write_obj.write(self.__color_table_final)
|
||||
if not self.__style_sheet_final:
|
||||
self.__make_default_style_table()
|
||||
self.__write_obj.write(self.__style_sheet_final)
|
||||
self.__write_obj.write(self.__list_table_final)
|
||||
self.__write_obj.write(self.__override_table_final)
|
||||
self.__write_obj.write(self.__revision_table_final)
|
||||
self.__write_obj.write(self.__doc_info_table_final)
|
||||
self.__print_page_info()
|
||||
self.__write_obj.write('ob<nu<open-brack<0001\n')
|
||||
self.__write_obj.write('ob<nu<open-brack<0002\n')
|
||||
self.__write_obj.write('cb<nu<clos-brack<0002\n')
|
||||
self.__write_obj.write('mi<tg<close_____<preamble\n')
|
||||
self.__write_obj.write('mi<tg<open______<body\n')
|
||||
# self.__write_obj.write('mi<tg<open-att__<section<num>1\n')
|
||||
# self.__print_sec_info()
|
||||
# self.__write_obj.write('mi<tg<open______<headers-and-footers\n')
|
||||
# self.__write_obj.write('mi<mk<head_foot_<\n')
|
||||
# self.__write_obj.write('mi<tg<close_____<headers-and-footers\n')
|
||||
self.__write_obj.write('mi<mk<body-open_\n')
|
||||
def __preamble_func(self, line):
|
||||
"""
|
||||
Check if the token info belongs to the dictionary. If so, take the
|
||||
appropriate action.
|
||||
"""
|
||||
action = self.__state_dict.get(self.__token_info)
|
||||
if action:
|
||||
action(line)
|
||||
def make_preamble_divisions(self):
|
||||
self.__initiate_values()
|
||||
read_obj = open(self.__file, 'r')
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.__ob_count = line[-5:-1]
|
||||
self.__ob_group += 1
|
||||
if self.__token_info == 'cb<nu<clos-brack':
|
||||
self.__cb_count = line[-5:-1]
|
||||
self.__ob_group -= 1
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if action == None:
|
||||
print self.__state
|
||||
action(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "preamble_div.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
return self.__all_lists
|
145
src/libprs500/ebooks/rtf2xml/preamble_rest.py
Executable file
145
src/libprs500/ebooks/rtf2xml/preamble_rest.py
Executable file
@ -0,0 +1,145 @@
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# You should have received a copy of the GNU General Public License #
|
||||
# along with this program; if not, write to the Free Software #
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
|
||||
# 02111-1307 USA #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys,os
|
||||
from libprs500.ebooks.rtf2xml import copy
|
||||
class Preamble:
|
||||
"""
|
||||
Fix the reamaing parts of the preamble. This module does very little. It
|
||||
makes sure that no text gets put in the revision of list table. In the
|
||||
future, when I understand how to interprett he revision table and list
|
||||
table, I will make these methods more functional.
|
||||
"""
|
||||
def __init__(self, file, bug_handler, platform, default_font, code_page,
|
||||
copy=None, temp_dir=None):
|
||||
"""
|
||||
Required:
|
||||
file--file to parse
|
||||
platform --Windows or Macintosh
|
||||
default_font -- the default font
|
||||
code_page --the code page (ansi1252, for example)
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file=file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__default_font = default_font
|
||||
self.__code_page = code_page
|
||||
self.__platform = platform
|
||||
if temp_dir:
|
||||
self.__write_to = os.path.join(temp_dir,"info_table_info.data")
|
||||
else:
|
||||
self.__write_to = "info_table_info.data"
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Initiate all values.
|
||||
"""
|
||||
self.__state = 'default'
|
||||
self.__text_string = ''
|
||||
self.__state_dict = {
|
||||
'default' : self.__default_func,
|
||||
'revision' : self.__revision_table_func,
|
||||
'list_table' : self.__list_table_func,
|
||||
'body' : self.__body_func,
|
||||
}
|
||||
self.__default_dict = {
|
||||
'mi<mk<rtfhed-beg' : self.__found_rtf_head_func,
|
||||
'mi<mk<listabbeg_' : self.__found_list_table_func,
|
||||
'mi<mk<revtbl-beg' : self.__found_revision_table_func,
|
||||
'mi<mk<body-open_' : self.__found_body_func,
|
||||
}
|
||||
def __default_func(self, line):
|
||||
action = self.__default_dict.get(self.__token_info)
|
||||
if action:
|
||||
action(line)
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
def __found_rtf_head_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- the line to parse
|
||||
Returns:
|
||||
nothing.
|
||||
Logic:
|
||||
Write to the output file the default font info, the code page
|
||||
info, and the platform info.
|
||||
"""
|
||||
self.__write_obj.write(
|
||||
'mi<tg<empty-att_<rtf-definition'
|
||||
'<default-font>%s<code-page>%s'
|
||||
'<platform>%s\n' % (self.__default_font, self.__code_page,
|
||||
self.__platform)
|
||||
)
|
||||
def __found_list_table_func(self, line):
|
||||
self.__state = 'list_table'
|
||||
def __list_table_func(self, line):
|
||||
if self.__token_info == 'mi<mk<listabend_':
|
||||
self.__state = 'default'
|
||||
elif line[0:2] == 'tx':
|
||||
pass
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
def __found_revision_table_func(self, line):
|
||||
self.__state = 'revision'
|
||||
def __revision_table_func(self, line):
|
||||
if self.__token_info == 'mi<mk<revtbl-end':
|
||||
self.__state = 'default'
|
||||
elif line[0:2] == 'tx':
|
||||
pass
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
def __found_body_func(self, line):
|
||||
self.__state = 'body'
|
||||
self.__write_obj.write(line)
|
||||
def __body_func(self, line):
|
||||
self.__write_obj.write(line)
|
||||
def fix_preamble(self):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
nothing (changes the original file)
|
||||
Logic:
|
||||
Read one line in at a time. Determine what action to take based on
|
||||
the state. The state can either be defaut, the revision table, or
|
||||
the list table.
|
||||
"""
|
||||
self.__initiate_values()
|
||||
read_obj = open(self.__file, 'r')
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if action == None:
|
||||
sys.stderr.write('no no matching state in module preamble_rest.py\n')
|
||||
sys.stderr.write(self.__state + '\n')
|
||||
action(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "preamble_div.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
826
src/libprs500/ebooks/rtf2xml/process_tokens.py
Executable file
826
src/libprs500/ebooks/rtf2xml/process_tokens.py
Executable file
@ -0,0 +1,826 @@
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# You should have received a copy of the GNU General Public License #
|
||||
# along with this program; if not, write to the Free Software #
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
|
||||
# 02111-1307 USA #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import os, re, tempfile
|
||||
from libprs500.ebooks.rtf2xml import copy, check_brackets
|
||||
class ProcessTokens:
|
||||
"""
|
||||
Process each token on a line and add information that will be useful for
|
||||
later processing. Information will be put on one line, delimited by "<"
|
||||
for main fields, and ">" for sub fields
|
||||
"""
|
||||
def __init__(self,
|
||||
in_file,
|
||||
exception_handler,
|
||||
bug_handler,
|
||||
copy = None,
|
||||
run_level = 1,
|
||||
):
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__run_level = run_level
|
||||
self.__write_to = tempfile.mktemp()
|
||||
self.initiate_token_dict()
|
||||
##self.initiate_token_actions()
|
||||
self.compile_expressions()
|
||||
self.__bracket_count=0
|
||||
self.__exception_handler = exception_handler
|
||||
self.__bug_handler = bug_handler
|
||||
def compile_expressions(self):
|
||||
self.__num_exp = re.compile(r"([a-zA-Z]+)(.*)")
|
||||
self.__utf_exp = re.compile(r'(&.*?;)')
|
||||
def initiate_token_dict(self):
|
||||
self.__return_code = 0
|
||||
self.dict_token={
|
||||
# unicode
|
||||
'mshex' : ('nu', '__________', self.__ms_hex_func),
|
||||
# brackets
|
||||
'{' : ('nu', '{', self.ob_func),
|
||||
'}' : ('nu', '}', self.cb_func),
|
||||
# microsoft characters
|
||||
'ldblquote' : ('mc', 'ldblquote', self.ms_sub_func),
|
||||
'rdblquote' : ('mc', 'rdblquote', self.ms_sub_func),
|
||||
'rquote' : ('mc', 'rquote', self.ms_sub_func),
|
||||
'lquote' : ('mc', 'lquote', self.ms_sub_func),
|
||||
'emdash' : ('mc', 'emdash', self.ms_sub_func),
|
||||
'endash' : ('mc', 'endash', self.ms_sub_func),
|
||||
'bullet' : ('mc', 'bullet', self.ms_sub_func),
|
||||
'~' : ('mc', '~', self.ms_sub_func),
|
||||
'tab' : ('mc', 'tab', self.ms_sub_func),
|
||||
'_' : ('mc', '_', self.ms_sub_func),
|
||||
';' : ('mc', ';', self.ms_sub_func),
|
||||
# this must be wrong
|
||||
'-' : ('mc', '-', self.ms_sub_func),
|
||||
# misc => ml
|
||||
'*' : ('ml', 'asterisk__', self.default_func),
|
||||
':' : ('ml', 'colon_____', self.default_func),
|
||||
# text
|
||||
'backslash' : ('nu', '\\', self.text_func),
|
||||
'ob' : ('nu', '{', self.text_func),
|
||||
'cb' : ('nu', '}', self.text_func),
|
||||
# paragraph formatting => pf
|
||||
'page' : ('pf', 'page-break', self.default_func),
|
||||
'par' : ('pf', 'par-end___', self.default_func),
|
||||
'pard' : ('pf', 'par-def___', self.default_func),
|
||||
'keepn' : ('pf', 'keep-w-nex', self.bool_st_func),
|
||||
'widctlpar' : ('pf', 'widow-cntl', self.bool_st_func),
|
||||
'adjustright' : ('pf', 'adjust-rgt', self.bool_st_func),
|
||||
'lang' : ('pf', 'language__', self.__language_func),
|
||||
'ri' : ('pf', 'right-inde', self.divide_by_20),
|
||||
'fi' : ('pf', 'fir-ln-ind', self.divide_by_20),
|
||||
'li' : ('pf', 'left-inden', self.divide_by_20),
|
||||
'sb' : ('pf', 'space-befo', self.divide_by_20),
|
||||
'sa' : ('pf', 'space-afte', self.divide_by_20),
|
||||
'sl' : ('pf', 'line-space', self.divide_by_20),
|
||||
'deftab' : ('pf', 'default-ta', self.divide_by_20),
|
||||
'ql' : ('pf', 'align_____<left', self.two_part_func),
|
||||
'qc' : ('pf', 'align_____<cent', self.two_part_func),
|
||||
'qj' : ('pf', 'align_____<just', self.two_part_func),
|
||||
'qr' : ('pf', 'align_____<right', self.two_part_func),
|
||||
'nowidctlpar' : ('pf', 'widow-cntr<false', self.two_part_func),
|
||||
'tx' : ('pf', 'tab-stop__', self.divide_by_20),
|
||||
'tb' : ('pf', 'tab-bar-st', self.divide_by_20),
|
||||
'tqr' : ('pf', 'tab-right_', self.default_func),
|
||||
'tqdec' : ('pf', 'tab-dec___', self.default_func),
|
||||
'tqc' : ('pf', 'tab-center', self.default_func),
|
||||
'tlul' : ('pf', 'leader-und', self.default_func),
|
||||
'tlhyph' : ('pf', 'leader-hyp', self.default_func),
|
||||
'tldot' : ('pf', 'leader-dot', self.default_func),
|
||||
# stylesheet = > ss
|
||||
'stylesheet' : ('ss', 'style-shet', self.default_func),
|
||||
'sbasedon' : ('ss', 'based-on__', self.default_func),
|
||||
'snext' : ('ss', 'next-style', self.default_func),
|
||||
'cs' : ('ss', 'char-style', self.default_func),
|
||||
's' : ('ss', 'para-style', self.default_func),
|
||||
# graphics => gr
|
||||
'pict' : ('gr', 'picture___', self.default_func),
|
||||
'objclass' : ('gr', 'obj-class_', self.default_func),
|
||||
'macpict' : ('gr', 'mac-pic___', self.default_func),
|
||||
# section => sc
|
||||
'sect' : ('sc', 'section___', self.default_func),
|
||||
'sectd' : ('sc', 'sect-defin', self.default_func),
|
||||
'endhere' : ('sc', 'sect-note_', self.default_func),
|
||||
# list=> ls
|
||||
'pntext' : ('ls', 'list-text_', self.default_func),
|
||||
# this line must be wrong because it duplicates an earlier one
|
||||
'listtext' : ('ls', 'list-text_', self.default_func),
|
||||
'pn' : ('ls', 'list______', self.default_func),
|
||||
'pnseclvl' : ('ls', 'list-level', self.default_func),
|
||||
'pncard' : ('ls', 'list-cardi', self.bool_st_func),
|
||||
'pndec' : ('ls', 'list-decim', self.bool_st_func),
|
||||
'pnucltr' : ('ls', 'list-up-al', self.bool_st_func),
|
||||
'pnucrm' : ('ls', 'list-up-ro', self.bool_st_func),
|
||||
'pnord' : ('ls', 'list-ord__', self.bool_st_func),
|
||||
'pnordt' : ('ls', 'list-ordte', self.bool_st_func),
|
||||
'pnlvlblt' : ('ls', 'list-bulli', self.bool_st_func),
|
||||
'pnlvlbody' : ('ls', 'list-simpi', self.bool_st_func),
|
||||
'pnlvlcont' : ('ls', 'list-conti', self.bool_st_func),
|
||||
'pnhang' : ('ls', 'list-hang_', self.bool_st_func),
|
||||
'pntxtb' : ('ls', 'list-tebef', self.bool_st_func),
|
||||
'ilvl' : ('ls', 'list-level', self.default_func),
|
||||
'ls' : ('ls', 'list-id___', self.default_func),
|
||||
'pnstart' : ('ls', 'list-start', self.default_func),
|
||||
'itap' : ('ls', 'nest-level', self.default_func),
|
||||
'leveltext' : ('ls', 'level-text', self.default_func),
|
||||
'levelnumbers' : ('ls', 'level-numb', self.default_func),
|
||||
'list' : ('ls', 'list-in-tb', self.default_func),
|
||||
'listlevel' : ('ls', 'list-tb-le', self.default_func),
|
||||
'listname' : ('ls', 'list-name_', self.default_func),
|
||||
'listtemplateid' : ('ls', 'ls-tem-id_', self.default_func),
|
||||
'leveltemplateid' : ('ls', 'lv-tem-id_', self.default_func),
|
||||
'listhybrid' : ('ls', 'list-hybri', self.default_func),
|
||||
'levelstartat' : ('ls', 'level-star', self.default_func),
|
||||
'levelspace' : ('ls', 'level-spac', self.divide_by_20),
|
||||
'levelindent' : ('ls', 'level-inde', self.default_func),
|
||||
'levelnfc' : ('ls', 'level-type', self.__list_type_func),
|
||||
'levelnfcn' : ('ls', 'level-type', self.__list_type_func),
|
||||
'listid' : ('ls', 'lis-tbl-id', self.default_func),
|
||||
'listoverride' : ('ls', 'lis-overid', self.default_func),
|
||||
# duplicate
|
||||
'pnlvl' : ('ls', 'list-level', self.default_func),
|
||||
# root info => ri
|
||||
'rtf' : ('ri', 'rtf_______', self.default_func),
|
||||
'deff' : ('ri', 'deflt-font', self.default_func),
|
||||
'mac' : ('ri', 'macintosh_', self.default_func),
|
||||
'ansi' : ('ri', 'ansi______', self.default_func),
|
||||
'ansicpg' : ('ri', 'ansi-codpg', self.default_func),
|
||||
# notes => nt
|
||||
'footnote' : ('nt', 'footnote__', self.default_func),
|
||||
'ftnalt' : ('nt', 'type______<endnote', self.two_part_func),
|
||||
# anchor => an
|
||||
'tc' : ('an', 'toc_______', self.default_func),
|
||||
'bkmkstt' : ('an', 'book-mk-st', self.default_func),
|
||||
'bkmkstart' : ('an', 'book-mk-st', self.default_func),
|
||||
'bkmkend' : ('an', 'book-mk-en', self.default_func),
|
||||
'xe' : ('an', 'index-mark', self.default_func),
|
||||
'rxe' : ('an', 'place_____', self.default_func),
|
||||
# index => in
|
||||
'bxe' : ('in', 'index-bold', self.default_func),
|
||||
'ixe' : ('in', 'index-ital', self.default_func),
|
||||
'txe' : ('in', 'index-see_', self.default_func),
|
||||
# table of contents => tc
|
||||
'tcl' : ('tc', 'toc-level_', self.default_func),
|
||||
'tcn' : ('tc', 'toc-sup-nu', self.default_func),
|
||||
# field => fd
|
||||
'field' : ('fd', 'field_____', self.default_func),
|
||||
'fldinst' : ('fd', 'field-inst', self.default_func),
|
||||
'fldrslt' : ('fd', 'field-rslt', self.default_func),
|
||||
'datafield' : ('fd', 'datafield_', self.default_func),
|
||||
# info-tables => it
|
||||
'fonttbl' : ('it', 'font-table', self.default_func),
|
||||
'colortbl' : ('it', 'colr-table', self.default_func),
|
||||
'listoverridetable' : ('it', 'lovr-table', self.default_func),
|
||||
'listtable' : ('it', 'listtable_', self.default_func),
|
||||
'revtbl' : ('it', 'revi-table', self.default_func),
|
||||
# character info => ci
|
||||
'b' : ('ci', 'bold______', self.bool_st_func),
|
||||
'blue' : ('ci', 'blue______', self.color_func),
|
||||
'caps' : ('ci', 'caps______', self.bool_st_func),
|
||||
'cf' : ('ci', 'font-color', self.default_func),
|
||||
'chftn' : ('ci', 'footnot-mk', self.bool_st_func),
|
||||
'dn' : ('ci', 'font-down_', self.divide_by_2),
|
||||
'embo' : ('ci', 'emboss____', self.bool_st_func),
|
||||
'f' : ('ci', 'font-style', self.default_func),
|
||||
'fs' : ('ci', 'font-size_', self.divide_by_2),
|
||||
'green' : ('ci', 'green_____', self.color_func),
|
||||
'i' : ('ci', 'italics___', self.bool_st_func),
|
||||
'impr' : ('ci', 'engrave___', self.bool_st_func),
|
||||
'outl' : ('ci', 'outline___', self.bool_st_func),
|
||||
'plain' : ('ci', 'plain_____', self.bool_st_func),
|
||||
'red' : ('ci', 'red_______', self.color_func),
|
||||
'scaps' : ('ci', 'small-caps', self.bool_st_func),
|
||||
'shad' : ('ci', 'shadow____', self.bool_st_func),
|
||||
'strike' : ('ci', 'strike-thr', self.bool_st_func),
|
||||
'striked' : ('ci', 'dbl-strike', self.bool_st_func),
|
||||
'sub' : ('ci', 'subscript_', self.bool_st_func),
|
||||
'super' : ('ci', 'superscrip', self.bool_st_func),
|
||||
'nosupersub' : ('ci', 'no-su-supe', self.__no_sup_sub_func),
|
||||
'up' : ('ci', 'font-up___', self.divide_by_2),
|
||||
'v' : ('ci', 'hidden____', self.default_func),
|
||||
# table => tb
|
||||
'trowd' : ('tb', 'row-def___', self.default_func),
|
||||
'cell' : ('tb', 'cell______', self.default_func),
|
||||
'row' : ('tb', 'row_______', self.default_func),
|
||||
'intbl' : ('tb', 'in-table__', self.default_func),
|
||||
'cols' : ('tb', 'columns___', self.default_func),
|
||||
'trleft' : ('tb', 'row-pos-le', self.divide_by_20),
|
||||
'cellx' : ('tb', 'cell-posit', self.divide_by_20),
|
||||
'trhdr' : ('tb', 'row-header', self.default_func),
|
||||
# preamble => pr
|
||||
# document information => di
|
||||
'info' : ('di', 'doc-info__', self.default_func),
|
||||
'author' : ('di', 'author____', self.default_func),
|
||||
'operator' : ('di', 'operator__', self.default_func),
|
||||
'title' : ('di', 'title_____', self.default_func),
|
||||
'keywords' : ('di', 'keywords__', self.default_func),
|
||||
'doccomm' : ('di', 'doc-notes_', self.default_func),
|
||||
'comment' : ('di', 'doc-notes_', self.default_func),
|
||||
'subject' : ('di', 'subject___', self.default_func),
|
||||
'creatim' : ('di', 'create-tim', self.default_func),
|
||||
'yr' : ('di', 'year______', self.default_func),
|
||||
'mo' : ('di', 'month_____', self.default_func),
|
||||
'dy' : ('di', 'day_______', self.default_func),
|
||||
'min' : ('di', 'minute____', self.default_func),
|
||||
'revtim' : ('di', 'revis-time', self.default_func),
|
||||
'nofwords' : ('di', 'num-of-wor', self.default_func),
|
||||
'nofchars' : ('di', 'num-of-chr', self.default_func),
|
||||
'nofpages' : ('di', 'num-of-pag', self.default_func),
|
||||
'edmins' : ('di', 'edit-time_', self.default_func),
|
||||
# headers and footers => hf
|
||||
'headerf' : ('hf', 'head-first', self.default_func),
|
||||
'headerl' : ('hf', 'head-left_', self.default_func),
|
||||
'headerr' : ('hf', 'head-right', self.default_func),
|
||||
'footerf' : ('hf', 'foot-first', self.default_func),
|
||||
'footerl' : ('hf', 'foot-left_', self.default_func),
|
||||
'footerr' : ('hf', 'foot-right', self.default_func),
|
||||
'header' : ('hf', 'header____', self.default_func),
|
||||
'footer' : ('hf', 'footer____', self.default_func),
|
||||
# page => pa
|
||||
'margl' : ('pa', 'margin-lef', self.divide_by_20),
|
||||
'margr' : ('pa', 'margin-rig', self.divide_by_20),
|
||||
'margb' : ('pa', 'margin-bot', self.divide_by_20),
|
||||
'margt' : ('pa', 'margin-top', self.divide_by_20),
|
||||
'gutter' : ('pa', 'gutter____', self.divide_by_20),
|
||||
'paperw' : ('pa', 'paper-widt', self.divide_by_20),
|
||||
'paperh' : ('pa', 'paper-hght', self.divide_by_20),
|
||||
# annotation => an
|
||||
'annotation' : ('an', 'annotation', self.default_func),
|
||||
# underline
|
||||
'ul' : ('ul', 'underlined<continous', self.two_part_func),
|
||||
'uld' : ('ul', 'underlined<dotted', self.two_part_func),
|
||||
'uldash' : ('ul', 'underlined<dash', self.two_part_func),
|
||||
'uldashd' : ('ul', 'underlined<dash-dot', self.two_part_func),
|
||||
'uldashdd' : ('ul', 'underlined<dash-dot-dot', self.two_part_func),
|
||||
'uldb' : ('ul', 'underlined<double', self.two_part_func),
|
||||
'ulhwave' : ('ul', 'underlined<heavy-wave', self.two_part_func),
|
||||
'ulldash' : ('ul', 'underlined<long-dash', self.two_part_func),
|
||||
'ulth' : ('ul', 'underlined<thich', self.two_part_func),
|
||||
'ulthd' : ('ul', 'underlined<thick-dotted', self.two_part_func),
|
||||
'ulthdash' : ('ul', 'underlined<thick-dash', self.two_part_func),
|
||||
'ulthdashd' : ('ul', 'underlined<thick-dash-dot', self.two_part_func),
|
||||
'ulthdashdd' : ('ul', 'underlined<thick-dash-dot-dot', self.two_part_func),
|
||||
'ulthldash' : ('ul', 'underlined<thick-long-dash', self.two_part_func),
|
||||
'ululdbwave' : ('ul', 'underlined<double-wave', self.two_part_func),
|
||||
'ulw' : ('ul', 'underlined<word', self.two_part_func),
|
||||
'ulwave' : ('ul', 'underlined<wave', self.two_part_func),
|
||||
'ulnone' : ('ul', 'underlined<false', self.two_part_func),
|
||||
# border => bd
|
||||
'trbrdrh' : ('bd', 'bor-t-r-hi', self.default_func),
|
||||
'trbrdrv' : ('bd', 'bor-t-r-vi', self.default_func),
|
||||
'trbrdrt' : ('bd', 'bor-t-r-to', self.default_func),
|
||||
'trbrdrl' : ('bd', 'bor-t-r-le', self.default_func),
|
||||
'trbrdrb' : ('bd', 'bor-t-r-bo', self.default_func),
|
||||
'trbrdrr' : ('bd', 'bor-t-r-ri', self.default_func),
|
||||
'clbrdrb' : ('bd', 'bor-cel-bo', self.default_func),
|
||||
'clbrdrt' : ('bd', 'bor-cel-to', self.default_func),
|
||||
'clbrdrl' : ('bd', 'bor-cel-le', self.default_func),
|
||||
'clbrdrr' : ('bd', 'bor-cel-ri', self.default_func),
|
||||
'brdrb' : ('bd', 'bor-par-bo', self.default_func),
|
||||
'brdrt' : ('bd', 'bor-par-to', self.default_func),
|
||||
'brdrl' : ('bd', 'bor-par-le', self.default_func),
|
||||
'brdrr' : ('bd', 'bor-par-ri', self.default_func),
|
||||
'box' : ('bd', 'bor-par-bx', self.default_func),
|
||||
'chbrdr' : ('bd', 'bor-par-bo', self.default_func),
|
||||
'brdrbtw' : ('bd', 'bor-for-ev', self.default_func),
|
||||
'brdrbar' : ('bd', 'bor-outsid', self.default_func),
|
||||
'brdrnone' : ('bd', 'bor-none__<false', self.two_part_func),
|
||||
# border type => bt
|
||||
'brdrs' : ('bt', 'bdr-single', self.default_func),
|
||||
'brdrth' : ('bt', 'bdr-doubtb', self.default_func),
|
||||
'brdrsh' : ('bt', 'bdr-shadow', self.default_func),
|
||||
'brdrdb' : ('bt', 'bdr-double', self.default_func),
|
||||
'brdrdot' : ('bt', 'bdr-dotted', self.default_func),
|
||||
'brdrdash' : ('bt', 'bdr-dashed', self.default_func),
|
||||
'brdrhair' : ('bt', 'bdr-hair__', self.default_func),
|
||||
'brdrinset' : ('bt', 'bdr-inset_', self.default_func),
|
||||
'brdrdashsm' : ('bt', 'bdr-das-sm', self.default_func),
|
||||
'brdrdashd' : ('bt', 'bdr-dot-sm', self.default_func),
|
||||
'brdrdashdd' : ('bt', 'bdr-dot-do', self.default_func),
|
||||
'brdroutset' : ('bt', 'bdr-outset', self.default_func),
|
||||
'brdrtriple' : ('bt', 'bdr-trippl', self.default_func),
|
||||
'brdrtnthsg' : ('bt', 'bdr-thsm__', self.default_func),
|
||||
'brdrthtnsg' : ('bt', 'bdr-htsm__', self.default_func),
|
||||
'brdrtnthtnsg' : ('bt', 'bdr-hthsm_', self.default_func),
|
||||
'brdrtnthmg' : ('bt', 'bdr-thm___', self.default_func),
|
||||
'brdrthtnmg' : ('bt', 'bdr-htm___', self.default_func),
|
||||
'brdrtnthtnmg' : ('bt', 'bdr-hthm__', self.default_func),
|
||||
'brdrtnthlg' : ('bt', 'bdr-thl___', self.default_func),
|
||||
'brdrtnthtnlg' : ('bt', 'bdr-hthl__', self.default_func),
|
||||
'brdrwavy' : ('bt', 'bdr-wavy__', self.default_func),
|
||||
'brdrwavydb' : ('bt', 'bdr-d-wav_', self.default_func),
|
||||
'brdrdashdotstr' : ('bt', 'bdr-strip_', self.default_func),
|
||||
'brdremboss' : ('bt', 'bdr-embos_', self.default_func),
|
||||
'brdrengrave' : ('bt', 'bdr-engra_', self.default_func),
|
||||
'brdrframe' : ('bt', 'bdr-frame_', self.default_func),
|
||||
'brdrw' : ('bt', 'bdr-li-wid', self.divide_by_20),
|
||||
'brsp' : ('bt', 'bdr-sp-wid', self.divide_by_20),
|
||||
'brdrcf' : ('bt', 'bdr-color_', self.default_func),
|
||||
# comments
|
||||
# 'comment' : ('cm', 'comment___', self.default_func),
|
||||
}
|
||||
self.__number_type_dict = {
|
||||
0: 'Arabic',
|
||||
1: 'uppercase Roman numeral',
|
||||
2: 'lowercase Roman numeral',
|
||||
3: 'uppercase letter',
|
||||
4: 'lowercase letter',
|
||||
5: 'ordinal number',
|
||||
6: 'cardianl text number',
|
||||
7: 'ordinal text number',
|
||||
10: 'Kanji numbering without the digit character',
|
||||
11: 'Kanji numbering with the digit character',
|
||||
1246: 'phonetic Katakana characters in aiueo order',
|
||||
1346: 'phonetic katakana characters in iroha order',
|
||||
14: 'double byte character',
|
||||
15: 'single byte character',
|
||||
16: 'Kanji numbering 3',
|
||||
17: 'Kanji numbering 4',
|
||||
18: 'Circle numbering' ,
|
||||
19: 'double-byte Arabic numbering',
|
||||
2046: 'phonetic double-byte Katakana characters',
|
||||
2146: 'phonetic double-byte katakana characters',
|
||||
22: 'Arabic with leading zero',
|
||||
23: 'bullet',
|
||||
24: 'Korean numbering 2',
|
||||
25: 'Korean numbering 1',
|
||||
26: 'Chinese numbering 1',
|
||||
27: 'Chinese numbering 2',
|
||||
28: 'Chinese numbering 3',
|
||||
29: 'Chinese numbering 4',
|
||||
30: 'Chinese Zodiac numbering 1',
|
||||
31: 'Chinese Zodiac numbering 2',
|
||||
32: 'Chinese Zodiac numbering 3',
|
||||
33: 'Taiwanese double-byte numbering 1',
|
||||
34: 'Taiwanese double-byte numbering 2',
|
||||
35: 'Taiwanese double-byte numbering 3',
|
||||
36: 'Taiwanese double-byte numbering 4',
|
||||
37: 'Chinese double-byte numbering 1',
|
||||
38: 'Chinese double-byte numbering 2',
|
||||
39: 'Chinese double-byte numbering 3',
|
||||
40: 'Chinese double-byte numbering 4',
|
||||
41: 'Korean double-byte numbering 1',
|
||||
42: 'Korean double-byte numbering 2',
|
||||
43: 'Korean double-byte numbering 3',
|
||||
44: 'Korean double-byte numbering 4',
|
||||
45: 'Hebrew non-standard decimal',
|
||||
46: 'Arabic Alif Ba Tah',
|
||||
47: 'Hebrew Biblical standard',
|
||||
48: 'Arabic Abjad style',
|
||||
255: 'No number',
|
||||
}
|
||||
self.__language_dict = {
|
||||
1078 : 'Afrikaans',
|
||||
1052 : 'Albanian',
|
||||
1025 : 'Arabic',
|
||||
5121 : 'Arabic Algeria',
|
||||
15361 : 'Arabic Bahrain',
|
||||
3073 : 'Arabic Egypt',
|
||||
1 : 'Arabic General',
|
||||
2049 : 'Arabic Iraq',
|
||||
11265 : 'Arabic Jordan',
|
||||
13313 : 'Arabic Kuwait',
|
||||
12289 : 'Arabic Lebanon',
|
||||
4097 : 'Arabic Libya',
|
||||
6145 : 'Arabic Morocco',
|
||||
8193 : 'Arabic Oman',
|
||||
16385 : 'Arabic Qatar',
|
||||
10241 : 'Arabic Syria',
|
||||
7169 : 'Arabic Tunisia',
|
||||
14337 : 'Arabic U.A.E.',
|
||||
9217 : 'Arabic Yemen',
|
||||
1067 : 'Armenian',
|
||||
1101 : 'Assamese',
|
||||
2092 : 'Azeri Cyrillic',
|
||||
1068 : 'Azeri Latin',
|
||||
1069 : 'Basque',
|
||||
1093 : 'Bengali',
|
||||
4122 : 'Bosnia Herzegovina',
|
||||
1026 : 'Bulgarian',
|
||||
1109 : 'Burmese',
|
||||
1059 : 'Byelorussian',
|
||||
1027 : 'Catalan',
|
||||
2052 : 'Chinese China',
|
||||
4 : 'Chinese General',
|
||||
3076 : 'Chinese Hong Kong',
|
||||
4100 : 'Chinese Singapore',
|
||||
1028 : 'Chinese Taiwan',
|
||||
1050 : 'Croatian',
|
||||
1029 : 'Czech',
|
||||
1030 : 'Danish',
|
||||
2067 : 'Dutch Belgium',
|
||||
1043 : 'Dutch Standard',
|
||||
3081 : 'English Australia',
|
||||
10249 : 'English Belize',
|
||||
2057 : 'English British',
|
||||
4105 : 'English Canada',
|
||||
9225 : 'English Caribbean',
|
||||
9 : 'English General',
|
||||
6153 : 'English Ireland',
|
||||
8201 : 'English Jamaica',
|
||||
5129 : 'English New Zealand',
|
||||
13321 : 'English Philippines',
|
||||
7177 : 'English South Africa',
|
||||
11273 : 'English Trinidad',
|
||||
1033 : 'English United States',
|
||||
1061 : 'Estonian',
|
||||
1080 : 'Faerose',
|
||||
1065 : 'Farsi',
|
||||
1035 : 'Finnish',
|
||||
1036 : 'French',
|
||||
2060 : 'French Belgium',
|
||||
11276 : 'French Cameroon',
|
||||
3084 : 'French Canada',
|
||||
12300 : 'French Cote d\'Ivoire',
|
||||
5132 : 'French Luxembourg',
|
||||
13324 : 'French Mali',
|
||||
6156 : 'French Monaco',
|
||||
8204 : 'French Reunion',
|
||||
10252 : 'French Senegal',
|
||||
4108 : 'French Swiss',
|
||||
7180 : 'French West Indies',
|
||||
9228 : 'French Democratic Republic of the Congo',
|
||||
1122 : 'Frisian',
|
||||
1084 : 'Gaelic',
|
||||
2108 : 'Gaelic Ireland',
|
||||
1110 : 'Galician',
|
||||
1079 : 'Georgian',
|
||||
1031 : 'German',
|
||||
3079 : 'German Austrian',
|
||||
5127 : 'German Liechtenstein',
|
||||
4103 : 'German Luxembourg',
|
||||
2055 : 'German Switzerland',
|
||||
1032 : 'Greek',
|
||||
1095 : 'Gujarati',
|
||||
1037 : 'Hebrew',
|
||||
1081 : 'Hindi',
|
||||
1038 : 'Hungarian',
|
||||
1039 : 'Icelandic',
|
||||
1057 : 'Indonesian',
|
||||
1040 : 'Italian',
|
||||
2064 : 'Italian Switzerland',
|
||||
1041 : 'Japanese',
|
||||
1099 : 'Kannada',
|
||||
1120 : 'Kashmiri',
|
||||
2144 : 'Kashmiri India',
|
||||
1087 : 'Kazakh',
|
||||
1107 : 'Khmer',
|
||||
1088 : 'Kirghiz',
|
||||
1111 : 'Konkani',
|
||||
1042 : 'Korean',
|
||||
2066 : 'Korean Johab',
|
||||
1108 : 'Lao',
|
||||
1062 : 'Latvian',
|
||||
1063 : 'Lithuanian',
|
||||
2087 : 'Lithuanian Classic',
|
||||
1086 : 'Malay',
|
||||
2110 : 'Malay Brunei Darussalam',
|
||||
1100 : 'Malayalam',
|
||||
1082 : 'Maltese',
|
||||
1112 : 'Manipuri',
|
||||
1102 : 'Marathi',
|
||||
1104 : 'Mongolian',
|
||||
1121 : 'Nepali',
|
||||
2145 : 'Nepali India',
|
||||
1044 : 'Norwegian Bokmal',
|
||||
2068 : 'Norwegian Nynorsk',
|
||||
1096 : 'Oriya',
|
||||
1045 : 'Polish',
|
||||
1046 : 'Portuguese (Brazil)',
|
||||
2070 : 'Portuguese (Portugal)',
|
||||
1094 : 'Punjabi',
|
||||
1047 : 'Rhaeto-Romanic',
|
||||
1048 : 'Romanian',
|
||||
2072 : 'Romanian Moldova',
|
||||
1049 : 'Russian',
|
||||
2073 : 'Russian Moldova',
|
||||
1083 : 'Sami Lappish',
|
||||
1103 : 'Sanskrit',
|
||||
3098 : 'Serbian Cyrillic',
|
||||
2074 : 'Serbian Latin',
|
||||
1113 : 'Sindhi',
|
||||
1051 : 'Slovak',
|
||||
1060 : 'Slovenian',
|
||||
1070 : 'Sorbian',
|
||||
11274 : 'Spanish Argentina',
|
||||
16394 : 'Spanish Bolivia',
|
||||
13322 : 'Spanish Chile',
|
||||
9226 : 'Spanish Colombia',
|
||||
5130 : 'Spanish Costa Rica',
|
||||
7178 : 'Spanish Dominican Republic',
|
||||
12298 : 'Spanish Ecuador',
|
||||
17418 : 'Spanish El Salvador',
|
||||
4106 : 'Spanish Guatemala',
|
||||
18442 : 'Spanish Honduras',
|
||||
2058 : 'Spanish Mexico',
|
||||
3082 : 'Spanish Modern',
|
||||
19466 : 'Spanish Nicaragua',
|
||||
6154 : 'Spanish Panama',
|
||||
15370 : 'Spanish Paraguay',
|
||||
10250 : 'Spanish Peru',
|
||||
20490 : 'Spanish Puerto Rico',
|
||||
1034 : 'Spanish Traditional',
|
||||
14346 : 'Spanish Uruguay',
|
||||
8202 : 'Spanish Venezuela',
|
||||
1072 : 'Sutu',
|
||||
1089 : 'Swahili',
|
||||
1053 : 'Swedish',
|
||||
2077 : 'Swedish Finland',
|
||||
1064 : 'Tajik',
|
||||
1097 : 'Tamil',
|
||||
1092 : 'Tatar',
|
||||
1098 : 'Telugu',
|
||||
1054 : 'Thai',
|
||||
1105 : 'Tibetan',
|
||||
1073 : 'Tsonga',
|
||||
1074 : 'Tswana',
|
||||
1055 : 'Turkish',
|
||||
1090 : 'Turkmen',
|
||||
1058 : 'Ukranian',
|
||||
1056 : 'Urdu',
|
||||
2080 : 'Urdu India',
|
||||
2115 : 'Uzbek Cyrillic',
|
||||
1091 : 'Uzbek Latin',
|
||||
1075 : 'Venda',
|
||||
1066 : 'Vietnamese',
|
||||
1106 : 'Welsh',
|
||||
1076 : 'Xhosa',
|
||||
1085 : 'Yiddish',
|
||||
1077 : 'Zulu',
|
||||
1024 : 'Unkown',
|
||||
255 : 'Unkown',
|
||||
}
|
||||
"""
|
||||
# unknown
|
||||
# These must get passed on because they occure after \*
|
||||
'do' : ('un', 'unknown___', self.default_func),
|
||||
'company' : ('un', 'company___', self.default_func),
|
||||
'shpinst' : ('un', 'unknown___', self.default_func),
|
||||
'panose' : ('un', 'unknown___', self.default_func),
|
||||
'falt' : ('un', 'unknown___', self.default_func),
|
||||
'listoverridetable' : ('un', 'unknown___', self.default_func),
|
||||
'category' : ('un', 'unknown___', self.default_func),
|
||||
'template' : ('un', 'unknown___', self.default_func),
|
||||
'ud' : ('un', 'unknown___', self.default_func),
|
||||
'formfield' : ('un', 'unknown___', self.default_func),
|
||||
'ts' : ('un', 'unknown___', self.default_func),
|
||||
'rsidtbl' : ('un', 'unknown___', self.default_func),
|
||||
'generator' : ('un', 'unknown___', self.default_func),
|
||||
'ftnsep' : ('un', 'unknown___', self.default_func),
|
||||
'aftnsep' : ('un', 'unknown___', self.default_func),
|
||||
'aftnsepc' : ('un', 'unknown___', self.default_func),
|
||||
'aftncn' : ('un', 'unknown___', self.default_func),
|
||||
'objclass' : ('un', 'unknown___', self.default_func),
|
||||
'objdata' : ('un', 'unknown___', self.default_func),
|
||||
'picprop' : ('un', 'unknown___', self.default_func),
|
||||
'blipuid' : ('un', 'unknown___', self.default_func),
|
||||
"""
|
||||
def __ms_hex_func(self, pre, token, num):
|
||||
num = num[1:] # chop off leading 0, which I added
|
||||
num = num.upper() # the mappings store hex in caps
|
||||
return 'tx<hx<__________<\'%s\n' % num # add an ' for the mappings
|
||||
def ms_sub_func(self, pre, token, num):
|
||||
return 'tx<mc<__________<%s\n' % token
|
||||
def default_func(self, pre, token, num):
|
||||
if num == None:
|
||||
num = 'true'
|
||||
return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
|
||||
def __list_type_func(self, pre, token, num):
|
||||
type = 'arabic'
|
||||
if num == None:
|
||||
type = 'Arabic'
|
||||
else:
|
||||
try:
|
||||
num = int(num)
|
||||
except ValueError:
|
||||
if self.__run_level > 3:
|
||||
msg = 'number "%s" cannot be converted to integer\n' % num
|
||||
raise self.__bug_handler, msg
|
||||
type = self.__number_type_dict.get(num)
|
||||
if type == None:
|
||||
if self.__run_level > 3:
|
||||
msg = 'No type for "%s" in self.__number_type_dict\n'
|
||||
raise self.__bug_handler
|
||||
type = 'Arabic'
|
||||
return 'cw<%s<%s<nu<%s\n' % (pre, token, type)
|
||||
def __language_func(self, pre, token, num):
|
||||
lang_name = self.__language_dict.get(int(num))
|
||||
if not lang_name:
|
||||
lang_name = "not defined"
|
||||
if self.__run_level > 3:
|
||||
msg = 'No entry for number "%s"' % num
|
||||
raise self.__bug_handler, msg
|
||||
return 'cw<%s<%s<nu<%s\n' % (pre, token, lang_name)
|
||||
def two_part_func(self, pre, token, num):
|
||||
list = token.split("<")
|
||||
token = list[0]
|
||||
num = list[1]
|
||||
return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
|
||||
##return 'cw<nu<nu<nu<%s>num<%s\n' % (token, num)
|
||||
def divide_by_2(self, pre, token, num):
|
||||
num = self.divide_num(num, 2)
|
||||
return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
|
||||
##return 'cw<nu<nu<nu<%s>%s<%s\n' % (token, num, token)
|
||||
def divide_by_20(self, pre, token, num):
|
||||
num = self.divide_num(num, 20)
|
||||
return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
|
||||
##return 'cw<nu<nu<nu<%s>%s<%s\n' % (token, num, token)
|
||||
def text_func(self, pre, token, num=None):
|
||||
return 'tx<nu<__________<%s\n' % token
|
||||
def ob_func(self, pre, token, num=None):
|
||||
self.__bracket_count += 1
|
||||
##return 'ob<%04d\n' % self.__bracket_count
|
||||
return 'ob<nu<open-brack<%04d\n' % self.__bracket_count
|
||||
def cb_func(self, pre, token, num=None):
|
||||
##line = 'cb<%04d\n' % self.__bracket_count
|
||||
line = 'cb<nu<clos-brack<%04d\n' % self.__bracket_count
|
||||
self.__bracket_count -= 1
|
||||
return line
|
||||
def color_func(self, pre, token, num):
|
||||
third_field = 'nu'
|
||||
if num[-1] == ';':
|
||||
num = num[:-1]
|
||||
third_field = 'en'
|
||||
num = str('%X' % int(num))
|
||||
if len(num) != 2:
|
||||
num = "0" + num
|
||||
return 'cw<%s<%s<%s<%s\n' % (pre, token, third_field, num)
|
||||
##return 'cw<cl<%s<nu<nu<%s>%s<%s\n' % (third_field, token, num, token)
|
||||
def bool_st_func(self, pre, token, num):
|
||||
if num is None or num == '' or num == '1':
|
||||
return 'cw<%s<%s<nu<true\n' % (pre, token)
|
||||
##return 'cw<nu<nu<nu<%s>true<%s\n' % (token, token)
|
||||
elif num == '0':
|
||||
return 'cw<%s<%s<nu<false\n' % (pre, token)
|
||||
##return 'cw<nu<nu<nu<%s>false<%s\n' % (token, token)
|
||||
else:
|
||||
msg = 'boolean should have some value module process tokens\n'
|
||||
msg += 'token is ' + token + "\n"
|
||||
msg += "'" + num + "'" + "\n"
|
||||
raise self.__bug_handler, msg
|
||||
def __no_sup_sub_func(self, pre, token, num):
|
||||
the_string = 'cw<ci<subscript_<nu<false\n'
|
||||
the_string += 'cw<ci<superscrip<nu<false\n'
|
||||
return the_string
|
||||
def divide_num(self, numerator, denominator):
|
||||
try:
|
||||
numerator = float(numerator)
|
||||
except TypeError, msg:
|
||||
if self.__run_level > 3:
|
||||
msg = 'no number to process?\n'
|
||||
msg += 'this indicates that the token '
|
||||
msg += ' \(\\li\) should have a number and does not\n'
|
||||
msg += 'numerator is "%s"\n' % numerator
|
||||
msg += 'denominator is "%s"\n' % denominator
|
||||
raise self.__bug_handler, msg
|
||||
if 5 > self.__return_code:
|
||||
self.__return_code = 5
|
||||
return 0
|
||||
num = '%0.2f' % round(numerator/denominator, 2)
|
||||
return num
|
||||
string_num = str(num)
|
||||
if string_num[-2:] == ".0":
|
||||
string_num = string_num[:-2]
|
||||
return string_num
|
||||
def split_let_num(self, token):
|
||||
match_obj = re.search(self.__num_exp,token)
|
||||
if match_obj != None:
|
||||
first = match_obj.group(1)
|
||||
second = match_obj.group(2)
|
||||
if not second:
|
||||
if self.__run_level > 3:
|
||||
msg = "token is '%s' \n" % token
|
||||
raise self.__bug_handler, msg
|
||||
return first, 0
|
||||
else:
|
||||
if self.__run_level > 3:
|
||||
msg = "token is '%s' \n" % token
|
||||
raise self.__bug_handler
|
||||
return token, 0
|
||||
return first, second
|
||||
def convert_to_hex(self,number):
|
||||
"""Convert a string to uppercase hexidecimal"""
|
||||
num = int(number)
|
||||
try:
|
||||
hex_num = "%X" % num
|
||||
return hex_num
|
||||
except:
|
||||
raise self.__bug_handler
|
||||
def process_cw(self, token):
|
||||
"""Change the value of the control word by determining what dictionary
|
||||
it belongs to"""
|
||||
special = [ '*', ':', '}', '{', '~', '_', '-', ';' ]
|
||||
##if token != "{" or token != "}":
|
||||
token = token[1:] # strip off leading \
|
||||
token = token.replace(" ", "")
|
||||
##if not token: return
|
||||
only_alpha = token.isalpha()
|
||||
num = None
|
||||
if not only_alpha and token not in special:
|
||||
token, num = self.split_let_num(token)
|
||||
pre, token, action = self.dict_token.get(token, (None, None, None))
|
||||
if action:
|
||||
return action(pre, token, num)
|
||||
# unused function
|
||||
def initiate_token_actions(self):
|
||||
self.action_for_token={
|
||||
'{' : self.ob_func,
|
||||
'}' : self.cb_func,
|
||||
'\\' : self.process_cw,
|
||||
}
|
||||
# unused function
|
||||
def evaluate_token(self,token):
|
||||
"""Evaluate tokens. Return a value if the token is not a
|
||||
control word. Otherwise, pass token onto another method
|
||||
for further evaluation."""
|
||||
token, action = self.dict_token.get(token[0:1])
|
||||
if action:
|
||||
line = action(token)
|
||||
return line
|
||||
else :
|
||||
return 'tx<nu<nu<nu<nu<%s\n' % token
|
||||
def __check_brackets(self, in_file):
|
||||
self.__check_brack_obj = check_brackets.CheckBrackets\
|
||||
(file = in_file)
|
||||
good_br = self.__check_brack_obj.check_brackets()[0]
|
||||
if not good_br:
|
||||
return 1
|
||||
def process_tokens(self):
|
||||
"""Main method for handling other methods. """
|
||||
first_token = 0
|
||||
second_token = 0
|
||||
read_obj = open(self.__file, 'r')
|
||||
write_obj = open(self.__write_to, 'w')
|
||||
line_to_read = "dummy"
|
||||
line_count = 0
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
token = line_to_read
|
||||
token = token.replace("\n","")
|
||||
if not token:
|
||||
continue
|
||||
line_count += 1
|
||||
try:
|
||||
token.decode('us-ascii')
|
||||
except UnicodeError, msg:
|
||||
msg = str(msg)
|
||||
msg += 'Invalid RTF: File not ascii encoded.\n'
|
||||
raise self.__exception_handler, msg
|
||||
if not first_token:
|
||||
if token != '\\{':
|
||||
msg = 'Invalid RTF: document doesn\'t start with {\n'
|
||||
raise self.__exception_handler, msg
|
||||
first_token = 1
|
||||
elif first_token and not second_token:
|
||||
if token[0:4] != '\\rtf':
|
||||
msg ='Invalid RTF: document doesn\'t start with \\rtf \n'
|
||||
raise self.__exception_handler, msg
|
||||
second_token = 1
|
||||
##token = self.evaluate_token(token)
|
||||
the_index = token.find('\\ ')
|
||||
if token != None and the_index > -1:
|
||||
msg ='Invalid RTF: token "\\ " not valid. \n'
|
||||
raise self.__exception_handler, msg
|
||||
elif token[0:1] == "\\":
|
||||
line = self.process_cw(token)
|
||||
if line != None:
|
||||
write_obj.write(line)
|
||||
else:
|
||||
fields = re.split(self.__utf_exp, token)
|
||||
for field in fields:
|
||||
if not field:
|
||||
continue
|
||||
if field[0:1] == '&':
|
||||
write_obj.write('tx<ut<__________<%s\n' % field)
|
||||
else:
|
||||
write_obj.write('tx<nu<__________<%s\n' % field)
|
||||
read_obj.close()
|
||||
write_obj.close()
|
||||
if not line_count:
|
||||
msg ='Invalid RTF: file appears to be empty. \n'
|
||||
raise self.__exception_handler, msg
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "processed_tokens.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
bad_brackets = self.__check_brackets(self.__file)
|
||||
if bad_brackets:
|
||||
msg = 'Invalid RTF: document does not have matching brackets.\n'
|
||||
raise self.__exception_handler, msg
|
||||
else:
|
||||
return self.__return_code
|
52
src/libprs500/ebooks/rtf2xml/replace_illegals.py
Executable file
52
src/libprs500/ebooks/rtf2xml/replace_illegals.py
Executable file
@ -0,0 +1,52 @@
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# You should have received a copy of the GNU General Public License #
|
||||
# along with this program; if not, write to the Free Software #
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
|
||||
# 02111-1307 USA #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import os, tempfile
|
||||
from libprs500.ebooks.rtf2xml import copy
|
||||
class ReplaceIllegals:
|
||||
"""
|
||||
reaplace illegal lower ascii characters
|
||||
"""
|
||||
def __init__(self,
|
||||
in_file,
|
||||
copy = None,
|
||||
run_level = 1,
|
||||
):
|
||||
self.__file = in_file
|
||||
self.__copy = copy
|
||||
self.__run_level = run_level
|
||||
self.__write_to = tempfile.mktemp()
|
||||
def replace_illegals(self):
|
||||
"""
|
||||
"""
|
||||
nums = [0, 1, 2, 3, 4, 5, 6, 7, 8, 11, 13, 14, 15, 16, 17, 18, 19]
|
||||
read_obj = open(self.__file, 'r')
|
||||
write_obj = open(self.__write_to, 'w')
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
for num in nums:
|
||||
line = line.replace(chr(num), '')
|
||||
write_obj.write(line)
|
||||
read_obj.close()
|
||||
write_obj.close()
|
||||
copy_obj = copy.Copy()
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "replace_illegals.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
513
src/libprs500/ebooks/rtf2xml/sections.py
Executable file
513
src/libprs500/ebooks/rtf2xml/sections.py
Executable file
@ -0,0 +1,513 @@
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# You should have received a copy of the GNU General Public License #
|
||||
# along with this program; if not, write to the Free Software #
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
|
||||
# 02111-1307 USA #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os, tempfile
|
||||
from libprs500.ebooks.rtf2xml import copy
|
||||
class Sections:
|
||||
"""
|
||||
=================
|
||||
Purpose
|
||||
=================
|
||||
Write section tags for a tokenized file. (This module won't be any use to use
|
||||
to you unless you use it as part of the other modules.)
|
||||
---------------
|
||||
logic
|
||||
---------------
|
||||
The tags for the first section breaks have already been written.
|
||||
RTF stores section breaks with the \sect tag. Each time this tag is
|
||||
encountered, add one to the counter.
|
||||
When I encounter the \sectd tag, I want to collect all the appropriate tokens
|
||||
that describe the section. When I reach a \pard, I know I an stop collecting
|
||||
tokens and write the section tags.
|
||||
The exception to this method occurs when sections occur in field blocks, such
|
||||
as the index. Normally, two section break occur within the index and other
|
||||
field-blocks. (If less or more section breaks occurr, this code may not work.)
|
||||
I want the sections to occurr outside of the index. That is, the index
|
||||
should be nested inside one section tag. After the index is complete, a new
|
||||
section should begin.
|
||||
In order to write the sections outside of the field blocks, I have to store
|
||||
all of the field block as a string. When I ecounter the \sect tag, add one to
|
||||
the section counter, but store this number in a list. Likewise, store the
|
||||
information describing the section in another list.
|
||||
When I reach the end of the field block, choose the first item from the
|
||||
numbered list as the section number. Choose the first item in the description
|
||||
list as the values and attributes of the section. Enclose the field string
|
||||
between the section tags.
|
||||
Start a new section outside the field-block strings. Use the second number in
|
||||
the list; use the second item in the description list.
|
||||
CHANGE (2004-04-26) No longer write sections that occurr in field-blocks.
|
||||
Instead, ingore all section information in a field-block.
|
||||
"""
|
||||
def __init__(self,
|
||||
in_file,
|
||||
bug_handler,
|
||||
copy = None,
|
||||
run_level = 1):
|
||||
"""
|
||||
Required:
|
||||
'file'--file to parse
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__run_level = run_level
|
||||
self.__write_to = tempfile.mktemp()
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Initiate all values.
|
||||
"""
|
||||
self.__mark_start = 'mi<mk<sect-start\n'
|
||||
self.__mark_end = 'mi<mk<sect-end__\n'
|
||||
self.__in_field = 0
|
||||
self.__section_values = {}
|
||||
self.__list_of_sec_values = []
|
||||
self.__field_num = []
|
||||
self.__section_num = 0
|
||||
self.__state = 'before_body'
|
||||
self.__found_first_sec = 0
|
||||
self.__text_string = ''
|
||||
self.__field_instruction_string = ''
|
||||
self.__state_dict = {
|
||||
'before_body' : self.__before_body_func,
|
||||
'body' : self.__body_func,
|
||||
'before_first_sec' : self.__before_first_sec_func,
|
||||
'section' : self.__section_func,
|
||||
'section_def' : self.__section_def_func,
|
||||
'sec_in_field' : self.__sec_in_field_func,
|
||||
}
|
||||
# cw<sc<sect-defin<nu<true
|
||||
self.__body_dict = {
|
||||
'cw<sc<section___' : self.__found_section_func,
|
||||
'mi<mk<sec-fd-beg' : self.__found_sec_in_field_func,
|
||||
'cw<sc<sect-defin' : self.__found_section_def_bef_sec_func,
|
||||
}
|
||||
self.__section_def_dict = {
|
||||
'cw<pf<par-def___' : (self.__end_sec_def_func, None),
|
||||
'mi<mk<body-open_' : (self.__end_sec_def_func, None),
|
||||
'cw<tb<columns___' : (self.__attribute_func, 'columns'),
|
||||
'cw<pa<margin-lef' : (self.__attribute_func, 'margin-left'),
|
||||
'cw<pa<margin-rig' : (self.__attribute_func, 'margin-right'),
|
||||
'mi<mk<header-ind' : (self.__end_sec_def_func, None),
|
||||
# premature endings
|
||||
#__end_sec_premature_func
|
||||
'tx<nu<__________' : (self.__end_sec_premature_func, None),
|
||||
'cw<ci<font-style' : (self.__end_sec_premature_func, None),
|
||||
'cw<ci<font-size_' : (self.__end_sec_premature_func, None),
|
||||
}
|
||||
self.__sec_in_field_dict = {
|
||||
'mi<mk<sec-fd-end' : self.__end_sec_in_field_func,
|
||||
# changed this 2004-04-26
|
||||
# two lines
|
||||
# 'cw<sc<section___' : self.__found_section_in_field_func,
|
||||
# 'cw<sc<sect-defin' : self.__found_section_def_in_field_func,
|
||||
}
|
||||
def __found_section_def_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line -- the line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
I have found a section definition. Change the state to
|
||||
setion_def (so subsequent lines will be processesed as part of
|
||||
the section definition), and clear the section_values dictionary.
|
||||
"""
|
||||
self.__state = 'section_def'
|
||||
self.__section_values.clear()
|
||||
def __attribute_func(self, line, name):
|
||||
"""
|
||||
Required:
|
||||
line -- the line to be parsed
|
||||
name -- the changed, readable name (as opposed to the
|
||||
abbreviated one)
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
I need to add the right data to the section values dictionary so I
|
||||
can retrive it later. The attribute (or key) is the name; the
|
||||
value is the last part of the text string.
|
||||
ex: cw<tb<columns___<nu<2
|
||||
"""
|
||||
attribute = name
|
||||
value = line[20:-1]
|
||||
self.__section_values[attribute] = value
|
||||
def __found_section_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- the line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
I have found the beginning of a section, so change the state
|
||||
accordingly. Also add one to the section counter.
|
||||
"""
|
||||
self.__state = 'section'
|
||||
self.__write_obj.write(line)
|
||||
self.__section_num += 1
|
||||
def __found_section_def_bef_sec_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- the line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
I have found the beginning of a section, so change the state
|
||||
accordingly. Also add one to the section counter.
|
||||
"""
|
||||
self.__section_num += 1
|
||||
self.__found_section_def_func(line)
|
||||
self.__write_obj.write(line)
|
||||
def __section_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --the line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
"""
|
||||
if self.__token_info == 'cw<sc<sect-defin':
|
||||
self.__found_section_def_func(line)
|
||||
self.__write_obj.write(line)
|
||||
def __section_def_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
I have found a section definition. Check if the line is the end of
|
||||
the defnition (a paragraph defintion), or if it contains info that
|
||||
should be added to the values dictionary. If neither of these
|
||||
cases are true, output the line to a file.
|
||||
"""
|
||||
action, name = self.__section_def_dict.get(self.__token_info, (None, None))
|
||||
if action:
|
||||
action(line, name)
|
||||
if self.__in_field:
|
||||
self.__sec_in_field_string += line
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
def __end_sec_def_func(self, line, name):
|
||||
"""
|
||||
Requires:
|
||||
line --the line to parse
|
||||
name --changed, readable name
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
The end of the section definition has been found. Reset the state.
|
||||
Call on the write_section method.
|
||||
"""
|
||||
if not self.__in_field:
|
||||
self.__state = 'body'
|
||||
else:
|
||||
self.__state = 'sec_in_field'
|
||||
self.__write_section(line)
|
||||
def __end_sec_premature_func(self, line, name):
|
||||
"""
|
||||
Requires:
|
||||
line --the line to parse
|
||||
name --changed, readable name
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Text or control words indicating text have been found
|
||||
before \pard. This shoud indicate older RTF. Reset the state
|
||||
Write the section defintion. Insert a paragraph definition.
|
||||
Insert {} to mark the end of a paragraph defintion
|
||||
"""
|
||||
if not self.__in_field:
|
||||
self.__state = 'body'
|
||||
else:
|
||||
self.__state = 'sec_in_field'
|
||||
self.__write_section(line)
|
||||
self.__write_obj.write('cw<pf<par-def___<nu<true\n')
|
||||
self.__write_obj.write('ob<nu<open-brack<0000\n')
|
||||
self.__write_obj.write('cb<nu<clos-brack<0000\n')
|
||||
def __write_section(self, line):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Form a string of attributes and values. If you are not in a field
|
||||
block, write this string to the output file. Otherwise, call on
|
||||
the handle_sec_def method to handle this string.
|
||||
"""
|
||||
my_string = self.__mark_start
|
||||
if self.__found_first_sec:
|
||||
my_string += 'mi<tg<close_____<section\n'
|
||||
else:
|
||||
self.__found_first_sec = 1
|
||||
my_string += 'mi<tg<open-att__<section<num>%s' % str(self.__section_num)
|
||||
my_string += '<num-in-level>%s' % str(self.__section_num)
|
||||
my_string += '<type>rtf-native'
|
||||
my_string += '<level>0'
|
||||
keys = self.__section_values.keys()
|
||||
if len(keys) > 0:
|
||||
for key in keys:
|
||||
my_string += '<%s>%s' % (key, self.__section_values[key])
|
||||
my_string += '\n'
|
||||
my_string += self.__mark_end
|
||||
# # my_string += line
|
||||
if self.__state == 'body':
|
||||
self.__write_obj.write(my_string)
|
||||
elif self.__state == 'sec_in_field':
|
||||
self.__handle_sec_def(my_string)
|
||||
elif self.__run_level > 3:
|
||||
msg = 'missed a flag\n'
|
||||
raise self.__bug_handler, msg
|
||||
def __handle_sec_def(self, my_string):
|
||||
"""
|
||||
Requires:
|
||||
my_string -- the string of attributes and values. (Do I need this?)
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
I need to append the dictionary of attributes and values to list
|
||||
so I can use it later when I reach the end of the field-block.
|
||||
"""
|
||||
values_dict = self.__section_values
|
||||
self.__list_of_sec_values.append(values_dict)
|
||||
def __body_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --the line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Look for the beginning of a section. Otherwise, print the line to
|
||||
the output file.
|
||||
"""
|
||||
action = self.__body_dict.get(self.__token_info)
|
||||
if action:
|
||||
action(line)
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
def __before_body_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Look for the beginning of the body. Always print out the line.
|
||||
"""
|
||||
if self.__token_info == 'mi<mk<body-open_':
|
||||
self.__state = 'before_first_sec'
|
||||
self.__write_obj.write(line)
|
||||
def __before_first_sec_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Look for the beginning of the first section. This can be \\sectd,
|
||||
but in older RTF it could mean the any paragraph or row definition
|
||||
"""
|
||||
if self.__token_info == 'cw<sc<sect-defin':
|
||||
self.__state = 'section_def'
|
||||
self.__section_num += 1
|
||||
self.__section_values.clear()
|
||||
elif self.__token_info == 'cw<pf<par-def___':
|
||||
self.__state = 'body'
|
||||
self.__section_num += 1
|
||||
self.__write_obj.write (
|
||||
'mi<tg<open-att__<section<num>%s'
|
||||
'<num-in-level>%s'
|
||||
'<type>rtf-native'
|
||||
'<level>0\n'
|
||||
% (str(self.__section_num), str(self.__section_num))
|
||||
)
|
||||
self.__found_first_sec = 1
|
||||
elif self.__token_info == 'tx<nu<__________':
|
||||
self.__state = 'body'
|
||||
self.__section_num += 1
|
||||
self.__write_obj.write (
|
||||
'mi<tg<open-att__<section<num>%s'
|
||||
'<num-in-level>%s'
|
||||
'<type>rtf-native'
|
||||
'<level>0\n'
|
||||
% (str(self.__section_num), str(self.__section_num))
|
||||
)
|
||||
self.__write_obj.write(
|
||||
'cw<pf<par-def___<true\n'
|
||||
)
|
||||
self.__found_first_sec = 1
|
||||
self.__write_obj.write(line)
|
||||
def __found_sec_in_field_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
I have found the beginning of a field that has a section (or
|
||||
really, two) inside of it. Change the state, and start adding to
|
||||
one long string.
|
||||
"""
|
||||
self.__state = 'sec_in_field'
|
||||
self.__sec_in_field_string = line
|
||||
self.__in_field = 1
|
||||
def __sec_in_field_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --the line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Check for the end of the field, or the beginning of a section
|
||||
definition.
|
||||
CHANGED! Just print out each line. Ignore any sections or
|
||||
section definition info.
|
||||
"""
|
||||
action = self.__sec_in_field_dict.get(self.__token_info)
|
||||
if action:
|
||||
action(line)
|
||||
else:
|
||||
# change this 2004-04-26
|
||||
# self.__sec_in_field_string += line
|
||||
self.__write_obj.write(line)
|
||||
def __end_sec_in_field_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Add the last line to the field string. Call on the method
|
||||
print_field_sec_attributes to write the close and beginning of a
|
||||
section tag. Print out the field string. Call on the same method
|
||||
to again write the close and beginning of a section tag.
|
||||
Change the state.
|
||||
"""
|
||||
# change this 2004-04-26
|
||||
# Don't do anyting
|
||||
"""
|
||||
self.__sec_in_field_string += line
|
||||
self.__print_field_sec_attributes()
|
||||
self.__write_obj.write(self.__sec_in_field_string)
|
||||
self.__print_field_sec_attributes()
|
||||
"""
|
||||
self.__state = 'body'
|
||||
self.__in_field = 0
|
||||
# this is changed too
|
||||
self.__write_obj.write(line)
|
||||
def __print_field_sec_attributes(self):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Get the number and dictionary of values from the lists. The number
|
||||
and dictionary will be the first item of each list. Write the
|
||||
close tag. Write the start tag. Write the attribute and values in
|
||||
the dictionary. Get rid of the first item in each list.
|
||||
keys = self.__section_values.keys()
|
||||
if len(keys) > 0:
|
||||
my_string += 'mi<tg<open-att__<section-definition'
|
||||
for key in keys:
|
||||
my_string += '<%s>%s' % (key, self.__section_values[key])
|
||||
my_string += '\n'
|
||||
else:
|
||||
my_string += 'mi<tg<open______<section-definition\n'
|
||||
"""
|
||||
num = self.__field_num[0]
|
||||
self.__field_num = self.__field_num[1:]
|
||||
self.__write_obj.write(
|
||||
'mi<tg<close_____<section\n'
|
||||
'mi<tg<open-att__<section<num>%s' % str(num)
|
||||
)
|
||||
if self.__list_of_sec_values:
|
||||
keys = self.__list_of_sec_values[0].keys()
|
||||
for key in keys:
|
||||
self.__write_obj.write(
|
||||
'<%s>%s\n' % (key, self.__list_of_sec_values[0][key]))
|
||||
self.__list_of_sec_values = self.__list_of_sec_values[1:]
|
||||
self.__write_obj.write('<level>0')
|
||||
self.__write_obj.write('<type>rtf-native')
|
||||
self.__write_obj.write('<num-in-level>%s' % str(self.__section_num))
|
||||
self.__write_obj.write('\n')
|
||||
# Look here
|
||||
def __found_section_in_field_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
I have found a section in a field block. Add one to section
|
||||
counter, and append this number to a list.
|
||||
"""
|
||||
self.__section_num += 1
|
||||
self.__field_num.append(self.__section_num)
|
||||
self.__sec_in_field_string += line
|
||||
def __found_section_def_in_field_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
I have found a section definition in a filed block. Change the
|
||||
state and clear the values dictionary.
|
||||
"""
|
||||
self.__state = 'section_def'
|
||||
self.__section_values.clear()
|
||||
def make_sections(self):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
nothing (changes the original file)
|
||||
Logic:
|
||||
Read one line in at a time. Determine what action to take based on
|
||||
the state. If the state is before the body, look for the
|
||||
beginning of the body.
|
||||
If the state is body, send the line to the body method.
|
||||
"""
|
||||
self.__initiate_values()
|
||||
read_obj = open(self.__file, 'r')
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if action == None:
|
||||
sys.stderr.write('no no matching state in module sections.py\n')
|
||||
sys.stderr.write(self.__state + '\n')
|
||||
action(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "sections.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
705
src/libprs500/ebooks/rtf2xml/styles.py
Executable file
705
src/libprs500/ebooks/rtf2xml/styles.py
Executable file
@ -0,0 +1,705 @@
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# You should have received a copy of the GNU General Public License #
|
||||
# along with this program; if not, write to the Free Software #
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
|
||||
# 02111-1307 USA #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os, tempfile
|
||||
from libprs500.ebooks.rtf2xml import copy, border_parse
|
||||
class Styles:
|
||||
"""
|
||||
Change lines with style numbers to actual style names.
|
||||
"""
|
||||
def __init__(self,
|
||||
in_file,
|
||||
bug_handler,
|
||||
copy = None,
|
||||
run_level = 1,
|
||||
):
|
||||
"""
|
||||
Required:
|
||||
'file'--file to parse
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__write_to = tempfile.mktemp()
|
||||
self.__run_level = run_level
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Initiate all values.
|
||||
"""
|
||||
self.__border_obj = border_parse.BorderParse()
|
||||
self.__styles_dict = {'par':{}, 'char':{}}
|
||||
self.__styles_num = '0'
|
||||
self.__type_of_style = 'par'
|
||||
self.__text_string = ''
|
||||
self.__state = 'before_styles_table'
|
||||
self.__state_dict = {
|
||||
'before_styles_table': self.__before_styles_func,
|
||||
'in_styles_table' : self.__in_styles_func,
|
||||
'in_individual_style' : self.__in_individual_style_func,
|
||||
'after_styles_table' : self.__after_styles_func,
|
||||
'mi<mk<styles-beg' : self.__found_styles_table_func,
|
||||
'mi<mk<styles-end' : self.__found_end_styles_table_func,
|
||||
'mi<mk<stylei-beg' : self.__found_beg_ind_style_func,
|
||||
'mi<mk<stylei-end' : self.__found_end_ind_style_func,
|
||||
'cw<ss<para-style' : self.__para_style_func,
|
||||
'cw<ss<char-style' : self.__char_style_func,
|
||||
}
|
||||
# A separate dictionary for parsing the body text
|
||||
self.__body_dict = {
|
||||
'cw<ss<para-style' : (self.__para_style_in_body_func, 'par'),
|
||||
'cw<ss<char-style' : (self.__para_style_in_body_func, 'char'),
|
||||
}
|
||||
# Dictionary needed to convert shortened style names to readable names
|
||||
self.__token_dict={
|
||||
# paragraph formatting => pf
|
||||
'par-end___' : 'para',
|
||||
'par-def___' : 'paragraph-definition',
|
||||
'keep-w-nex' : 'keep-with-next',
|
||||
'widow-cntl' : 'widow-control',
|
||||
'adjust-rgt' : 'adjust-right',
|
||||
'language__' : 'language',
|
||||
'right-inde' : 'right-indent',
|
||||
'fir-ln-ind' : 'first-line-indent',
|
||||
'left-inden' : 'left-indent',
|
||||
'space-befo' : 'space-before',
|
||||
'space-afte' : 'space-after',
|
||||
'line-space' : 'line-spacing',
|
||||
'default-ta' : 'default-tab',
|
||||
'align_____' : 'align',
|
||||
'widow-cntr' : 'widow-control',
|
||||
# page fomratting mixed in! (Just in older RTF?)
|
||||
'margin-lef' : 'left-indent',
|
||||
'margin-rig' : 'right-indent',
|
||||
'margin-bot' : 'space-after',
|
||||
'margin-top' : 'space-before',
|
||||
# stylesheet = > ss
|
||||
'style-shet' : 'stylesheet',
|
||||
'based-on__' : 'based-on-style',
|
||||
'next-style' : 'next-style',
|
||||
'char-style' : 'character-style',
|
||||
'para-style' : 'paragraph-style',
|
||||
# graphics => gr
|
||||
'picture___' : 'pict',
|
||||
'obj-class_' : 'obj_class',
|
||||
'mac-pic___' : 'mac-pict',
|
||||
# section => sc
|
||||
'section___' : 'section-new',
|
||||
'sect-defin' : 'section-reset',
|
||||
'sect-note_' : 'endnotes-in-section',
|
||||
# list=> ls
|
||||
'list-text_' : 'list-text',
|
||||
# this line must be wrong because it duplicates an earlier one
|
||||
'list-text_' : 'list-text',
|
||||
'list______' : 'list',
|
||||
'list-lev-d' : 'list-level-definition',
|
||||
'list-cardi' : 'list-cardinal-numbering',
|
||||
'list-decim' : 'list-decimal-numbering',
|
||||
'list-up-al' : 'list-uppercase-alphabetic-numbering',
|
||||
'list-up-ro' : 'list-uppercae-roman-numbering',
|
||||
'list-ord__' : 'list-ordinal-numbering',
|
||||
'list-ordte' : 'list-ordinal-text-numbering',
|
||||
'list-bulli' : 'list-bullet',
|
||||
'list-simpi' : 'list-simple',
|
||||
'list-conti' : 'list-continue',
|
||||
'list-hang_' : 'list-hang',
|
||||
# 'list-tebef' : 'list-text-before',
|
||||
'list-level' : 'level',
|
||||
'list-id___' : 'list-id',
|
||||
'list-start' : 'list-start',
|
||||
'nest-level' : 'nest-level',
|
||||
# duplicate
|
||||
'list-level' : 'list-level',
|
||||
# notes => nt
|
||||
'footnote__' : 'footnote',
|
||||
'type______' : 'type',
|
||||
# anchor => an
|
||||
'toc_______' : 'anchor-toc',
|
||||
'book-mk-st' : 'bookmark-start',
|
||||
'book-mk-en' : 'bookmark-end',
|
||||
'index-mark' : 'anchor-index',
|
||||
'place_____' : 'place',
|
||||
# field => fd
|
||||
'field_____' : 'field',
|
||||
'field-inst' : 'field-instruction',
|
||||
'field-rslt' : 'field-result',
|
||||
'datafield_' : 'data-field',
|
||||
# info-tables => it
|
||||
'font-table' : 'font-table',
|
||||
'colr-table' : 'color-table',
|
||||
'lovr-table' : 'list-override-table',
|
||||
'listtable_' : 'list-table',
|
||||
'revi-table' : 'revision-table',
|
||||
# character info => ci
|
||||
'hidden____' : 'hidden',
|
||||
'italics___' : 'italics',
|
||||
'bold______' : 'bold',
|
||||
'strike-thr' : 'strike-through',
|
||||
'shadow____' : 'shadow',
|
||||
'outline___' : 'outline',
|
||||
'small-caps' : 'small-caps',
|
||||
'dbl-strike' : 'double-strike-through',
|
||||
'emboss____' : 'emboss',
|
||||
'engrave___' : 'engrave',
|
||||
'subscript_' : 'subscript',
|
||||
'superscrip' : 'superscript',
|
||||
'plain_____' : 'plain',
|
||||
'font-style' : 'font-style',
|
||||
'font-color' : 'font-color',
|
||||
'font-size_' : 'font-size',
|
||||
'font-up___' : 'superscript',
|
||||
'font-down_' : 'subscript',
|
||||
'red_______' : 'red',
|
||||
'blue______' : 'blue',
|
||||
'green_____' : 'green',
|
||||
'caps______' : 'caps',
|
||||
# table => tb
|
||||
'row-def___' : 'row-definition',
|
||||
'cell______' : 'cell',
|
||||
'row_______' : 'row',
|
||||
'in-table__' : 'in-table',
|
||||
'columns___' : 'columns',
|
||||
'row-pos-le' : 'row-position-left',
|
||||
'cell-posit' : 'cell-position',
|
||||
# preamble => pr
|
||||
# underline
|
||||
'underlined' : 'underlined',
|
||||
# border => bd
|
||||
'bor-t-r-hi' : 'border-table-row-horizontal-inside',
|
||||
'bor-t-r-vi' : 'border-table-row-vertical-inside',
|
||||
'bor-t-r-to' : 'border-table-row-top',
|
||||
'bor-t-r-le' : 'border-table-row-left',
|
||||
'bor-t-r-bo' : 'border-table-row-bottom',
|
||||
'bor-t-r-ri' : 'border-table-row-right',
|
||||
'bor-cel-bo' : 'border-cell-bottom',
|
||||
'bor-cel-to' : 'border-cell-top',
|
||||
'bor-cel-le' : 'border-cell-left',
|
||||
'bor-cel-ri' : 'border-cell-right',
|
||||
'bor-par-bo' : 'border-paragraph-bottom',
|
||||
'bor-par-to' : 'border-paragraph-top',
|
||||
'bor-par-le' : 'border-paragraph-left',
|
||||
'bor-par-ri' : 'border-paragraph-right',
|
||||
'bor-par-bo' : 'border-paragraph-box',
|
||||
'bor-for-ev' : 'border-for-every-paragraph',
|
||||
'bor-outsid' : 'border-outisde',
|
||||
'bor-none__' : 'border',
|
||||
# border type => bt
|
||||
'bdr-single' : 'single',
|
||||
'bdr-doubtb' : 'double-thickness-border',
|
||||
'bdr-shadow' : 'shadowed-border',
|
||||
'bdr-double' : 'double-border',
|
||||
'bdr-dotted' : 'dotted-border',
|
||||
'bdr-dashed' : 'dashed',
|
||||
'bdr-hair__' : 'hairline',
|
||||
'bdr-inset_' : 'inset',
|
||||
'bdr-das-sm' : 'dash-small',
|
||||
'bdr-dot-sm' : 'dot-dash',
|
||||
'bdr-dot-do' : 'dot-dot-dash',
|
||||
'bdr-outset' : 'outset',
|
||||
'bdr-trippl' : 'tripple',
|
||||
'bdr-thsm__' : 'thick-thin-small',
|
||||
'bdr-htsm__' : 'thin-thick-small',
|
||||
'bdr-hthsm_' : 'thin-thick-thin-small',
|
||||
'bdr-thm__' : 'thick-thin-medium',
|
||||
'bdr-htm__' : 'thin-thick-medium',
|
||||
'bdr-hthm_' : 'thin-thick-thin-medium',
|
||||
'bdr-thl__' : 'thick-thin-large',
|
||||
'bdr-hthl_' : 'think-thick-think-large',
|
||||
'bdr-wavy_' : 'wavy',
|
||||
'bdr-d-wav' : 'double-wavy',
|
||||
'bdr-strip' : 'striped',
|
||||
'bdr-embos' : 'emboss',
|
||||
'bdr-engra' : 'engrave',
|
||||
'bdr-frame' : 'frame',
|
||||
'bdr-li-wid' : 'line-width',
|
||||
# tabs
|
||||
'tab-center' : 'center',
|
||||
'tab-right_' : 'right',
|
||||
'tab-dec___' : 'decimal',
|
||||
'leader-dot' : 'leader-dot',
|
||||
'leader-hyp' : 'leader-hyphen',
|
||||
'leader-und' : 'leader-underline',
|
||||
}
|
||||
self.__tabs_dict = {
|
||||
'cw<pf<tab-stop__' : self.__tab_stop_func,
|
||||
'cw<pf<tab-center' : self.__tab_type_func,
|
||||
'cw<pf<tab-right_' : self.__tab_type_func,
|
||||
'cw<pf<tab-dec___' : self.__tab_type_func,
|
||||
'cw<pf<leader-dot' : self.__tab_leader_func,
|
||||
'cw<pf<leader-hyp' : self.__tab_leader_func,
|
||||
'cw<pf<leader-und' : self.__tab_leader_func,
|
||||
'cw<pf<tab-bar-st' : self.__tab_bar_func,
|
||||
}
|
||||
self.__tab_type_dict = {
|
||||
'cw<pf<tab-center' : 'center',
|
||||
'cw<pf<tab-right_' : 'right',
|
||||
'cw<pf<tab-dec___' : 'decimal',
|
||||
'cw<pf<leader-dot' : 'leader-dot',
|
||||
'cw<pf<leader-hyp' : 'leader-hyphen',
|
||||
'cw<pf<leader-und' : 'leader-underline',
|
||||
}
|
||||
self.__ignore_list = [
|
||||
'list-tebef',
|
||||
]
|
||||
self.__tabs_list = self.__tabs_dict.keys()
|
||||
self.__tab_type = 'left'
|
||||
self.__leader_found = 0
|
||||
def __in_individual_style_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Check if the token marks the end of the individual style. (Action
|
||||
is the value of the state dictionary, and the only key that will
|
||||
match in this function is the end of the individual style.)
|
||||
If the end of the individual style is not found, check if the line
|
||||
is a control word. If it is, extract the relelvant info and look
|
||||
up this info in the tokens dictionary. I want to change
|
||||
abbreviated names for longer, more readable ones.
|
||||
Write an error message if no key is found for the info.
|
||||
If the line is text, add the text to a text string. The text
|
||||
string will be the name of the style.
|
||||
"""
|
||||
action = self.__state_dict.get(self.__token_info)
|
||||
if action:
|
||||
action(line)
|
||||
# have to parse border lines with external module
|
||||
elif line[0:5] == 'cw<bd':
|
||||
border_dict = self.__border_obj.parse_border(line)
|
||||
keys = border_dict.keys()
|
||||
for key in keys:
|
||||
self.__enter_dict_entry(key, border_dict[key])
|
||||
elif self.__token_info in self.__tabs_list:
|
||||
action = self.__tabs_dict.get(self.__token_info)
|
||||
if action != None:
|
||||
action(line)
|
||||
elif line[0:2] == 'cw':
|
||||
#cw<pf<widow-cntl<nu<true
|
||||
info = line[6:16]
|
||||
att = self.__token_dict.get(info)
|
||||
if att == None :
|
||||
if info not in self.__ignore_list:
|
||||
if self.__run_level > 3:
|
||||
msg = 'no value for key %s\n' % info
|
||||
raise self.__bug_handler, msg
|
||||
else:
|
||||
value = line[20:-1]
|
||||
self.__enter_dict_entry(att, value)
|
||||
elif line[0:2] == 'tx':
|
||||
self.__text_string += line[17:-1]
|
||||
def __tab_stop_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Try to add the number to dictionary entry tabs-left, or tabs-right, etc.
|
||||
If the dictionary entry doesn't exist, create one.
|
||||
"""
|
||||
type = 'tabs-%s' % self.__tab_type
|
||||
try:
|
||||
if self.__leader_found:
|
||||
self.__styles_dict['par'][self.__styles_num]['tabs']\
|
||||
+= '%s:' % self.__tab_type
|
||||
self.__styles_dict['par'][self.__styles_num]['tabs']\
|
||||
+= '%s;' % line[20:-1]
|
||||
else:
|
||||
self.__styles_dict['par'][self.__styles_num]['tabs']\
|
||||
+= '%s:' % self.__tab_type
|
||||
self.__styles_dict['par'][self.__styles_num]['tabs']\
|
||||
+= '%s;' % line[20:-1]
|
||||
except KeyError:
|
||||
self.__enter_dict_entry('tabs', '')
|
||||
self.__styles_dict['par'][self.__styles_num]['tabs']\
|
||||
+= '%s:' % self.__tab_type
|
||||
self.__styles_dict['par'][self.__styles_num]['tabs'] += '%s;' % line[20:-1]
|
||||
self.__tab_type = 'left'
|
||||
self.__leader_found = 0
|
||||
def __tab_type_func(self, line):
|
||||
"""
|
||||
"""
|
||||
type = self.__tab_type_dict.get(self.__token_info)
|
||||
if type != None:
|
||||
self.__tab_type = type
|
||||
else:
|
||||
if self.__run_level > 3:
|
||||
msg = 'no entry for %s\n' % self.__token_info
|
||||
raise self.__bug_handler, msg
|
||||
def __tab_leader_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Try to add the string of the tab leader to dictionary entry
|
||||
tabs-left, or tabs-right, etc. If the dictionary entry doesn't
|
||||
exist, create one.
|
||||
"""
|
||||
self.__leader_found = 1
|
||||
leader = self.__tab_type_dict.get(self.__token_info)
|
||||
if leader != None:
|
||||
leader += '^'
|
||||
type = 'tabs-%s' % self.__tab_type
|
||||
try:
|
||||
self.__styles_dict['par'][self.__styles_num]['tabs'] += ':%s;' % leader
|
||||
except KeyError:
|
||||
self.__enter_dict_entry('tabs', '')
|
||||
self.__styles_dict['par'][self.__styles_num]['tabs'] += '%s;' % leader
|
||||
else:
|
||||
if self.__run_level > 3:
|
||||
msg = 'no entry for %s\n' % self.__token_info
|
||||
raise self.__bug_handler, msg
|
||||
def __tab_bar_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Try to add the string of the tab bar to dictionary entry tabs-bar.
|
||||
If the dictionary entry doesn't exist, create one.
|
||||
"""
|
||||
# self.__add_dict_entry('tabs-bar', line[20:-1])
|
||||
try:
|
||||
self.__styles_dict['par'][self.__styles_num]['tabs']\
|
||||
+= '%s:' % 'bar'
|
||||
self.__styles_dict['par'][self.__styles_num]['tabs']\
|
||||
+= '%s;' % line[20:-1]
|
||||
except KeyError:
|
||||
self.__enter_dict_entry('tabs', '')
|
||||
self.__styles_dict['par'][self.__styles_num]['tabs']\
|
||||
+= '%s:' % 'bar'
|
||||
self.__styles_dict['par'][self.__styles_num]['tabs']\
|
||||
+= '%s;' % line[20:-1]
|
||||
self.__tab_type = 'left'
|
||||
def __enter_dict_entry(self, att, value):
|
||||
"""
|
||||
Required:
|
||||
att -- the attribute
|
||||
value -- the value
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Try to add the attribute value directly to the styles dictionary.
|
||||
If a keyerror is found, that means I have to build the "branches"
|
||||
of the dictionary before I can add the key value pair.
|
||||
"""
|
||||
try:
|
||||
self.__styles_dict[self.__type_of_style][self.__styles_num][att] = value
|
||||
except KeyError:
|
||||
self.__add_dict_entry(att, value)
|
||||
def __add_dict_entry(self, att, value):
|
||||
"""
|
||||
Required:
|
||||
att --the attribute
|
||||
value --the value
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
I have to build the branches of the dictionary before I can add
|
||||
the leaves. (I am comparing a dictionary to a tree.) To achieve
|
||||
this, I first make a temporary dictionary by extracting either the
|
||||
inside dictionary of the keyword par or char. This temporary
|
||||
dictionary is called type_dict.
|
||||
Next, create a second, smaller dictionary with just the attribute and value.
|
||||
Add the small dictionary to the type dictionary.
|
||||
Add this type dictionary to the main styles dictionary.
|
||||
"""
|
||||
if self.__type_of_style == 'par':
|
||||
type_dict =self.__styles_dict['par']
|
||||
elif self.__type_of_style == 'char':
|
||||
type_dict = self.__styles_dict['char']
|
||||
else:
|
||||
if self.__run_level > 3:
|
||||
msg = self.__type_of_style + 'error\n'
|
||||
raise self.__bug_handler, msg
|
||||
smallest_dict = {}
|
||||
smallest_dict[att] = value
|
||||
type_dict[self.__styles_num] = smallest_dict
|
||||
self.__styles_dict[self.__type_of_style] = type_dict
|
||||
def __para_style_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Set the type of style to paragraph.
|
||||
Extract the number for a line such as "cw<ss<para-style<nu<15".
|
||||
"""
|
||||
self.__type_of_style = 'par'
|
||||
self.__styles_num = line[20:-1]
|
||||
"""
|
||||
self.__enter_dict_entry('tabs-left', '')
|
||||
self.__enter_dict_entry('tabs-right', '')
|
||||
self.__enter_dict_entry('tabs-center', '')
|
||||
self.__enter_dict_entry('tabs-decimal', '')
|
||||
self.__enter_dict_entry('tabs-bar', '')
|
||||
"""
|
||||
def __char_style_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Set the type of style to character.
|
||||
Extract the number for a line such as "cw<ss<char-style<nu<15".
|
||||
"""
|
||||
self.__type_of_style = 'char'
|
||||
self.__styles_num = line[20:-1]
|
||||
def __found_beg_ind_style_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Get rid of the last semicolon in the text string. Add the text
|
||||
string as the value with 'name' as the key in the style
|
||||
dictionary.
|
||||
"""
|
||||
self.__state = 'in_individual_style'
|
||||
def __found_end_ind_style_func(self, line):
|
||||
name = self.__text_string[:-1] # get rid of semicolon
|
||||
# add 2005-04-29
|
||||
# get rid of space before or after
|
||||
name = name.strip()
|
||||
self.__enter_dict_entry('name', name)
|
||||
self.__text_string = ''
|
||||
def __found_end_styles_table_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Set the state to after the styles table.
|
||||
Fix the styles. (I explain this below.)
|
||||
Print out the style table.
|
||||
"""
|
||||
self.__state = 'after_styles_table'
|
||||
self.__fix_based_on()
|
||||
self.__print_style_table()
|
||||
def __fix_based_on(self):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
The styles dictionary may contain a pair of key values such as
|
||||
'next-style' => '15'. I want to change the 15 to the name of the
|
||||
style. I accomplish this by simply looking up the value of 15 in
|
||||
the styles table.
|
||||
Use two loops. First, check all the paragraph styles. Then check
|
||||
all the characer styles.
|
||||
The inner loop: first check 'next-style', then check 'based-on-style'.
|
||||
Make sure values exist for the keys to avoid the nasty keyerror message.
|
||||
"""
|
||||
types = ['par', 'char']
|
||||
for type in types:
|
||||
keys = self.__styles_dict[type].keys()
|
||||
for key in keys:
|
||||
styles = ['next-style', 'based-on-style']
|
||||
for style in styles:
|
||||
value = self.__styles_dict[type][key].get(style)
|
||||
if value != None:
|
||||
temp_dict = self.__styles_dict[type].get(value)
|
||||
if temp_dict:
|
||||
changed_value = self.__styles_dict[type][value].get('name')
|
||||
if changed_value:
|
||||
self.__styles_dict[type][key][style] = \
|
||||
changed_value
|
||||
else:
|
||||
if value == 0 or value == '0':
|
||||
pass
|
||||
else:
|
||||
if self.__run_level > 4:
|
||||
msg = '%s %s is based on %s\n' % (type, key, value)
|
||||
msg = 'There is no style with %s\n' % value
|
||||
raise self.__bug_handler, msg
|
||||
del self.__styles_dict[type][key][style]
|
||||
def __print_style_table(self):
|
||||
"""
|
||||
Required:
|
||||
nothing
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
This function prints out the style table.
|
||||
I use three nested for loops. The outer loop prints out the
|
||||
paragraphs styles, then the character styles.
|
||||
The next loop iterates through the style numbers.
|
||||
The most inside loop iterates over the pairs of attributes and
|
||||
values, and prints them out.
|
||||
"""
|
||||
types = ['par', 'char']
|
||||
for type in types:
|
||||
if type == 'par':
|
||||
prefix = 'paragraph'
|
||||
else:
|
||||
prefix = 'character'
|
||||
self.__write_obj.write(
|
||||
'mi<tg<open______<%s-styles\n' % prefix
|
||||
)
|
||||
style_numbers = self.__styles_dict[type].keys()
|
||||
for num in style_numbers:
|
||||
self.__write_obj.write(
|
||||
'mi<tg<empty-att_<%s-style-in-table<num>%s' % (prefix, num)
|
||||
)
|
||||
attributes = self.__styles_dict[type][num].keys()
|
||||
for att in attributes:
|
||||
this_value = self.__styles_dict[type][num][att]
|
||||
self.__write_obj.write(
|
||||
'<%s>%s' % (att, this_value)
|
||||
)
|
||||
self.__write_obj.write('\n')
|
||||
self.__write_obj.write(
|
||||
'mi<tg<close_____<%s-styles\n' % prefix
|
||||
)
|
||||
def __found_styles_table_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Change the state to in the style table when the marker has been found.
|
||||
"""
|
||||
self.__state = 'in_styles_table'
|
||||
def __before_styles_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line
|
||||
Returns:
|
||||
nothing.
|
||||
Logic:
|
||||
Check the line info in the state dictionary. When the beginning of
|
||||
the styles table is found, change the state to in the styles
|
||||
table.
|
||||
"""
|
||||
action = self.__state_dict.get(self.__token_info)
|
||||
if not action:
|
||||
self.__write_obj.write(line)
|
||||
else:
|
||||
action(line)
|
||||
def __in_styles_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Check the line for the beginning of an individaul style. If it is
|
||||
not found, simply print out the line.
|
||||
"""
|
||||
action = self.__state_dict.get(self.__token_info)
|
||||
if action == None:
|
||||
self.__write_obj.write(line)
|
||||
else:
|
||||
action(line)
|
||||
def __para_style_in_body_func(self, line, type):
|
||||
"""
|
||||
Required:
|
||||
line-- the line
|
||||
type -- whether a character or paragraph
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Determine the prefix by whether the type is "par" or "char".
|
||||
Extract the number from a line such as "cw<ss<para-style<nu<15".
|
||||
Look up that number in the styles dictionary and put a name for a number
|
||||
"""
|
||||
if type == 'par':
|
||||
prefix = 'para'
|
||||
else:
|
||||
prefix = 'char'
|
||||
num = line[20:-1]
|
||||
# may be invalid RTF--a style down below not defined above!
|
||||
try:
|
||||
value = self.__styles_dict[type][num]['name']
|
||||
except KeyError:
|
||||
value = None
|
||||
if value:
|
||||
self.__write_obj.write(
|
||||
'cw<ss<%s-style<nu<%s\n' % (prefix, value)
|
||||
)
|
||||
else:
|
||||
self.__write_obj.write(
|
||||
'cw<ss<%s_style<nu<not-defined\n' % prefix
|
||||
)
|
||||
def __after_styles_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Determine if a line with either character of paragraph style info
|
||||
has been found. If so, then use the appropriate method to parse
|
||||
the line. Otherwise, write the line to a file.
|
||||
"""
|
||||
action, type = self.__body_dict.get(self.__token_info, (None, None))
|
||||
if action:
|
||||
action(line, type)
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
def convert_styles(self):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
nothing (changes the original file)
|
||||
Logic:
|
||||
Read one line in at a time. Determine what action to take based on
|
||||
the state. If the state is before the style table, look for the
|
||||
beginning of the style table.
|
||||
If the state is in the style table, create the style dictionary
|
||||
and print out the tags.
|
||||
If the state if afer the style table, look for lines with style
|
||||
info, and substitute the number with the name of the style.
|
||||
"""
|
||||
self.__initiate_values()
|
||||
read_obj = open(self.__file, 'r')
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if action == None:
|
||||
sys.stderr.write('no matching state in module styles.py\n')
|
||||
sys.stderr.write(self.__state + '\n')
|
||||
action(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "styles.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
543
src/libprs500/ebooks/rtf2xml/table.py
Executable file
543
src/libprs500/ebooks/rtf2xml/table.py
Executable file
@ -0,0 +1,543 @@
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# You should have received a copy of the GNU General Public License #
|
||||
# along with this program; if not, write to the Free Software #
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
|
||||
# 02111-1307 USA #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os, tempfile
|
||||
from libprs500.ebooks.rtf2xml import copy, border_parse
|
||||
"""
|
||||
States.
|
||||
1. 'not_in_table'
|
||||
1. 'cw<tb<row-def___' start a row definition
|
||||
2. 'mi<mk<in-table__' start table
|
||||
2. 'in_table'
|
||||
1. 'mi<mk<pard-start', start of a row, cell
|
||||
2. 'mi<mk<not-in-tbl', end the table.
|
||||
3. 'cw<tb<row-def___' start a row definition
|
||||
3. in_row_definition
|
||||
1. 'mi<mk<not-in-tbl' : end the row defintion. If in table, end the table.
|
||||
2. 'mi<mk<pard-start' : end the row defintion
|
||||
if already in the table, start a row and cell.
|
||||
3. 'cw<tb<row_______' : end the row definition, end the row
|
||||
4. 'cw...' use another method to handle the control word
|
||||
control word might be added to dictionary.
|
||||
5. 'mi<mk<in-table__' If already in table, do nothing. Otherwise
|
||||
start the table.
|
||||
4. 'in_row'
|
||||
1. 'mi<mk<pard-start', start cell
|
||||
2. 'mi<mk<not-in-tbl' end table,
|
||||
3. 'cw<tb<row_______' close row,
|
||||
5. 'in_cell'
|
||||
1. 'mi<mk<not-in-tbl', end table
|
||||
2. 'cw<tb<cell______', end cell
|
||||
"""
|
||||
class Table:
|
||||
"""
|
||||
Make tables.
|
||||
Logic:
|
||||
Read one line at a time. The default state (self.__state) is
|
||||
'not_in_table'. Look for either a 'cw<tb<in-table__', or a row definition.
|
||||
"""
|
||||
def __init__(self,
|
||||
in_file,
|
||||
bug_handler,
|
||||
copy = None,
|
||||
run_level = 1,):
|
||||
"""
|
||||
Required:
|
||||
'file'--file to parse
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__run_level = run_level
|
||||
self.__write_to = tempfile.mktemp()
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Initiate all values.
|
||||
"""
|
||||
self.__state_dict = {
|
||||
'in_table': self.__in_table_func,
|
||||
'in_row_def': self.__in_row_def_func,
|
||||
'not_in_table': self.__not_in_table_func,
|
||||
'in_cell': self.__in_cell_func,
|
||||
'in_row': self.__in_row_func,
|
||||
}
|
||||
self.__not_in_table_dict = {
|
||||
'cw<tb<row-def___': self.__found_row_def_func,
|
||||
'cw<tb<in-table__': self.__start_table_func,
|
||||
'mi<mk<in-table__' : self.__start_table_func,
|
||||
}
|
||||
# can't use this dictionary. When in row_definition, many tokens
|
||||
# require multiple definitions
|
||||
self.__in_row_definition_dict = {
|
||||
'mi<mk<not-in-tbl' : self.__end_row_table_func,
|
||||
'mi<mk<pard-start' : self.__end_row_def_func,
|
||||
}
|
||||
self.__in_row_dict = {
|
||||
'mi<mk<not-in-tbl' : self.__close_table,
|
||||
'mi<mk<pard-start' : self.__start_cell_func,
|
||||
'cw<tb<row_______' : self.__end_row_func,
|
||||
'cw<tb<cell______' : self.__empty_cell,
|
||||
}
|
||||
# set the default state
|
||||
self.__state = ['not_in_table']
|
||||
# set empty data for all tables
|
||||
self.__table_data = []
|
||||
# just in case there is no table data
|
||||
self.__row_dict = {}
|
||||
self.__cell_list = []
|
||||
self.__cell_widths = []
|
||||
def __in_table_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line to parse
|
||||
Logic:
|
||||
Look for the end of the table. If found, close out the table.
|
||||
Look for 'mi<mk<pard-start', which marks the beginning of a row. Start
|
||||
a row and start a cell.
|
||||
"""
|
||||
# 'cell' : ('tb', 'cell______', self.default_func),
|
||||
if self.__token_info == 'mi<mk<not-in-tbl' or\
|
||||
self.__token_info == 'mi<mk<sect-start' or\
|
||||
self.__token_info == 'mi<mk<sect-close' or\
|
||||
self.__token_info == 'mi<mk<body-close':
|
||||
self.__close_table(line)
|
||||
elif self.__token_info == 'mi<mk<pard-start':
|
||||
self.__start_row_func(line)
|
||||
self.__start_cell_func(line)
|
||||
elif self.__token_info == 'cw<tb<row-def___':
|
||||
self.__found_row_def_func(line)
|
||||
elif self.__token_info == 'cw<tb<cell______':
|
||||
self.__start_row_func(line)
|
||||
self.__empty_cell( line)
|
||||
self.__write_obj.write(line)
|
||||
def __not_in_table_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- the line of text read in from document
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
The state is not in a table, so look for the two tokens that
|
||||
mark the start of a table: 'cw<tb<row-def', or 'cw<tb<in-table__'.
|
||||
If these tokens are found, use another method to start a table
|
||||
and change states. Otherwise, just output the line.
|
||||
"""
|
||||
action = self.__not_in_table_dict.get(self.__token_info)
|
||||
if action:
|
||||
action(line)
|
||||
self.__write_obj.write(line)
|
||||
def __close_table(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line to parse
|
||||
Returns:
|
||||
?
|
||||
Logic:
|
||||
Write the end marker for the table.
|
||||
Write the end tag for the table.
|
||||
Set the state to ['not_in_table']
|
||||
"""
|
||||
self.__write_obj.write('mi<mk<table-end_\n')
|
||||
self.__state = ['not_in_table']
|
||||
self.__table_data[-1]['number-of-columns'] = self.__max_number_cells_in_row
|
||||
self.__table_data[-1]['number-of-rows'] = self.__rows_in_table
|
||||
average_cells_in_row = self.__mode(self.__list_of_cells_in_row)
|
||||
self.__table_data[-1]['average-cells-per-row'] = average_cells_in_row
|
||||
average_cell_width = self.__mode(self.__cell_widths)
|
||||
self.__table_data[-1]['average-cell-width'] = average_cell_width
|
||||
def __found_row_def_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line don't need this except for consistency with other methods.
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
A row definition has been found. Collect all the data from this
|
||||
to use later in writing attributes for the table.
|
||||
"""
|
||||
self.__state.append('in_row_def')
|
||||
self.__last_cell_position = 0
|
||||
self.__row_dict = {}
|
||||
self.__cell_list = []
|
||||
self.__cell_list.append({})
|
||||
self.__cell_widths = []
|
||||
def __start_table_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line to parse
|
||||
Returns:
|
||||
?
|
||||
Logic:
|
||||
Add the 'in_table' to the state list.
|
||||
Write out the table marker.
|
||||
Initialize table values (not sure about these yet)
|
||||
"""
|
||||
self.__rows_in_table = 0;
|
||||
self.__cells_in_table = 0;
|
||||
self.__cells_in_row = 0;
|
||||
self.__max_number_cells_in_row = 0
|
||||
self.__table_data.append({})
|
||||
self.__list_of_cells_in_row = []
|
||||
self.__write_obj.write('mi<mk<tabl-start\n')
|
||||
self.__state.append('in_table')
|
||||
def __end_row_table_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --just for consistencey
|
||||
Returns:
|
||||
?
|
||||
Logic:
|
||||
?
|
||||
"""
|
||||
self.__close_table(self, line)
|
||||
def __end_row_def_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --just for consistency
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
change the state.
|
||||
get rid of the last {} in the cell list
|
||||
figure out the number of cells based on the self.__row_dict[widths]
|
||||
('122, 122')
|
||||
"""
|
||||
if len(self.__state) > 0:
|
||||
if self.__state[-1] == 'in_row_def':
|
||||
self.__state.pop()
|
||||
# added [{]] at the *end* of each /cell. Get rid of extra one
|
||||
self.__cell_list.pop()
|
||||
widths = self.__row_dict.get('widths')
|
||||
if widths:
|
||||
width_list = widths.split(',')
|
||||
num_cells = len (width_list)
|
||||
self.__row_dict['number-of-cells'] = num_cells
|
||||
def __in_row_def_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
In the text that defines a row. If a control word is found, handle the
|
||||
control word with another method.
|
||||
Check for states that will end this state.
|
||||
While in the row definition, certain tokens can end a row or end a table.
|
||||
If a paragrah definition (pard-start) is found, and the you are already in
|
||||
a table, start of a row.
|
||||
"""
|
||||
if self.__token_info == 'cw<tb<row_______':
|
||||
# write tags
|
||||
self.__end_row_func(line)
|
||||
# change the state
|
||||
self.__end_row_def_func(line)
|
||||
self.__write_obj.write(line)
|
||||
elif line[0:2] == 'cw':
|
||||
self.__handle_row_token(line)
|
||||
self.__write_obj.write(line)
|
||||
elif self.__token_info == 'mi<mk<not-in-tbl' and 'in_table' in self.__state:
|
||||
self.__end_row_def_func(line)
|
||||
self.__close_table(line)
|
||||
self.__write_obj.write(line)
|
||||
elif self.__token_info == 'mi<mk<pard-start':
|
||||
self.__end_row_def_func(line)
|
||||
# if already in the table, start a row, then cell.
|
||||
if (self.__state) > 0 and self.__state[-1] == 'in_table':
|
||||
self.__start_row_func(line)
|
||||
self.__start_cell_func(line)
|
||||
self.__write_obj.write(line)
|
||||
elif self.__token_info == 'mi<mk<in-table__':
|
||||
self.__end_row_def_func(line)
|
||||
# if not in table, start a new table
|
||||
if len(self.__state) > 0 and self.__state[-1] != 'in_table':
|
||||
self.__start_table_func(line)
|
||||
self.__write_obj.write(line)
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
def __handle_row_token(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line to parse
|
||||
Returns:
|
||||
?
|
||||
Logic:
|
||||
the tokens in the row definition contain the following information:
|
||||
1. row borders.
|
||||
2. cell borders for all cells in the row.
|
||||
3. cell postions for all cells in the row.
|
||||
Put all information about row borders into a row dictionary.
|
||||
Put all information about cell borders into into the dictionary in
|
||||
the last item in the cell list. ([{border:something, width:something},
|
||||
{border:something, width:something}])
|
||||
cw<bd<bor-t-r-to<nu<bdr-hair__|bdr-li-wid:0.50
|
||||
"""
|
||||
if line[3:5] == 'bd':
|
||||
border_obj = border_parse.BorderParse()
|
||||
the_dict = border_obj.parse_border(line)
|
||||
keys = the_dict.keys()
|
||||
# border-cell-top-hairline
|
||||
in_cell = 0
|
||||
for key in keys:
|
||||
if key[0:11] == 'border-cell':
|
||||
in_cell = 1
|
||||
for key in keys:
|
||||
if in_cell:
|
||||
self.__cell_list[-1][key] = the_dict[key]
|
||||
else:
|
||||
self.__row_dict[key] = the_dict[key]
|
||||
# cw<tb<cell-posit<nu<216.00
|
||||
elif self.__token_info == 'cw<tb<cell-posit':
|
||||
self.__found_cell_position(line)
|
||||
# cw<tb<row-pos-le<nu<-5.40
|
||||
elif self.__token_info == 'cw<tb<row-pos-le':
|
||||
position = line[20:-1]
|
||||
self.__row_dict['left-row-position'] = position
|
||||
elif self.__token_info == 'cw<tb<row-header':
|
||||
self.__row_dict['header'] = 'true'
|
||||
def __start_cell_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line -- the line of text
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Append 'in_cell' for states
|
||||
If the self.__cell list containst dictionaries, get the last dictionary.
|
||||
Write value => attributes for key=> value
|
||||
pop the self.__cell_list.
|
||||
Otherwise, print out a cell tag.
|
||||
"""
|
||||
self.__state.append('in_cell')
|
||||
# self.__cell_list = []
|
||||
if len(self.__cell_list) > 0:
|
||||
self.__write_obj.write('mi<tg<open-att__<cell')
|
||||
# cell_dict = self.__cell_list[-1]
|
||||
cell_dict = self.__cell_list[0]
|
||||
keys = cell_dict.keys()
|
||||
for key in keys:
|
||||
self.__write_obj.write('<%s>%s' % (key, cell_dict[key]))
|
||||
self.__write_obj.write('\n')
|
||||
# self.__cell_list.pop()
|
||||
self.__cell_list.pop(0)
|
||||
# self.__cell_list = self.__cell_list[1:]
|
||||
else:
|
||||
self.__write_obj.write('mi<tg<open______<cell\n')
|
||||
self.__cells_in_table += 1
|
||||
self.__cells_in_row += 1
|
||||
def __start_row_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line -- the line of text
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Append 'in_row' for states
|
||||
Write value => attributes for key=> value
|
||||
"""
|
||||
self.__state.append('in_row')
|
||||
self.__write_obj.write('mi<tg<open-att__<row')
|
||||
keys = self.__row_dict.keys()
|
||||
for key in keys:
|
||||
self.__write_obj.write('<%s>%s' % (key, self.__row_dict[key]))
|
||||
self.__write_obj.write('\n')
|
||||
self.__cells_in_row = 0
|
||||
self.__rows_in_table += 1
|
||||
def __found_cell_position(self, line):
|
||||
"""
|
||||
needs:
|
||||
line: current line
|
||||
returns:
|
||||
nothing
|
||||
logic:
|
||||
Calculate the cell width.
|
||||
If the cell is the first cell, you should add the left cell position to it.
|
||||
(This value is often negative.)
|
||||
Next, set the new last_cell_position to the current cell position.
|
||||
"""
|
||||
# cw<tb<cell-posit<nu<216.00
|
||||
new_cell_position = round(float(line[20:-1]), 2)
|
||||
left_position = 0
|
||||
if self.__last_cell_position == 0:
|
||||
left_position = self.__row_dict.get('left-row-position', 0)
|
||||
left_position = float(left_position)
|
||||
width = new_cell_position - self.__last_cell_position - left_position
|
||||
# width = round(width, 2)
|
||||
width = str('%.2f' % width)
|
||||
self.__last_cell_position = new_cell_position
|
||||
widths_exists = self.__row_dict.get('widths')
|
||||
if widths_exists:
|
||||
self.__row_dict['widths'] += ', %s' % str(width)
|
||||
else:
|
||||
self.__row_dict['widths'] = str(width)
|
||||
self.__cell_list[-1]['width'] = width
|
||||
self.__cell_list.append({})
|
||||
self.__cell_widths.append(width)
|
||||
def __in_cell_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
In the middle of a cell.
|
||||
Look for the close of the table. If found, use the close table function to close
|
||||
the table.
|
||||
Look for the close of the cell. If found, use the close cell function to close out
|
||||
the cell.
|
||||
Otherwise, print out the line.
|
||||
"""
|
||||
# cw<tb<cell______<nu<true
|
||||
# mi<mk<sect-start
|
||||
if self.__token_info == 'mi<mk<not-in-tbl' or\
|
||||
self.__token_info == 'mi<mk<sect-start' or\
|
||||
self.__token_info == 'mi<mk<sect-close' or\
|
||||
self.__token_info == 'mi<mk<body-close':
|
||||
self.__end_cell_func(line)
|
||||
self.__end_row_func(line)
|
||||
self.__close_table(line)
|
||||
self.__write_obj.write(line)
|
||||
elif self.__token_info == 'cw<tb<cell______':
|
||||
self.__end_cell_func(line)
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
def __end_cell_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
End the cell. Print out the closing marks. Pop the self.__state.
|
||||
"""
|
||||
if len(self.__state) > 1:
|
||||
if self.__state[-1] == 'in_cell':
|
||||
self.__state.pop()
|
||||
self.__write_obj.write('mi<mk<close_cell\n')
|
||||
self.__write_obj.write('mi<tg<close_____<cell\n')
|
||||
self.__write_obj.write('mi<mk<closecell_\n')
|
||||
def __in_row_func(self, line):
|
||||
if self.__token_info == 'mi<mk<not-in-tbl' or\
|
||||
self.__token_info == 'mi<mk<sect-start' or\
|
||||
self.__token_info == 'mi<mk<sect-close' or\
|
||||
self.__token_info == 'mi<mk<body-close':
|
||||
self.__end_row_func(line)
|
||||
self.__close_table(line)
|
||||
self.__write_obj.write(line)
|
||||
else:
|
||||
action = self.__in_row_dict.get(self.__token_info)
|
||||
if action:
|
||||
action(line)
|
||||
self.__write_obj.write(line)
|
||||
"""
|
||||
elif self.__token_info == 'mi<mk<pard-start':
|
||||
self.__start_cell_func(line)
|
||||
self.__write_obj.write(line)
|
||||
elif self.__token_info == 'cw<tb<row_______':
|
||||
self.__end_row_func(line)
|
||||
self.__write_obj.write(line)
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
"""
|
||||
def __end_row_func(self, line):
|
||||
"""
|
||||
"""
|
||||
if len(self.__state) > 1 and self.__state[-1] == 'in_row':
|
||||
self.__state.pop()
|
||||
self.__write_obj.write('mi<tg<close_____<row\n')
|
||||
else:
|
||||
self.__write_obj.write('mi<tg<empty_____<row\n')
|
||||
self.__rows_in_table += 1
|
||||
if self.__cells_in_row > self.__max_number_cells_in_row:
|
||||
self.__max_number_cells_in_row = self.__cells_in_row
|
||||
self.__list_of_cells_in_row.append(self.__cells_in_row)
|
||||
def __empty_cell(self, line):
|
||||
"""
|
||||
Required:
|
||||
line -- line of text
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Write an empty tag with attributes if there are attributes.
|
||||
Otherwise, writen an empty tag with cell as element.
|
||||
"""
|
||||
if len(self.__cell_list) > 0:
|
||||
self.__write_obj.write('mi<tg<empty-att_<cell')
|
||||
cell_dict = self.__cell_list[-1]
|
||||
keys = cell_dict.keys()
|
||||
for key in keys:
|
||||
self.__write_obj.write('<%s>%s' % (key, cell_dict[key]))
|
||||
self.__write_obj.write('\n')
|
||||
else:
|
||||
self.__write_obj.write('mi<tg<empty_____<cell\n')
|
||||
self.__cells_in_table += 1
|
||||
self.__cells_in_row += 1
|
||||
def __mode(self, the_list):
|
||||
"""
|
||||
Required:
|
||||
the_list -- a list of something
|
||||
Returns:
|
||||
the number that occurs the most
|
||||
Logic:
|
||||
get the count of each item in list. The count that is the greatest
|
||||
is the mode.
|
||||
"""
|
||||
max = 0
|
||||
mode = 'not-defined'
|
||||
for item in the_list:
|
||||
num_of_values = the_list.count(item)
|
||||
if num_of_values > max:
|
||||
mode = item
|
||||
max = num_of_values
|
||||
return mode
|
||||
def make_table(self):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
A dictionary of values for the beginning of the table.
|
||||
Logic:
|
||||
Read one line in at a time. Determine what action to take based on
|
||||
the state.
|
||||
"""
|
||||
self.__initiate_values()
|
||||
read_obj = open(self.__file, 'r')
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
action = self.__state_dict.get(self.__state[-1])
|
||||
# print self.__state[-1]
|
||||
if action == None:
|
||||
sys.stderr.write('No matching state in module table.py\n')
|
||||
sys.stderr.write(self.__state[-1] + '\n')
|
||||
action(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "table.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
return self.__table_data
|
85
src/libprs500/ebooks/rtf2xml/table_info.py
Executable file
85
src/libprs500/ebooks/rtf2xml/table_info.py
Executable file
@ -0,0 +1,85 @@
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# You should have received a copy of the GNU General Public License #
|
||||
# along with this program; if not, write to the Free Software #
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
|
||||
# 02111-1307 USA #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import os, tempfile
|
||||
from libprs500.ebooks.rtf2xml import copy
|
||||
# note to self. This is the first module in which I use tempfile. A good idea?
|
||||
"""
|
||||
"""
|
||||
class TableInfo:
|
||||
"""
|
||||
Insert table data for tables.
|
||||
Logic:
|
||||
"""
|
||||
def __init__(self,
|
||||
in_file,
|
||||
bug_handler,
|
||||
table_data,
|
||||
copy=None,
|
||||
run_level = 1,):
|
||||
"""
|
||||
Required:
|
||||
'file'--file to parse
|
||||
'table_data' -- a dictionary for each table.
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__table_data = table_data
|
||||
self.__run_level = run_level
|
||||
self.__write_to = tempfile.mktemp()
|
||||
# self.__write_to = 'table_info.data'
|
||||
def insert_info(self):
|
||||
"""
|
||||
"""
|
||||
read_obj = open(self.__file, 'r')
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
if line == 'mi<mk<tabl-start\n':
|
||||
if len(self.__table_data) > 0:
|
||||
table_dict = self.__table_data[0]
|
||||
self.__write_obj.write('mi<tg<open-att__<table')
|
||||
keys = table_dict.keys()
|
||||
for key in keys:
|
||||
self.__write_obj.write('<%s>%s' % (key, table_dict[key]))
|
||||
self.__write_obj.write('\n')
|
||||
self.__table_data = self.__table_data[1:]
|
||||
else:
|
||||
# this shouldn't happen!
|
||||
if self.__run_level > 3:
|
||||
msg = 'Not enough data for each table\n'
|
||||
raise self.__bug_handler, msg
|
||||
self.__write_obj.write('mi<tg<open______<table\n')
|
||||
elif line == 'mi<mk<table-end_\n':
|
||||
self.__write_obj.write('mi<tg<close_____<table\n')
|
||||
self.__write_obj.write(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "table_info.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
116
src/libprs500/ebooks/rtf2xml/tokenize.py
Executable file
116
src/libprs500/ebooks/rtf2xml/tokenize.py
Executable file
@ -0,0 +1,116 @@
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# You should have received a copy of the GNU General Public License #
|
||||
# along with this program; if not, write to the Free Software #
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
|
||||
# 02111-1307 USA #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import os, re, tempfile
|
||||
from libprs500.ebooks.rtf2xml import copy
|
||||
class Tokenize:
|
||||
"""Tokenize RTF into one line per field. Each line will contain information useful for the rest of the script"""
|
||||
def __init__(self,
|
||||
in_file,
|
||||
bug_handler,
|
||||
copy = None,
|
||||
run_level = 1,
|
||||
):
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__special_tokens = [ '_', '~', "'", '{', '}' ]
|
||||
self.__write_to = tempfile.mktemp()
|
||||
def __from_ms_to_utf8(self,match_obj):
|
||||
uni_char = int(match_obj.group(1))
|
||||
if uni_char < 0:
|
||||
uni_char += 65536
|
||||
return '&#x' + str('%X' % uni_char) + ';'
|
||||
def __neg_unicode_func(self, match_obj):
|
||||
neg_uni_char = int(match_obj.group(1)) * -1
|
||||
# sys.stderr.write(str( neg_uni_char))
|
||||
uni_char = neg_uni_char + 65536
|
||||
return '&#x' + str('%X' % uni_char) + ';'
|
||||
def __sub_line_reg(self,line):
|
||||
line = line.replace("\\\\", "\\backslash ")
|
||||
line = line.replace("\\~", "\\~ ")
|
||||
line = line.replace("\\;", "\\; ")
|
||||
line = line.replace("&", "&")
|
||||
line = line.replace("<", "<")
|
||||
line = line.replace(">", ">")
|
||||
line = line.replace("\\~", "\\~ ")
|
||||
line = line.replace("\\_", "\\_ ")
|
||||
line = line.replace("\\:", "\\: ")
|
||||
line = line.replace("\\-", "\\- ")
|
||||
# turn into a generic token to eliminate special
|
||||
# cases and make processing easier
|
||||
line = line.replace("\\{", "\\ob ")
|
||||
# turn into a generic token to eliminate special
|
||||
# cases and make processing easier
|
||||
line = line.replace("\\}", "\\cb ")
|
||||
# put a backslash in front of to eliminate special cases and
|
||||
# make processing easier
|
||||
line = line.replace("{", "\\{")
|
||||
# put a backslash in front of to eliminate special cases and
|
||||
# make processing easier
|
||||
line = line.replace("}", "\\}")
|
||||
line = re.sub(self.__utf_exp, self.__from_ms_to_utf8, line)
|
||||
# line = re.sub( self.__neg_utf_exp, self.__neg_unicode_func, line)
|
||||
line = re.sub(self.__ms_hex_exp, "\\mshex0\g<1> ", line)
|
||||
##line = line.replace("\\backslash", "\\\\")
|
||||
# this is for older RTF
|
||||
line = re.sub(self.__par_exp, '\\par ', line)
|
||||
return line
|
||||
def __compile_expressions(self):
|
||||
self.__ms_hex_exp = re.compile(r"\\\'(..)")
|
||||
self.__utf_exp = re.compile(r"\\u(-?\d{3,6})")
|
||||
self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\\[^\s\\{}&]+(?:\s)?)")
|
||||
self.__par_exp = re.compile(r'\\$')
|
||||
self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)")
|
||||
##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)")
|
||||
def __create_tokens(self):
|
||||
self.__compile_expressions()
|
||||
read_obj = open(self.__file, 'r')
|
||||
write_obj = open(self.__write_to, 'w')
|
||||
line_to_read = "dummy"
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
line = line.replace("\n", "")
|
||||
line = self.__sub_line_reg(line)
|
||||
tokens = re.split(self.__splitexp, line)
|
||||
##print tokens
|
||||
for token in tokens:
|
||||
if token != "":
|
||||
write_obj.write(token + "\n")
|
||||
"""
|
||||
match_obj = re.search(self.__mixed_exp, token)
|
||||
if match_obj != None:
|
||||
first = match_obj.group(1)
|
||||
second = match_obj.group(2)
|
||||
write_obj.write(first + "\n")
|
||||
write_obj.write(second + "\n")
|
||||
else:
|
||||
write_obj.write(token + "\n")
|
||||
"""
|
||||
read_obj.close()
|
||||
write_obj.close()
|
||||
def tokenize(self):
|
||||
"""Main class for handling other methods. Reads in one line \
|
||||
at a time, usues method self.sub_line to make basic substitutions,\
|
||||
uses ? to process tokens"""
|
||||
self.__create_tokens()
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "tokenize.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
@ -93,7 +93,7 @@ class Distribution(object):
|
||||
self.command = cmd.strip()
|
||||
if os == 'debian':
|
||||
self.command += '\n'+prefix + 'cp -R /usr/share/pycentral/fonttools/site-packages/FontTools* /usr/lib/python2.5/site-packages/'
|
||||
self.command += '\n'+prefix+'easy_install -U TTFQuery libprs500 \n'+prefix+'easy_install -f http://sourceforge.net/project/showfiles.php?group_id=68617 rtf2xml\n'+prefix+'libprs500_postinstall'
|
||||
self.command += '\n'+prefix+'easy_install -U TTFQuery libprs500 \n'+prefix+'libprs500_postinstall'
|
||||
try:
|
||||
self.manual = Markup(self.MANUAL_MAP[os])
|
||||
except KeyError:
|
||||
|
Loading…
x
Reference in New Issue
Block a user