RTFInput: Cleanup & small improvments

This commit is contained in:
Sengian 2012-05-05 15:34:00 +02:00
parent bd5e6585ff
commit 60b53045e4
7 changed files with 298 additions and 264 deletions

View File

@ -372,8 +372,8 @@ class ParseRtf:
old_rtf = old_rtf_obj.check_if_old_rtf()
if old_rtf:
if self.__run_level > 5:
msg = 'Older RTF\n'
msg += 'self.__run_level is "%s"\n' % self.__run_level
msg = 'Older RTF\n' \
'self.__run_level is "%s"\n' % self.__run_level
raise RtfInvalidCodeException, msg
if self.__run_level > 1:
sys.stderr.write('File could be older RTF...\n')
@ -381,7 +381,7 @@ class ParseRtf:
if self.__run_level > 1:
sys.stderr.write(
'File also has newer RTF.\n'
'Will do the best to convert.\n'
'Will do the best to convert...\n'
)
add_brackets_obj = add_brackets.AddBrackets(
in_file = self.__temp_file,

View File

@ -20,6 +20,9 @@ class AddBrackets:
"""
Add brackets for old RTF.
Logic:
When control words without their own brackets are encountered
and in the list of allowed words, this will add brackets
to facilitate the treatment of the file
"""
def __init__(self, in_file,
bug_handler,
@ -41,26 +44,17 @@ class AddBrackets:
self.__copy = copy
self.__write_to = better_mktemp()
self.__run_level = run_level
def __initiate_values(self):
"""
"""
self.__state_dict = {
'before_body' : self.__before_body_func,
'in_body' : self.__in_body_func,
'after_control_word' : self.__after_control_word_func,
'in_ignore' : self.__ignore_func,
}
self.__state = 'before_body'
self.__inline = {}
self.__temp_group = []
self.__open_bracket = False
self.__found_brackets = False
self.__accept = [
'cw<ci<bold______' ,
'cw<ci<annotation' ,
'cw<ci<blue______' ,
'cw<ci<bold______' ,
# 'cw<ci<bold______' ,
'cw<ci<caps______' ,
'cw<ci<char-style' ,
'cw<ci<dbl-strike' ,
@ -86,6 +80,16 @@ class AddBrackets:
# 'cw<ul<underlined' ,
]
def __initiate_values(self):
"""
"""
self.__state = 'before_body'
self.__inline = {}
self.__temp_group = []
self.__open_bracket = False
self.__found_brackets = False
def __before_body_func(self, line):
"""
"""

View File

@ -11,6 +11,7 @@
# #
#########################################################################
import sys, os
from calibre.ebooks.rtf2xml import copy
from calibre.ptempfile import better_mktemp
@ -31,29 +32,29 @@ class Header:
self.__bug_handler = bug_handler
self.__copy = copy
self.__write_to = better_mktemp()
self.__found_a_header = 0
self.__found_a_header = False
def __in_header_func(self, line):
"""
Handle all tokens that are part of header
"""
if self.__cb_count == self.__header_bracket_count:
self.__in_header = 0
self.__in_header = False
self.__write_obj.write(line)
self.__write_to_head_obj.write(
'mi<mk<head___clo\n')
self.__write_to_head_obj.write(
'mi<tg<close_____<header-or-footer\n')
self.__write_to_head_obj.write(
'mi<mk<head___clo\n' \
'mi<tg<close_____<header-or-footer\n' \
'mi<mk<header-clo\n')
else:
self.__write_to_head_obj.write(line)
def __found_header(self, line):
"""
Found a header
"""
# but this could be header or footer
self.__found_a_header = 1
self.__in_header = 1
self.__found_a_header = True
self.__in_header = True
self.__header_count += 1
# temporarily set this to zero so I can enter loop
self.__cb_count = 0
@ -69,18 +70,23 @@ class Header:
'mi<tg<open-att__<header-or-footer<type>%s\n' % (type)
)
else:
sys.stderr.write('module is header\n')
sys.stderr.write('method is __found_header\n')
sys.stderr.write('no dict entry\n')
sys.stderr.write('line is %s' % line)
sys.stderr.write(
'module is header\n' \
'method is __found_header\n' \
'no dict entry\n' \
'line is %s' % line)
self.__write_to_head_obj.write(
'mi<tg<open-att__<header-or-footer<type>none\n'
)
def __default_sep(self, line):
"""Handle all tokens that are not header tokens"""
"""
Handle all tokens that are not header tokens
"""
if self.__token_info[3:5] == 'hf':
self.__found_header(line)
self.__write_obj.write(line)
def __initiate_sep_values(self):
"""
initiate counters for separate_footnotes method.
@ -89,7 +95,7 @@ class Header:
self.__ob_count = 0
self.__cb_count = 0
self.__header_bracket_count = 0
self.__in_header = 0
self.__in_header = False
self.__header_count = 0
self.__head_dict = {
'head-left_' : ('header-left'),
@ -101,6 +107,7 @@ class Header:
'header____' : ('header' ),
'footer____' : ('footer' ),
}
def separate_headers(self):
"""
Separate all the footnotes in an RTF file and put them at the bottom,
@ -110,14 +117,11 @@ class Header:
bottom of the main file.
"""
self.__initiate_sep_values()
read_obj = open(self.__file)
self.__write_obj = open(self.__write_to, 'w')
self.__header_holder = better_mktemp()
self.__write_to_head_obj = open(self.__header_holder, 'w')
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
with open(self.__file) as read_obj:
with open(self.__write_to, 'w') as self.__write_obj:
with open(self.__header_holder, 'w') as self.__write_to_head_obj:
for line in read_obj:
self.__token_info = line[:16]
# keep track of opening and closing brackets
if self.__token_info == 'ob<nu<open-brack':
@ -130,33 +134,30 @@ class Header:
# not in the middle of footnote text
else:
self.__default_sep(line)
self.__write_obj.close()
read_obj.close()
self.__write_to_head_obj.close()
read_obj = open(self.__header_holder, 'r')
write_obj = open(self.__write_to, 'a')
with open(self.__header_holder, 'r') as read_obj:
with open(self.__write_to, 'a') as write_obj:
write_obj.write(
'mi<mk<header-beg\n')
line = 1
while line:
line = read_obj.readline()
for line in read_obj:
write_obj.write(line)
write_obj.write(
'mi<mk<header-end\n')
read_obj.close()
write_obj.close()
os.remove(self.__header_holder)
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "header_separate.info")
copy_obj.copy_file(self.__write_to, "header_separate.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)
def update_info(self, file, copy):
"""
Unused method
"""
self.__file = file
self.__copy = copy
def __get_head_body_func(self, line):
"""
Process lines in main body and look for beginning of headers.
@ -166,6 +167,7 @@ class Header:
self.__state = 'head'
else:
self.__write_obj.write(line)
def __get_head_head_func(self, line):
"""
Copy headers and footers from bottom of file to a separate, temporary file.
@ -174,6 +176,7 @@ class Header:
self.__state = 'body'
else:
self.__write_to_head_obj.write(line)
def __get_headers(self):
"""
Private method to remove footnotes from main file. Read one line from
@ -182,21 +185,16 @@ class Header:
These two functions do the work of separating the footnotes form the
body.
"""
read_obj = open(self.__file)
self.__write_obj = open(self.__write_to, 'w')
# self.__write_to = "footnote_info.data"
self.__write_to_head_obj = open(self.__header_holder, 'w')
line = 1
while line:
line = read_obj.readline()
with open(self.__file) as read_obj:
with open(self.__write_to, 'w') as self.__write_obj:
with open(self.__header_holder, 'w') as self.__write_to_head_obj:
for line in read_obj:
self.__token_info = line[:16]
if self.__state == 'body':
self.__get_head_body_func(line)
elif self.__state == 'head':
self.__get_head_head_func(line)
read_obj.close()
self.__write_obj.close()
self.__write_to_head_obj.close()
def __get_head_from_temp(self, num):
"""
Private method for joining headers and footers to body. This method
@ -205,18 +203,17 @@ class Header:
returns them as a string.
"""
look_for = 'mi<mk<header-ope<' + num + '\n'
found_head = 0
found_head = False
string_to_return = ''
line = 1
while line:
line = self.__read_from_head_obj.readline()
for line in self.__read_from_head_obj:
if found_head:
if line == 'mi<mk<header-clo\n':
return string_to_return
string_to_return = string_to_return + line
string_to_return += line
else:
if line == look_for:
found_head = 1
found_head = True
def __join_from_temp(self):
"""
Private method for rejoining footnotes to body. Read from the
@ -227,15 +224,13 @@ class Header:
If no footnote marker is found, simply print out the token (line).
"""
self.__read_from_head_obj = open(self.__header_holder, 'r')
read_obj = open(self.__write_to, 'r')
self.__write_obj = open(self.__write_to2, 'w')
line = 1
while line:
line = read_obj.readline()
with open(self.__write_to, 'r') as read_obj:
for line in read_obj:
if line[:16] == 'mi<mk<header-ind':
line = self.__get_head_from_temp(line[17:-1])
self.__write_obj.write(line)
read_obj.close()
def join_headers(self):
"""
Join the footnotes from the bottom of the file and put them in their

View File

@ -11,14 +11,18 @@
# #
#########################################################################
import sys
"""
"""
class OldRtf:
"""
Check to see if the RTF is an older version
Logic:
If allowable control word/properties happen in text without being enclosed
in brackets the file will be considered old rtf
"""
def __init__(self, in_file, bug_handler, run_level ):
def __init__(self, in_file,
bug_handler,
run_level,
):
"""
Required:
'file'--file to parse
@ -32,11 +36,7 @@ class OldRtf:
"""
self.__file = in_file
self.__bug_handler = bug_handler
self.__initiate_values()
self.__ob_group = 0
def __initiate_values(self):
self.__previous_token = ''
self.__new_found = 0
self.__run_level = run_level
self.__allowable = [
'annotation' ,
'blue______' ,
@ -64,14 +64,18 @@ class OldRtf:
'superscrip' ,
'underlined' ,
]
self.__state = 'before_body'
self.__action_dict = {
'before_body' : self.__before_body_func,
'in_body' : self.__check_tokens_func,
'after_pard' : self.__after_pard_func,
}
self.__is_old = 0
def __initiate_values(self):
self.__previous_token = ''
self.__state = 'before_body'
self.__found_new = 0
self.__ob_group = 0
def __check_tokens_func(self, line):
if self.__inline_info in self.__allowable:
if self.__ob_group == self.__base_ob_count:
@ -80,32 +84,32 @@ class OldRtf:
self.__found_new += 1
elif self.__token_info == 'cw<pf<par-def___':
self.__state = 'after_pard'
def __before_body_func(self, line):
if self.__token_info == 'mi<mk<body-open_':
self.__state = 'in_body'
self.__base_ob_count = self.__ob_group
def __after_pard_func(self, line):
if line[0:2] != 'cw':
self.__state = 'in_body'
def check_if_old_rtf(self):
"""
Requires:
nothing
Returns:
1 if file is older RTf
0 if file is newer RTF
True if file is older RTf
False if file is newer RTF
"""
read_obj = open(self.__file, 'r')
line = 1
self.__initiate_values()
line_num = 0
while line:
line = read_obj.readline()
with open(self.__file, 'r') as read_obj:
for line in read_obj:
line_num += 1
self.__token_info = line[:16]
if self.__token_info == 'mi<mk<body-close':
return 0
self.__ob_group = 0
return False
if self.__token_info == 'ob<nu<open-brack':
self.__ob_group += 1
self.__ob_count = line[-5:-1]
@ -114,14 +118,22 @@ class OldRtf:
self.__cb_count = line[-5:-1]
self.__inline_info = line[6:16]
if self.__state == 'after_body':
return 0
return False
action = self.__action_dict.get(self.__state)
if not action:
sys.stderr.write('No action for state!\n')
if action is None:
try:
sys.stderr.write('No action for this state!\n')
except:
pass
result = action(line)
if result == 'new_rtf':
return 0
return False
elif result == 'old_rtf':
return 1
if self.__run_level > 3:
sys.stderr.write(
'Old rtf construction %s (bracket %s, line %s)\n'
% (self.__inline_info, str(self.__ob_group), line_num)
)
return True
self.__previous_token = line[6:16]
return 0
return False

View File

@ -11,6 +11,7 @@
# #
#########################################################################
import sys, os
from calibre.ebooks.rtf2xml import copy
from calibre.ptempfile import better_mktemp
@ -34,7 +35,7 @@ a blank paragraph.
Once a paragraph is found, the state changes to 'paragraph.' In this state,
clues are looked to for the end of a paragraph. The end of a paragraph marker
(\par) marks the end of a paragraph. So does the end of a footnote or heading;
a paragraph definintion; the end of a field-block; and the beginning of a
a paragraph definition; the end of a field-block; and the beginning of a
section. (How about the end of a section or the end of a field-block?)
"""
def __init__(self,
@ -60,6 +61,7 @@ section. (How about the end of a section or the end of a field-block?)
self.__write_empty_para = write_empty_para
self.__run_level = run_level
self.__write_to = better_mktemp()
def __initiate_values(self):
"""
Initiate all values.
@ -99,6 +101,7 @@ section. (How about the end of a section or the end of a field-block?)
'mi<mk<pict-start' : self.__start_para_func,
'cw<pf<page-break' : self.__empty_pgbk_func, # page break
}
def __before_body_func(self, line):
"""
Required:
@ -112,6 +115,7 @@ section. (How about the end of a section or the end of a field-block?)
if self.__token_info == 'mi<mk<body-open_':
self.__state = 'not_paragraph'
self.__write_obj.write(line)
def __not_paragraph_func(self, line):
"""
Required:
@ -127,6 +131,7 @@ section. (How about the end of a section or the end of a field-block?)
if action:
action(line)
self.__write_obj.write(line)
def __paragraph_func(self, line):
"""
Required:
@ -144,6 +149,7 @@ section. (How about the end of a section or the end of a field-block?)
action(line)
else:
self.__write_obj.write(line)
def __start_para_func(self, line):
"""
Requires:
@ -160,6 +166,7 @@ section. (How about the end of a section or the end of a field-block?)
)
self.__write_obj.write(self.__start2_marker)
self.__state = 'paragraph'
def __empty_para_func(self, line):
"""
Requires:
@ -176,6 +183,7 @@ section. (How about the end of a section or the end of a field-block?)
'mi<tg<empty_____<para\n'
)
self.__write_obj.write(self.__end_marker) # marker for later parsing
def __empty_pgbk_func(self, line):
"""
Requires:
@ -188,6 +196,7 @@ section. (How about the end of a section or the end of a field-block?)
self.__write_obj.write(
'mi<tg<empty_____<page-break\n'
)
def __close_para_func(self, line):
"""
Requires:
@ -205,6 +214,7 @@ section. (How about the end of a section or the end of a field-block?)
self.__write_obj.write(self.__end_marker) # marker for later parser
self.__write_obj.write(line)
self.__state = 'not_paragraph'
def __bogus_para__def_func(self, line):
"""
Requires:
@ -215,6 +225,7 @@ section. (How about the end of a section or the end of a field-block?)
if a \pard occurs in a paragraph, I want to ignore it. (I believe)
"""
self.__write_obj.write('mi<mk<bogus-pard\n')
def make_paragraphs(self):
"""
Requires:
@ -229,20 +240,18 @@ section. (How about the end of a section or the end of a field-block?)
only other state is 'paragraph'.
"""
self.__initiate_values()
read_obj = open(self.__file, 'r')
self.__write_obj = open(self.__write_to, 'w')
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
with open(self.__file, 'r') as read_obj:
with open(self.__write_to, 'w') as self.__write_obj:
for line in read_obj:
self.__token_info = line[:16]
action = self.__state_dict.get(self.__state)
if action == None:
sys.stderr.write('no no matching state in module sections.py\n')
if action is None:
try:
sys.stderr.write('no matching state in module paragraphs.py\n')
sys.stderr.write(self.__state + '\n')
except:
pass
action(line)
read_obj.close()
self.__write_obj.close()
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "paragraphs.data")

View File

@ -11,7 +11,9 @@
# #
#########################################################################
import sys,os
from calibre.ebooks.rtf2xml import copy
class Preamble:
"""
Fix the reamaing parts of the preamble. This module does very little. It
@ -19,8 +21,14 @@ class Preamble:
future, when I understand how to interpret the revision table and list
table, I will make these methods more functional.
"""
def __init__(self, file, bug_handler, platform, default_font, code_page,
copy=None, temp_dir=None):
def __init__(self, file,
bug_handler,
platform,
default_font,
code_page,
copy=None,
temp_dir=None,
):
"""
Required:
file--file to parse
@ -44,6 +52,7 @@ class Preamble:
self.__write_to = os.path.join(temp_dir,"info_table_info.data")
else:
self.__write_to = "info_table_info.data"
def __initiate_values(self):
"""
Initiate all values.
@ -62,12 +71,14 @@ class Preamble:
'mi<mk<revtbl-beg' : self.__found_revision_table_func,
'mi<mk<body-open_' : self.__found_body_func,
}
def __default_func(self, line):
action = self.__default_dict.get(self.__token_info)
if action:
action(line)
else:
self.__write_obj.write(line)
def __found_rtf_head_func(self, line):
"""
Requires:
@ -84,8 +95,10 @@ class Preamble:
'<platform>%s\n' % (self.__default_font, self.__code_page,
self.__platform)
)
def __found_list_table_func(self, line):
self.__state = 'list_table'
def __list_table_func(self, line):
if self.__token_info == 'mi<mk<listabend_':
self.__state = 'default'
@ -93,8 +106,10 @@ class Preamble:
pass
else:
self.__write_obj.write(line)
def __found_revision_table_func(self, line):
self.__state = 'revision'
def __revision_table_func(self, line):
if self.__token_info == 'mi<mk<revtbl-end':
self.__state = 'default'
@ -102,11 +117,14 @@ class Preamble:
pass
else:
self.__write_obj.write(line)
def __found_body_func(self, line):
self.__state = 'body'
self.__write_obj.write(line)
def __body_func(self, line):
self.__write_obj.write(line)
def fix_preamble(self):
"""
Requires:
@ -119,20 +137,15 @@ class Preamble:
the list table.
"""
self.__initiate_values()
read_obj = open(self.__file, 'r')
self.__write_obj = open(self.__write_to, 'w')
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
with open(self.__file, 'r') as read_obj:
with open(self.__write_to, 'w') as self.__write_obj:
for line in read_obj:
self.__token_info = line[:16]
action = self.__state_dict.get(self.__state)
if action == None:
sys.stderr.write('no no matching state in module preamble_rest.py\n')
sys.stderr.write(self.__state + '\n')
if action is None:
sys.stderr.write(
'no matching state in module preamble_rest.py\n' + self.__state + '\n')
action(line)
read_obj.close()
self.__write_obj.close()
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "preamble_div.data")

View File

@ -11,6 +11,7 @@
# #
#########################################################################
import sys, os
from calibre.ebooks.rtf2xml import copy
from calibre.ptempfile import better_mktemp
@ -33,7 +34,7 @@ tokens and write the section tags.
The exception to this method occurs when sections occur in field blocks, such
as the index. Normally, two section break occur within the index and other
field-blocks. (If less or more section breaks occurr, this code may not work.)
I want the sections to occurr outside of the index. That is, the index
I want the sections to occur outside of the index. That is, the index
should be nested inside one section tag. After the index is complete, a new
section should begin.
In order to write the sections outside of the field blocks, I have to store