RTF Input: Handle old RTF files that have commands without braces. Fixes #994133 (Private bug)

This commit is contained in:
Kovid Goyal 2012-05-13 07:55:09 +05:30
commit fb94b02be3
11 changed files with 14057 additions and 381 deletions

View File

@ -372,8 +372,8 @@ class ParseRtf:
old_rtf = old_rtf_obj.check_if_old_rtf()
if old_rtf:
if self.__run_level > 5:
msg = 'Older RTF\n'
msg += 'self.__run_level is "%s"\n' % self.__run_level
msg = 'Older RTF\n' \
'self.__run_level is "%s"\n' % self.__run_level
raise RtfInvalidCodeException, msg
if self.__run_level > 1:
sys.stderr.write('File could be older RTF...\n')
@ -381,7 +381,7 @@ class ParseRtf:
if self.__run_level > 1:
sys.stderr.write(
'File also has newer RTF.\n'
'Will do the best to convert.\n'
'Will do the best to convert...\n'
)
add_brackets_obj = add_brackets.AddBrackets(
in_file = self.__temp_file,

View File

@ -20,6 +20,9 @@ class AddBrackets:
"""
Add brackets for old RTF.
Logic:
When control words without their own brackets are encountered
and in the list of allowed words, this will add brackets
to facilitate the treatment of the file
"""
def __init__(self, in_file,
bug_handler,
@ -41,53 +44,56 @@ class AddBrackets:
self.__copy = copy
self.__write_to = better_mktemp()
self.__run_level = run_level
def __initiate_values(self):
"""
"""
self.__state_dict = {
'before_body' : self.__before_body_func,
'in_body' : self.__in_body_func,
'after_control_word' : self.__after_control_word_func,
'in_ignore' : self.__ignore_func,
}
self.__accept = [
'cw<ci<bold______' ,
'cw<ci<annotation' ,
'cw<ci<blue______' ,
# 'cw<ci<bold______' ,
'cw<ci<caps______' ,
'cw<ci<char-style' ,
'cw<ci<dbl-strike' ,
'cw<ci<emboss____' ,
'cw<ci<engrave___' ,
'cw<ci<font-color' ,
'cw<ci<font-down_' ,
'cw<ci<font-size_' ,
'cw<ci<font-style' ,
'cw<ci<font-up___' ,
'cw<ci<footnot-mk' ,
'cw<ci<green_____' ,
'cw<ci<hidden____' ,
'cw<ci<italics___' ,
'cw<ci<outline___' ,
'cw<ci<red_______' ,
'cw<ci<shadow____' ,
'cw<ci<small-caps' ,
'cw<ci<strike-thr' ,
'cw<ci<subscript_' ,
'cw<ci<superscrip' ,
'cw<ci<underlined' ,
# 'cw<ul<underlined' ,
]
def __initiate_values(self):
"""
Init temp values
"""
self.__state = 'before_body'
self.__inline = {}
self.__temp_group = []
self.__open_bracket = 0
self.__found_brackets = 0
self.__accept = [
'cw<ci<bold______',
'cw<ci<annotation' ,
'cw<ci<blue______' ,
'cw<ci<bold______' ,
'cw<ci<caps______' ,
'cw<ci<char-style' ,
'cw<ci<dbl-strike' ,
'cw<ci<emboss____' ,
'cw<ci<engrave___' ,
'cw<ci<font-color' ,
'cw<ci<font-down_' ,
'cw<ci<font-size_' ,
'cw<ci<font-style' ,
'cw<ci<font-up___',
'cw<ci<footnot-mk',
'cw<ci<green_____' ,
'cw<ci<hidden____',
'cw<ci<italics___' ,
'cw<ci<outline___',
'cw<ci<red_______' ,
'cw<ci<shadow____',
'cw<ci<small-caps' ,
'cw<ci<strike-thr',
'cw<ci<subscript_' ,
'cw<ci<superscrip',
'cw<ci<underlined' ,
# 'cw<ul<underlined' ,
]
self.__open_bracket = False
self.__found_brackets = False
def __before_body_func(self, line):
"""
If we are before the body, not interest in changing anything
"""
if self.__token_info == 'mi<mk<body-open_':
self.__state = 'in_body'
@ -95,6 +101,14 @@ class AddBrackets:
def __in_body_func(self, line):
"""
Select what action to take in body:
1-At the end of the file close the braket if a bracket was opened
This happens if there is achange
2-If an open bracket is found the code inside is ignore
(written without modifications)
3-If an accepted control word is found put the line
in a buffer then chage state to after cw
4-Else simply write the line
"""
if line == 'cb<nu<clos-brack<0001\n' and self.__open_bracket:
self.__write_obj.write(
@ -102,7 +116,7 @@ class AddBrackets:
)
self.__write_obj.write(line)
elif self.__token_info == 'ob<nu<open-brack':
self.__found_brackets = 1
self.__found_brackets = True
self.__state = 'in_ignore'
self.__ignore_count = self.__ob_count
self.__write_obj.write(line)
@ -114,6 +128,10 @@ class AddBrackets:
def __after_control_word_func(self, line):
"""
After a cw either add next allowed cw to temporary list or
change groupe and write it.
If the token leading to an exit is an open bracket go to
ignore otherwise goto in body
"""
if self.__token_info in self.__accept:
self.__temp_group.append(line)
@ -129,82 +147,84 @@ class AddBrackets:
def __write_group(self):
"""
Write a tempory group after accepted control words end
But this is mostly useless in my opinion as there is no list of rejected cw
This may be a way to implement future old rtf processing for cw
Utility: open a group to just put brackets but why be so complicated?
Scheme: open brackets, write cw then go to body and back with cw after
"""
if self.__open_bracket:
self.__write_obj.write(
'cb<nu<clos-brack<0003\n'
)
self.__open_bracket = 0
inline_string = ''
the_keys = self.__inline.keys()
for the_key in the_keys:
value = self.__inline[the_key]
if value != 'false':
inline_string += '%s<nu<%s\n' % (the_key, value)
self.__open_bracket = False
inline_string = ''.join(['%s<nu<%s\n' % (k, v) \
for k, v in self.__inline.iteritems() \
if v != 'false'])
if inline_string:
self.__write_obj.write('ob<nu<open-brack<0003\n')
self.__write_obj.write(inline_string)
self.__open_bracket = 1
self.__write_obj.write('ob<nu<open-brack<0003\n'
'%s' % inline_string)
self.__open_bracket = True
self.__temp_group = []
def __change_permanent_group(self):
"""
use temp group to change permanent group
Use temp group to change permanent group
If the control word is not accepted remove it
What is the interest as it is build to accept only accepted cw
in __after_control_word_func?
"""
for line in self.__temp_group:
token_info = line[:16]
if token_info in self.__accept:
att = line[20:-1]
self.__inline[token_info] = att
self.__inline = {line[:16] : line[20:-1]\
for line in self.__temp_group\
# Is this really necessary?
if line[:16] in self.__accept}
def __ignore_func(self, line):
"""
Don't add any brackets while inside of brackets RTF has already
added.
Just copy data inside of RTF brackets already here.
"""
self.__write_obj.write(line)
if self.__token_info == 'cb<nu<clos-brack'and\
self.__cb_count == self.__ignore_count:
if self.__token_info == 'cb<nu<clos-brack'\
and self.__cb_count == self.__ignore_count:
self.__state = 'in_body'
def __check_brackets(self, in_file):
self.__check_brack_obj = check_brackets.CheckBrackets\
"""
Return True if brackets match
"""
check_brack_obj = check_brackets.CheckBrackets\
(file = in_file)
good_br = self.__check_brack_obj.check_brackets()[0]
if not good_br:
return 1
return check_brack_obj.check_brackets()[0]
def add_brackets(self):
"""
"""
self.__initiate_values()
read_obj = open(self.__file, 'r')
self.__write_obj = open(self.__write_to, 'w')
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
if self.__token_info == 'ob<nu<open-brack':
self.__ob_count = line[-5:-1]
if self.__token_info == 'cb<nu<clos-brack':
self.__cb_count = line[-5:-1]
action = self.__state_dict.get(self.__state)
if action == None:
sys.stderr.write('No matching state in module add_brackets.py\n')
sys.stderr.write(self.__state + '\n')
action(line)
read_obj.close()
self.__write_obj.close()
bad_brackets = self.__check_brackets(self.__write_to)
if not bad_brackets:
with open(self.__file, 'r') as read_obj:
with open(self.__write_to, 'w') as self.__write_obj:
for line in read_obj:
self.__token_info = line[:16]
if self.__token_info == 'ob<nu<open-brack':
self.__ob_count = line[-5:-1]
if self.__token_info == 'cb<nu<clos-brack':
self.__cb_count = line[-5:-1]
action = self.__state_dict.get(self.__state)
if action is None:
sys.stderr.write(
'No matching state in module add_brackets.py\n'
'%s\n' % self.__state)
action(line)
#Check bad brackets
if self.__check_brackets(self.__write_to):
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "add_brackets.data")
copy_obj.rename(self.__write_to, self.__file)
copy_obj.rename(self.__write_to, self.__file)
else:
if self.__run_level > 0:
sys.stderr.write(
'Sorry, but this files has a mix of old and new RTF.\n'
'Some characteristics cannot be converted.\n')
os.remove(self.__write_to)
os.remove(self.__write_to)

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,5 @@
import os, sys
from codecs import EncodedFile
from calibre.ebooks.rtf2xml import copy, check_encoding
from calibre.ptempfile import better_mktemp
@ -41,6 +42,7 @@ class ConvertToTags:
self.__run_level = run_level
self.__write_to = better_mktemp()
self.__convert_utf = False
self.__bad_encoding = False
def __initiate_values(self):
"""
@ -213,13 +215,14 @@ class ConvertToTags:
if not check_encoding_obj.check_encoding(self.__file, verbose=False):
self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>')
elif not check_encoding_obj.check_encoding(self.__file, self.__encoding):
elif not check_encoding_obj.check_encoding(self.__file, self.__encoding, verbose=False):
self.__write_obj.write('<?xml version="1.0" encoding="UTF-8" ?>')
self.__convert_utf = True
else:
self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>')
sys.stderr.write('Bad RTF encoding, revert to US-ASCII chars and'
' hope for the best')
self.__bad_encoding = True
self.__new_line = 0
self.__write_new_line()
if self.__no_dtd:
@ -247,7 +250,7 @@ class ConvertToTags:
the appropriate function.
The functions that are called:
a text function for text
an open funciton for open tags
an open function for open tags
an open with attribute function for tags with attributes
an empty with attribute function for tags that are empty but have
attribtes.
@ -263,20 +266,19 @@ class ConvertToTags:
action = self.__state_dict.get(self.__token_info)
if action is not None:
action(line)
self.__write_obj.close()
#convert all encodings to UTF8 to avoid unsupported encodings in lxml
if self.__convert_utf:
#convert all encodings to UTF8 or ASCII to avoid unsupported encodings in lxml
if self.__convert_utf or self.__bad_encoding:
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
copy_obj.rename(self.__write_to, self.__file)
file_encoding = "utf-8"
if self.__bad_encoding:
file_encoding = "us-ascii"
with open(self.__file, 'r') as read_obj:
with open(self.__write_to, 'w') as write_obj:
file = read_obj.read()
try:
file = file.decode(self.__encoding)
write_obj.write(file.encode('utf-8'))
except:
sys.stderr.write('Conversion to UTF-8 is not possible,'
' encoding should be very carefully checked')
write_objenc = EncodedFile(write_obj, self.__encoding,
file_encoding, 'replace')
for line in read_obj:
write_objenc.write(line)
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "convert_to_tags.data")

View File

@ -11,6 +11,7 @@
# #
#########################################################################
import sys, os
from calibre.ebooks.rtf2xml import copy
from calibre.ptempfile import better_mktemp
@ -31,29 +32,29 @@ class Header:
self.__bug_handler = bug_handler
self.__copy = copy
self.__write_to = better_mktemp()
self.__found_a_header = 0
self.__found_a_header = False
def __in_header_func(self, line):
"""
Handle all tokens that are part of header
"""
if self.__cb_count == self.__header_bracket_count:
self.__in_header = 0
self.__in_header = False
self.__write_obj.write(line)
self.__write_to_head_obj.write(
'mi<mk<head___clo\n')
self.__write_to_head_obj.write(
'mi<tg<close_____<header-or-footer\n')
self.__write_to_head_obj.write(
'mi<mk<head___clo\n' \
'mi<tg<close_____<header-or-footer\n' \
'mi<mk<header-clo\n')
else:
self.__write_to_head_obj.write(line)
def __found_header(self, line):
"""
Found a header
"""
# but this could be header or footer
self.__found_a_header = 1
self.__in_header = 1
self.__found_a_header = True
self.__in_header = True
self.__header_count += 1
# temporarily set this to zero so I can enter loop
self.__cb_count = 0
@ -69,18 +70,23 @@ class Header:
'mi<tg<open-att__<header-or-footer<type>%s\n' % (type)
)
else:
sys.stderr.write('module is header\n')
sys.stderr.write('method is __found_header\n')
sys.stderr.write('no dict entry\n')
sys.stderr.write('line is %s' % line)
sys.stderr.write(
'module is header\n' \
'method is __found_header\n' \
'no dict entry\n' \
'line is %s' % line)
self.__write_to_head_obj.write(
'mi<tg<open-att__<header-or-footer<type>none\n'
)
def __default_sep(self, line):
"""Handle all tokens that are not header tokens"""
"""
Handle all tokens that are not header tokens
"""
if self.__token_info[3:5] == 'hf':
self.__found_header(line)
self.__write_obj.write(line)
def __initiate_sep_values(self):
"""
initiate counters for separate_footnotes method.
@ -89,7 +95,7 @@ class Header:
self.__ob_count = 0
self.__cb_count = 0
self.__header_bracket_count = 0
self.__in_header = 0
self.__in_header = False
self.__header_count = 0
self.__head_dict = {
'head-left_' : ('header-left'),
@ -101,6 +107,7 @@ class Header:
'header____' : ('header' ),
'footer____' : ('footer' ),
}
def separate_headers(self):
"""
Separate all the footnotes in an RTF file and put them at the bottom,
@ -110,53 +117,47 @@ class Header:
bottom of the main file.
"""
self.__initiate_sep_values()
read_obj = open(self.__file)
self.__write_obj = open(self.__write_to, 'w')
self.__header_holder = better_mktemp()
self.__write_to_head_obj = open(self.__header_holder, 'w')
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
# keep track of opening and closing brackets
if self.__token_info == 'ob<nu<open-brack':
self.__ob_count = line[-5:-1]
if self.__token_info == 'cb<nu<clos-brack':
self.__cb_count = line[-5:-1]
# In the middle of footnote text
if self.__in_header:
self.__in_header_func(line)
# not in the middle of footnote text
else:
self.__default_sep(line)
self.__write_obj.close()
read_obj.close()
self.__write_to_head_obj.close()
read_obj = open(self.__header_holder, 'r')
write_obj = open(self.__write_to, 'a')
write_obj.write(
'mi<mk<header-beg\n')
line = 1
while line:
line = read_obj.readline()
write_obj.write(line)
write_obj.write(
'mi<mk<header-end\n')
read_obj.close()
write_obj.close()
with open(self.__file) as read_obj:
with open(self.__write_to, 'w') as self.__write_obj:
with open(self.__header_holder, 'w') as self.__write_to_head_obj:
for line in read_obj:
self.__token_info = line[:16]
# keep track of opening and closing brackets
if self.__token_info == 'ob<nu<open-brack':
self.__ob_count = line[-5:-1]
if self.__token_info == 'cb<nu<clos-brack':
self.__cb_count = line[-5:-1]
# In the middle of footnote text
if self.__in_header:
self.__in_header_func(line)
# not in the middle of footnote text
else:
self.__default_sep(line)
with open(self.__header_holder, 'r') as read_obj:
with open(self.__write_to, 'a') as write_obj:
write_obj.write(
'mi<mk<header-beg\n')
for line in read_obj:
write_obj.write(line)
write_obj.write(
'mi<mk<header-end\n')
os.remove(self.__header_holder)
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "header_separate.info")
copy_obj.copy_file(self.__write_to, "header_separate.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)
def update_info(self, file, copy):
"""
Unused method
"""
self.__file = file
self.__copy = copy
def __get_head_body_func(self, line):
"""
Process lines in main body and look for beginning of headers.
@ -166,6 +167,7 @@ class Header:
self.__state = 'head'
else:
self.__write_obj.write(line)
def __get_head_head_func(self, line):
"""
Copy headers and footers from bottom of file to a separate, temporary file.
@ -174,6 +176,7 @@ class Header:
self.__state = 'body'
else:
self.__write_to_head_obj.write(line)
def __get_headers(self):
"""
Private method to remove footnotes from main file. Read one line from
@ -182,21 +185,16 @@ class Header:
These two functions do the work of separating the footnotes form the
body.
"""
read_obj = open(self.__file)
self.__write_obj = open(self.__write_to, 'w')
# self.__write_to = "footnote_info.data"
self.__write_to_head_obj = open(self.__header_holder, 'w')
line = 1
while line:
line = read_obj.readline()
self.__token_info = line[:16]
if self.__state == 'body':
self.__get_head_body_func(line)
elif self.__state == 'head':
self.__get_head_head_func(line)
read_obj.close()
self.__write_obj.close()
self.__write_to_head_obj.close()
with open(self.__file) as read_obj:
with open(self.__write_to, 'w') as self.__write_obj:
with open(self.__header_holder, 'w') as self.__write_to_head_obj:
for line in read_obj:
self.__token_info = line[:16]
if self.__state == 'body':
self.__get_head_body_func(line)
elif self.__state == 'head':
self.__get_head_head_func(line)
def __get_head_from_temp(self, num):
"""
Private method for joining headers and footers to body. This method
@ -205,18 +203,17 @@ class Header:
returns them as a string.
"""
look_for = 'mi<mk<header-ope<' + num + '\n'
found_head = 0
found_head = False
string_to_return = ''
line = 1
while line:
line = self.__read_from_head_obj.readline()
for line in self.__read_from_head_obj:
if found_head:
if line == 'mi<mk<header-clo\n':
return string_to_return
string_to_return = string_to_return + line
string_to_return += line
else:
if line == look_for:
found_head = 1
found_head = True
def __join_from_temp(self):
"""
Private method for rejoining footnotes to body. Read from the
@ -227,15 +224,13 @@ class Header:
If no footnote marker is found, simply print out the token (line).
"""
self.__read_from_head_obj = open(self.__header_holder, 'r')
read_obj = open(self.__write_to, 'r')
self.__write_obj = open(self.__write_to2, 'w')
line = 1
while line:
line = read_obj.readline()
if line[:16] == 'mi<mk<header-ind':
line = self.__get_head_from_temp(line[17:-1])
self.__write_obj.write(line)
read_obj.close()
with open(self.__write_to, 'r') as read_obj:
for line in read_obj:
if line[:16] == 'mi<mk<header-ind':
line = self.__get_head_from_temp(line[17:-1])
self.__write_obj.write(line)
def join_headers(self):
"""
Join the footnotes from the bottom of the file and put them in their

View File

@ -181,7 +181,7 @@ class Hex2Utf8:
self.__dingbats_dict.update(dingbats_base_dict)
self.__dingbats_dict.update(ms_dingbats_dict)
# load dictionary for caps, and make a string for the replacement
self.__caps_uni_dict = char_map_obj.get_char_map(map='caps_uni')
self.__caps_uni_dict = char_map_obj.get_char_map(map = 'caps_uni')
# # print self.__caps_uni_dict
# don't think I'll need this
##keys = self.__caps_uni_dict.keys()

View File

@ -11,14 +11,18 @@
# #
#########################################################################
import sys
"""
"""
class OldRtf:
"""
Check to see if the RTF is an older version
Logic:
If allowable control word/properties happen in text without being enclosed
in brackets the file will be considered old rtf
"""
def __init__(self, in_file, bug_handler, run_level ):
def __init__(self, in_file,
bug_handler,
run_level,
):
"""
Required:
'file'--file to parse
@ -32,46 +36,46 @@ class OldRtf:
"""
self.__file = in_file
self.__bug_handler = bug_handler
self.__initiate_values()
self.__ob_group = 0
def __initiate_values(self):
self.__previous_token = ''
self.__new_found = 0
self.__run_level = run_level
self.__allowable = [
'annotation' ,
'blue______' ,
'bold______',
'caps______',
'char-style' ,
'dbl-strike' ,
'emboss____',
'engrave___' ,
'font-color',
'font-down_' ,
'font-size_',
'font-style',
'font-up___',
'footnot-mk' ,
'green_____' ,
'hidden____',
'italics___',
'outline___',
'red_______',
'shadow____' ,
'small-caps',
'strike-thr',
'subscript_',
'superscrip' ,
'underlined' ,
'annotation' ,
'blue______' ,
'bold______',
'caps______',
'char-style' ,
'dbl-strike' ,
'emboss____',
'engrave___' ,
'font-color',
'font-down_' ,
'font-size_',
'font-style',
'font-up___',
'footnot-mk' ,
'green_____' ,
'hidden____',
'italics___',
'outline___',
'red_______',
'shadow____' ,
'small-caps',
'strike-thr',
'subscript_',
'superscrip' ,
'underlined' ,
]
self.__state = 'before_body'
self.__action_dict = {
'before_body' : self.__before_body_func,
'in_body' : self.__check_tokens_func,
'after_pard' : self.__after_pard_func,
}
self.__is_old = 0
def __initiate_values(self):
self.__previous_token = ''
self.__state = 'before_body'
self.__found_new = 0
self.__ob_group = 0
def __check_tokens_func(self, line):
if self.__inline_info in self.__allowable:
if self.__ob_group == self.__base_ob_count:
@ -80,48 +84,56 @@ class OldRtf:
self.__found_new += 1
elif self.__token_info == 'cw<pf<par-def___':
self.__state = 'after_pard'
def __before_body_func(self, line):
if self.__token_info == 'mi<mk<body-open_':
self.__state = 'in_body'
self.__base_ob_count = self.__ob_group
def __after_pard_func(self, line):
if line[0:2] != 'cw':
self.__state = 'in_body'
def check_if_old_rtf(self):
"""
Requires:
nothing
Returns:
1 if file is older RTf
0 if file is newer RTF
True if file is older RTf
False if file is newer RTF
"""
read_obj = open(self.__file, 'r')
line = 1
self.__initiate_values()
line_num = 0
while line:
line = read_obj.readline()
line_num += 1
self.__token_info = line[:16]
if self.__token_info == 'mi<mk<body-close':
return 0
self.__ob_group = 0
if self.__token_info == 'ob<nu<open-brack':
self.__ob_group += 1
self.__ob_count = line[-5:-1]
if self.__token_info == 'cb<nu<clos-brack':
self.__ob_group -= 1
self.__cb_count = line[-5:-1]
self.__inline_info = line[6:16]
if self.__state == 'after_body':
return 0
action = self.__action_dict.get(self.__state)
if not action:
sys.stderr.write('No action for state!\n')
result = action(line)
if result == 'new_rtf':
return 0
elif result == 'old_rtf':
return 1
self.__previous_token = line[6:16]
return 0
with open(self.__file, 'r') as read_obj:
for line in read_obj:
line_num += 1
self.__token_info = line[:16]
if self.__token_info == 'mi<mk<body-close':
return False
if self.__token_info == 'ob<nu<open-brack':
self.__ob_group += 1
self.__ob_count = line[-5:-1]
if self.__token_info == 'cb<nu<clos-brack':
self.__ob_group -= 1
self.__cb_count = line[-5:-1]
self.__inline_info = line[6:16]
if self.__state == 'after_body':
return False
action = self.__action_dict.get(self.__state)
if action is None:
try:
sys.stderr.write('No action for this state!\n')
except:
pass
result = action(line)
if result == 'new_rtf':
return False
elif result == 'old_rtf':
if self.__run_level > 3:
sys.stderr.write(
'Old rtf construction %s (bracket %s, line %s)\n'
% (self.__inline_info, str(self.__ob_group), line_num)
)
return True
self.__previous_token = line[6:16]
return False

View File

@ -10,7 +10,9 @@
# #
# #
#########################################################################
import sys, os, codecs
import sys, os
# , codecs
class Output:
"""
Output file
@ -19,7 +21,8 @@ class Output:
file,
orig_file,
output_dir = None,
out_file = None
out_file = None,
no_ask = True
):
"""
Required:
@ -33,8 +36,9 @@ class Output:
self.__file = file
self.__orig_file = orig_file
self.__output_dir = output_dir
self.__no_ask = 1
self.__no_ask = no_ask
self.__out_file = out_file
def output(self):
"""
Required:
@ -45,13 +49,14 @@ class Output:
output the line to the screen if no output file given. Otherwise, output to
the file.
"""
# self.__output_xml(self.__file, self.__out_file)
if self.__output_dir:
self.__output_to_dir_func()
elif self.__out_file:
self.__output_xml(self.__file, self.__out_file)
self.__output_to_file_func()
# self.__output_xml(self.__file, self.__out_file)
else:
self.__output_to_standard_func()
def __output_to_dir_func(self):
"""
Requires:
@ -64,32 +69,25 @@ class Output:
"""
base_name = os.path.basename(self.__orig_file)
base_name, ext = os.path.splitext(base_name)
output_file = '%s.xml' % base_name
output_file = os.path.join(self.__output_dir, output_file)
output_file = os.path.join(self.__output_dir, '%s.xml' % base_name)
# change if user wants to output to a specific file
if self.__out_file:
output_file = os.path.join(self.__output_dir, self.__out_file)
user_response = 'o'
if os.path.isfile(output_file):
if self.__no_ask:
user_response = 'o'
else:
msg = 'Do you want to over-write %s?\n' % output_file
msg += 'Type "o" to over-write.\n'
msg += 'Type any other key to print to standard output.\n'
sys.stderr.write(msg)
user_response = raw_input()
if os.path.isfile(output_file) and not self.__no_ask:
msg = 'Do you want to overwrite %s?\n' % output_file
msg += ('Type "o" to overwrite.\n'
'Type any other key to print to standard output.\n')
sys.stderr.write(msg)
user_response = raw_input()
if user_response == 'o':
read_obj = open(self.__file, 'r')
write_obj = open(output_file, 'w')
line = 1
while line:
line = read_obj.readline()
write_obj.write(line)
read_obj.close()
write_obj.close()
with open(self.__file, 'r') as read_obj:
with open(self.output_file, 'w') as write_obj:
for line in read_obj:
write_obj.write(line)
else:
self.__output_to_standard_func()
def __output_to_file_func(self):
"""
Required:
@ -99,14 +97,11 @@ class Output:
Logic:
read one line at a time. Output to standard
"""
read_obj = open(self.__file, 'r')
write_obj = open(self.__out_file, 'w')
line = 1
while line:
line = read_obj.readline()
write_obj.write(line)
read_obj.close()
write_obj.close()
with open(self.__file, 'r') as read_obj:
with open(self.__out_file, 'w') as write_obj:
for line in read_obj:
write_obj.write(line)
def __output_to_standard_func(self):
"""
Required:
@ -116,26 +111,24 @@ class Output:
Logic:
read one line at a time. Output to standard
"""
read_obj = open(self.__file, 'r')
line = 1
while line:
line = read_obj.readline()
sys.stdout.write(line)
read_obj.close()
def __output_xml(self, in_file, out_file):
"""
output the ill-formed xml file
"""
(utf8_encode, utf8_decode, utf8_reader, utf8_writer) = codecs.lookup("utf-8")
write_obj = utf8_writer(open(out_file, 'w'))
write_obj = open(out_file, 'w')
read_obj = utf8_writer(open(in_file, 'r'))
read_obj = open(in_file, 'r')
line = 1
while line:
line = read_obj.readline()
if isinstance(line, type(u"")):
line = line.encode("utf-8")
write_obj.write(line)
read_obj.close()
write_obj.close()
with open(self.__file, 'r') as read_obj:
for line in read_obj:
sys.stdout.write(line)
# def __output_xml(self, in_file, out_file):
# """
# output the ill-formed xml file
# """
# (utf8_encode, utf8_decode, utf8_reader, utf8_writer) = codecs.lookup("utf-8")
# write_obj = utf8_writer(open(out_file, 'w'))
# write_obj = open(out_file, 'w')
# read_obj = utf8_writer(open(in_file, 'r'))
# read_obj = open(in_file, 'r')
# line = 1
# while line:
# line = read_obj.readline()
# if isinstance(line, type(u"")):
# line = line.encode("utf-8")
# write_obj.write(line)
# read_obj.close()
# write_obj.close()

View File

@ -11,31 +11,32 @@
# #
#########################################################################
import sys, os
from calibre.ebooks.rtf2xml import copy
from calibre.ptempfile import better_mktemp
class Paragraphs:
"""
=================
Purpose
=================
Write paragraph tags for a tokenized file. (This module won't be any use to use
to you unless you use it as part of the other modules.)
-------------
Method
-------------
RTF does not tell you when a paragraph begins. It only tells you when the
paragraph ends.
In order to make paragraphs out of this limited info, the parser starts in the
body of the documents and assumes it is not in a paragraph. It looks for clues
to begin a paragraph. Text starts a paragraph; so does an inline field or
list-text. If an end of paragraph marker (\par) is found, then this indicates
a blank paragraph.
Once a paragraph is found, the state changes to 'paragraph.' In this state,
clues are looked to for the end of a paragraph. The end of a paragraph marker
(\par) marks the end of a paragraph. So does the end of a footnote or heading;
a paragraph definintion; the end of a field-block; and the beginning of a
section. (How about the end of a section or the end of a field-block?)
=================
Purpose
=================
Write paragraph tags for a tokenized file. (This module won't be any use to use
to you unless you use it as part of the other modules.)
-------------
Method
-------------
RTF does not tell you when a paragraph begins. It only tells you when the
paragraph ends.
In order to make paragraphs out of this limited info, the parser starts in the
body of the documents and assumes it is not in a paragraph. It looks for clues
to begin a paragraph. Text starts a paragraph; so does an inline field or
list-text. If an end of paragraph marker (\par) is found, then this indicates
a blank paragraph.
Once a paragraph is found, the state changes to 'paragraph.' In this state,
clues are looked to for the end of a paragraph. The end of a paragraph marker
(\par) marks the end of a paragraph. So does the end of a footnote or heading;
a paragraph definition; the end of a field-block; and the beginning of a
section. (How about the end of a section or the end of a field-block?)
"""
def __init__(self,
in_file,
@ -60,6 +61,7 @@ section. (How about the end of a section or the end of a field-block?)
self.__write_empty_para = write_empty_para
self.__run_level = run_level
self.__write_to = better_mktemp()
def __initiate_values(self):
"""
Initiate all values.
@ -77,7 +79,7 @@ section. (How about the end of a section or the end of a field-block?)
self.__paragraph_dict = {
'cw<pf<par-end___' : self.__close_para_func, # end of paragraph
'mi<mk<headi_-end' : self.__close_para_func, # end of header or footer
##'cw<pf<par-def___' : self.__close_para_func, # paragraph definition
## 'cw<pf<par-def___' : self.__close_para_func, # paragraph definition
# 'mi<mk<fld-bk-end' : self.__close_para_func, # end of field-block
'mi<mk<fldbk-end_' : self.__close_para_func, # end of field-block
'mi<mk<body-close' : self.__close_para_func, # end of body
@ -99,6 +101,7 @@ section. (How about the end of a section or the end of a field-block?)
'mi<mk<pict-start' : self.__start_para_func,
'cw<pf<page-break' : self.__empty_pgbk_func, # page break
}
def __before_body_func(self, line):
"""
Required:
@ -112,6 +115,7 @@ section. (How about the end of a section or the end of a field-block?)
if self.__token_info == 'mi<mk<body-open_':
self.__state = 'not_paragraph'
self.__write_obj.write(line)
def __not_paragraph_func(self, line):
"""
Required:
@ -127,6 +131,7 @@ section. (How about the end of a section or the end of a field-block?)
if action:
action(line)
self.__write_obj.write(line)
def __paragraph_func(self, line):
"""
Required:
@ -144,6 +149,7 @@ section. (How about the end of a section or the end of a field-block?)
action(line)
else:
self.__write_obj.write(line)
def __start_para_func(self, line):
"""
Requires:
@ -160,6 +166,7 @@ section. (How about the end of a section or the end of a field-block?)
)
self.__write_obj.write(self.__start2_marker)
self.__state = 'paragraph'
def __empty_para_func(self, line):
"""
Requires:
@ -176,6 +183,7 @@ section. (How about the end of a section or the end of a field-block?)
'mi<tg<empty_____<para\n'
)
self.__write_obj.write(self.__end_marker) # marker for later parsing
def __empty_pgbk_func(self, line):
"""
Requires:
@ -188,6 +196,7 @@ section. (How about the end of a section or the end of a field-block?)
self.__write_obj.write(
'mi<tg<empty_____<page-break\n'
)
def __close_para_func(self, line):
"""
Requires:
@ -205,6 +214,7 @@ section. (How about the end of a section or the end of a field-block?)
self.__write_obj.write(self.__end_marker) # marker for later parser
self.__write_obj.write(line)
self.__state = 'not_paragraph'
def __bogus_para__def_func(self, line):
"""
Requires:
@ -215,6 +225,7 @@ section. (How about the end of a section or the end of a field-block?)
if a \pard occurs in a paragraph, I want to ignore it. (I believe)
"""
self.__write_obj.write('mi<mk<bogus-pard\n')
def make_paragraphs(self):
"""
Requires:
@ -229,20 +240,18 @@ section. (How about the end of a section or the end of a field-block?)
only other state is 'paragraph'.
"""
self.__initiate_values()
read_obj = open(self.__file, 'r')
self.__write_obj = open(self.__write_to, 'w')
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
action = self.__state_dict.get(self.__state)
if action == None:
sys.stderr.write('no no matching state in module sections.py\n')
sys.stderr.write(self.__state + '\n')
action(line)
read_obj.close()
self.__write_obj.close()
with open(self.__file, 'r') as read_obj:
with open(self.__write_to, 'w') as self.__write_obj:
for line in read_obj:
self.__token_info = line[:16]
action = self.__state_dict.get(self.__state)
if action is None:
try:
sys.stderr.write('no matching state in module paragraphs.py\n')
sys.stderr.write(self.__state + '\n')
except:
pass
action(line)
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "paragraphs.data")

View File

@ -11,16 +11,24 @@
# #
#########################################################################
import sys,os
from calibre.ebooks.rtf2xml import copy
class Preamble:
"""
Fix the reamaing parts of the preamble. This module does very little. It
makes sure that no text gets put in the revision of list table. In the
future, when I understand how to interprett he revision table and list
future, when I understand how to interpret the revision table and list
table, I will make these methods more functional.
"""
def __init__(self, file, bug_handler, platform, default_font, code_page,
copy=None, temp_dir=None):
def __init__(self, file,
bug_handler,
platform,
default_font,
code_page,
copy=None,
temp_dir=None,
):
"""
Required:
file--file to parse
@ -44,6 +52,7 @@ class Preamble:
self.__write_to = os.path.join(temp_dir,"info_table_info.data")
else:
self.__write_to = "info_table_info.data"
def __initiate_values(self):
"""
Initiate all values.
@ -62,12 +71,14 @@ class Preamble:
'mi<mk<revtbl-beg' : self.__found_revision_table_func,
'mi<mk<body-open_' : self.__found_body_func,
}
def __default_func(self, line):
action = self.__default_dict.get(self.__token_info)
if action:
action(line)
else:
self.__write_obj.write(line)
def __found_rtf_head_func(self, line):
"""
Requires:
@ -84,8 +95,10 @@ class Preamble:
'<platform>%s\n' % (self.__default_font, self.__code_page,
self.__platform)
)
def __found_list_table_func(self, line):
self.__state = 'list_table'
def __list_table_func(self, line):
if self.__token_info == 'mi<mk<listabend_':
self.__state = 'default'
@ -93,8 +106,10 @@ class Preamble:
pass
else:
self.__write_obj.write(line)
def __found_revision_table_func(self, line):
self.__state = 'revision'
def __revision_table_func(self, line):
if self.__token_info == 'mi<mk<revtbl-end':
self.__state = 'default'
@ -102,11 +117,14 @@ class Preamble:
pass
else:
self.__write_obj.write(line)
def __found_body_func(self, line):
self.__state = 'body'
self.__write_obj.write(line)
def __body_func(self, line):
self.__write_obj.write(line)
def fix_preamble(self):
"""
Requires:
@ -119,20 +137,15 @@ class Preamble:
the list table.
"""
self.__initiate_values()
read_obj = open(self.__file, 'r')
self.__write_obj = open(self.__write_to, 'w')
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
action = self.__state_dict.get(self.__state)
if action == None:
sys.stderr.write('no no matching state in module preamble_rest.py\n')
sys.stderr.write(self.__state + '\n')
action(line)
read_obj.close()
self.__write_obj.close()
with open(self.__file, 'r') as read_obj:
with open(self.__write_to, 'w') as self.__write_obj:
for line in read_obj:
self.__token_info = line[:16]
action = self.__state_dict.get(self.__state)
if action is None:
sys.stderr.write(
'no matching state in module preamble_rest.py\n' + self.__state + '\n')
action(line)
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "preamble_div.data")

View File

@ -11,43 +11,44 @@
# #
#########################################################################
import sys, os
from calibre.ebooks.rtf2xml import copy
from calibre.ptempfile import better_mktemp
class Sections:
"""
=================
Purpose
=================
Write section tags for a tokenized file. (This module won't be any use to use
to you unless you use it as part of the other modules.)
---------------
logic
---------------
The tags for the first section breaks have already been written.
RTF stores section breaks with the \sect tag. Each time this tag is
encountered, add one to the counter.
When I encounter the \sectd tag, I want to collect all the appropriate tokens
that describe the section. When I reach a \pard, I know I an stop collecting
tokens and write the section tags.
The exception to this method occurs when sections occur in field blocks, such
as the index. Normally, two section break occur within the index and other
field-blocks. (If less or more section breaks occurr, this code may not work.)
I want the sections to occurr outside of the index. That is, the index
should be nested inside one section tag. After the index is complete, a new
section should begin.
In order to write the sections outside of the field blocks, I have to store
all of the field block as a string. When I ecounter the \sect tag, add one to
the section counter, but store this number in a list. Likewise, store the
information describing the section in another list.
When I reach the end of the field block, choose the first item from the
numbered list as the section number. Choose the first item in the description
list as the values and attributes of the section. Enclose the field string
between the section tags.
Start a new section outside the field-block strings. Use the second number in
the list; use the second item in the description list.
CHANGE (2004-04-26) No longer write sections that occurr in field-blocks.
Instead, ingore all section information in a field-block.
=================
Purpose
=================
Write section tags for a tokenized file. (This module won't be any use to use
to you unless you use it as part of the other modules.)
---------------
logic
---------------
The tags for the first section breaks have already been written.
RTF stores section breaks with the \sect tag. Each time this tag is
encountered, add one to the counter.
When I encounter the \sectd tag, I want to collect all the appropriate tokens
that describe the section. When I reach a \pard, I know I an stop collecting
tokens and write the section tags.
The exception to this method occurs when sections occur in field blocks, such
as the index. Normally, two section break occur within the index and other
field-blocks. (If less or more section breaks occurr, this code may not work.)
I want the sections to occur outside of the index. That is, the index
should be nested inside one section tag. After the index is complete, a new
section should begin.
In order to write the sections outside of the field blocks, I have to store
all of the field block as a string. When I ecounter the \sect tag, add one to
the section counter, but store this number in a list. Likewise, store the
information describing the section in another list.
When I reach the end of the field block, choose the first item from the
numbered list as the section number. Choose the first item in the description
list as the values and attributes of the section. Enclose the field string
between the section tags.
Start a new section outside the field-block strings. Use the second number in
the list; use the second item in the description list.
CHANGE (2004-04-26) No longer write sections that occurr in field-blocks.
Instead, ingore all section information in a field-block.
"""
def __init__(self,
in_file,