mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
RTFInput: Cleanup & small improvments
This commit is contained in:
parent
bd5e6585ff
commit
60b53045e4
@ -372,8 +372,8 @@ class ParseRtf:
|
||||
old_rtf = old_rtf_obj.check_if_old_rtf()
|
||||
if old_rtf:
|
||||
if self.__run_level > 5:
|
||||
msg = 'Older RTF\n'
|
||||
msg += 'self.__run_level is "%s"\n' % self.__run_level
|
||||
msg = 'Older RTF\n' \
|
||||
'self.__run_level is "%s"\n' % self.__run_level
|
||||
raise RtfInvalidCodeException, msg
|
||||
if self.__run_level > 1:
|
||||
sys.stderr.write('File could be older RTF...\n')
|
||||
@ -381,7 +381,7 @@ class ParseRtf:
|
||||
if self.__run_level > 1:
|
||||
sys.stderr.write(
|
||||
'File also has newer RTF.\n'
|
||||
'Will do the best to convert.\n'
|
||||
'Will do the best to convert...\n'
|
||||
)
|
||||
add_brackets_obj = add_brackets.AddBrackets(
|
||||
in_file = self.__temp_file,
|
||||
|
@ -20,6 +20,9 @@ class AddBrackets:
|
||||
"""
|
||||
Add brackets for old RTF.
|
||||
Logic:
|
||||
When control words without their own brackets are encountered
|
||||
and in the list of allowed words, this will add brackets
|
||||
to facilitate the treatment of the file
|
||||
"""
|
||||
def __init__(self, in_file,
|
||||
bug_handler,
|
||||
@ -41,50 +44,51 @@ class AddBrackets:
|
||||
self.__copy = copy
|
||||
self.__write_to = better_mktemp()
|
||||
self.__run_level = run_level
|
||||
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
"""
|
||||
self.__state_dict = {
|
||||
'before_body' : self.__before_body_func,
|
||||
'in_body' : self.__in_body_func,
|
||||
'after_control_word' : self.__after_control_word_func,
|
||||
'in_ignore' : self.__ignore_func,
|
||||
}
|
||||
self.__accept = [
|
||||
'cw<ci<bold______' ,
|
||||
'cw<ci<annotation' ,
|
||||
'cw<ci<blue______' ,
|
||||
# 'cw<ci<bold______' ,
|
||||
'cw<ci<caps______' ,
|
||||
'cw<ci<char-style' ,
|
||||
'cw<ci<dbl-strike' ,
|
||||
'cw<ci<emboss____' ,
|
||||
'cw<ci<engrave___' ,
|
||||
'cw<ci<font-color' ,
|
||||
'cw<ci<font-down_' ,
|
||||
'cw<ci<font-size_' ,
|
||||
'cw<ci<font-style' ,
|
||||
'cw<ci<font-up___' ,
|
||||
'cw<ci<footnot-mk' ,
|
||||
'cw<ci<green_____' ,
|
||||
'cw<ci<hidden____' ,
|
||||
'cw<ci<italics___' ,
|
||||
'cw<ci<outline___' ,
|
||||
'cw<ci<red_______' ,
|
||||
'cw<ci<shadow____' ,
|
||||
'cw<ci<small-caps' ,
|
||||
'cw<ci<strike-thr' ,
|
||||
'cw<ci<subscript_' ,
|
||||
'cw<ci<superscrip' ,
|
||||
'cw<ci<underlined' ,
|
||||
# 'cw<ul<underlined' ,
|
||||
]
|
||||
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
"""
|
||||
self.__state = 'before_body'
|
||||
self.__inline = {}
|
||||
self.__temp_group = []
|
||||
self.__open_bracket = False
|
||||
self.__found_brackets = False
|
||||
self.__accept = [
|
||||
'cw<ci<bold______',
|
||||
'cw<ci<annotation' ,
|
||||
'cw<ci<blue______' ,
|
||||
'cw<ci<bold______' ,
|
||||
'cw<ci<caps______' ,
|
||||
'cw<ci<char-style' ,
|
||||
'cw<ci<dbl-strike' ,
|
||||
'cw<ci<emboss____' ,
|
||||
'cw<ci<engrave___' ,
|
||||
'cw<ci<font-color' ,
|
||||
'cw<ci<font-down_' ,
|
||||
'cw<ci<font-size_' ,
|
||||
'cw<ci<font-style' ,
|
||||
'cw<ci<font-up___',
|
||||
'cw<ci<footnot-mk',
|
||||
'cw<ci<green_____' ,
|
||||
'cw<ci<hidden____',
|
||||
'cw<ci<italics___' ,
|
||||
'cw<ci<outline___',
|
||||
'cw<ci<red_______' ,
|
||||
'cw<ci<shadow____',
|
||||
'cw<ci<small-caps' ,
|
||||
'cw<ci<strike-thr',
|
||||
'cw<ci<subscript_' ,
|
||||
'cw<ci<superscrip',
|
||||
'cw<ci<underlined' ,
|
||||
# 'cw<ul<underlined' ,
|
||||
]
|
||||
|
||||
|
||||
def __before_body_func(self, line):
|
||||
"""
|
||||
|
@ -11,6 +11,7 @@
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os
|
||||
|
||||
from calibre.ebooks.rtf2xml import copy
|
||||
from calibre.ptempfile import better_mktemp
|
||||
|
||||
@ -31,29 +32,29 @@ class Header:
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__write_to = better_mktemp()
|
||||
self.__found_a_header = 0
|
||||
self.__found_a_header = False
|
||||
|
||||
def __in_header_func(self, line):
|
||||
"""
|
||||
Handle all tokens that are part of header
|
||||
"""
|
||||
if self.__cb_count == self.__header_bracket_count:
|
||||
self.__in_header = 0
|
||||
self.__in_header = False
|
||||
self.__write_obj.write(line)
|
||||
self.__write_to_head_obj.write(
|
||||
'mi<mk<head___clo\n')
|
||||
self.__write_to_head_obj.write(
|
||||
'mi<tg<close_____<header-or-footer\n')
|
||||
self.__write_to_head_obj.write(
|
||||
'mi<mk<head___clo\n' \
|
||||
'mi<tg<close_____<header-or-footer\n' \
|
||||
'mi<mk<header-clo\n')
|
||||
else:
|
||||
self.__write_to_head_obj.write(line)
|
||||
|
||||
def __found_header(self, line):
|
||||
"""
|
||||
Found a header
|
||||
"""
|
||||
# but this could be header or footer
|
||||
self.__found_a_header = 1
|
||||
self.__in_header = 1
|
||||
self.__found_a_header = True
|
||||
self.__in_header = True
|
||||
self.__header_count += 1
|
||||
# temporarily set this to zero so I can enter loop
|
||||
self.__cb_count = 0
|
||||
@ -69,18 +70,23 @@ class Header:
|
||||
'mi<tg<open-att__<header-or-footer<type>%s\n' % (type)
|
||||
)
|
||||
else:
|
||||
sys.stderr.write('module is header\n')
|
||||
sys.stderr.write('method is __found_header\n')
|
||||
sys.stderr.write('no dict entry\n')
|
||||
sys.stderr.write('line is %s' % line)
|
||||
sys.stderr.write(
|
||||
'module is header\n' \
|
||||
'method is __found_header\n' \
|
||||
'no dict entry\n' \
|
||||
'line is %s' % line)
|
||||
self.__write_to_head_obj.write(
|
||||
'mi<tg<open-att__<header-or-footer<type>none\n'
|
||||
)
|
||||
|
||||
def __default_sep(self, line):
|
||||
"""Handle all tokens that are not header tokens"""
|
||||
"""
|
||||
Handle all tokens that are not header tokens
|
||||
"""
|
||||
if self.__token_info[3:5] == 'hf':
|
||||
self.__found_header(line)
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __initiate_sep_values(self):
|
||||
"""
|
||||
initiate counters for separate_footnotes method.
|
||||
@ -89,7 +95,7 @@ class Header:
|
||||
self.__ob_count = 0
|
||||
self.__cb_count = 0
|
||||
self.__header_bracket_count = 0
|
||||
self.__in_header = 0
|
||||
self.__in_header = False
|
||||
self.__header_count = 0
|
||||
self.__head_dict = {
|
||||
'head-left_' : ('header-left'),
|
||||
@ -101,6 +107,7 @@ class Header:
|
||||
'header____' : ('header' ),
|
||||
'footer____' : ('footer' ),
|
||||
}
|
||||
|
||||
def separate_headers(self):
|
||||
"""
|
||||
Separate all the footnotes in an RTF file and put them at the bottom,
|
||||
@ -110,53 +117,47 @@ class Header:
|
||||
bottom of the main file.
|
||||
"""
|
||||
self.__initiate_sep_values()
|
||||
read_obj = open(self.__file)
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
self.__header_holder = better_mktemp()
|
||||
self.__write_to_head_obj = open(self.__header_holder, 'w')
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
# keep track of opening and closing brackets
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.__ob_count = line[-5:-1]
|
||||
if self.__token_info == 'cb<nu<clos-brack':
|
||||
self.__cb_count = line[-5:-1]
|
||||
# In the middle of footnote text
|
||||
if self.__in_header:
|
||||
self.__in_header_func(line)
|
||||
# not in the middle of footnote text
|
||||
else:
|
||||
self.__default_sep(line)
|
||||
self.__write_obj.close()
|
||||
read_obj.close()
|
||||
self.__write_to_head_obj.close()
|
||||
read_obj = open(self.__header_holder, 'r')
|
||||
write_obj = open(self.__write_to, 'a')
|
||||
write_obj.write(
|
||||
'mi<mk<header-beg\n')
|
||||
line = 1
|
||||
while line:
|
||||
line = read_obj.readline()
|
||||
write_obj.write(line)
|
||||
write_obj.write(
|
||||
'mi<mk<header-end\n')
|
||||
read_obj.close()
|
||||
write_obj.close()
|
||||
with open(self.__file) as read_obj:
|
||||
with open(self.__write_to, 'w') as self.__write_obj:
|
||||
with open(self.__header_holder, 'w') as self.__write_to_head_obj:
|
||||
for line in read_obj:
|
||||
self.__token_info = line[:16]
|
||||
# keep track of opening and closing brackets
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.__ob_count = line[-5:-1]
|
||||
if self.__token_info == 'cb<nu<clos-brack':
|
||||
self.__cb_count = line[-5:-1]
|
||||
# In the middle of footnote text
|
||||
if self.__in_header:
|
||||
self.__in_header_func(line)
|
||||
# not in the middle of footnote text
|
||||
else:
|
||||
self.__default_sep(line)
|
||||
|
||||
with open(self.__header_holder, 'r') as read_obj:
|
||||
with open(self.__write_to, 'a') as write_obj:
|
||||
write_obj.write(
|
||||
'mi<mk<header-beg\n')
|
||||
for line in read_obj:
|
||||
write_obj.write(line)
|
||||
write_obj.write(
|
||||
'mi<mk<header-end\n')
|
||||
os.remove(self.__header_holder)
|
||||
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "header_separate.info")
|
||||
copy_obj.copy_file(self.__write_to, "header_separate.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
|
||||
def update_info(self, file, copy):
|
||||
"""
|
||||
Unused method
|
||||
"""
|
||||
self.__file = file
|
||||
self.__copy = copy
|
||||
|
||||
def __get_head_body_func(self, line):
|
||||
"""
|
||||
Process lines in main body and look for beginning of headers.
|
||||
@ -166,6 +167,7 @@ class Header:
|
||||
self.__state = 'head'
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __get_head_head_func(self, line):
|
||||
"""
|
||||
Copy headers and footers from bottom of file to a separate, temporary file.
|
||||
@ -174,6 +176,7 @@ class Header:
|
||||
self.__state = 'body'
|
||||
else:
|
||||
self.__write_to_head_obj.write(line)
|
||||
|
||||
def __get_headers(self):
|
||||
"""
|
||||
Private method to remove footnotes from main file. Read one line from
|
||||
@ -182,21 +185,16 @@ class Header:
|
||||
These two functions do the work of separating the footnotes form the
|
||||
body.
|
||||
"""
|
||||
read_obj = open(self.__file)
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
# self.__write_to = "footnote_info.data"
|
||||
self.__write_to_head_obj = open(self.__header_holder, 'w')
|
||||
line = 1
|
||||
while line:
|
||||
line = read_obj.readline()
|
||||
self.__token_info = line[:16]
|
||||
if self.__state == 'body':
|
||||
self.__get_head_body_func(line)
|
||||
elif self.__state == 'head':
|
||||
self.__get_head_head_func(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
self.__write_to_head_obj.close()
|
||||
with open(self.__file) as read_obj:
|
||||
with open(self.__write_to, 'w') as self.__write_obj:
|
||||
with open(self.__header_holder, 'w') as self.__write_to_head_obj:
|
||||
for line in read_obj:
|
||||
self.__token_info = line[:16]
|
||||
if self.__state == 'body':
|
||||
self.__get_head_body_func(line)
|
||||
elif self.__state == 'head':
|
||||
self.__get_head_head_func(line)
|
||||
|
||||
def __get_head_from_temp(self, num):
|
||||
"""
|
||||
Private method for joining headers and footers to body. This method
|
||||
@ -205,18 +203,17 @@ class Header:
|
||||
returns them as a string.
|
||||
"""
|
||||
look_for = 'mi<mk<header-ope<' + num + '\n'
|
||||
found_head = 0
|
||||
found_head = False
|
||||
string_to_return = ''
|
||||
line = 1
|
||||
while line:
|
||||
line = self.__read_from_head_obj.readline()
|
||||
for line in self.__read_from_head_obj:
|
||||
if found_head:
|
||||
if line == 'mi<mk<header-clo\n':
|
||||
return string_to_return
|
||||
string_to_return = string_to_return + line
|
||||
string_to_return += line
|
||||
else:
|
||||
if line == look_for:
|
||||
found_head = 1
|
||||
found_head = True
|
||||
|
||||
def __join_from_temp(self):
|
||||
"""
|
||||
Private method for rejoining footnotes to body. Read from the
|
||||
@ -227,15 +224,13 @@ class Header:
|
||||
If no footnote marker is found, simply print out the token (line).
|
||||
"""
|
||||
self.__read_from_head_obj = open(self.__header_holder, 'r')
|
||||
read_obj = open(self.__write_to, 'r')
|
||||
self.__write_obj = open(self.__write_to2, 'w')
|
||||
line = 1
|
||||
while line:
|
||||
line = read_obj.readline()
|
||||
if line[:16] == 'mi<mk<header-ind':
|
||||
line = self.__get_head_from_temp(line[17:-1])
|
||||
self.__write_obj.write(line)
|
||||
read_obj.close()
|
||||
with open(self.__write_to, 'r') as read_obj:
|
||||
for line in read_obj:
|
||||
if line[:16] == 'mi<mk<header-ind':
|
||||
line = self.__get_head_from_temp(line[17:-1])
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def join_headers(self):
|
||||
"""
|
||||
Join the footnotes from the bottom of the file and put them in their
|
||||
|
@ -11,14 +11,18 @@
|
||||
# #
|
||||
#########################################################################
|
||||
import sys
|
||||
"""
|
||||
"""
|
||||
|
||||
class OldRtf:
|
||||
"""
|
||||
Check to see if the RTF is an older version
|
||||
Logic:
|
||||
If allowable control word/properties happen in text without being enclosed
|
||||
in brackets the file will be considered old rtf
|
||||
"""
|
||||
def __init__(self, in_file, bug_handler, run_level ):
|
||||
def __init__(self, in_file,
|
||||
bug_handler,
|
||||
run_level,
|
||||
):
|
||||
"""
|
||||
Required:
|
||||
'file'--file to parse
|
||||
@ -32,46 +36,46 @@ class OldRtf:
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__initiate_values()
|
||||
self.__ob_group = 0
|
||||
def __initiate_values(self):
|
||||
self.__previous_token = ''
|
||||
self.__new_found = 0
|
||||
self.__run_level = run_level
|
||||
self.__allowable = [
|
||||
'annotation' ,
|
||||
'blue______' ,
|
||||
'bold______',
|
||||
'caps______',
|
||||
'char-style' ,
|
||||
'dbl-strike' ,
|
||||
'emboss____',
|
||||
'engrave___' ,
|
||||
'font-color',
|
||||
'font-down_' ,
|
||||
'font-size_',
|
||||
'font-style',
|
||||
'font-up___',
|
||||
'footnot-mk' ,
|
||||
'green_____' ,
|
||||
'hidden____',
|
||||
'italics___',
|
||||
'outline___',
|
||||
'red_______',
|
||||
'shadow____' ,
|
||||
'small-caps',
|
||||
'strike-thr',
|
||||
'subscript_',
|
||||
'superscrip' ,
|
||||
'underlined' ,
|
||||
'annotation' ,
|
||||
'blue______' ,
|
||||
'bold______',
|
||||
'caps______',
|
||||
'char-style' ,
|
||||
'dbl-strike' ,
|
||||
'emboss____',
|
||||
'engrave___' ,
|
||||
'font-color',
|
||||
'font-down_' ,
|
||||
'font-size_',
|
||||
'font-style',
|
||||
'font-up___',
|
||||
'footnot-mk' ,
|
||||
'green_____' ,
|
||||
'hidden____',
|
||||
'italics___',
|
||||
'outline___',
|
||||
'red_______',
|
||||
'shadow____' ,
|
||||
'small-caps',
|
||||
'strike-thr',
|
||||
'subscript_',
|
||||
'superscrip' ,
|
||||
'underlined' ,
|
||||
]
|
||||
self.__state = 'before_body'
|
||||
self.__action_dict = {
|
||||
'before_body' : self.__before_body_func,
|
||||
'in_body' : self.__check_tokens_func,
|
||||
'after_pard' : self.__after_pard_func,
|
||||
}
|
||||
self.__is_old = 0
|
||||
|
||||
def __initiate_values(self):
|
||||
self.__previous_token = ''
|
||||
self.__state = 'before_body'
|
||||
self.__found_new = 0
|
||||
self.__ob_group = 0
|
||||
|
||||
def __check_tokens_func(self, line):
|
||||
if self.__inline_info in self.__allowable:
|
||||
if self.__ob_group == self.__base_ob_count:
|
||||
@ -80,48 +84,56 @@ class OldRtf:
|
||||
self.__found_new += 1
|
||||
elif self.__token_info == 'cw<pf<par-def___':
|
||||
self.__state = 'after_pard'
|
||||
|
||||
def __before_body_func(self, line):
|
||||
if self.__token_info == 'mi<mk<body-open_':
|
||||
self.__state = 'in_body'
|
||||
self.__base_ob_count = self.__ob_group
|
||||
|
||||
def __after_pard_func(self, line):
|
||||
if line[0:2] != 'cw':
|
||||
self.__state = 'in_body'
|
||||
|
||||
def check_if_old_rtf(self):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
1 if file is older RTf
|
||||
0 if file is newer RTF
|
||||
True if file is older RTf
|
||||
False if file is newer RTF
|
||||
"""
|
||||
|
||||
read_obj = open(self.__file, 'r')
|
||||
line = 1
|
||||
self.__initiate_values()
|
||||
line_num = 0
|
||||
while line:
|
||||
line = read_obj.readline()
|
||||
line_num += 1
|
||||
self.__token_info = line[:16]
|
||||
if self.__token_info == 'mi<mk<body-close':
|
||||
return 0
|
||||
self.__ob_group = 0
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.__ob_group += 1
|
||||
self.__ob_count = line[-5:-1]
|
||||
if self.__token_info == 'cb<nu<clos-brack':
|
||||
self.__ob_group -= 1
|
||||
self.__cb_count = line[-5:-1]
|
||||
self.__inline_info = line[6:16]
|
||||
if self.__state == 'after_body':
|
||||
return 0
|
||||
action = self.__action_dict.get(self.__state)
|
||||
if not action:
|
||||
sys.stderr.write('No action for state!\n')
|
||||
result = action(line)
|
||||
if result == 'new_rtf':
|
||||
return 0
|
||||
elif result == 'old_rtf':
|
||||
return 1
|
||||
self.__previous_token = line[6:16]
|
||||
return 0
|
||||
with open(self.__file, 'r') as read_obj:
|
||||
for line in read_obj:
|
||||
line_num += 1
|
||||
self.__token_info = line[:16]
|
||||
if self.__token_info == 'mi<mk<body-close':
|
||||
return False
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.__ob_group += 1
|
||||
self.__ob_count = line[-5:-1]
|
||||
if self.__token_info == 'cb<nu<clos-brack':
|
||||
self.__ob_group -= 1
|
||||
self.__cb_count = line[-5:-1]
|
||||
self.__inline_info = line[6:16]
|
||||
if self.__state == 'after_body':
|
||||
return False
|
||||
action = self.__action_dict.get(self.__state)
|
||||
if action is None:
|
||||
try:
|
||||
sys.stderr.write('No action for this state!\n')
|
||||
except:
|
||||
pass
|
||||
result = action(line)
|
||||
if result == 'new_rtf':
|
||||
return False
|
||||
elif result == 'old_rtf':
|
||||
if self.__run_level > 3:
|
||||
sys.stderr.write(
|
||||
'Old rtf construction %s (bracket %s, line %s)\n'
|
||||
% (self.__inline_info, str(self.__ob_group), line_num)
|
||||
)
|
||||
return True
|
||||
self.__previous_token = line[6:16]
|
||||
return False
|
||||
|
@ -11,31 +11,32 @@
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os
|
||||
|
||||
from calibre.ebooks.rtf2xml import copy
|
||||
from calibre.ptempfile import better_mktemp
|
||||
|
||||
class Paragraphs:
|
||||
"""
|
||||
=================
|
||||
Purpose
|
||||
=================
|
||||
Write paragraph tags for a tokenized file. (This module won't be any use to use
|
||||
to you unless you use it as part of the other modules.)
|
||||
-------------
|
||||
Method
|
||||
-------------
|
||||
RTF does not tell you when a paragraph begins. It only tells you when the
|
||||
paragraph ends.
|
||||
In order to make paragraphs out of this limited info, the parser starts in the
|
||||
body of the documents and assumes it is not in a paragraph. It looks for clues
|
||||
to begin a paragraph. Text starts a paragraph; so does an inline field or
|
||||
list-text. If an end of paragraph marker (\par) is found, then this indicates
|
||||
a blank paragraph.
|
||||
Once a paragraph is found, the state changes to 'paragraph.' In this state,
|
||||
clues are looked to for the end of a paragraph. The end of a paragraph marker
|
||||
(\par) marks the end of a paragraph. So does the end of a footnote or heading;
|
||||
a paragraph definintion; the end of a field-block; and the beginning of a
|
||||
section. (How about the end of a section or the end of a field-block?)
|
||||
=================
|
||||
Purpose
|
||||
=================
|
||||
Write paragraph tags for a tokenized file. (This module won't be any use to use
|
||||
to you unless you use it as part of the other modules.)
|
||||
-------------
|
||||
Method
|
||||
-------------
|
||||
RTF does not tell you when a paragraph begins. It only tells you when the
|
||||
paragraph ends.
|
||||
In order to make paragraphs out of this limited info, the parser starts in the
|
||||
body of the documents and assumes it is not in a paragraph. It looks for clues
|
||||
to begin a paragraph. Text starts a paragraph; so does an inline field or
|
||||
list-text. If an end of paragraph marker (\par) is found, then this indicates
|
||||
a blank paragraph.
|
||||
Once a paragraph is found, the state changes to 'paragraph.' In this state,
|
||||
clues are looked to for the end of a paragraph. The end of a paragraph marker
|
||||
(\par) marks the end of a paragraph. So does the end of a footnote or heading;
|
||||
a paragraph definition; the end of a field-block; and the beginning of a
|
||||
section. (How about the end of a section or the end of a field-block?)
|
||||
"""
|
||||
def __init__(self,
|
||||
in_file,
|
||||
@ -60,6 +61,7 @@ section. (How about the end of a section or the end of a field-block?)
|
||||
self.__write_empty_para = write_empty_para
|
||||
self.__run_level = run_level
|
||||
self.__write_to = better_mktemp()
|
||||
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Initiate all values.
|
||||
@ -77,7 +79,7 @@ section. (How about the end of a section or the end of a field-block?)
|
||||
self.__paragraph_dict = {
|
||||
'cw<pf<par-end___' : self.__close_para_func, # end of paragraph
|
||||
'mi<mk<headi_-end' : self.__close_para_func, # end of header or footer
|
||||
##'cw<pf<par-def___' : self.__close_para_func, # paragraph definition
|
||||
## 'cw<pf<par-def___' : self.__close_para_func, # paragraph definition
|
||||
# 'mi<mk<fld-bk-end' : self.__close_para_func, # end of field-block
|
||||
'mi<mk<fldbk-end_' : self.__close_para_func, # end of field-block
|
||||
'mi<mk<body-close' : self.__close_para_func, # end of body
|
||||
@ -99,6 +101,7 @@ section. (How about the end of a section or the end of a field-block?)
|
||||
'mi<mk<pict-start' : self.__start_para_func,
|
||||
'cw<pf<page-break' : self.__empty_pgbk_func, # page break
|
||||
}
|
||||
|
||||
def __before_body_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
@ -112,6 +115,7 @@ section. (How about the end of a section or the end of a field-block?)
|
||||
if self.__token_info == 'mi<mk<body-open_':
|
||||
self.__state = 'not_paragraph'
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __not_paragraph_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
@ -127,6 +131,7 @@ section. (How about the end of a section or the end of a field-block?)
|
||||
if action:
|
||||
action(line)
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __paragraph_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
@ -144,6 +149,7 @@ section. (How about the end of a section or the end of a field-block?)
|
||||
action(line)
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __start_para_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
@ -160,6 +166,7 @@ section. (How about the end of a section or the end of a field-block?)
|
||||
)
|
||||
self.__write_obj.write(self.__start2_marker)
|
||||
self.__state = 'paragraph'
|
||||
|
||||
def __empty_para_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
@ -176,6 +183,7 @@ section. (How about the end of a section or the end of a field-block?)
|
||||
'mi<tg<empty_____<para\n'
|
||||
)
|
||||
self.__write_obj.write(self.__end_marker) # marker for later parsing
|
||||
|
||||
def __empty_pgbk_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
@ -188,6 +196,7 @@ section. (How about the end of a section or the end of a field-block?)
|
||||
self.__write_obj.write(
|
||||
'mi<tg<empty_____<page-break\n'
|
||||
)
|
||||
|
||||
def __close_para_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
@ -205,6 +214,7 @@ section. (How about the end of a section or the end of a field-block?)
|
||||
self.__write_obj.write(self.__end_marker) # marker for later parser
|
||||
self.__write_obj.write(line)
|
||||
self.__state = 'not_paragraph'
|
||||
|
||||
def __bogus_para__def_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
@ -215,6 +225,7 @@ section. (How about the end of a section or the end of a field-block?)
|
||||
if a \pard occurs in a paragraph, I want to ignore it. (I believe)
|
||||
"""
|
||||
self.__write_obj.write('mi<mk<bogus-pard\n')
|
||||
|
||||
def make_paragraphs(self):
|
||||
"""
|
||||
Requires:
|
||||
@ -229,20 +240,18 @@ section. (How about the end of a section or the end of a field-block?)
|
||||
only other state is 'paragraph'.
|
||||
"""
|
||||
self.__initiate_values()
|
||||
read_obj = open(self.__file, 'r')
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if action == None:
|
||||
sys.stderr.write('no no matching state in module sections.py\n')
|
||||
sys.stderr.write(self.__state + '\n')
|
||||
action(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
with open(self.__file, 'r') as read_obj:
|
||||
with open(self.__write_to, 'w') as self.__write_obj:
|
||||
for line in read_obj:
|
||||
self.__token_info = line[:16]
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if action is None:
|
||||
try:
|
||||
sys.stderr.write('no matching state in module paragraphs.py\n')
|
||||
sys.stderr.write(self.__state + '\n')
|
||||
except:
|
||||
pass
|
||||
action(line)
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "paragraphs.data")
|
||||
|
@ -11,16 +11,24 @@
|
||||
# #
|
||||
#########################################################################
|
||||
import sys,os
|
||||
|
||||
from calibre.ebooks.rtf2xml import copy
|
||||
|
||||
class Preamble:
|
||||
"""
|
||||
Fix the reamaing parts of the preamble. This module does very little. It
|
||||
makes sure that no text gets put in the revision of list table. In the
|
||||
future, when I understand how to interprett he revision table and list
|
||||
future, when I understand how to interpret the revision table and list
|
||||
table, I will make these methods more functional.
|
||||
"""
|
||||
def __init__(self, file, bug_handler, platform, default_font, code_page,
|
||||
copy=None, temp_dir=None):
|
||||
def __init__(self, file,
|
||||
bug_handler,
|
||||
platform,
|
||||
default_font,
|
||||
code_page,
|
||||
copy=None,
|
||||
temp_dir=None,
|
||||
):
|
||||
"""
|
||||
Required:
|
||||
file--file to parse
|
||||
@ -44,6 +52,7 @@ class Preamble:
|
||||
self.__write_to = os.path.join(temp_dir,"info_table_info.data")
|
||||
else:
|
||||
self.__write_to = "info_table_info.data"
|
||||
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Initiate all values.
|
||||
@ -62,12 +71,14 @@ class Preamble:
|
||||
'mi<mk<revtbl-beg' : self.__found_revision_table_func,
|
||||
'mi<mk<body-open_' : self.__found_body_func,
|
||||
}
|
||||
|
||||
def __default_func(self, line):
|
||||
action = self.__default_dict.get(self.__token_info)
|
||||
if action:
|
||||
action(line)
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __found_rtf_head_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
@ -84,8 +95,10 @@ class Preamble:
|
||||
'<platform>%s\n' % (self.__default_font, self.__code_page,
|
||||
self.__platform)
|
||||
)
|
||||
|
||||
def __found_list_table_func(self, line):
|
||||
self.__state = 'list_table'
|
||||
|
||||
def __list_table_func(self, line):
|
||||
if self.__token_info == 'mi<mk<listabend_':
|
||||
self.__state = 'default'
|
||||
@ -93,8 +106,10 @@ class Preamble:
|
||||
pass
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __found_revision_table_func(self, line):
|
||||
self.__state = 'revision'
|
||||
|
||||
def __revision_table_func(self, line):
|
||||
if self.__token_info == 'mi<mk<revtbl-end':
|
||||
self.__state = 'default'
|
||||
@ -102,11 +117,14 @@ class Preamble:
|
||||
pass
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __found_body_func(self, line):
|
||||
self.__state = 'body'
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __body_func(self, line):
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def fix_preamble(self):
|
||||
"""
|
||||
Requires:
|
||||
@ -119,20 +137,15 @@ class Preamble:
|
||||
the list table.
|
||||
"""
|
||||
self.__initiate_values()
|
||||
read_obj = open(self.__file, 'r')
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if action == None:
|
||||
sys.stderr.write('no no matching state in module preamble_rest.py\n')
|
||||
sys.stderr.write(self.__state + '\n')
|
||||
action(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
with open(self.__file, 'r') as read_obj:
|
||||
with open(self.__write_to, 'w') as self.__write_obj:
|
||||
for line in read_obj:
|
||||
self.__token_info = line[:16]
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if action is None:
|
||||
sys.stderr.write(
|
||||
'no matching state in module preamble_rest.py\n' + self.__state + '\n')
|
||||
action(line)
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "preamble_div.data")
|
||||
|
@ -11,43 +11,44 @@
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os
|
||||
|
||||
from calibre.ebooks.rtf2xml import copy
|
||||
from calibre.ptempfile import better_mktemp
|
||||
|
||||
class Sections:
|
||||
"""
|
||||
=================
|
||||
Purpose
|
||||
=================
|
||||
Write section tags for a tokenized file. (This module won't be any use to use
|
||||
to you unless you use it as part of the other modules.)
|
||||
---------------
|
||||
logic
|
||||
---------------
|
||||
The tags for the first section breaks have already been written.
|
||||
RTF stores section breaks with the \sect tag. Each time this tag is
|
||||
encountered, add one to the counter.
|
||||
When I encounter the \sectd tag, I want to collect all the appropriate tokens
|
||||
that describe the section. When I reach a \pard, I know I an stop collecting
|
||||
tokens and write the section tags.
|
||||
The exception to this method occurs when sections occur in field blocks, such
|
||||
as the index. Normally, two section break occur within the index and other
|
||||
field-blocks. (If less or more section breaks occurr, this code may not work.)
|
||||
I want the sections to occurr outside of the index. That is, the index
|
||||
should be nested inside one section tag. After the index is complete, a new
|
||||
section should begin.
|
||||
In order to write the sections outside of the field blocks, I have to store
|
||||
all of the field block as a string. When I ecounter the \sect tag, add one to
|
||||
the section counter, but store this number in a list. Likewise, store the
|
||||
information describing the section in another list.
|
||||
When I reach the end of the field block, choose the first item from the
|
||||
numbered list as the section number. Choose the first item in the description
|
||||
list as the values and attributes of the section. Enclose the field string
|
||||
between the section tags.
|
||||
Start a new section outside the field-block strings. Use the second number in
|
||||
the list; use the second item in the description list.
|
||||
CHANGE (2004-04-26) No longer write sections that occurr in field-blocks.
|
||||
Instead, ingore all section information in a field-block.
|
||||
=================
|
||||
Purpose
|
||||
=================
|
||||
Write section tags for a tokenized file. (This module won't be any use to use
|
||||
to you unless you use it as part of the other modules.)
|
||||
---------------
|
||||
logic
|
||||
---------------
|
||||
The tags for the first section breaks have already been written.
|
||||
RTF stores section breaks with the \sect tag. Each time this tag is
|
||||
encountered, add one to the counter.
|
||||
When I encounter the \sectd tag, I want to collect all the appropriate tokens
|
||||
that describe the section. When I reach a \pard, I know I an stop collecting
|
||||
tokens and write the section tags.
|
||||
The exception to this method occurs when sections occur in field blocks, such
|
||||
as the index. Normally, two section break occur within the index and other
|
||||
field-blocks. (If less or more section breaks occurr, this code may not work.)
|
||||
I want the sections to occur outside of the index. That is, the index
|
||||
should be nested inside one section tag. After the index is complete, a new
|
||||
section should begin.
|
||||
In order to write the sections outside of the field blocks, I have to store
|
||||
all of the field block as a string. When I ecounter the \sect tag, add one to
|
||||
the section counter, but store this number in a list. Likewise, store the
|
||||
information describing the section in another list.
|
||||
When I reach the end of the field block, choose the first item from the
|
||||
numbered list as the section number. Choose the first item in the description
|
||||
list as the values and attributes of the section. Enclose the field string
|
||||
between the section tags.
|
||||
Start a new section outside the field-block strings. Use the second number in
|
||||
the list; use the second item in the description list.
|
||||
CHANGE (2004-04-26) No longer write sections that occurr in field-blocks.
|
||||
Instead, ingore all section information in a field-block.
|
||||
"""
|
||||
def __init__(self,
|
||||
in_file,
|
||||
|
Loading…
x
Reference in New Issue
Block a user