RTF Input: Handle old RTF files that have commands without braces. Fixes #994133 (Private bug)

This commit is contained in:
Kovid Goyal 2012-05-13 07:55:09 +05:30
commit fb94b02be3
11 changed files with 14057 additions and 381 deletions

View File

@ -372,8 +372,8 @@ class ParseRtf:
old_rtf = old_rtf_obj.check_if_old_rtf() old_rtf = old_rtf_obj.check_if_old_rtf()
if old_rtf: if old_rtf:
if self.__run_level > 5: if self.__run_level > 5:
msg = 'Older RTF\n' msg = 'Older RTF\n' \
msg += 'self.__run_level is "%s"\n' % self.__run_level 'self.__run_level is "%s"\n' % self.__run_level
raise RtfInvalidCodeException, msg raise RtfInvalidCodeException, msg
if self.__run_level > 1: if self.__run_level > 1:
sys.stderr.write('File could be older RTF...\n') sys.stderr.write('File could be older RTF...\n')
@ -381,7 +381,7 @@ class ParseRtf:
if self.__run_level > 1: if self.__run_level > 1:
sys.stderr.write( sys.stderr.write(
'File also has newer RTF.\n' 'File also has newer RTF.\n'
'Will do the best to convert.\n' 'Will do the best to convert...\n'
) )
add_brackets_obj = add_brackets.AddBrackets( add_brackets_obj = add_brackets.AddBrackets(
in_file = self.__temp_file, in_file = self.__temp_file,

View File

@ -20,6 +20,9 @@ class AddBrackets:
""" """
Add brackets for old RTF. Add brackets for old RTF.
Logic: Logic:
When control words without their own brackets are encountered
and in the list of allowed words, this will add brackets
to facilitate the treatment of the file
""" """
def __init__(self, in_file, def __init__(self, in_file,
bug_handler, bug_handler,
@ -41,53 +44,56 @@ class AddBrackets:
self.__copy = copy self.__copy = copy
self.__write_to = better_mktemp() self.__write_to = better_mktemp()
self.__run_level = run_level self.__run_level = run_level
def __initiate_values(self):
"""
"""
self.__state_dict = { self.__state_dict = {
'before_body' : self.__before_body_func, 'before_body' : self.__before_body_func,
'in_body' : self.__in_body_func, 'in_body' : self.__in_body_func,
'after_control_word' : self.__after_control_word_func, 'after_control_word' : self.__after_control_word_func,
'in_ignore' : self.__ignore_func, 'in_ignore' : self.__ignore_func,
} }
self.__accept = [
'cw<ci<bold______' ,
'cw<ci<annotation' ,
'cw<ci<blue______' ,
# 'cw<ci<bold______' ,
'cw<ci<caps______' ,
'cw<ci<char-style' ,
'cw<ci<dbl-strike' ,
'cw<ci<emboss____' ,
'cw<ci<engrave___' ,
'cw<ci<font-color' ,
'cw<ci<font-down_' ,
'cw<ci<font-size_' ,
'cw<ci<font-style' ,
'cw<ci<font-up___' ,
'cw<ci<footnot-mk' ,
'cw<ci<green_____' ,
'cw<ci<hidden____' ,
'cw<ci<italics___' ,
'cw<ci<outline___' ,
'cw<ci<red_______' ,
'cw<ci<shadow____' ,
'cw<ci<small-caps' ,
'cw<ci<strike-thr' ,
'cw<ci<subscript_' ,
'cw<ci<superscrip' ,
'cw<ci<underlined' ,
# 'cw<ul<underlined' ,
]
def __initiate_values(self):
"""
Init temp values
"""
self.__state = 'before_body' self.__state = 'before_body'
self.__inline = {} self.__inline = {}
self.__temp_group = [] self.__temp_group = []
self.__open_bracket = 0 self.__open_bracket = False
self.__found_brackets = 0 self.__found_brackets = False
self.__accept = [
'cw<ci<bold______',
'cw<ci<annotation' ,
'cw<ci<blue______' ,
'cw<ci<bold______' ,
'cw<ci<caps______' ,
'cw<ci<char-style' ,
'cw<ci<dbl-strike' ,
'cw<ci<emboss____' ,
'cw<ci<engrave___' ,
'cw<ci<font-color' ,
'cw<ci<font-down_' ,
'cw<ci<font-size_' ,
'cw<ci<font-style' ,
'cw<ci<font-up___',
'cw<ci<footnot-mk',
'cw<ci<green_____' ,
'cw<ci<hidden____',
'cw<ci<italics___' ,
'cw<ci<outline___',
'cw<ci<red_______' ,
'cw<ci<shadow____',
'cw<ci<small-caps' ,
'cw<ci<strike-thr',
'cw<ci<subscript_' ,
'cw<ci<superscrip',
'cw<ci<underlined' ,
# 'cw<ul<underlined' ,
]
def __before_body_func(self, line): def __before_body_func(self, line):
""" """
If we are before the body, not interest in changing anything
""" """
if self.__token_info == 'mi<mk<body-open_': if self.__token_info == 'mi<mk<body-open_':
self.__state = 'in_body' self.__state = 'in_body'
@ -95,6 +101,14 @@ class AddBrackets:
def __in_body_func(self, line): def __in_body_func(self, line):
""" """
Select what action to take in body:
1-At the end of the file close the braket if a bracket was opened
This happens if there is achange
2-If an open bracket is found the code inside is ignore
(written without modifications)
3-If an accepted control word is found put the line
in a buffer then chage state to after cw
4-Else simply write the line
""" """
if line == 'cb<nu<clos-brack<0001\n' and self.__open_bracket: if line == 'cb<nu<clos-brack<0001\n' and self.__open_bracket:
self.__write_obj.write( self.__write_obj.write(
@ -102,7 +116,7 @@ class AddBrackets:
) )
self.__write_obj.write(line) self.__write_obj.write(line)
elif self.__token_info == 'ob<nu<open-brack': elif self.__token_info == 'ob<nu<open-brack':
self.__found_brackets = 1 self.__found_brackets = True
self.__state = 'in_ignore' self.__state = 'in_ignore'
self.__ignore_count = self.__ob_count self.__ignore_count = self.__ob_count
self.__write_obj.write(line) self.__write_obj.write(line)
@ -114,6 +128,10 @@ class AddBrackets:
def __after_control_word_func(self, line): def __after_control_word_func(self, line):
""" """
After a cw either add next allowed cw to temporary list or
change groupe and write it.
If the token leading to an exit is an open bracket go to
ignore otherwise goto in body
""" """
if self.__token_info in self.__accept: if self.__token_info in self.__accept:
self.__temp_group.append(line) self.__temp_group.append(line)
@ -129,75 +147,77 @@ class AddBrackets:
def __write_group(self): def __write_group(self):
""" """
Write a tempory group after accepted control words end
But this is mostly useless in my opinion as there is no list of rejected cw
This may be a way to implement future old rtf processing for cw
Utility: open a group to just put brackets but why be so complicated?
Scheme: open brackets, write cw then go to body and back with cw after
""" """
if self.__open_bracket: if self.__open_bracket:
self.__write_obj.write( self.__write_obj.write(
'cb<nu<clos-brack<0003\n' 'cb<nu<clos-brack<0003\n'
) )
self.__open_bracket = 0 self.__open_bracket = False
inline_string = ''
the_keys = self.__inline.keys() inline_string = ''.join(['%s<nu<%s\n' % (k, v) \
for the_key in the_keys: for k, v in self.__inline.iteritems() \
value = self.__inline[the_key] if v != 'false'])
if value != 'false':
inline_string += '%s<nu<%s\n' % (the_key, value)
if inline_string: if inline_string:
self.__write_obj.write('ob<nu<open-brack<0003\n') self.__write_obj.write('ob<nu<open-brack<0003\n'
self.__write_obj.write(inline_string) '%s' % inline_string)
self.__open_bracket = 1 self.__open_bracket = True
self.__temp_group = [] self.__temp_group = []
def __change_permanent_group(self): def __change_permanent_group(self):
""" """
use temp group to change permanent group Use temp group to change permanent group
If the control word is not accepted remove it
What is the interest as it is build to accept only accepted cw
in __after_control_word_func?
""" """
for line in self.__temp_group: self.__inline = {line[:16] : line[20:-1]\
token_info = line[:16] for line in self.__temp_group\
if token_info in self.__accept: # Is this really necessary?
att = line[20:-1] if line[:16] in self.__accept}
self.__inline[token_info] = att
def __ignore_func(self, line): def __ignore_func(self, line):
""" """
Don't add any brackets while inside of brackets RTF has already Just copy data inside of RTF brackets already here.
added.
""" """
self.__write_obj.write(line) self.__write_obj.write(line)
if self.__token_info == 'cb<nu<clos-brack'and\ if self.__token_info == 'cb<nu<clos-brack'\
self.__cb_count == self.__ignore_count: and self.__cb_count == self.__ignore_count:
self.__state = 'in_body' self.__state = 'in_body'
def __check_brackets(self, in_file): def __check_brackets(self, in_file):
self.__check_brack_obj = check_brackets.CheckBrackets\ """
Return True if brackets match
"""
check_brack_obj = check_brackets.CheckBrackets\
(file = in_file) (file = in_file)
good_br = self.__check_brack_obj.check_brackets()[0] return check_brack_obj.check_brackets()[0]
if not good_br:
return 1
def add_brackets(self): def add_brackets(self):
""" """
""" """
self.__initiate_values() self.__initiate_values()
read_obj = open(self.__file, 'r') with open(self.__file, 'r') as read_obj:
self.__write_obj = open(self.__write_to, 'w') with open(self.__write_to, 'w') as self.__write_obj:
line_to_read = 1 for line in read_obj:
while line_to_read: self.__token_info = line[:16]
line_to_read = read_obj.readline() if self.__token_info == 'ob<nu<open-brack':
line = line_to_read self.__ob_count = line[-5:-1]
self.__token_info = line[:16] if self.__token_info == 'cb<nu<clos-brack':
if self.__token_info == 'ob<nu<open-brack': self.__cb_count = line[-5:-1]
self.__ob_count = line[-5:-1] action = self.__state_dict.get(self.__state)
if self.__token_info == 'cb<nu<clos-brack': if action is None:
self.__cb_count = line[-5:-1] sys.stderr.write(
action = self.__state_dict.get(self.__state) 'No matching state in module add_brackets.py\n'
if action == None: '%s\n' % self.__state)
sys.stderr.write('No matching state in module add_brackets.py\n') action(line)
sys.stderr.write(self.__state + '\n') #Check bad brackets
action(line) if self.__check_brackets(self.__write_to):
read_obj.close()
self.__write_obj.close()
bad_brackets = self.__check_brackets(self.__write_to)
if not bad_brackets:
copy_obj = copy.Copy(bug_handler = self.__bug_handler) copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy: if self.__copy:
copy_obj.copy_file(self.__write_to, "add_brackets.data") copy_obj.copy_file(self.__write_to, "add_brackets.data")

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,5 @@
import os, sys import os, sys
from codecs import EncodedFile
from calibre.ebooks.rtf2xml import copy, check_encoding from calibre.ebooks.rtf2xml import copy, check_encoding
from calibre.ptempfile import better_mktemp from calibre.ptempfile import better_mktemp
@ -41,6 +42,7 @@ class ConvertToTags:
self.__run_level = run_level self.__run_level = run_level
self.__write_to = better_mktemp() self.__write_to = better_mktemp()
self.__convert_utf = False self.__convert_utf = False
self.__bad_encoding = False
def __initiate_values(self): def __initiate_values(self):
""" """
@ -213,13 +215,14 @@ class ConvertToTags:
if not check_encoding_obj.check_encoding(self.__file, verbose=False): if not check_encoding_obj.check_encoding(self.__file, verbose=False):
self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>') self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>')
elif not check_encoding_obj.check_encoding(self.__file, self.__encoding): elif not check_encoding_obj.check_encoding(self.__file, self.__encoding, verbose=False):
self.__write_obj.write('<?xml version="1.0" encoding="UTF-8" ?>') self.__write_obj.write('<?xml version="1.0" encoding="UTF-8" ?>')
self.__convert_utf = True self.__convert_utf = True
else: else:
self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>') self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>')
sys.stderr.write('Bad RTF encoding, revert to US-ASCII chars and' sys.stderr.write('Bad RTF encoding, revert to US-ASCII chars and'
' hope for the best') ' hope for the best')
self.__bad_encoding = True
self.__new_line = 0 self.__new_line = 0
self.__write_new_line() self.__write_new_line()
if self.__no_dtd: if self.__no_dtd:
@ -247,7 +250,7 @@ class ConvertToTags:
the appropriate function. the appropriate function.
The functions that are called: The functions that are called:
a text function for text a text function for text
an open funciton for open tags an open function for open tags
an open with attribute function for tags with attributes an open with attribute function for tags with attributes
an empty with attribute function for tags that are empty but have an empty with attribute function for tags that are empty but have
attribtes. attribtes.
@ -263,20 +266,19 @@ class ConvertToTags:
action = self.__state_dict.get(self.__token_info) action = self.__state_dict.get(self.__token_info)
if action is not None: if action is not None:
action(line) action(line)
self.__write_obj.close() #convert all encodings to UTF8 or ASCII to avoid unsupported encodings in lxml
#convert all encodings to UTF8 to avoid unsupported encodings in lxml if self.__convert_utf or self.__bad_encoding:
if self.__convert_utf:
copy_obj = copy.Copy(bug_handler = self.__bug_handler) copy_obj = copy.Copy(bug_handler = self.__bug_handler)
copy_obj.rename(self.__write_to, self.__file) copy_obj.rename(self.__write_to, self.__file)
file_encoding = "utf-8"
if self.__bad_encoding:
file_encoding = "us-ascii"
with open(self.__file, 'r') as read_obj: with open(self.__file, 'r') as read_obj:
with open(self.__write_to, 'w') as write_obj: with open(self.__write_to, 'w') as write_obj:
file = read_obj.read() write_objenc = EncodedFile(write_obj, self.__encoding,
try: file_encoding, 'replace')
file = file.decode(self.__encoding) for line in read_obj:
write_obj.write(file.encode('utf-8')) write_objenc.write(line)
except:
sys.stderr.write('Conversion to UTF-8 is not possible,'
' encoding should be very carefully checked')
copy_obj = copy.Copy(bug_handler = self.__bug_handler) copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy: if self.__copy:
copy_obj.copy_file(self.__write_to, "convert_to_tags.data") copy_obj.copy_file(self.__write_to, "convert_to_tags.data")

View File

@ -11,6 +11,7 @@
# # # #
######################################################################### #########################################################################
import sys, os import sys, os
from calibre.ebooks.rtf2xml import copy from calibre.ebooks.rtf2xml import copy
from calibre.ptempfile import better_mktemp from calibre.ptempfile import better_mktemp
@ -31,29 +32,29 @@ class Header:
self.__bug_handler = bug_handler self.__bug_handler = bug_handler
self.__copy = copy self.__copy = copy
self.__write_to = better_mktemp() self.__write_to = better_mktemp()
self.__found_a_header = 0 self.__found_a_header = False
def __in_header_func(self, line): def __in_header_func(self, line):
""" """
Handle all tokens that are part of header Handle all tokens that are part of header
""" """
if self.__cb_count == self.__header_bracket_count: if self.__cb_count == self.__header_bracket_count:
self.__in_header = 0 self.__in_header = False
self.__write_obj.write(line) self.__write_obj.write(line)
self.__write_to_head_obj.write( self.__write_to_head_obj.write(
'mi<mk<head___clo\n') 'mi<mk<head___clo\n' \
self.__write_to_head_obj.write( 'mi<tg<close_____<header-or-footer\n' \
'mi<tg<close_____<header-or-footer\n')
self.__write_to_head_obj.write(
'mi<mk<header-clo\n') 'mi<mk<header-clo\n')
else: else:
self.__write_to_head_obj.write(line) self.__write_to_head_obj.write(line)
def __found_header(self, line): def __found_header(self, line):
""" """
Found a header Found a header
""" """
# but this could be header or footer # but this could be header or footer
self.__found_a_header = 1 self.__found_a_header = True
self.__in_header = 1 self.__in_header = True
self.__header_count += 1 self.__header_count += 1
# temporarily set this to zero so I can enter loop # temporarily set this to zero so I can enter loop
self.__cb_count = 0 self.__cb_count = 0
@ -69,18 +70,23 @@ class Header:
'mi<tg<open-att__<header-or-footer<type>%s\n' % (type) 'mi<tg<open-att__<header-or-footer<type>%s\n' % (type)
) )
else: else:
sys.stderr.write('module is header\n') sys.stderr.write(
sys.stderr.write('method is __found_header\n') 'module is header\n' \
sys.stderr.write('no dict entry\n') 'method is __found_header\n' \
sys.stderr.write('line is %s' % line) 'no dict entry\n' \
'line is %s' % line)
self.__write_to_head_obj.write( self.__write_to_head_obj.write(
'mi<tg<open-att__<header-or-footer<type>none\n' 'mi<tg<open-att__<header-or-footer<type>none\n'
) )
def __default_sep(self, line): def __default_sep(self, line):
"""Handle all tokens that are not header tokens""" """
Handle all tokens that are not header tokens
"""
if self.__token_info[3:5] == 'hf': if self.__token_info[3:5] == 'hf':
self.__found_header(line) self.__found_header(line)
self.__write_obj.write(line) self.__write_obj.write(line)
def __initiate_sep_values(self): def __initiate_sep_values(self):
""" """
initiate counters for separate_footnotes method. initiate counters for separate_footnotes method.
@ -89,7 +95,7 @@ class Header:
self.__ob_count = 0 self.__ob_count = 0
self.__cb_count = 0 self.__cb_count = 0
self.__header_bracket_count = 0 self.__header_bracket_count = 0
self.__in_header = 0 self.__in_header = False
self.__header_count = 0 self.__header_count = 0
self.__head_dict = { self.__head_dict = {
'head-left_' : ('header-left'), 'head-left_' : ('header-left'),
@ -101,6 +107,7 @@ class Header:
'header____' : ('header' ), 'header____' : ('header' ),
'footer____' : ('footer' ), 'footer____' : ('footer' ),
} }
def separate_headers(self): def separate_headers(self):
""" """
Separate all the footnotes in an RTF file and put them at the bottom, Separate all the footnotes in an RTF file and put them at the bottom,
@ -110,53 +117,47 @@ class Header:
bottom of the main file. bottom of the main file.
""" """
self.__initiate_sep_values() self.__initiate_sep_values()
read_obj = open(self.__file)
self.__write_obj = open(self.__write_to, 'w')
self.__header_holder = better_mktemp() self.__header_holder = better_mktemp()
self.__write_to_head_obj = open(self.__header_holder, 'w') with open(self.__file) as read_obj:
line_to_read = 1 with open(self.__write_to, 'w') as self.__write_obj:
while line_to_read: with open(self.__header_holder, 'w') as self.__write_to_head_obj:
line_to_read = read_obj.readline() for line in read_obj:
line = line_to_read self.__token_info = line[:16]
self.__token_info = line[:16] # keep track of opening and closing brackets
# keep track of opening and closing brackets if self.__token_info == 'ob<nu<open-brack':
if self.__token_info == 'ob<nu<open-brack': self.__ob_count = line[-5:-1]
self.__ob_count = line[-5:-1] if self.__token_info == 'cb<nu<clos-brack':
if self.__token_info == 'cb<nu<clos-brack': self.__cb_count = line[-5:-1]
self.__cb_count = line[-5:-1] # In the middle of footnote text
# In the middle of footnote text if self.__in_header:
if self.__in_header: self.__in_header_func(line)
self.__in_header_func(line) # not in the middle of footnote text
# not in the middle of footnote text else:
else: self.__default_sep(line)
self.__default_sep(line)
self.__write_obj.close() with open(self.__header_holder, 'r') as read_obj:
read_obj.close() with open(self.__write_to, 'a') as write_obj:
self.__write_to_head_obj.close() write_obj.write(
read_obj = open(self.__header_holder, 'r') 'mi<mk<header-beg\n')
write_obj = open(self.__write_to, 'a') for line in read_obj:
write_obj.write( write_obj.write(line)
'mi<mk<header-beg\n') write_obj.write(
line = 1 'mi<mk<header-end\n')
while line:
line = read_obj.readline()
write_obj.write(line)
write_obj.write(
'mi<mk<header-end\n')
read_obj.close()
write_obj.close()
os.remove(self.__header_holder) os.remove(self.__header_holder)
copy_obj = copy.Copy(bug_handler = self.__bug_handler) copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy: if self.__copy:
copy_obj.copy_file(self.__write_to, "header_separate.info") copy_obj.copy_file(self.__write_to, "header_separate.data")
copy_obj.rename(self.__write_to, self.__file) copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to) os.remove(self.__write_to)
def update_info(self, file, copy): def update_info(self, file, copy):
""" """
Unused method Unused method
""" """
self.__file = file self.__file = file
self.__copy = copy self.__copy = copy
def __get_head_body_func(self, line): def __get_head_body_func(self, line):
""" """
Process lines in main body and look for beginning of headers. Process lines in main body and look for beginning of headers.
@ -166,6 +167,7 @@ class Header:
self.__state = 'head' self.__state = 'head'
else: else:
self.__write_obj.write(line) self.__write_obj.write(line)
def __get_head_head_func(self, line): def __get_head_head_func(self, line):
""" """
Copy headers and footers from bottom of file to a separate, temporary file. Copy headers and footers from bottom of file to a separate, temporary file.
@ -174,6 +176,7 @@ class Header:
self.__state = 'body' self.__state = 'body'
else: else:
self.__write_to_head_obj.write(line) self.__write_to_head_obj.write(line)
def __get_headers(self): def __get_headers(self):
""" """
Private method to remove footnotes from main file. Read one line from Private method to remove footnotes from main file. Read one line from
@ -182,21 +185,16 @@ class Header:
These two functions do the work of separating the footnotes form the These two functions do the work of separating the footnotes form the
body. body.
""" """
read_obj = open(self.__file) with open(self.__file) as read_obj:
self.__write_obj = open(self.__write_to, 'w') with open(self.__write_to, 'w') as self.__write_obj:
# self.__write_to = "footnote_info.data" with open(self.__header_holder, 'w') as self.__write_to_head_obj:
self.__write_to_head_obj = open(self.__header_holder, 'w') for line in read_obj:
line = 1 self.__token_info = line[:16]
while line: if self.__state == 'body':
line = read_obj.readline() self.__get_head_body_func(line)
self.__token_info = line[:16] elif self.__state == 'head':
if self.__state == 'body': self.__get_head_head_func(line)
self.__get_head_body_func(line)
elif self.__state == 'head':
self.__get_head_head_func(line)
read_obj.close()
self.__write_obj.close()
self.__write_to_head_obj.close()
def __get_head_from_temp(self, num): def __get_head_from_temp(self, num):
""" """
Private method for joining headers and footers to body. This method Private method for joining headers and footers to body. This method
@ -205,18 +203,17 @@ class Header:
returns them as a string. returns them as a string.
""" """
look_for = 'mi<mk<header-ope<' + num + '\n' look_for = 'mi<mk<header-ope<' + num + '\n'
found_head = 0 found_head = False
string_to_return = '' string_to_return = ''
line = 1 for line in self.__read_from_head_obj:
while line:
line = self.__read_from_head_obj.readline()
if found_head: if found_head:
if line == 'mi<mk<header-clo\n': if line == 'mi<mk<header-clo\n':
return string_to_return return string_to_return
string_to_return = string_to_return + line string_to_return += line
else: else:
if line == look_for: if line == look_for:
found_head = 1 found_head = True
def __join_from_temp(self): def __join_from_temp(self):
""" """
Private method for rejoining footnotes to body. Read from the Private method for rejoining footnotes to body. Read from the
@ -227,15 +224,13 @@ class Header:
If no footnote marker is found, simply print out the token (line). If no footnote marker is found, simply print out the token (line).
""" """
self.__read_from_head_obj = open(self.__header_holder, 'r') self.__read_from_head_obj = open(self.__header_holder, 'r')
read_obj = open(self.__write_to, 'r')
self.__write_obj = open(self.__write_to2, 'w') self.__write_obj = open(self.__write_to2, 'w')
line = 1 with open(self.__write_to, 'r') as read_obj:
while line: for line in read_obj:
line = read_obj.readline() if line[:16] == 'mi<mk<header-ind':
if line[:16] == 'mi<mk<header-ind': line = self.__get_head_from_temp(line[17:-1])
line = self.__get_head_from_temp(line[17:-1]) self.__write_obj.write(line)
self.__write_obj.write(line)
read_obj.close()
def join_headers(self): def join_headers(self):
""" """
Join the footnotes from the bottom of the file and put them in their Join the footnotes from the bottom of the file and put them in their

View File

@ -181,7 +181,7 @@ class Hex2Utf8:
self.__dingbats_dict.update(dingbats_base_dict) self.__dingbats_dict.update(dingbats_base_dict)
self.__dingbats_dict.update(ms_dingbats_dict) self.__dingbats_dict.update(ms_dingbats_dict)
# load dictionary for caps, and make a string for the replacement # load dictionary for caps, and make a string for the replacement
self.__caps_uni_dict = char_map_obj.get_char_map(map='caps_uni') self.__caps_uni_dict = char_map_obj.get_char_map(map = 'caps_uni')
# # print self.__caps_uni_dict # # print self.__caps_uni_dict
# don't think I'll need this # don't think I'll need this
##keys = self.__caps_uni_dict.keys() ##keys = self.__caps_uni_dict.keys()

View File

@ -11,14 +11,18 @@
# # # #
######################################################################### #########################################################################
import sys import sys
"""
"""
class OldRtf: class OldRtf:
""" """
Check to see if the RTF is an older version Check to see if the RTF is an older version
Logic: Logic:
If allowable control word/properties happen in text without being enclosed
in brackets the file will be considered old rtf
""" """
def __init__(self, in_file, bug_handler, run_level ): def __init__(self, in_file,
bug_handler,
run_level,
):
""" """
Required: Required:
'file'--file to parse 'file'--file to parse
@ -32,46 +36,46 @@ class OldRtf:
""" """
self.__file = in_file self.__file = in_file
self.__bug_handler = bug_handler self.__bug_handler = bug_handler
self.__initiate_values() self.__run_level = run_level
self.__ob_group = 0
def __initiate_values(self):
self.__previous_token = ''
self.__new_found = 0
self.__allowable = [ self.__allowable = [
'annotation' , 'annotation' ,
'blue______' , 'blue______' ,
'bold______', 'bold______',
'caps______', 'caps______',
'char-style' , 'char-style' ,
'dbl-strike' , 'dbl-strike' ,
'emboss____', 'emboss____',
'engrave___' , 'engrave___' ,
'font-color', 'font-color',
'font-down_' , 'font-down_' ,
'font-size_', 'font-size_',
'font-style', 'font-style',
'font-up___', 'font-up___',
'footnot-mk' , 'footnot-mk' ,
'green_____' , 'green_____' ,
'hidden____', 'hidden____',
'italics___', 'italics___',
'outline___', 'outline___',
'red_______', 'red_______',
'shadow____' , 'shadow____' ,
'small-caps', 'small-caps',
'strike-thr', 'strike-thr',
'subscript_', 'subscript_',
'superscrip' , 'superscrip' ,
'underlined' , 'underlined' ,
] ]
self.__state = 'before_body'
self.__action_dict = { self.__action_dict = {
'before_body' : self.__before_body_func, 'before_body' : self.__before_body_func,
'in_body' : self.__check_tokens_func, 'in_body' : self.__check_tokens_func,
'after_pard' : self.__after_pard_func, 'after_pard' : self.__after_pard_func,
} }
self.__is_old = 0
def __initiate_values(self):
self.__previous_token = ''
self.__state = 'before_body'
self.__found_new = 0 self.__found_new = 0
self.__ob_group = 0
def __check_tokens_func(self, line): def __check_tokens_func(self, line):
if self.__inline_info in self.__allowable: if self.__inline_info in self.__allowable:
if self.__ob_group == self.__base_ob_count: if self.__ob_group == self.__base_ob_count:
@ -80,48 +84,56 @@ class OldRtf:
self.__found_new += 1 self.__found_new += 1
elif self.__token_info == 'cw<pf<par-def___': elif self.__token_info == 'cw<pf<par-def___':
self.__state = 'after_pard' self.__state = 'after_pard'
def __before_body_func(self, line): def __before_body_func(self, line):
if self.__token_info == 'mi<mk<body-open_': if self.__token_info == 'mi<mk<body-open_':
self.__state = 'in_body' self.__state = 'in_body'
self.__base_ob_count = self.__ob_group self.__base_ob_count = self.__ob_group
def __after_pard_func(self, line): def __after_pard_func(self, line):
if line[0:2] != 'cw': if line[0:2] != 'cw':
self.__state = 'in_body' self.__state = 'in_body'
def check_if_old_rtf(self): def check_if_old_rtf(self):
""" """
Requires: Requires:
nothing nothing
Returns: Returns:
1 if file is older RTf True if file is older RTf
0 if file is newer RTF False if file is newer RTF
""" """
self.__initiate_values()
read_obj = open(self.__file, 'r')
line = 1
line_num = 0 line_num = 0
while line: with open(self.__file, 'r') as read_obj:
line = read_obj.readline() for line in read_obj:
line_num += 1 line_num += 1
self.__token_info = line[:16] self.__token_info = line[:16]
if self.__token_info == 'mi<mk<body-close': if self.__token_info == 'mi<mk<body-close':
return 0 return False
self.__ob_group = 0 if self.__token_info == 'ob<nu<open-brack':
if self.__token_info == 'ob<nu<open-brack': self.__ob_group += 1
self.__ob_group += 1 self.__ob_count = line[-5:-1]
self.__ob_count = line[-5:-1] if self.__token_info == 'cb<nu<clos-brack':
if self.__token_info == 'cb<nu<clos-brack': self.__ob_group -= 1
self.__ob_group -= 1 self.__cb_count = line[-5:-1]
self.__cb_count = line[-5:-1] self.__inline_info = line[6:16]
self.__inline_info = line[6:16] if self.__state == 'after_body':
if self.__state == 'after_body': return False
return 0 action = self.__action_dict.get(self.__state)
action = self.__action_dict.get(self.__state) if action is None:
if not action: try:
sys.stderr.write('No action for state!\n') sys.stderr.write('No action for this state!\n')
result = action(line) except:
if result == 'new_rtf': pass
return 0 result = action(line)
elif result == 'old_rtf': if result == 'new_rtf':
return 1 return False
self.__previous_token = line[6:16] elif result == 'old_rtf':
return 0 if self.__run_level > 3:
sys.stderr.write(
'Old rtf construction %s (bracket %s, line %s)\n'
% (self.__inline_info, str(self.__ob_group), line_num)
)
return True
self.__previous_token = line[6:16]
return False

View File

@ -10,7 +10,9 @@
# # # #
# # # #
######################################################################### #########################################################################
import sys, os, codecs import sys, os
# , codecs
class Output: class Output:
""" """
Output file Output file
@ -19,7 +21,8 @@ class Output:
file, file,
orig_file, orig_file,
output_dir = None, output_dir = None,
out_file = None out_file = None,
no_ask = True
): ):
""" """
Required: Required:
@ -33,8 +36,9 @@ class Output:
self.__file = file self.__file = file
self.__orig_file = orig_file self.__orig_file = orig_file
self.__output_dir = output_dir self.__output_dir = output_dir
self.__no_ask = 1 self.__no_ask = no_ask
self.__out_file = out_file self.__out_file = out_file
def output(self): def output(self):
""" """
Required: Required:
@ -45,13 +49,14 @@ class Output:
output the line to the screen if no output file given. Otherwise, output to output the line to the screen if no output file given. Otherwise, output to
the file. the file.
""" """
# self.__output_xml(self.__file, self.__out_file)
if self.__output_dir: if self.__output_dir:
self.__output_to_dir_func() self.__output_to_dir_func()
elif self.__out_file: elif self.__out_file:
self.__output_xml(self.__file, self.__out_file) self.__output_to_file_func()
# self.__output_xml(self.__file, self.__out_file)
else: else:
self.__output_to_standard_func() self.__output_to_standard_func()
def __output_to_dir_func(self): def __output_to_dir_func(self):
""" """
Requires: Requires:
@ -64,32 +69,25 @@ class Output:
""" """
base_name = os.path.basename(self.__orig_file) base_name = os.path.basename(self.__orig_file)
base_name, ext = os.path.splitext(base_name) base_name, ext = os.path.splitext(base_name)
output_file = '%s.xml' % base_name output_file = os.path.join(self.__output_dir, '%s.xml' % base_name)
output_file = os.path.join(self.__output_dir, output_file)
# change if user wants to output to a specific file # change if user wants to output to a specific file
if self.__out_file: if self.__out_file:
output_file = os.path.join(self.__output_dir, self.__out_file) output_file = os.path.join(self.__output_dir, self.__out_file)
user_response = 'o' user_response = 'o'
if os.path.isfile(output_file): if os.path.isfile(output_file) and not self.__no_ask:
if self.__no_ask: msg = 'Do you want to overwrite %s?\n' % output_file
user_response = 'o' msg += ('Type "o" to overwrite.\n'
else: 'Type any other key to print to standard output.\n')
msg = 'Do you want to over-write %s?\n' % output_file sys.stderr.write(msg)
msg += 'Type "o" to over-write.\n' user_response = raw_input()
msg += 'Type any other key to print to standard output.\n'
sys.stderr.write(msg)
user_response = raw_input()
if user_response == 'o': if user_response == 'o':
read_obj = open(self.__file, 'r') with open(self.__file, 'r') as read_obj:
write_obj = open(output_file, 'w') with open(self.output_file, 'w') as write_obj:
line = 1 for line in read_obj:
while line: write_obj.write(line)
line = read_obj.readline()
write_obj.write(line)
read_obj.close()
write_obj.close()
else: else:
self.__output_to_standard_func() self.__output_to_standard_func()
def __output_to_file_func(self): def __output_to_file_func(self):
""" """
Required: Required:
@ -99,14 +97,11 @@ class Output:
Logic: Logic:
read one line at a time. Output to standard read one line at a time. Output to standard
""" """
read_obj = open(self.__file, 'r') with open(self.__file, 'r') as read_obj:
write_obj = open(self.__out_file, 'w') with open(self.__out_file, 'w') as write_obj:
line = 1 for line in read_obj:
while line: write_obj.write(line)
line = read_obj.readline()
write_obj.write(line)
read_obj.close()
write_obj.close()
def __output_to_standard_func(self): def __output_to_standard_func(self):
""" """
Required: Required:
@ -116,26 +111,24 @@ class Output:
Logic: Logic:
read one line at a time. Output to standard read one line at a time. Output to standard
""" """
read_obj = open(self.__file, 'r') with open(self.__file, 'r') as read_obj:
line = 1 for line in read_obj:
while line: sys.stdout.write(line)
line = read_obj.readline()
sys.stdout.write(line) # def __output_xml(self, in_file, out_file):
read_obj.close() # """
def __output_xml(self, in_file, out_file): # output the ill-formed xml file
""" # """
output the ill-formed xml file # (utf8_encode, utf8_decode, utf8_reader, utf8_writer) = codecs.lookup("utf-8")
""" # write_obj = utf8_writer(open(out_file, 'w'))
(utf8_encode, utf8_decode, utf8_reader, utf8_writer) = codecs.lookup("utf-8") # write_obj = open(out_file, 'w')
write_obj = utf8_writer(open(out_file, 'w')) # read_obj = utf8_writer(open(in_file, 'r'))
write_obj = open(out_file, 'w') # read_obj = open(in_file, 'r')
read_obj = utf8_writer(open(in_file, 'r')) # line = 1
read_obj = open(in_file, 'r') # while line:
line = 1 # line = read_obj.readline()
while line: # if isinstance(line, type(u"")):
line = read_obj.readline() # line = line.encode("utf-8")
if isinstance(line, type(u"")): # write_obj.write(line)
line = line.encode("utf-8") # read_obj.close()
write_obj.write(line) # write_obj.close()
read_obj.close()
write_obj.close()

View File

@ -11,31 +11,32 @@
# # # #
######################################################################### #########################################################################
import sys, os import sys, os
from calibre.ebooks.rtf2xml import copy from calibre.ebooks.rtf2xml import copy
from calibre.ptempfile import better_mktemp from calibre.ptempfile import better_mktemp
class Paragraphs: class Paragraphs:
""" """
================= =================
Purpose Purpose
================= =================
Write paragraph tags for a tokenized file. (This module won't be any use to use Write paragraph tags for a tokenized file. (This module won't be any use to use
to you unless you use it as part of the other modules.) to you unless you use it as part of the other modules.)
------------- -------------
Method Method
------------- -------------
RTF does not tell you when a paragraph begins. It only tells you when the RTF does not tell you when a paragraph begins. It only tells you when the
paragraph ends. paragraph ends.
In order to make paragraphs out of this limited info, the parser starts in the In order to make paragraphs out of this limited info, the parser starts in the
body of the documents and assumes it is not in a paragraph. It looks for clues body of the documents and assumes it is not in a paragraph. It looks for clues
to begin a paragraph. Text starts a paragraph; so does an inline field or to begin a paragraph. Text starts a paragraph; so does an inline field or
list-text. If an end of paragraph marker (\par) is found, then this indicates list-text. If an end of paragraph marker (\par) is found, then this indicates
a blank paragraph. a blank paragraph.
Once a paragraph is found, the state changes to 'paragraph.' In this state, Once a paragraph is found, the state changes to 'paragraph.' In this state,
clues are looked to for the end of a paragraph. The end of a paragraph marker clues are looked to for the end of a paragraph. The end of a paragraph marker
(\par) marks the end of a paragraph. So does the end of a footnote or heading; (\par) marks the end of a paragraph. So does the end of a footnote or heading;
a paragraph definintion; the end of a field-block; and the beginning of a a paragraph definition; the end of a field-block; and the beginning of a
section. (How about the end of a section or the end of a field-block?) section. (How about the end of a section or the end of a field-block?)
""" """
def __init__(self, def __init__(self,
in_file, in_file,
@ -60,6 +61,7 @@ section. (How about the end of a section or the end of a field-block?)
self.__write_empty_para = write_empty_para self.__write_empty_para = write_empty_para
self.__run_level = run_level self.__run_level = run_level
self.__write_to = better_mktemp() self.__write_to = better_mktemp()
def __initiate_values(self): def __initiate_values(self):
""" """
Initiate all values. Initiate all values.
@ -77,7 +79,7 @@ section. (How about the end of a section or the end of a field-block?)
self.__paragraph_dict = { self.__paragraph_dict = {
'cw<pf<par-end___' : self.__close_para_func, # end of paragraph 'cw<pf<par-end___' : self.__close_para_func, # end of paragraph
'mi<mk<headi_-end' : self.__close_para_func, # end of header or footer 'mi<mk<headi_-end' : self.__close_para_func, # end of header or footer
##'cw<pf<par-def___' : self.__close_para_func, # paragraph definition ## 'cw<pf<par-def___' : self.__close_para_func, # paragraph definition
# 'mi<mk<fld-bk-end' : self.__close_para_func, # end of field-block # 'mi<mk<fld-bk-end' : self.__close_para_func, # end of field-block
'mi<mk<fldbk-end_' : self.__close_para_func, # end of field-block 'mi<mk<fldbk-end_' : self.__close_para_func, # end of field-block
'mi<mk<body-close' : self.__close_para_func, # end of body 'mi<mk<body-close' : self.__close_para_func, # end of body
@ -99,6 +101,7 @@ section. (How about the end of a section or the end of a field-block?)
'mi<mk<pict-start' : self.__start_para_func, 'mi<mk<pict-start' : self.__start_para_func,
'cw<pf<page-break' : self.__empty_pgbk_func, # page break 'cw<pf<page-break' : self.__empty_pgbk_func, # page break
} }
def __before_body_func(self, line): def __before_body_func(self, line):
""" """
Required: Required:
@ -112,6 +115,7 @@ section. (How about the end of a section or the end of a field-block?)
if self.__token_info == 'mi<mk<body-open_': if self.__token_info == 'mi<mk<body-open_':
self.__state = 'not_paragraph' self.__state = 'not_paragraph'
self.__write_obj.write(line) self.__write_obj.write(line)
def __not_paragraph_func(self, line): def __not_paragraph_func(self, line):
""" """
Required: Required:
@ -127,6 +131,7 @@ section. (How about the end of a section or the end of a field-block?)
if action: if action:
action(line) action(line)
self.__write_obj.write(line) self.__write_obj.write(line)
def __paragraph_func(self, line): def __paragraph_func(self, line):
""" """
Required: Required:
@ -144,6 +149,7 @@ section. (How about the end of a section or the end of a field-block?)
action(line) action(line)
else: else:
self.__write_obj.write(line) self.__write_obj.write(line)
def __start_para_func(self, line): def __start_para_func(self, line):
""" """
Requires: Requires:
@ -160,6 +166,7 @@ section. (How about the end of a section or the end of a field-block?)
) )
self.__write_obj.write(self.__start2_marker) self.__write_obj.write(self.__start2_marker)
self.__state = 'paragraph' self.__state = 'paragraph'
def __empty_para_func(self, line): def __empty_para_func(self, line):
""" """
Requires: Requires:
@ -176,6 +183,7 @@ section. (How about the end of a section or the end of a field-block?)
'mi<tg<empty_____<para\n' 'mi<tg<empty_____<para\n'
) )
self.__write_obj.write(self.__end_marker) # marker for later parsing self.__write_obj.write(self.__end_marker) # marker for later parsing
def __empty_pgbk_func(self, line): def __empty_pgbk_func(self, line):
""" """
Requires: Requires:
@ -188,6 +196,7 @@ section. (How about the end of a section or the end of a field-block?)
self.__write_obj.write( self.__write_obj.write(
'mi<tg<empty_____<page-break\n' 'mi<tg<empty_____<page-break\n'
) )
def __close_para_func(self, line): def __close_para_func(self, line):
""" """
Requires: Requires:
@ -205,6 +214,7 @@ section. (How about the end of a section or the end of a field-block?)
self.__write_obj.write(self.__end_marker) # marker for later parser self.__write_obj.write(self.__end_marker) # marker for later parser
self.__write_obj.write(line) self.__write_obj.write(line)
self.__state = 'not_paragraph' self.__state = 'not_paragraph'
def __bogus_para__def_func(self, line): def __bogus_para__def_func(self, line):
""" """
Requires: Requires:
@ -215,6 +225,7 @@ section. (How about the end of a section or the end of a field-block?)
if a \pard occurs in a paragraph, I want to ignore it. (I believe) if a \pard occurs in a paragraph, I want to ignore it. (I believe)
""" """
self.__write_obj.write('mi<mk<bogus-pard\n') self.__write_obj.write('mi<mk<bogus-pard\n')
def make_paragraphs(self): def make_paragraphs(self):
""" """
Requires: Requires:
@ -229,20 +240,18 @@ section. (How about the end of a section or the end of a field-block?)
only other state is 'paragraph'. only other state is 'paragraph'.
""" """
self.__initiate_values() self.__initiate_values()
read_obj = open(self.__file, 'r') with open(self.__file, 'r') as read_obj:
self.__write_obj = open(self.__write_to, 'w') with open(self.__write_to, 'w') as self.__write_obj:
line_to_read = 1 for line in read_obj:
while line_to_read: self.__token_info = line[:16]
line_to_read = read_obj.readline() action = self.__state_dict.get(self.__state)
line = line_to_read if action is None:
self.__token_info = line[:16] try:
action = self.__state_dict.get(self.__state) sys.stderr.write('no matching state in module paragraphs.py\n')
if action == None: sys.stderr.write(self.__state + '\n')
sys.stderr.write('no no matching state in module sections.py\n') except:
sys.stderr.write(self.__state + '\n') pass
action(line) action(line)
read_obj.close()
self.__write_obj.close()
copy_obj = copy.Copy(bug_handler = self.__bug_handler) copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy: if self.__copy:
copy_obj.copy_file(self.__write_to, "paragraphs.data") copy_obj.copy_file(self.__write_to, "paragraphs.data")

View File

@ -11,16 +11,24 @@
# # # #
######################################################################### #########################################################################
import sys,os import sys,os
from calibre.ebooks.rtf2xml import copy from calibre.ebooks.rtf2xml import copy
class Preamble: class Preamble:
""" """
Fix the reamaing parts of the preamble. This module does very little. It Fix the reamaing parts of the preamble. This module does very little. It
makes sure that no text gets put in the revision of list table. In the makes sure that no text gets put in the revision of list table. In the
future, when I understand how to interprett he revision table and list future, when I understand how to interpret the revision table and list
table, I will make these methods more functional. table, I will make these methods more functional.
""" """
def __init__(self, file, bug_handler, platform, default_font, code_page, def __init__(self, file,
copy=None, temp_dir=None): bug_handler,
platform,
default_font,
code_page,
copy=None,
temp_dir=None,
):
""" """
Required: Required:
file--file to parse file--file to parse
@ -44,6 +52,7 @@ class Preamble:
self.__write_to = os.path.join(temp_dir,"info_table_info.data") self.__write_to = os.path.join(temp_dir,"info_table_info.data")
else: else:
self.__write_to = "info_table_info.data" self.__write_to = "info_table_info.data"
def __initiate_values(self): def __initiate_values(self):
""" """
Initiate all values. Initiate all values.
@ -62,12 +71,14 @@ class Preamble:
'mi<mk<revtbl-beg' : self.__found_revision_table_func, 'mi<mk<revtbl-beg' : self.__found_revision_table_func,
'mi<mk<body-open_' : self.__found_body_func, 'mi<mk<body-open_' : self.__found_body_func,
} }
def __default_func(self, line): def __default_func(self, line):
action = self.__default_dict.get(self.__token_info) action = self.__default_dict.get(self.__token_info)
if action: if action:
action(line) action(line)
else: else:
self.__write_obj.write(line) self.__write_obj.write(line)
def __found_rtf_head_func(self, line): def __found_rtf_head_func(self, line):
""" """
Requires: Requires:
@ -84,8 +95,10 @@ class Preamble:
'<platform>%s\n' % (self.__default_font, self.__code_page, '<platform>%s\n' % (self.__default_font, self.__code_page,
self.__platform) self.__platform)
) )
def __found_list_table_func(self, line): def __found_list_table_func(self, line):
self.__state = 'list_table' self.__state = 'list_table'
def __list_table_func(self, line): def __list_table_func(self, line):
if self.__token_info == 'mi<mk<listabend_': if self.__token_info == 'mi<mk<listabend_':
self.__state = 'default' self.__state = 'default'
@ -93,8 +106,10 @@ class Preamble:
pass pass
else: else:
self.__write_obj.write(line) self.__write_obj.write(line)
def __found_revision_table_func(self, line): def __found_revision_table_func(self, line):
self.__state = 'revision' self.__state = 'revision'
def __revision_table_func(self, line): def __revision_table_func(self, line):
if self.__token_info == 'mi<mk<revtbl-end': if self.__token_info == 'mi<mk<revtbl-end':
self.__state = 'default' self.__state = 'default'
@ -102,11 +117,14 @@ class Preamble:
pass pass
else: else:
self.__write_obj.write(line) self.__write_obj.write(line)
def __found_body_func(self, line): def __found_body_func(self, line):
self.__state = 'body' self.__state = 'body'
self.__write_obj.write(line) self.__write_obj.write(line)
def __body_func(self, line): def __body_func(self, line):
self.__write_obj.write(line) self.__write_obj.write(line)
def fix_preamble(self): def fix_preamble(self):
""" """
Requires: Requires:
@ -119,20 +137,15 @@ class Preamble:
the list table. the list table.
""" """
self.__initiate_values() self.__initiate_values()
read_obj = open(self.__file, 'r') with open(self.__file, 'r') as read_obj:
self.__write_obj = open(self.__write_to, 'w') with open(self.__write_to, 'w') as self.__write_obj:
line_to_read = 1 for line in read_obj:
while line_to_read: self.__token_info = line[:16]
line_to_read = read_obj.readline() action = self.__state_dict.get(self.__state)
line = line_to_read if action is None:
self.__token_info = line[:16] sys.stderr.write(
action = self.__state_dict.get(self.__state) 'no matching state in module preamble_rest.py\n' + self.__state + '\n')
if action == None: action(line)
sys.stderr.write('no no matching state in module preamble_rest.py\n')
sys.stderr.write(self.__state + '\n')
action(line)
read_obj.close()
self.__write_obj.close()
copy_obj = copy.Copy(bug_handler = self.__bug_handler) copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy: if self.__copy:
copy_obj.copy_file(self.__write_to, "preamble_div.data") copy_obj.copy_file(self.__write_to, "preamble_div.data")

View File

@ -11,43 +11,44 @@
# # # #
######################################################################### #########################################################################
import sys, os import sys, os
from calibre.ebooks.rtf2xml import copy from calibre.ebooks.rtf2xml import copy
from calibre.ptempfile import better_mktemp from calibre.ptempfile import better_mktemp
class Sections: class Sections:
""" """
================= =================
Purpose Purpose
================= =================
Write section tags for a tokenized file. (This module won't be any use to use Write section tags for a tokenized file. (This module won't be any use to use
to you unless you use it as part of the other modules.) to you unless you use it as part of the other modules.)
--------------- ---------------
logic logic
--------------- ---------------
The tags for the first section breaks have already been written. The tags for the first section breaks have already been written.
RTF stores section breaks with the \sect tag. Each time this tag is RTF stores section breaks with the \sect tag. Each time this tag is
encountered, add one to the counter. encountered, add one to the counter.
When I encounter the \sectd tag, I want to collect all the appropriate tokens When I encounter the \sectd tag, I want to collect all the appropriate tokens
that describe the section. When I reach a \pard, I know I an stop collecting that describe the section. When I reach a \pard, I know I an stop collecting
tokens and write the section tags. tokens and write the section tags.
The exception to this method occurs when sections occur in field blocks, such The exception to this method occurs when sections occur in field blocks, such
as the index. Normally, two section break occur within the index and other as the index. Normally, two section break occur within the index and other
field-blocks. (If less or more section breaks occurr, this code may not work.) field-blocks. (If less or more section breaks occurr, this code may not work.)
I want the sections to occurr outside of the index. That is, the index I want the sections to occur outside of the index. That is, the index
should be nested inside one section tag. After the index is complete, a new should be nested inside one section tag. After the index is complete, a new
section should begin. section should begin.
In order to write the sections outside of the field blocks, I have to store In order to write the sections outside of the field blocks, I have to store
all of the field block as a string. When I ecounter the \sect tag, add one to all of the field block as a string. When I ecounter the \sect tag, add one to
the section counter, but store this number in a list. Likewise, store the the section counter, but store this number in a list. Likewise, store the
information describing the section in another list. information describing the section in another list.
When I reach the end of the field block, choose the first item from the When I reach the end of the field block, choose the first item from the
numbered list as the section number. Choose the first item in the description numbered list as the section number. Choose the first item in the description
list as the values and attributes of the section. Enclose the field string list as the values and attributes of the section. Enclose the field string
between the section tags. between the section tags.
Start a new section outside the field-block strings. Use the second number in Start a new section outside the field-block strings. Use the second number in
the list; use the second item in the description list. the list; use the second item in the description list.
CHANGE (2004-04-26) No longer write sections that occurr in field-blocks. CHANGE (2004-04-26) No longer write sections that occurr in field-blocks.
Instead, ingore all section information in a field-block. Instead, ingore all section information in a field-block.
""" """
def __init__(self, def __init__(self,
in_file, in_file,