mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
RTF Input: Handle old RTF files that have commands without braces. Fixes #994133 (Private bug)
This commit is contained in:
commit
fb94b02be3
@ -372,8 +372,8 @@ class ParseRtf:
|
||||
old_rtf = old_rtf_obj.check_if_old_rtf()
|
||||
if old_rtf:
|
||||
if self.__run_level > 5:
|
||||
msg = 'Older RTF\n'
|
||||
msg += 'self.__run_level is "%s"\n' % self.__run_level
|
||||
msg = 'Older RTF\n' \
|
||||
'self.__run_level is "%s"\n' % self.__run_level
|
||||
raise RtfInvalidCodeException, msg
|
||||
if self.__run_level > 1:
|
||||
sys.stderr.write('File could be older RTF...\n')
|
||||
@ -381,7 +381,7 @@ class ParseRtf:
|
||||
if self.__run_level > 1:
|
||||
sys.stderr.write(
|
||||
'File also has newer RTF.\n'
|
||||
'Will do the best to convert.\n'
|
||||
'Will do the best to convert...\n'
|
||||
)
|
||||
add_brackets_obj = add_brackets.AddBrackets(
|
||||
in_file = self.__temp_file,
|
||||
|
@ -20,6 +20,9 @@ class AddBrackets:
|
||||
"""
|
||||
Add brackets for old RTF.
|
||||
Logic:
|
||||
When control words without their own brackets are encountered
|
||||
and in the list of allowed words, this will add brackets
|
||||
to facilitate the treatment of the file
|
||||
"""
|
||||
def __init__(self, in_file,
|
||||
bug_handler,
|
||||
@ -41,53 +44,56 @@ class AddBrackets:
|
||||
self.__copy = copy
|
||||
self.__write_to = better_mktemp()
|
||||
self.__run_level = run_level
|
||||
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
"""
|
||||
self.__state_dict = {
|
||||
'before_body' : self.__before_body_func,
|
||||
'in_body' : self.__in_body_func,
|
||||
'after_control_word' : self.__after_control_word_func,
|
||||
'in_ignore' : self.__ignore_func,
|
||||
}
|
||||
self.__accept = [
|
||||
'cw<ci<bold______' ,
|
||||
'cw<ci<annotation' ,
|
||||
'cw<ci<blue______' ,
|
||||
# 'cw<ci<bold______' ,
|
||||
'cw<ci<caps______' ,
|
||||
'cw<ci<char-style' ,
|
||||
'cw<ci<dbl-strike' ,
|
||||
'cw<ci<emboss____' ,
|
||||
'cw<ci<engrave___' ,
|
||||
'cw<ci<font-color' ,
|
||||
'cw<ci<font-down_' ,
|
||||
'cw<ci<font-size_' ,
|
||||
'cw<ci<font-style' ,
|
||||
'cw<ci<font-up___' ,
|
||||
'cw<ci<footnot-mk' ,
|
||||
'cw<ci<green_____' ,
|
||||
'cw<ci<hidden____' ,
|
||||
'cw<ci<italics___' ,
|
||||
'cw<ci<outline___' ,
|
||||
'cw<ci<red_______' ,
|
||||
'cw<ci<shadow____' ,
|
||||
'cw<ci<small-caps' ,
|
||||
'cw<ci<strike-thr' ,
|
||||
'cw<ci<subscript_' ,
|
||||
'cw<ci<superscrip' ,
|
||||
'cw<ci<underlined' ,
|
||||
# 'cw<ul<underlined' ,
|
||||
]
|
||||
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Init temp values
|
||||
"""
|
||||
self.__state = 'before_body'
|
||||
self.__inline = {}
|
||||
self.__temp_group = []
|
||||
self.__open_bracket = 0
|
||||
self.__found_brackets = 0
|
||||
self.__accept = [
|
||||
'cw<ci<bold______',
|
||||
'cw<ci<annotation' ,
|
||||
'cw<ci<blue______' ,
|
||||
'cw<ci<bold______' ,
|
||||
'cw<ci<caps______' ,
|
||||
'cw<ci<char-style' ,
|
||||
'cw<ci<dbl-strike' ,
|
||||
'cw<ci<emboss____' ,
|
||||
'cw<ci<engrave___' ,
|
||||
'cw<ci<font-color' ,
|
||||
'cw<ci<font-down_' ,
|
||||
'cw<ci<font-size_' ,
|
||||
'cw<ci<font-style' ,
|
||||
'cw<ci<font-up___',
|
||||
'cw<ci<footnot-mk',
|
||||
'cw<ci<green_____' ,
|
||||
'cw<ci<hidden____',
|
||||
'cw<ci<italics___' ,
|
||||
'cw<ci<outline___',
|
||||
'cw<ci<red_______' ,
|
||||
'cw<ci<shadow____',
|
||||
'cw<ci<small-caps' ,
|
||||
'cw<ci<strike-thr',
|
||||
'cw<ci<subscript_' ,
|
||||
'cw<ci<superscrip',
|
||||
'cw<ci<underlined' ,
|
||||
# 'cw<ul<underlined' ,
|
||||
]
|
||||
self.__open_bracket = False
|
||||
self.__found_brackets = False
|
||||
|
||||
|
||||
def __before_body_func(self, line):
|
||||
"""
|
||||
If we are before the body, not interest in changing anything
|
||||
"""
|
||||
if self.__token_info == 'mi<mk<body-open_':
|
||||
self.__state = 'in_body'
|
||||
@ -95,6 +101,14 @@ class AddBrackets:
|
||||
|
||||
def __in_body_func(self, line):
|
||||
"""
|
||||
Select what action to take in body:
|
||||
1-At the end of the file close the braket if a bracket was opened
|
||||
This happens if there is achange
|
||||
2-If an open bracket is found the code inside is ignore
|
||||
(written without modifications)
|
||||
3-If an accepted control word is found put the line
|
||||
in a buffer then chage state to after cw
|
||||
4-Else simply write the line
|
||||
"""
|
||||
if line == 'cb<nu<clos-brack<0001\n' and self.__open_bracket:
|
||||
self.__write_obj.write(
|
||||
@ -102,7 +116,7 @@ class AddBrackets:
|
||||
)
|
||||
self.__write_obj.write(line)
|
||||
elif self.__token_info == 'ob<nu<open-brack':
|
||||
self.__found_brackets = 1
|
||||
self.__found_brackets = True
|
||||
self.__state = 'in_ignore'
|
||||
self.__ignore_count = self.__ob_count
|
||||
self.__write_obj.write(line)
|
||||
@ -114,6 +128,10 @@ class AddBrackets:
|
||||
|
||||
def __after_control_word_func(self, line):
|
||||
"""
|
||||
After a cw either add next allowed cw to temporary list or
|
||||
change groupe and write it.
|
||||
If the token leading to an exit is an open bracket go to
|
||||
ignore otherwise goto in body
|
||||
"""
|
||||
if self.__token_info in self.__accept:
|
||||
self.__temp_group.append(line)
|
||||
@ -129,82 +147,84 @@ class AddBrackets:
|
||||
|
||||
def __write_group(self):
|
||||
"""
|
||||
Write a tempory group after accepted control words end
|
||||
But this is mostly useless in my opinion as there is no list of rejected cw
|
||||
This may be a way to implement future old rtf processing for cw
|
||||
Utility: open a group to just put brackets but why be so complicated?
|
||||
Scheme: open brackets, write cw then go to body and back with cw after
|
||||
"""
|
||||
if self.__open_bracket:
|
||||
self.__write_obj.write(
|
||||
'cb<nu<clos-brack<0003\n'
|
||||
)
|
||||
self.__open_bracket = 0
|
||||
inline_string = ''
|
||||
the_keys = self.__inline.keys()
|
||||
for the_key in the_keys:
|
||||
value = self.__inline[the_key]
|
||||
if value != 'false':
|
||||
inline_string += '%s<nu<%s\n' % (the_key, value)
|
||||
self.__open_bracket = False
|
||||
|
||||
inline_string = ''.join(['%s<nu<%s\n' % (k, v) \
|
||||
for k, v in self.__inline.iteritems() \
|
||||
if v != 'false'])
|
||||
if inline_string:
|
||||
self.__write_obj.write('ob<nu<open-brack<0003\n')
|
||||
self.__write_obj.write(inline_string)
|
||||
self.__open_bracket = 1
|
||||
self.__write_obj.write('ob<nu<open-brack<0003\n'
|
||||
'%s' % inline_string)
|
||||
self.__open_bracket = True
|
||||
self.__temp_group = []
|
||||
|
||||
def __change_permanent_group(self):
|
||||
"""
|
||||
use temp group to change permanent group
|
||||
Use temp group to change permanent group
|
||||
If the control word is not accepted remove it
|
||||
What is the interest as it is build to accept only accepted cw
|
||||
in __after_control_word_func?
|
||||
"""
|
||||
for line in self.__temp_group:
|
||||
token_info = line[:16]
|
||||
if token_info in self.__accept:
|
||||
att = line[20:-1]
|
||||
self.__inline[token_info] = att
|
||||
self.__inline = {line[:16] : line[20:-1]\
|
||||
for line in self.__temp_group\
|
||||
# Is this really necessary?
|
||||
if line[:16] in self.__accept}
|
||||
|
||||
|
||||
def __ignore_func(self, line):
|
||||
"""
|
||||
Don't add any brackets while inside of brackets RTF has already
|
||||
added.
|
||||
Just copy data inside of RTF brackets already here.
|
||||
"""
|
||||
self.__write_obj.write(line)
|
||||
if self.__token_info == 'cb<nu<clos-brack'and\
|
||||
self.__cb_count == self.__ignore_count:
|
||||
if self.__token_info == 'cb<nu<clos-brack'\
|
||||
and self.__cb_count == self.__ignore_count:
|
||||
self.__state = 'in_body'
|
||||
|
||||
def __check_brackets(self, in_file):
|
||||
self.__check_brack_obj = check_brackets.CheckBrackets\
|
||||
"""
|
||||
Return True if brackets match
|
||||
"""
|
||||
check_brack_obj = check_brackets.CheckBrackets\
|
||||
(file = in_file)
|
||||
good_br = self.__check_brack_obj.check_brackets()[0]
|
||||
if not good_br:
|
||||
return 1
|
||||
return check_brack_obj.check_brackets()[0]
|
||||
|
||||
def add_brackets(self):
|
||||
"""
|
||||
"""
|
||||
self.__initiate_values()
|
||||
read_obj = open(self.__file, 'r')
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.__ob_count = line[-5:-1]
|
||||
if self.__token_info == 'cb<nu<clos-brack':
|
||||
self.__cb_count = line[-5:-1]
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if action == None:
|
||||
sys.stderr.write('No matching state in module add_brackets.py\n')
|
||||
sys.stderr.write(self.__state + '\n')
|
||||
action(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
bad_brackets = self.__check_brackets(self.__write_to)
|
||||
if not bad_brackets:
|
||||
with open(self.__file, 'r') as read_obj:
|
||||
with open(self.__write_to, 'w') as self.__write_obj:
|
||||
for line in read_obj:
|
||||
self.__token_info = line[:16]
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.__ob_count = line[-5:-1]
|
||||
if self.__token_info == 'cb<nu<clos-brack':
|
||||
self.__cb_count = line[-5:-1]
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if action is None:
|
||||
sys.stderr.write(
|
||||
'No matching state in module add_brackets.py\n'
|
||||
'%s\n' % self.__state)
|
||||
action(line)
|
||||
#Check bad brackets
|
||||
if self.__check_brackets(self.__write_to):
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "add_brackets.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
else:
|
||||
if self.__run_level > 0:
|
||||
sys.stderr.write(
|
||||
'Sorry, but this files has a mix of old and new RTF.\n'
|
||||
'Some characteristics cannot be converted.\n')
|
||||
os.remove(self.__write_to)
|
||||
os.remove(self.__write_to)
|
File diff suppressed because it is too large
Load Diff
@ -1,4 +1,5 @@
|
||||
import os, sys
|
||||
from codecs import EncodedFile
|
||||
|
||||
from calibre.ebooks.rtf2xml import copy, check_encoding
|
||||
from calibre.ptempfile import better_mktemp
|
||||
@ -41,6 +42,7 @@ class ConvertToTags:
|
||||
self.__run_level = run_level
|
||||
self.__write_to = better_mktemp()
|
||||
self.__convert_utf = False
|
||||
self.__bad_encoding = False
|
||||
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
@ -213,13 +215,14 @@ class ConvertToTags:
|
||||
|
||||
if not check_encoding_obj.check_encoding(self.__file, verbose=False):
|
||||
self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>')
|
||||
elif not check_encoding_obj.check_encoding(self.__file, self.__encoding):
|
||||
elif not check_encoding_obj.check_encoding(self.__file, self.__encoding, verbose=False):
|
||||
self.__write_obj.write('<?xml version="1.0" encoding="UTF-8" ?>')
|
||||
self.__convert_utf = True
|
||||
else:
|
||||
self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>')
|
||||
sys.stderr.write('Bad RTF encoding, revert to US-ASCII chars and'
|
||||
' hope for the best')
|
||||
self.__bad_encoding = True
|
||||
self.__new_line = 0
|
||||
self.__write_new_line()
|
||||
if self.__no_dtd:
|
||||
@ -247,7 +250,7 @@ class ConvertToTags:
|
||||
the appropriate function.
|
||||
The functions that are called:
|
||||
a text function for text
|
||||
an open funciton for open tags
|
||||
an open function for open tags
|
||||
an open with attribute function for tags with attributes
|
||||
an empty with attribute function for tags that are empty but have
|
||||
attribtes.
|
||||
@ -263,20 +266,19 @@ class ConvertToTags:
|
||||
action = self.__state_dict.get(self.__token_info)
|
||||
if action is not None:
|
||||
action(line)
|
||||
self.__write_obj.close()
|
||||
#convert all encodings to UTF8 to avoid unsupported encodings in lxml
|
||||
if self.__convert_utf:
|
||||
#convert all encodings to UTF8 or ASCII to avoid unsupported encodings in lxml
|
||||
if self.__convert_utf or self.__bad_encoding:
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
file_encoding = "utf-8"
|
||||
if self.__bad_encoding:
|
||||
file_encoding = "us-ascii"
|
||||
with open(self.__file, 'r') as read_obj:
|
||||
with open(self.__write_to, 'w') as write_obj:
|
||||
file = read_obj.read()
|
||||
try:
|
||||
file = file.decode(self.__encoding)
|
||||
write_obj.write(file.encode('utf-8'))
|
||||
except:
|
||||
sys.stderr.write('Conversion to UTF-8 is not possible,'
|
||||
' encoding should be very carefully checked')
|
||||
write_objenc = EncodedFile(write_obj, self.__encoding,
|
||||
file_encoding, 'replace')
|
||||
for line in read_obj:
|
||||
write_objenc.write(line)
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "convert_to_tags.data")
|
||||
|
@ -11,6 +11,7 @@
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os
|
||||
|
||||
from calibre.ebooks.rtf2xml import copy
|
||||
from calibre.ptempfile import better_mktemp
|
||||
|
||||
@ -31,29 +32,29 @@ class Header:
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__write_to = better_mktemp()
|
||||
self.__found_a_header = 0
|
||||
self.__found_a_header = False
|
||||
|
||||
def __in_header_func(self, line):
|
||||
"""
|
||||
Handle all tokens that are part of header
|
||||
"""
|
||||
if self.__cb_count == self.__header_bracket_count:
|
||||
self.__in_header = 0
|
||||
self.__in_header = False
|
||||
self.__write_obj.write(line)
|
||||
self.__write_to_head_obj.write(
|
||||
'mi<mk<head___clo\n')
|
||||
self.__write_to_head_obj.write(
|
||||
'mi<tg<close_____<header-or-footer\n')
|
||||
self.__write_to_head_obj.write(
|
||||
'mi<mk<head___clo\n' \
|
||||
'mi<tg<close_____<header-or-footer\n' \
|
||||
'mi<mk<header-clo\n')
|
||||
else:
|
||||
self.__write_to_head_obj.write(line)
|
||||
|
||||
def __found_header(self, line):
|
||||
"""
|
||||
Found a header
|
||||
"""
|
||||
# but this could be header or footer
|
||||
self.__found_a_header = 1
|
||||
self.__in_header = 1
|
||||
self.__found_a_header = True
|
||||
self.__in_header = True
|
||||
self.__header_count += 1
|
||||
# temporarily set this to zero so I can enter loop
|
||||
self.__cb_count = 0
|
||||
@ -69,18 +70,23 @@ class Header:
|
||||
'mi<tg<open-att__<header-or-footer<type>%s\n' % (type)
|
||||
)
|
||||
else:
|
||||
sys.stderr.write('module is header\n')
|
||||
sys.stderr.write('method is __found_header\n')
|
||||
sys.stderr.write('no dict entry\n')
|
||||
sys.stderr.write('line is %s' % line)
|
||||
sys.stderr.write(
|
||||
'module is header\n' \
|
||||
'method is __found_header\n' \
|
||||
'no dict entry\n' \
|
||||
'line is %s' % line)
|
||||
self.__write_to_head_obj.write(
|
||||
'mi<tg<open-att__<header-or-footer<type>none\n'
|
||||
)
|
||||
|
||||
def __default_sep(self, line):
|
||||
"""Handle all tokens that are not header tokens"""
|
||||
"""
|
||||
Handle all tokens that are not header tokens
|
||||
"""
|
||||
if self.__token_info[3:5] == 'hf':
|
||||
self.__found_header(line)
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __initiate_sep_values(self):
|
||||
"""
|
||||
initiate counters for separate_footnotes method.
|
||||
@ -89,7 +95,7 @@ class Header:
|
||||
self.__ob_count = 0
|
||||
self.__cb_count = 0
|
||||
self.__header_bracket_count = 0
|
||||
self.__in_header = 0
|
||||
self.__in_header = False
|
||||
self.__header_count = 0
|
||||
self.__head_dict = {
|
||||
'head-left_' : ('header-left'),
|
||||
@ -101,6 +107,7 @@ class Header:
|
||||
'header____' : ('header' ),
|
||||
'footer____' : ('footer' ),
|
||||
}
|
||||
|
||||
def separate_headers(self):
|
||||
"""
|
||||
Separate all the footnotes in an RTF file and put them at the bottom,
|
||||
@ -110,53 +117,47 @@ class Header:
|
||||
bottom of the main file.
|
||||
"""
|
||||
self.__initiate_sep_values()
|
||||
read_obj = open(self.__file)
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
self.__header_holder = better_mktemp()
|
||||
self.__write_to_head_obj = open(self.__header_holder, 'w')
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
# keep track of opening and closing brackets
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.__ob_count = line[-5:-1]
|
||||
if self.__token_info == 'cb<nu<clos-brack':
|
||||
self.__cb_count = line[-5:-1]
|
||||
# In the middle of footnote text
|
||||
if self.__in_header:
|
||||
self.__in_header_func(line)
|
||||
# not in the middle of footnote text
|
||||
else:
|
||||
self.__default_sep(line)
|
||||
self.__write_obj.close()
|
||||
read_obj.close()
|
||||
self.__write_to_head_obj.close()
|
||||
read_obj = open(self.__header_holder, 'r')
|
||||
write_obj = open(self.__write_to, 'a')
|
||||
write_obj.write(
|
||||
'mi<mk<header-beg\n')
|
||||
line = 1
|
||||
while line:
|
||||
line = read_obj.readline()
|
||||
write_obj.write(line)
|
||||
write_obj.write(
|
||||
'mi<mk<header-end\n')
|
||||
read_obj.close()
|
||||
write_obj.close()
|
||||
with open(self.__file) as read_obj:
|
||||
with open(self.__write_to, 'w') as self.__write_obj:
|
||||
with open(self.__header_holder, 'w') as self.__write_to_head_obj:
|
||||
for line in read_obj:
|
||||
self.__token_info = line[:16]
|
||||
# keep track of opening and closing brackets
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.__ob_count = line[-5:-1]
|
||||
if self.__token_info == 'cb<nu<clos-brack':
|
||||
self.__cb_count = line[-5:-1]
|
||||
# In the middle of footnote text
|
||||
if self.__in_header:
|
||||
self.__in_header_func(line)
|
||||
# not in the middle of footnote text
|
||||
else:
|
||||
self.__default_sep(line)
|
||||
|
||||
with open(self.__header_holder, 'r') as read_obj:
|
||||
with open(self.__write_to, 'a') as write_obj:
|
||||
write_obj.write(
|
||||
'mi<mk<header-beg\n')
|
||||
for line in read_obj:
|
||||
write_obj.write(line)
|
||||
write_obj.write(
|
||||
'mi<mk<header-end\n')
|
||||
os.remove(self.__header_holder)
|
||||
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "header_separate.info")
|
||||
copy_obj.copy_file(self.__write_to, "header_separate.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
|
||||
def update_info(self, file, copy):
|
||||
"""
|
||||
Unused method
|
||||
"""
|
||||
self.__file = file
|
||||
self.__copy = copy
|
||||
|
||||
def __get_head_body_func(self, line):
|
||||
"""
|
||||
Process lines in main body and look for beginning of headers.
|
||||
@ -166,6 +167,7 @@ class Header:
|
||||
self.__state = 'head'
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __get_head_head_func(self, line):
|
||||
"""
|
||||
Copy headers and footers from bottom of file to a separate, temporary file.
|
||||
@ -174,6 +176,7 @@ class Header:
|
||||
self.__state = 'body'
|
||||
else:
|
||||
self.__write_to_head_obj.write(line)
|
||||
|
||||
def __get_headers(self):
|
||||
"""
|
||||
Private method to remove footnotes from main file. Read one line from
|
||||
@ -182,21 +185,16 @@ class Header:
|
||||
These two functions do the work of separating the footnotes form the
|
||||
body.
|
||||
"""
|
||||
read_obj = open(self.__file)
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
# self.__write_to = "footnote_info.data"
|
||||
self.__write_to_head_obj = open(self.__header_holder, 'w')
|
||||
line = 1
|
||||
while line:
|
||||
line = read_obj.readline()
|
||||
self.__token_info = line[:16]
|
||||
if self.__state == 'body':
|
||||
self.__get_head_body_func(line)
|
||||
elif self.__state == 'head':
|
||||
self.__get_head_head_func(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
self.__write_to_head_obj.close()
|
||||
with open(self.__file) as read_obj:
|
||||
with open(self.__write_to, 'w') as self.__write_obj:
|
||||
with open(self.__header_holder, 'w') as self.__write_to_head_obj:
|
||||
for line in read_obj:
|
||||
self.__token_info = line[:16]
|
||||
if self.__state == 'body':
|
||||
self.__get_head_body_func(line)
|
||||
elif self.__state == 'head':
|
||||
self.__get_head_head_func(line)
|
||||
|
||||
def __get_head_from_temp(self, num):
|
||||
"""
|
||||
Private method for joining headers and footers to body. This method
|
||||
@ -205,18 +203,17 @@ class Header:
|
||||
returns them as a string.
|
||||
"""
|
||||
look_for = 'mi<mk<header-ope<' + num + '\n'
|
||||
found_head = 0
|
||||
found_head = False
|
||||
string_to_return = ''
|
||||
line = 1
|
||||
while line:
|
||||
line = self.__read_from_head_obj.readline()
|
||||
for line in self.__read_from_head_obj:
|
||||
if found_head:
|
||||
if line == 'mi<mk<header-clo\n':
|
||||
return string_to_return
|
||||
string_to_return = string_to_return + line
|
||||
string_to_return += line
|
||||
else:
|
||||
if line == look_for:
|
||||
found_head = 1
|
||||
found_head = True
|
||||
|
||||
def __join_from_temp(self):
|
||||
"""
|
||||
Private method for rejoining footnotes to body. Read from the
|
||||
@ -227,15 +224,13 @@ class Header:
|
||||
If no footnote marker is found, simply print out the token (line).
|
||||
"""
|
||||
self.__read_from_head_obj = open(self.__header_holder, 'r')
|
||||
read_obj = open(self.__write_to, 'r')
|
||||
self.__write_obj = open(self.__write_to2, 'w')
|
||||
line = 1
|
||||
while line:
|
||||
line = read_obj.readline()
|
||||
if line[:16] == 'mi<mk<header-ind':
|
||||
line = self.__get_head_from_temp(line[17:-1])
|
||||
self.__write_obj.write(line)
|
||||
read_obj.close()
|
||||
with open(self.__write_to, 'r') as read_obj:
|
||||
for line in read_obj:
|
||||
if line[:16] == 'mi<mk<header-ind':
|
||||
line = self.__get_head_from_temp(line[17:-1])
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def join_headers(self):
|
||||
"""
|
||||
Join the footnotes from the bottom of the file and put them in their
|
||||
|
@ -181,7 +181,7 @@ class Hex2Utf8:
|
||||
self.__dingbats_dict.update(dingbats_base_dict)
|
||||
self.__dingbats_dict.update(ms_dingbats_dict)
|
||||
# load dictionary for caps, and make a string for the replacement
|
||||
self.__caps_uni_dict = char_map_obj.get_char_map(map='caps_uni')
|
||||
self.__caps_uni_dict = char_map_obj.get_char_map(map = 'caps_uni')
|
||||
# # print self.__caps_uni_dict
|
||||
# don't think I'll need this
|
||||
##keys = self.__caps_uni_dict.keys()
|
||||
|
@ -11,14 +11,18 @@
|
||||
# #
|
||||
#########################################################################
|
||||
import sys
|
||||
"""
|
||||
"""
|
||||
|
||||
class OldRtf:
|
||||
"""
|
||||
Check to see if the RTF is an older version
|
||||
Logic:
|
||||
If allowable control word/properties happen in text without being enclosed
|
||||
in brackets the file will be considered old rtf
|
||||
"""
|
||||
def __init__(self, in_file, bug_handler, run_level ):
|
||||
def __init__(self, in_file,
|
||||
bug_handler,
|
||||
run_level,
|
||||
):
|
||||
"""
|
||||
Required:
|
||||
'file'--file to parse
|
||||
@ -32,46 +36,46 @@ class OldRtf:
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__initiate_values()
|
||||
self.__ob_group = 0
|
||||
def __initiate_values(self):
|
||||
self.__previous_token = ''
|
||||
self.__new_found = 0
|
||||
self.__run_level = run_level
|
||||
self.__allowable = [
|
||||
'annotation' ,
|
||||
'blue______' ,
|
||||
'bold______',
|
||||
'caps______',
|
||||
'char-style' ,
|
||||
'dbl-strike' ,
|
||||
'emboss____',
|
||||
'engrave___' ,
|
||||
'font-color',
|
||||
'font-down_' ,
|
||||
'font-size_',
|
||||
'font-style',
|
||||
'font-up___',
|
||||
'footnot-mk' ,
|
||||
'green_____' ,
|
||||
'hidden____',
|
||||
'italics___',
|
||||
'outline___',
|
||||
'red_______',
|
||||
'shadow____' ,
|
||||
'small-caps',
|
||||
'strike-thr',
|
||||
'subscript_',
|
||||
'superscrip' ,
|
||||
'underlined' ,
|
||||
'annotation' ,
|
||||
'blue______' ,
|
||||
'bold______',
|
||||
'caps______',
|
||||
'char-style' ,
|
||||
'dbl-strike' ,
|
||||
'emboss____',
|
||||
'engrave___' ,
|
||||
'font-color',
|
||||
'font-down_' ,
|
||||
'font-size_',
|
||||
'font-style',
|
||||
'font-up___',
|
||||
'footnot-mk' ,
|
||||
'green_____' ,
|
||||
'hidden____',
|
||||
'italics___',
|
||||
'outline___',
|
||||
'red_______',
|
||||
'shadow____' ,
|
||||
'small-caps',
|
||||
'strike-thr',
|
||||
'subscript_',
|
||||
'superscrip' ,
|
||||
'underlined' ,
|
||||
]
|
||||
self.__state = 'before_body'
|
||||
self.__action_dict = {
|
||||
'before_body' : self.__before_body_func,
|
||||
'in_body' : self.__check_tokens_func,
|
||||
'after_pard' : self.__after_pard_func,
|
||||
}
|
||||
self.__is_old = 0
|
||||
|
||||
def __initiate_values(self):
|
||||
self.__previous_token = ''
|
||||
self.__state = 'before_body'
|
||||
self.__found_new = 0
|
||||
self.__ob_group = 0
|
||||
|
||||
def __check_tokens_func(self, line):
|
||||
if self.__inline_info in self.__allowable:
|
||||
if self.__ob_group == self.__base_ob_count:
|
||||
@ -80,48 +84,56 @@ class OldRtf:
|
||||
self.__found_new += 1
|
||||
elif self.__token_info == 'cw<pf<par-def___':
|
||||
self.__state = 'after_pard'
|
||||
|
||||
def __before_body_func(self, line):
|
||||
if self.__token_info == 'mi<mk<body-open_':
|
||||
self.__state = 'in_body'
|
||||
self.__base_ob_count = self.__ob_group
|
||||
|
||||
def __after_pard_func(self, line):
|
||||
if line[0:2] != 'cw':
|
||||
self.__state = 'in_body'
|
||||
|
||||
def check_if_old_rtf(self):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
1 if file is older RTf
|
||||
0 if file is newer RTF
|
||||
True if file is older RTf
|
||||
False if file is newer RTF
|
||||
"""
|
||||
|
||||
read_obj = open(self.__file, 'r')
|
||||
line = 1
|
||||
self.__initiate_values()
|
||||
line_num = 0
|
||||
while line:
|
||||
line = read_obj.readline()
|
||||
line_num += 1
|
||||
self.__token_info = line[:16]
|
||||
if self.__token_info == 'mi<mk<body-close':
|
||||
return 0
|
||||
self.__ob_group = 0
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.__ob_group += 1
|
||||
self.__ob_count = line[-5:-1]
|
||||
if self.__token_info == 'cb<nu<clos-brack':
|
||||
self.__ob_group -= 1
|
||||
self.__cb_count = line[-5:-1]
|
||||
self.__inline_info = line[6:16]
|
||||
if self.__state == 'after_body':
|
||||
return 0
|
||||
action = self.__action_dict.get(self.__state)
|
||||
if not action:
|
||||
sys.stderr.write('No action for state!\n')
|
||||
result = action(line)
|
||||
if result == 'new_rtf':
|
||||
return 0
|
||||
elif result == 'old_rtf':
|
||||
return 1
|
||||
self.__previous_token = line[6:16]
|
||||
return 0
|
||||
with open(self.__file, 'r') as read_obj:
|
||||
for line in read_obj:
|
||||
line_num += 1
|
||||
self.__token_info = line[:16]
|
||||
if self.__token_info == 'mi<mk<body-close':
|
||||
return False
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.__ob_group += 1
|
||||
self.__ob_count = line[-5:-1]
|
||||
if self.__token_info == 'cb<nu<clos-brack':
|
||||
self.__ob_group -= 1
|
||||
self.__cb_count = line[-5:-1]
|
||||
self.__inline_info = line[6:16]
|
||||
if self.__state == 'after_body':
|
||||
return False
|
||||
action = self.__action_dict.get(self.__state)
|
||||
if action is None:
|
||||
try:
|
||||
sys.stderr.write('No action for this state!\n')
|
||||
except:
|
||||
pass
|
||||
result = action(line)
|
||||
if result == 'new_rtf':
|
||||
return False
|
||||
elif result == 'old_rtf':
|
||||
if self.__run_level > 3:
|
||||
sys.stderr.write(
|
||||
'Old rtf construction %s (bracket %s, line %s)\n'
|
||||
% (self.__inline_info, str(self.__ob_group), line_num)
|
||||
)
|
||||
return True
|
||||
self.__previous_token = line[6:16]
|
||||
return False
|
||||
|
@ -10,7 +10,9 @@
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os, codecs
|
||||
import sys, os
|
||||
# , codecs
|
||||
|
||||
class Output:
|
||||
"""
|
||||
Output file
|
||||
@ -19,7 +21,8 @@ class Output:
|
||||
file,
|
||||
orig_file,
|
||||
output_dir = None,
|
||||
out_file = None
|
||||
out_file = None,
|
||||
no_ask = True
|
||||
):
|
||||
"""
|
||||
Required:
|
||||
@ -33,8 +36,9 @@ class Output:
|
||||
self.__file = file
|
||||
self.__orig_file = orig_file
|
||||
self.__output_dir = output_dir
|
||||
self.__no_ask = 1
|
||||
self.__no_ask = no_ask
|
||||
self.__out_file = out_file
|
||||
|
||||
def output(self):
|
||||
"""
|
||||
Required:
|
||||
@ -45,13 +49,14 @@ class Output:
|
||||
output the line to the screen if no output file given. Otherwise, output to
|
||||
the file.
|
||||
"""
|
||||
# self.__output_xml(self.__file, self.__out_file)
|
||||
if self.__output_dir:
|
||||
self.__output_to_dir_func()
|
||||
elif self.__out_file:
|
||||
self.__output_xml(self.__file, self.__out_file)
|
||||
self.__output_to_file_func()
|
||||
# self.__output_xml(self.__file, self.__out_file)
|
||||
else:
|
||||
self.__output_to_standard_func()
|
||||
|
||||
def __output_to_dir_func(self):
|
||||
"""
|
||||
Requires:
|
||||
@ -64,32 +69,25 @@ class Output:
|
||||
"""
|
||||
base_name = os.path.basename(self.__orig_file)
|
||||
base_name, ext = os.path.splitext(base_name)
|
||||
output_file = '%s.xml' % base_name
|
||||
output_file = os.path.join(self.__output_dir, output_file)
|
||||
output_file = os.path.join(self.__output_dir, '%s.xml' % base_name)
|
||||
# change if user wants to output to a specific file
|
||||
if self.__out_file:
|
||||
output_file = os.path.join(self.__output_dir, self.__out_file)
|
||||
user_response = 'o'
|
||||
if os.path.isfile(output_file):
|
||||
if self.__no_ask:
|
||||
user_response = 'o'
|
||||
else:
|
||||
msg = 'Do you want to over-write %s?\n' % output_file
|
||||
msg += 'Type "o" to over-write.\n'
|
||||
msg += 'Type any other key to print to standard output.\n'
|
||||
sys.stderr.write(msg)
|
||||
user_response = raw_input()
|
||||
if os.path.isfile(output_file) and not self.__no_ask:
|
||||
msg = 'Do you want to overwrite %s?\n' % output_file
|
||||
msg += ('Type "o" to overwrite.\n'
|
||||
'Type any other key to print to standard output.\n')
|
||||
sys.stderr.write(msg)
|
||||
user_response = raw_input()
|
||||
if user_response == 'o':
|
||||
read_obj = open(self.__file, 'r')
|
||||
write_obj = open(output_file, 'w')
|
||||
line = 1
|
||||
while line:
|
||||
line = read_obj.readline()
|
||||
write_obj.write(line)
|
||||
read_obj.close()
|
||||
write_obj.close()
|
||||
with open(self.__file, 'r') as read_obj:
|
||||
with open(self.output_file, 'w') as write_obj:
|
||||
for line in read_obj:
|
||||
write_obj.write(line)
|
||||
else:
|
||||
self.__output_to_standard_func()
|
||||
|
||||
def __output_to_file_func(self):
|
||||
"""
|
||||
Required:
|
||||
@ -99,14 +97,11 @@ class Output:
|
||||
Logic:
|
||||
read one line at a time. Output to standard
|
||||
"""
|
||||
read_obj = open(self.__file, 'r')
|
||||
write_obj = open(self.__out_file, 'w')
|
||||
line = 1
|
||||
while line:
|
||||
line = read_obj.readline()
|
||||
write_obj.write(line)
|
||||
read_obj.close()
|
||||
write_obj.close()
|
||||
with open(self.__file, 'r') as read_obj:
|
||||
with open(self.__out_file, 'w') as write_obj:
|
||||
for line in read_obj:
|
||||
write_obj.write(line)
|
||||
|
||||
def __output_to_standard_func(self):
|
||||
"""
|
||||
Required:
|
||||
@ -116,26 +111,24 @@ class Output:
|
||||
Logic:
|
||||
read one line at a time. Output to standard
|
||||
"""
|
||||
read_obj = open(self.__file, 'r')
|
||||
line = 1
|
||||
while line:
|
||||
line = read_obj.readline()
|
||||
sys.stdout.write(line)
|
||||
read_obj.close()
|
||||
def __output_xml(self, in_file, out_file):
|
||||
"""
|
||||
output the ill-formed xml file
|
||||
"""
|
||||
(utf8_encode, utf8_decode, utf8_reader, utf8_writer) = codecs.lookup("utf-8")
|
||||
write_obj = utf8_writer(open(out_file, 'w'))
|
||||
write_obj = open(out_file, 'w')
|
||||
read_obj = utf8_writer(open(in_file, 'r'))
|
||||
read_obj = open(in_file, 'r')
|
||||
line = 1
|
||||
while line:
|
||||
line = read_obj.readline()
|
||||
if isinstance(line, type(u"")):
|
||||
line = line.encode("utf-8")
|
||||
write_obj.write(line)
|
||||
read_obj.close()
|
||||
write_obj.close()
|
||||
with open(self.__file, 'r') as read_obj:
|
||||
for line in read_obj:
|
||||
sys.stdout.write(line)
|
||||
|
||||
# def __output_xml(self, in_file, out_file):
|
||||
# """
|
||||
# output the ill-formed xml file
|
||||
# """
|
||||
# (utf8_encode, utf8_decode, utf8_reader, utf8_writer) = codecs.lookup("utf-8")
|
||||
# write_obj = utf8_writer(open(out_file, 'w'))
|
||||
# write_obj = open(out_file, 'w')
|
||||
# read_obj = utf8_writer(open(in_file, 'r'))
|
||||
# read_obj = open(in_file, 'r')
|
||||
# line = 1
|
||||
# while line:
|
||||
# line = read_obj.readline()
|
||||
# if isinstance(line, type(u"")):
|
||||
# line = line.encode("utf-8")
|
||||
# write_obj.write(line)
|
||||
# read_obj.close()
|
||||
# write_obj.close()
|
||||
|
@ -11,31 +11,32 @@
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os
|
||||
|
||||
from calibre.ebooks.rtf2xml import copy
|
||||
from calibre.ptempfile import better_mktemp
|
||||
|
||||
class Paragraphs:
|
||||
"""
|
||||
=================
|
||||
Purpose
|
||||
=================
|
||||
Write paragraph tags for a tokenized file. (This module won't be any use to use
|
||||
to you unless you use it as part of the other modules.)
|
||||
-------------
|
||||
Method
|
||||
-------------
|
||||
RTF does not tell you when a paragraph begins. It only tells you when the
|
||||
paragraph ends.
|
||||
In order to make paragraphs out of this limited info, the parser starts in the
|
||||
body of the documents and assumes it is not in a paragraph. It looks for clues
|
||||
to begin a paragraph. Text starts a paragraph; so does an inline field or
|
||||
list-text. If an end of paragraph marker (\par) is found, then this indicates
|
||||
a blank paragraph.
|
||||
Once a paragraph is found, the state changes to 'paragraph.' In this state,
|
||||
clues are looked to for the end of a paragraph. The end of a paragraph marker
|
||||
(\par) marks the end of a paragraph. So does the end of a footnote or heading;
|
||||
a paragraph definintion; the end of a field-block; and the beginning of a
|
||||
section. (How about the end of a section or the end of a field-block?)
|
||||
=================
|
||||
Purpose
|
||||
=================
|
||||
Write paragraph tags for a tokenized file. (This module won't be any use to use
|
||||
to you unless you use it as part of the other modules.)
|
||||
-------------
|
||||
Method
|
||||
-------------
|
||||
RTF does not tell you when a paragraph begins. It only tells you when the
|
||||
paragraph ends.
|
||||
In order to make paragraphs out of this limited info, the parser starts in the
|
||||
body of the documents and assumes it is not in a paragraph. It looks for clues
|
||||
to begin a paragraph. Text starts a paragraph; so does an inline field or
|
||||
list-text. If an end of paragraph marker (\par) is found, then this indicates
|
||||
a blank paragraph.
|
||||
Once a paragraph is found, the state changes to 'paragraph.' In this state,
|
||||
clues are looked to for the end of a paragraph. The end of a paragraph marker
|
||||
(\par) marks the end of a paragraph. So does the end of a footnote or heading;
|
||||
a paragraph definition; the end of a field-block; and the beginning of a
|
||||
section. (How about the end of a section or the end of a field-block?)
|
||||
"""
|
||||
def __init__(self,
|
||||
in_file,
|
||||
@ -60,6 +61,7 @@ section. (How about the end of a section or the end of a field-block?)
|
||||
self.__write_empty_para = write_empty_para
|
||||
self.__run_level = run_level
|
||||
self.__write_to = better_mktemp()
|
||||
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Initiate all values.
|
||||
@ -77,7 +79,7 @@ section. (How about the end of a section or the end of a field-block?)
|
||||
self.__paragraph_dict = {
|
||||
'cw<pf<par-end___' : self.__close_para_func, # end of paragraph
|
||||
'mi<mk<headi_-end' : self.__close_para_func, # end of header or footer
|
||||
##'cw<pf<par-def___' : self.__close_para_func, # paragraph definition
|
||||
## 'cw<pf<par-def___' : self.__close_para_func, # paragraph definition
|
||||
# 'mi<mk<fld-bk-end' : self.__close_para_func, # end of field-block
|
||||
'mi<mk<fldbk-end_' : self.__close_para_func, # end of field-block
|
||||
'mi<mk<body-close' : self.__close_para_func, # end of body
|
||||
@ -99,6 +101,7 @@ section. (How about the end of a section or the end of a field-block?)
|
||||
'mi<mk<pict-start' : self.__start_para_func,
|
||||
'cw<pf<page-break' : self.__empty_pgbk_func, # page break
|
||||
}
|
||||
|
||||
def __before_body_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
@ -112,6 +115,7 @@ section. (How about the end of a section or the end of a field-block?)
|
||||
if self.__token_info == 'mi<mk<body-open_':
|
||||
self.__state = 'not_paragraph'
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __not_paragraph_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
@ -127,6 +131,7 @@ section. (How about the end of a section or the end of a field-block?)
|
||||
if action:
|
||||
action(line)
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __paragraph_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
@ -144,6 +149,7 @@ section. (How about the end of a section or the end of a field-block?)
|
||||
action(line)
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __start_para_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
@ -160,6 +166,7 @@ section. (How about the end of a section or the end of a field-block?)
|
||||
)
|
||||
self.__write_obj.write(self.__start2_marker)
|
||||
self.__state = 'paragraph'
|
||||
|
||||
def __empty_para_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
@ -176,6 +183,7 @@ section. (How about the end of a section or the end of a field-block?)
|
||||
'mi<tg<empty_____<para\n'
|
||||
)
|
||||
self.__write_obj.write(self.__end_marker) # marker for later parsing
|
||||
|
||||
def __empty_pgbk_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
@ -188,6 +196,7 @@ section. (How about the end of a section or the end of a field-block?)
|
||||
self.__write_obj.write(
|
||||
'mi<tg<empty_____<page-break\n'
|
||||
)
|
||||
|
||||
def __close_para_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
@ -205,6 +214,7 @@ section. (How about the end of a section or the end of a field-block?)
|
||||
self.__write_obj.write(self.__end_marker) # marker for later parser
|
||||
self.__write_obj.write(line)
|
||||
self.__state = 'not_paragraph'
|
||||
|
||||
def __bogus_para__def_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
@ -215,6 +225,7 @@ section. (How about the end of a section or the end of a field-block?)
|
||||
if a \pard occurs in a paragraph, I want to ignore it. (I believe)
|
||||
"""
|
||||
self.__write_obj.write('mi<mk<bogus-pard\n')
|
||||
|
||||
def make_paragraphs(self):
|
||||
"""
|
||||
Requires:
|
||||
@ -229,20 +240,18 @@ section. (How about the end of a section or the end of a field-block?)
|
||||
only other state is 'paragraph'.
|
||||
"""
|
||||
self.__initiate_values()
|
||||
read_obj = open(self.__file, 'r')
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if action == None:
|
||||
sys.stderr.write('no no matching state in module sections.py\n')
|
||||
sys.stderr.write(self.__state + '\n')
|
||||
action(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
with open(self.__file, 'r') as read_obj:
|
||||
with open(self.__write_to, 'w') as self.__write_obj:
|
||||
for line in read_obj:
|
||||
self.__token_info = line[:16]
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if action is None:
|
||||
try:
|
||||
sys.stderr.write('no matching state in module paragraphs.py\n')
|
||||
sys.stderr.write(self.__state + '\n')
|
||||
except:
|
||||
pass
|
||||
action(line)
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "paragraphs.data")
|
||||
|
@ -11,16 +11,24 @@
|
||||
# #
|
||||
#########################################################################
|
||||
import sys,os
|
||||
|
||||
from calibre.ebooks.rtf2xml import copy
|
||||
|
||||
class Preamble:
|
||||
"""
|
||||
Fix the reamaing parts of the preamble. This module does very little. It
|
||||
makes sure that no text gets put in the revision of list table. In the
|
||||
future, when I understand how to interprett he revision table and list
|
||||
future, when I understand how to interpret the revision table and list
|
||||
table, I will make these methods more functional.
|
||||
"""
|
||||
def __init__(self, file, bug_handler, platform, default_font, code_page,
|
||||
copy=None, temp_dir=None):
|
||||
def __init__(self, file,
|
||||
bug_handler,
|
||||
platform,
|
||||
default_font,
|
||||
code_page,
|
||||
copy=None,
|
||||
temp_dir=None,
|
||||
):
|
||||
"""
|
||||
Required:
|
||||
file--file to parse
|
||||
@ -44,6 +52,7 @@ class Preamble:
|
||||
self.__write_to = os.path.join(temp_dir,"info_table_info.data")
|
||||
else:
|
||||
self.__write_to = "info_table_info.data"
|
||||
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Initiate all values.
|
||||
@ -62,12 +71,14 @@ class Preamble:
|
||||
'mi<mk<revtbl-beg' : self.__found_revision_table_func,
|
||||
'mi<mk<body-open_' : self.__found_body_func,
|
||||
}
|
||||
|
||||
def __default_func(self, line):
|
||||
action = self.__default_dict.get(self.__token_info)
|
||||
if action:
|
||||
action(line)
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __found_rtf_head_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
@ -84,8 +95,10 @@ class Preamble:
|
||||
'<platform>%s\n' % (self.__default_font, self.__code_page,
|
||||
self.__platform)
|
||||
)
|
||||
|
||||
def __found_list_table_func(self, line):
|
||||
self.__state = 'list_table'
|
||||
|
||||
def __list_table_func(self, line):
|
||||
if self.__token_info == 'mi<mk<listabend_':
|
||||
self.__state = 'default'
|
||||
@ -93,8 +106,10 @@ class Preamble:
|
||||
pass
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __found_revision_table_func(self, line):
|
||||
self.__state = 'revision'
|
||||
|
||||
def __revision_table_func(self, line):
|
||||
if self.__token_info == 'mi<mk<revtbl-end':
|
||||
self.__state = 'default'
|
||||
@ -102,11 +117,14 @@ class Preamble:
|
||||
pass
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __found_body_func(self, line):
|
||||
self.__state = 'body'
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __body_func(self, line):
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def fix_preamble(self):
|
||||
"""
|
||||
Requires:
|
||||
@ -119,20 +137,15 @@ class Preamble:
|
||||
the list table.
|
||||
"""
|
||||
self.__initiate_values()
|
||||
read_obj = open(self.__file, 'r')
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if action == None:
|
||||
sys.stderr.write('no no matching state in module preamble_rest.py\n')
|
||||
sys.stderr.write(self.__state + '\n')
|
||||
action(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
with open(self.__file, 'r') as read_obj:
|
||||
with open(self.__write_to, 'w') as self.__write_obj:
|
||||
for line in read_obj:
|
||||
self.__token_info = line[:16]
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if action is None:
|
||||
sys.stderr.write(
|
||||
'no matching state in module preamble_rest.py\n' + self.__state + '\n')
|
||||
action(line)
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "preamble_div.data")
|
||||
|
@ -11,43 +11,44 @@
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os
|
||||
|
||||
from calibre.ebooks.rtf2xml import copy
|
||||
from calibre.ptempfile import better_mktemp
|
||||
|
||||
class Sections:
|
||||
"""
|
||||
=================
|
||||
Purpose
|
||||
=================
|
||||
Write section tags for a tokenized file. (This module won't be any use to use
|
||||
to you unless you use it as part of the other modules.)
|
||||
---------------
|
||||
logic
|
||||
---------------
|
||||
The tags for the first section breaks have already been written.
|
||||
RTF stores section breaks with the \sect tag. Each time this tag is
|
||||
encountered, add one to the counter.
|
||||
When I encounter the \sectd tag, I want to collect all the appropriate tokens
|
||||
that describe the section. When I reach a \pard, I know I an stop collecting
|
||||
tokens and write the section tags.
|
||||
The exception to this method occurs when sections occur in field blocks, such
|
||||
as the index. Normally, two section break occur within the index and other
|
||||
field-blocks. (If less or more section breaks occurr, this code may not work.)
|
||||
I want the sections to occurr outside of the index. That is, the index
|
||||
should be nested inside one section tag. After the index is complete, a new
|
||||
section should begin.
|
||||
In order to write the sections outside of the field blocks, I have to store
|
||||
all of the field block as a string. When I ecounter the \sect tag, add one to
|
||||
the section counter, but store this number in a list. Likewise, store the
|
||||
information describing the section in another list.
|
||||
When I reach the end of the field block, choose the first item from the
|
||||
numbered list as the section number. Choose the first item in the description
|
||||
list as the values and attributes of the section. Enclose the field string
|
||||
between the section tags.
|
||||
Start a new section outside the field-block strings. Use the second number in
|
||||
the list; use the second item in the description list.
|
||||
CHANGE (2004-04-26) No longer write sections that occurr in field-blocks.
|
||||
Instead, ingore all section information in a field-block.
|
||||
=================
|
||||
Purpose
|
||||
=================
|
||||
Write section tags for a tokenized file. (This module won't be any use to use
|
||||
to you unless you use it as part of the other modules.)
|
||||
---------------
|
||||
logic
|
||||
---------------
|
||||
The tags for the first section breaks have already been written.
|
||||
RTF stores section breaks with the \sect tag. Each time this tag is
|
||||
encountered, add one to the counter.
|
||||
When I encounter the \sectd tag, I want to collect all the appropriate tokens
|
||||
that describe the section. When I reach a \pard, I know I an stop collecting
|
||||
tokens and write the section tags.
|
||||
The exception to this method occurs when sections occur in field blocks, such
|
||||
as the index. Normally, two section break occur within the index and other
|
||||
field-blocks. (If less or more section breaks occurr, this code may not work.)
|
||||
I want the sections to occur outside of the index. That is, the index
|
||||
should be nested inside one section tag. After the index is complete, a new
|
||||
section should begin.
|
||||
In order to write the sections outside of the field blocks, I have to store
|
||||
all of the field block as a string. When I ecounter the \sect tag, add one to
|
||||
the section counter, but store this number in a list. Likewise, store the
|
||||
information describing the section in another list.
|
||||
When I reach the end of the field block, choose the first item from the
|
||||
numbered list as the section number. Choose the first item in the description
|
||||
list as the values and attributes of the section. Enclose the field string
|
||||
between the section tags.
|
||||
Start a new section outside the field-block strings. Use the second number in
|
||||
the list; use the second item in the description list.
|
||||
CHANGE (2004-04-26) No longer write sections that occurr in field-blocks.
|
||||
Instead, ingore all section information in a field-block.
|
||||
"""
|
||||
def __init__(self,
|
||||
in_file,
|
||||
|
Loading…
x
Reference in New Issue
Block a user