Handle inproper \*\csN in body without braces

This commit is contained in:
Sengian 2011-01-16 00:47:01 +01:00
parent 2e033022b7
commit fc42efda42
4 changed files with 39 additions and 24 deletions

View File

@ -375,7 +375,7 @@ class ParseRtf:
old_rtf = old_rtf_obj.check_if_old_rtf()
if old_rtf:
if self.__run_level > 5:
msg = 'older RTF\n'
msg = 'Older RTF\n'
msg += 'self.__run_level is "%s"\n' % self.__run_level
raise RtfInvalidCodeException, msg
if self.__run_level > 1:

View File

@ -48,6 +48,7 @@ class DeleteInfo:
'cw<it<listtable_',
'cw<it<revi-table',
'cw<ls<list-lev-d',
# Field allowed
'cw<fd<field-inst',
'cw<an<book-mk-st',
'cw<an<book-mk-en',
@ -86,7 +87,7 @@ class DeleteInfo:
self.__ob = line
return False
else:
# write previous bracket, since didn't fine asterisk
# write previous bracket, since didn't find asterisk
if self.__ob:
self.__write_obj.write(self.__ob)
self.__ob = 0
@ -109,7 +110,7 @@ class DeleteInfo:
If you find that you are in a delete group, and the previous
token in not an open bracket (self.__ob = 0), that means
that the delete group is nested inside another acceptable
detination group. In this case, you have alrady written
detination group. In this case, you have already written
the open bracket, so you will need to write the closed one
as well.
"""

View File

@ -15,8 +15,10 @@
# #
# #
#########################################################################
import sys, os, tempfile, re
import sys, os, tempfile, re
from calibre.ebooks.rtf2xml import field_strings, copy
class FieldsSmall:
"""
=================
@ -24,7 +26,7 @@ Purpose
=================
Write tags for bookmarks, index and toc entry fields in a tokenized file.
This module does not handle toc or index tables. (This module won't be any
use to use to you unless you use it as part of the other modules.)
use to you unless you use it as part of the other modules.)
-----------
Method
-----------
@ -55,6 +57,7 @@ file.
self.__copy = copy
self.__write_to = tempfile.mktemp()
self.__run_level = run_level
def __initiate_values(self):
"""
Initiate all values.
@ -81,6 +84,7 @@ file.
tx = 'tx<nu<__________<(.*?)'
reg_st = ob + bk_st + tx + cb
self.__book_start = re.compile(r'%s' % reg_st)
def __before_body_func(self, line):
"""
Requires:
@ -94,6 +98,7 @@ file.
if self.__token_info == 'mi<mk<body-open_':
self.__state = 'body'
self.__write_obj.write(line)
def __body_func(self, line):
"""
Requires:
@ -110,6 +115,7 @@ file.
action(line, tag)
else:
self.__write_obj.write(line)
def __found_bookmark_func(self, line, tag):
"""
Requires:
@ -125,6 +131,7 @@ file.
self.__cb_count = 0
self.__state = 'bookmark'
self.__type_of_bookmark = tag
def __bookmark_func(self, line):
"""
Requires:
@ -153,6 +160,7 @@ file.
self.__write_obj.write(line)
elif line[0:2] == 'tx':
self.__text_string += line[17:-1]
def __parse_index_func(self, my_string):
"""
Requires:
@ -201,6 +209,7 @@ file.
my_changed_string += '<sub-entry>%s' % sub_entry
my_changed_string += '\n'
return my_changed_string
def __index_see_func(self, my_string):
in_see = 0
bracket_count = 0
@ -226,6 +235,7 @@ file.
in_see = 1
changed_string += '%s\n' % line
return changed_string, see_string
def __index_bookmark_func(self, my_string):
"""
Requries:
@ -262,6 +272,7 @@ file.
in_bookmark = 1
index_string += '%s\n' % line
return index_string, bookmark_string
def __index__format_func(self, my_string):
italics = 0
bold =0
@ -273,6 +284,7 @@ file.
if token_info == 'cw<in<index-ital':
italics = 1
return italics, bold
def __parse_toc_func(self, my_string):
"""
Requires:
@ -308,6 +320,7 @@ file.
my_changed_string += '<main-entry>%s' % main_entry
my_changed_string += '\n'
return my_changed_string
def __parse_bookmark_for_toc(self, my_string):
"""
Requires:
@ -353,6 +366,7 @@ file.
in_bookmark = 1
toc_string += '%s\n' % line
return toc_string, book_start_string, book_end_string
def __parse_bookmark_func(self, my_string, type):
"""
Requires:
@ -367,6 +381,7 @@ file.
my_changed_string = ('mi<tg<empty-att_<field<type>%s'
'<number>%s<update>none\n' % (type, my_string))
return my_changed_string
def __found_toc_index_func(self, line, tag):
"""
Requires:
@ -382,6 +397,7 @@ file.
self.__cb_count = 0
self.__state = 'toc_index'
self.__tag = tag
def __toc_index_func(self, line):
"""
Requires:
@ -409,6 +425,7 @@ file.
self.__write_obj.write(line)
else:
self.__text_string += line
def fix_fields(self):
"""
Requires:
@ -423,24 +440,19 @@ file.
bookmark.
"""
self.__initiate_values()
read_obj = open(self.__file)
self.__write_obj = open(self.__write_to, 'w')
line_to_read = '1'
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
if self.__token_info == 'ob<nu<open-brack':
self.__ob_count = line[-5:-1]
if self.__token_info == 'cb<nu<clos-brack':
self.__cb_count = line[-5:-1]
action = self.__state_dict.get(self.__state)
if action == None:
sys.stderr.write('no no matching state in module fields_small.py\n')
sys.stderr.write(self.__state + '\n')
action(line)
read_obj.close()
self.__write_obj.close()
with open(self.__file, 'r') as read_obj:
with open(self.__write_to, 'w') as self.__write_obj:
for line in read_obj:
self.__token_info = line[:16]
if self.__token_info == 'ob<nu<open-brack':
self.__ob_count = line[-5:-1]
if self.__token_info == 'cb<nu<clos-brack':
self.__cb_count = line[-5:-1]
action = self.__state_dict.get(self.__state)
if action is None:
sys.stderr.write('No matching state in module fields_small.py\n')
sys.stderr.write(self.__state + '\n')
action(line)
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "fields_small.data")

View File

@ -115,8 +115,8 @@ class Tokenize:
def __sub_reg_split(self,input_file):
input_file = self.__replace_spchar.mreplace(input_file)
# this is for older RTF
input_file = self.__par_exp.sub('\n\\par \n', input_file)
input_file = self.__cs_ast.sub("\g<1>", input_file)
input_file = self.__ms_hex_exp.sub("\\mshex0\g<1> ", input_file)
input_file = self.__utf_ud.sub("\\{\\uc0 \g<1>\\}", input_file)
#remove \n in bin data
@ -172,6 +172,8 @@ class Tokenize:
self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)")
#this is for old RTF
self.__par_exp = re.compile(r'\\\n+')
#handle improper cs char-style with \* before without {
self.__cs_ast = re.compile(r'\\\*([\n ]*\\cs\d+[\n \\]+)')
# self.__par_exp = re.compile(r'\\$')
#self.__bin_exp = re.compile(r"\\bin(-?\d{1,8}) {0,1}")
#self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})")