Global overhaul of rtf2xml : RTF fixes (1)

This commit is contained in:
Sengian 2010-07-31 10:47:12 +02:00
parent 8512f57866
commit 09c8f13a1f
7 changed files with 52 additions and 77 deletions

View File

@ -50,6 +50,7 @@ class RTFInput(InputFormatPlugin):
parser = ParseRtf( parser = ParseRtf(
in_file = stream, in_file = stream,
out_file = ofile, out_file = ofile,
#deb_dir = 'I:\\Calibre\\rtfdebug',
# Convert symbol fonts to unicode equivalents. Default # Convert symbol fonts to unicode equivalents. Default
# is 1 # is 1
convert_symbol = 1, convert_symbol = 1,

View File

@ -143,7 +143,7 @@ class ParseRtf:
self.__convert_wingdings = convert_wingdings self.__convert_wingdings = convert_wingdings
self.__convert_zapf = convert_zapf self.__convert_zapf = convert_zapf
self.__run_level = run_level self.__run_level = run_level
self.__exit_level = 0 #self.__exit_level = 0
self.__indent = indent self.__indent = indent
self.__replace_illegals = replace_illegals self.__replace_illegals = replace_illegals
self.__form_lists = form_lists self.__form_lists = form_lists
@ -162,8 +162,7 @@ class ParseRtf:
elif os.path.exists(the_file): elif os.path.exists(the_file):
pass # do nothing pass # do nothing
else: else:
message = "\nThe file '%s' cannot be found" % the_file msg = "\nThe file '%s' cannot be found" % the_file
msg = message
raise RtfInvalidCodeException, msg raise RtfInvalidCodeException, msg
def __check_dir(self, the_dir): def __check_dir(self, the_dir):
"""Check to see if directory exists""" """Check to see if directory exists"""
@ -180,8 +179,7 @@ class ParseRtf:
test = codecs.open(the_file, 'r', 'ascii', 'strict') test = codecs.open(the_file, 'r', 'ascii', 'strict')
test.close() test.close()
except UnicodeError: except UnicodeError:
message= "\n%s is not a correct ascii file" % the_file msg = "\n%s is not a correct ascii file" % the_file
msg = message
raise RtfInvalidCodeException, msg raise RtfInvalidCodeException, msg
return 1 return 1
def parse_rtf(self): def parse_rtf(self):
@ -204,27 +202,29 @@ class ParseRtf:
copy_obj.set_dir(self.__debug_dir) copy_obj.set_dir(self.__debug_dir)
copy_obj.remove_files() copy_obj.remove_files()
copy_obj.copy_file(self.__temp_file, "original_file") copy_obj.copy_file(self.__temp_file, "original_file")
# new as of 2005-08-02. Do I want this? # Function to check if bracket are well handled
if self.__debug_dir or self.__run_level > 2: if self.__debug_dir or self.__run_level > 2:
self.__check_brack_obj = check_brackets.CheckBrackets\ self.__check_brack_obj = check_brackets.CheckBrackets\
(file = self.__temp_file, (file = self.__temp_file,
bug_handler = RtfInvalidCodeException, bug_handler = RtfInvalidCodeException,
) )
# convert Macintosh line endings to Unix line endings # convert Macintosh and Windows line endings to Unix line endings
#why do this if you don't wb after?
line_obj = line_endings.FixLineEndings( line_obj = line_endings.FixLineEndings(
in_file = self.__temp_file, in_file = self.__temp_file,
bug_handler = RtfInvalidCodeException, bug_handler = RtfInvalidCodeException,
copy = self.__copy, copy = self.__copy,
run_level = self.__run_level, #run_level = self.__run_level,
replace_illegals = self.__replace_illegals, replace_illegals = self.__replace_illegals,
) )
return_value = line_obj.fix_endings() line_obj.fix_endings()
self.__return_code(return_value) #return_value = line_obj.fix_endings() #calibre: no return in this function, why keep it?
#self.__return_code(return_value)
tokenize_obj = tokenize.Tokenize( tokenize_obj = tokenize.Tokenize(
bug_handler = RtfInvalidCodeException, bug_handler = RtfInvalidCodeException,
in_file = self.__temp_file, in_file = self.__temp_file,
copy = self.__copy, copy = self.__copy,)
run_level = self.__run_level,) #run_level = self.__run_level,)
tokenize_obj.tokenize() tokenize_obj.tokenize()
process_tokens_obj = process_tokens.ProcessTokens( process_tokens_obj = process_tokens.ProcessTokens(
in_file = self.__temp_file, in_file = self.__temp_file,
@ -529,7 +529,7 @@ class ParseRtf:
) )
output_obj.output() output_obj.output()
os.remove(self.__temp_file) os.remove(self.__temp_file)
return self.__exit_level #return self.__exit_level
def __bracket_match(self, file_name): def __bracket_match(self, file_name):
if self.__run_level > 2: if self.__run_level > 2:
good_br, msg = self.__check_brack_obj.check_brackets() good_br, msg = self.__check_brack_obj.check_brackets()
@ -539,26 +539,17 @@ class ParseRtf:
else: else:
msg += msg + " in file '" + file_name + "'\n" msg += msg + " in file '" + file_name + "'\n"
raise RtfInvalidCodeException, msg raise RtfInvalidCodeException, msg
def __return_code(self, num): #def __return_code(self, num): calibre not used
if num == None: # if num == None:
return # return
if int(num) > self.__exit_level: # if int(num) > self.__exit_level:
self.__exit_level = num # self.__exit_level = num
def __make_temp_file(self,file): def __make_temp_file(self,file):
"""Make a temporary file to parse""" """Make a temporary file to parse"""
write_file="rtf_write_file" write_file="rtf_write_file"
read_obj = file if hasattr(file, 'read') else open(file,'r') read_obj = file if hasattr(file, 'read') else open(file,'r')
write_obj = open(write_file, 'w') write_obj = open(write_file, 'w')
line = "dummy" for line in read_obj:
while line:
line = read_obj.read(1000)
write_obj.write(line) write_obj.write(line)
write_obj.close() write_obj.close()
return write_file return write_file
"""
mi<tg<open______<style-sheet\n
mi<tg<close_____<style-sheet\n
mi<tg<open-att__<footnote<num>1\n
mi<tg<empty-att_<page-definition<margin>33\n
mi<tg<empty_____<para\n
"""

View File

@ -34,18 +34,16 @@ class CheckBrackets:
try: try:
last_num = self.__open_bracket_num.pop() last_num = self.__open_bracket_num.pop()
except: except:
return 0 return False
if num != last_num: if num != last_num:
return 0 return False
self.__bracket_count -= 1 self.__bracket_count -= 1
return 1 return True
def check_brackets(self): def check_brackets(self):
read_obj = open(self.__file, 'r') read_obj = open(self.__file, 'r')
line = 'dummy'
line_count = 0 line_count = 0
while line: for line in read_obj:
line_count += 1 line_count += 1
line = read_obj.readline()
self.__token_info = line[:16] self.__token_info = line[:16]
if self.__token_info == 'ob<nu<open-brack': if self.__token_info == 'ob<nu<open-brack':
self.open_brack(line) self.open_brack(line)

View File

@ -23,43 +23,31 @@ class FixLineEndings:
bug_handler, bug_handler,
in_file = None, in_file = None,
copy = None, copy = None,
run_level = 1, #run_level = 1, calibre why keep it?
replace_illegals = 1, replace_illegals = 1,
): ):
self.__file = in_file self.__file = in_file
self.__bug_handler = bug_handler self.__bug_handler = bug_handler
self.__copy = copy self.__copy = copy
self.__run_level = run_level
self.__write_to = tempfile.mktemp() self.__write_to = tempfile.mktemp()
self.__replace_illegals = replace_illegals self.__replace_illegals = replace_illegals
def fix_endings(self): def fix_endings(self):
##tempFileName = tempfile.mktemp()
illegal_regx = re.compile( '\x00|\x01|\x02|\x03|\x04|\x05|\x06|\x07|\x08|\x0B|\x0E|\x0F|\x10|\x11|\x12|\x13') illegal_regx = re.compile( '\x00|\x01|\x02|\x03|\x04|\x05|\x06|\x07|\x08|\x0B|\x0E|\x0F|\x10|\x11|\x12|\x13')
#nums = [0, 1, 2, 3, 4, 5, 6, 7, 8, 11, 14, 15, 16, 17, 18, 19]
"""
read_obj = open(self.__file, 'r')
line = read_obj.read(1000)
regexp = re.compile(r"\r")
macintosh = regexp.search(line)
read_obj.close()
"""
# always check since I have to get rid of illegal characters # always check since I have to get rid of illegal characters
macintosh = 1 #read
if macintosh:
line = 1
read_obj = open(self.__file, 'r') read_obj = open(self.__file, 'r')
write_obj = open(self.__write_to, 'w') input_file = read_obj.read()
while line:
line = read_obj.read(1000)
# line = re.sub(regexp,"\n",line)
line = line.replace ('\r', '\n')
if self.__replace_illegals:
line = re.sub(illegal_regx, '', line)
# for num in nums:
# line = line.replace(chr(num), '')
write_obj.write(line )
read_obj.close() read_obj.close()
#calibre go from win and mac to unix
input_file = input_file.replace ('\r\n', '\n')
input_file = input_file.replace ('\r', '\n')
if self.__replace_illegals:
input_file = re.sub(illegal_regx, '', input_file)
#write
write_obj = open(self.__write_to, 'wb')
write_obj.write(input_file)
write_obj.close() write_obj.close()
#copy
copy_obj = copy.Copy(bug_handler = self.__bug_handler) copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy: if self.__copy:
copy_obj.copy_file(self.__write_to, "line_endings.data") copy_obj.copy_file(self.__write_to, "line_endings.data")

View File

@ -645,10 +645,8 @@ class ProcessTokens:
return 'tx<nu<__________<%s\n' % token return 'tx<nu<__________<%s\n' % token
def ob_func(self, pre, token, num=None): def ob_func(self, pre, token, num=None):
self.__bracket_count += 1 self.__bracket_count += 1
##return 'ob<%04d\n' % self.__bracket_count
return 'ob<nu<open-brack<%04d\n' % self.__bracket_count return 'ob<nu<open-brack<%04d\n' % self.__bracket_count
def cb_func(self, pre, token, num=None): def cb_func(self, pre, token, num=None):
##line = 'cb<%04d\n' % self.__bracket_count
line = 'cb<nu<clos-brack<%04d\n' % self.__bracket_count line = 'cb<nu<clos-brack<%04d\n' % self.__bracket_count
self.__bracket_count -= 1 self.__bracket_count -= 1
return line return line

View File

@ -23,7 +23,7 @@ class Tokenize:
in_file, in_file,
bug_handler, bug_handler,
copy = None, copy = None,
run_level = 1, #run_level = 1,
): ):
self.__file = in_file self.__file = in_file
self.__bug_handler = bug_handler self.__bug_handler = bug_handler
@ -80,7 +80,7 @@ class Tokenize:
def __create_tokens(self): def __create_tokens(self):
self.__compile_expressions() self.__compile_expressions()
read_obj = open(self.__file, 'r') read_obj = open(self.__file, 'r')
write_obj = open(self.__write_to, 'w') write_obj = open(self.__write_to, 'wb')
line_to_read = "dummy" line_to_read = "dummy"
while line_to_read: while line_to_read:
line_to_read = read_obj.readline() line_to_read = read_obj.readline()
@ -106,7 +106,7 @@ class Tokenize:
write_obj.close() write_obj.close()
def tokenize(self): def tokenize(self):
"""Main class for handling other methods. Reads in one line \ """Main class for handling other methods. Reads in one line \
at a time, usues method self.sub_line to make basic substitutions,\ at a time, uses method self.sub_line to make basic substitutions,\
uses ? to process tokens""" uses ? to process tokens"""
self.__create_tokens() self.__create_tokens()
copy_obj = copy.Copy(bug_handler = self.__bug_handler) copy_obj = copy.Copy(bug_handler = self.__bug_handler)

View File

@ -4,8 +4,7 @@
Read content from txt file. Read content from txt file.
''' '''
import os import os, re
import re
from calibre import prepare_string_for_xml from calibre import prepare_string_for_xml
from calibre.ebooks.markdown import markdown from calibre.ebooks.markdown import markdown