Global overhaul of rtf2xml : RTF fixes (1)

This commit is contained in:
Sengian 2010-07-31 10:47:12 +02:00
parent 8512f57866
commit 09c8f13a1f
7 changed files with 52 additions and 77 deletions

View File

@ -50,6 +50,7 @@ class RTFInput(InputFormatPlugin):
parser = ParseRtf(
in_file = stream,
out_file = ofile,
#deb_dir = 'I:\\Calibre\\rtfdebug',
# Convert symbol fonts to unicode equivalents. Default
# is 1
convert_symbol = 1,

View File

@ -143,7 +143,7 @@ class ParseRtf:
self.__convert_wingdings = convert_wingdings
self.__convert_zapf = convert_zapf
self.__run_level = run_level
self.__exit_level = 0
#self.__exit_level = 0
self.__indent = indent
self.__replace_illegals = replace_illegals
self.__form_lists = form_lists
@ -162,8 +162,7 @@ class ParseRtf:
elif os.path.exists(the_file):
pass # do nothing
else:
message = "\nThe file '%s' cannot be found" % the_file
msg = message
msg = "\nThe file '%s' cannot be found" % the_file
raise RtfInvalidCodeException, msg
def __check_dir(self, the_dir):
"""Check to see if directory exists"""
@ -180,8 +179,7 @@ class ParseRtf:
test = codecs.open(the_file, 'r', 'ascii', 'strict')
test.close()
except UnicodeError:
message= "\n%s is not a correct ascii file" % the_file
msg = message
msg = "\n%s is not a correct ascii file" % the_file
raise RtfInvalidCodeException, msg
return 1
def parse_rtf(self):
@ -204,27 +202,29 @@ class ParseRtf:
copy_obj.set_dir(self.__debug_dir)
copy_obj.remove_files()
copy_obj.copy_file(self.__temp_file, "original_file")
# new as of 2005-08-02. Do I want this?
# Function to check if bracket are well handled
if self.__debug_dir or self.__run_level > 2:
self.__check_brack_obj = check_brackets.CheckBrackets\
(file = self.__temp_file,
bug_handler = RtfInvalidCodeException,
)
# convert Macintosh line endings to Unix line endings
# convert Macintosh and Windows line endings to Unix line endings
#why do this if you don't wb after?
line_obj = line_endings.FixLineEndings(
in_file = self.__temp_file,
bug_handler = RtfInvalidCodeException,
copy = self.__copy,
run_level = self.__run_level,
#run_level = self.__run_level,
replace_illegals = self.__replace_illegals,
)
return_value = line_obj.fix_endings()
self.__return_code(return_value)
line_obj.fix_endings()
#return_value = line_obj.fix_endings() #calibre: no return in this function, why keep it?
#self.__return_code(return_value)
tokenize_obj = tokenize.Tokenize(
bug_handler = RtfInvalidCodeException,
in_file = self.__temp_file,
copy = self.__copy,
run_level = self.__run_level,)
copy = self.__copy,)
#run_level = self.__run_level,)
tokenize_obj.tokenize()
process_tokens_obj = process_tokens.ProcessTokens(
in_file = self.__temp_file,
@ -529,7 +529,7 @@ class ParseRtf:
)
output_obj.output()
os.remove(self.__temp_file)
return self.__exit_level
#return self.__exit_level
def __bracket_match(self, file_name):
if self.__run_level > 2:
good_br, msg = self.__check_brack_obj.check_brackets()
@ -539,26 +539,17 @@ class ParseRtf:
else:
msg += msg + " in file '" + file_name + "'\n"
raise RtfInvalidCodeException, msg
def __return_code(self, num):
if num == None:
return
if int(num) > self.__exit_level:
self.__exit_level = num
#def __return_code(self, num): calibre not used
# if num == None:
# return
# if int(num) > self.__exit_level:
# self.__exit_level = num
def __make_temp_file(self,file):
"""Make a temporary file to parse"""
write_file="rtf_write_file"
read_obj = file if hasattr(file, 'read') else open(file,'r')
write_obj = open(write_file, 'w')
line = "dummy"
while line:
line = read_obj.read(1000)
for line in read_obj:
write_obj.write(line)
write_obj.close()
return write_file
"""
mi<tg<open______<style-sheet\n
mi<tg<close_____<style-sheet\n
mi<tg<open-att__<footnote<num>1\n
mi<tg<empty-att_<page-definition<margin>33\n
mi<tg<empty_____<para\n
"""

View File

@ -34,18 +34,16 @@ class CheckBrackets:
try:
last_num = self.__open_bracket_num.pop()
except:
return 0
return False
if num != last_num:
return 0
return False
self.__bracket_count -= 1
return 1
return True
def check_brackets(self):
read_obj = open(self.__file, 'r')
line = 'dummy'
line_count = 0
while line:
for line in read_obj:
line_count += 1
line = read_obj.readline()
self.__token_info = line[:16]
if self.__token_info == 'ob<nu<open-brack':
self.open_brack(line)

View File

@ -23,43 +23,31 @@ class FixLineEndings:
bug_handler,
in_file = None,
copy = None,
run_level = 1,
#run_level = 1, calibre why keep it?
replace_illegals = 1,
):
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__run_level = run_level
self.__write_to = tempfile.mktemp()
self.__replace_illegals = replace_illegals
def fix_endings(self):
##tempFileName = tempfile.mktemp()
illegal_regx = re.compile( '\x00|\x01|\x02|\x03|\x04|\x05|\x06|\x07|\x08|\x0B|\x0E|\x0F|\x10|\x11|\x12|\x13')
#nums = [0, 1, 2, 3, 4, 5, 6, 7, 8, 11, 14, 15, 16, 17, 18, 19]
"""
read_obj = open(self.__file, 'r')
line = read_obj.read(1000)
regexp = re.compile(r"\r")
macintosh = regexp.search(line)
read_obj.close()
"""
# always check since I have to get rid of illegal characters
macintosh = 1
if macintosh:
line = 1
#read
read_obj = open(self.__file, 'r')
write_obj = open(self.__write_to, 'w')
while line:
line = read_obj.read(1000)
# line = re.sub(regexp,"\n",line)
line = line.replace ('\r', '\n')
if self.__replace_illegals:
line = re.sub(illegal_regx, '', line)
# for num in nums:
# line = line.replace(chr(num), '')
write_obj.write(line )
input_file = read_obj.read()
read_obj.close()
#calibre go from win and mac to unix
input_file = input_file.replace ('\r\n', '\n')
input_file = input_file.replace ('\r', '\n')
if self.__replace_illegals:
input_file = re.sub(illegal_regx, '', input_file)
#write
write_obj = open(self.__write_to, 'wb')
write_obj.write(input_file)
write_obj.close()
#copy
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "line_endings.data")

View File

@ -645,10 +645,8 @@ class ProcessTokens:
return 'tx<nu<__________<%s\n' % token
def ob_func(self, pre, token, num=None):
self.__bracket_count += 1
##return 'ob<%04d\n' % self.__bracket_count
return 'ob<nu<open-brack<%04d\n' % self.__bracket_count
def cb_func(self, pre, token, num=None):
##line = 'cb<%04d\n' % self.__bracket_count
line = 'cb<nu<clos-brack<%04d\n' % self.__bracket_count
self.__bracket_count -= 1
return line

View File

@ -23,7 +23,7 @@ class Tokenize:
in_file,
bug_handler,
copy = None,
run_level = 1,
#run_level = 1,
):
self.__file = in_file
self.__bug_handler = bug_handler
@ -80,7 +80,7 @@ class Tokenize:
def __create_tokens(self):
self.__compile_expressions()
read_obj = open(self.__file, 'r')
write_obj = open(self.__write_to, 'w')
write_obj = open(self.__write_to, 'wb')
line_to_read = "dummy"
while line_to_read:
line_to_read = read_obj.readline()
@ -106,7 +106,7 @@ class Tokenize:
write_obj.close()
def tokenize(self):
"""Main class for handling other methods. Reads in one line \
at a time, usues method self.sub_line to make basic substitutions,\
at a time, uses method self.sub_line to make basic substitutions,\
uses ? to process tokens"""
self.__create_tokens()
copy_obj = copy.Copy(bug_handler = self.__bug_handler)

View File

@ -4,8 +4,7 @@
Read content from txt file.
'''
import os
import re
import os, re
from calibre import prepare_string_for_xml
from calibre.ebooks.markdown import markdown