mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Global overhaul of rtf2xml : RTF fixes (1)
This commit is contained in:
parent
8512f57866
commit
09c8f13a1f
@ -50,6 +50,7 @@ class RTFInput(InputFormatPlugin):
|
||||
parser = ParseRtf(
|
||||
in_file = stream,
|
||||
out_file = ofile,
|
||||
#deb_dir = 'I:\\Calibre\\rtfdebug',
|
||||
# Convert symbol fonts to unicode equivalents. Default
|
||||
# is 1
|
||||
convert_symbol = 1,
|
||||
|
@ -143,7 +143,7 @@ class ParseRtf:
|
||||
self.__convert_wingdings = convert_wingdings
|
||||
self.__convert_zapf = convert_zapf
|
||||
self.__run_level = run_level
|
||||
self.__exit_level = 0
|
||||
#self.__exit_level = 0
|
||||
self.__indent = indent
|
||||
self.__replace_illegals = replace_illegals
|
||||
self.__form_lists = form_lists
|
||||
@ -162,8 +162,7 @@ class ParseRtf:
|
||||
elif os.path.exists(the_file):
|
||||
pass # do nothing
|
||||
else:
|
||||
message = "\nThe file '%s' cannot be found" % the_file
|
||||
msg = message
|
||||
msg = "\nThe file '%s' cannot be found" % the_file
|
||||
raise RtfInvalidCodeException, msg
|
||||
def __check_dir(self, the_dir):
|
||||
"""Check to see if directory exists"""
|
||||
@ -180,8 +179,7 @@ class ParseRtf:
|
||||
test = codecs.open(the_file, 'r', 'ascii', 'strict')
|
||||
test.close()
|
||||
except UnicodeError:
|
||||
message= "\n%s is not a correct ascii file" % the_file
|
||||
msg = message
|
||||
msg = "\n%s is not a correct ascii file" % the_file
|
||||
raise RtfInvalidCodeException, msg
|
||||
return 1
|
||||
def parse_rtf(self):
|
||||
@ -204,27 +202,29 @@ class ParseRtf:
|
||||
copy_obj.set_dir(self.__debug_dir)
|
||||
copy_obj.remove_files()
|
||||
copy_obj.copy_file(self.__temp_file, "original_file")
|
||||
# new as of 2005-08-02. Do I want this?
|
||||
# Function to check if bracket are well handled
|
||||
if self.__debug_dir or self.__run_level > 2:
|
||||
self.__check_brack_obj = check_brackets.CheckBrackets\
|
||||
(file = self.__temp_file,
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
)
|
||||
# convert Macintosh line endings to Unix line endings
|
||||
# convert Macintosh and Windows line endings to Unix line endings
|
||||
#why do this if you don't wb after?
|
||||
line_obj = line_endings.FixLineEndings(
|
||||
in_file = self.__temp_file,
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
copy = self.__copy,
|
||||
run_level = self.__run_level,
|
||||
#run_level = self.__run_level,
|
||||
replace_illegals = self.__replace_illegals,
|
||||
)
|
||||
return_value = line_obj.fix_endings()
|
||||
self.__return_code(return_value)
|
||||
line_obj.fix_endings()
|
||||
#return_value = line_obj.fix_endings() #calibre: no return in this function, why keep it?
|
||||
#self.__return_code(return_value)
|
||||
tokenize_obj = tokenize.Tokenize(
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
in_file = self.__temp_file,
|
||||
copy = self.__copy,
|
||||
run_level = self.__run_level,)
|
||||
copy = self.__copy,)
|
||||
#run_level = self.__run_level,)
|
||||
tokenize_obj.tokenize()
|
||||
process_tokens_obj = process_tokens.ProcessTokens(
|
||||
in_file = self.__temp_file,
|
||||
@ -529,36 +529,27 @@ class ParseRtf:
|
||||
)
|
||||
output_obj.output()
|
||||
os.remove(self.__temp_file)
|
||||
return self.__exit_level
|
||||
#return self.__exit_level
|
||||
def __bracket_match(self, file_name):
|
||||
if self.__run_level > 2:
|
||||
good_br, msg = self.__check_brack_obj.check_brackets()
|
||||
if good_br:
|
||||
pass
|
||||
# sys.stderr.write( msg + ' in ' + file_name + "\n")
|
||||
#sys.stderr.write( msg + ' in ' + file_name + "\n")
|
||||
else:
|
||||
msg += msg + " in file '" + file_name + "'\n"
|
||||
raise RtfInvalidCodeException, msg
|
||||
def __return_code(self, num):
|
||||
if num == None:
|
||||
return
|
||||
if int(num) > self.__exit_level:
|
||||
self.__exit_level = num
|
||||
#def __return_code(self, num): calibre not used
|
||||
# if num == None:
|
||||
# return
|
||||
# if int(num) > self.__exit_level:
|
||||
# self.__exit_level = num
|
||||
def __make_temp_file(self,file):
|
||||
"""Make a temporary file to parse"""
|
||||
write_file="rtf_write_file"
|
||||
read_obj = file if hasattr(file, 'read') else open(file,'r')
|
||||
write_obj = open(write_file, 'w')
|
||||
line = "dummy"
|
||||
while line:
|
||||
line = read_obj.read(1000)
|
||||
write_obj.write(line )
|
||||
for line in read_obj:
|
||||
write_obj.write(line)
|
||||
write_obj.close()
|
||||
return write_file
|
||||
"""
|
||||
mi<tg<open______<style-sheet\n
|
||||
mi<tg<close_____<style-sheet\n
|
||||
mi<tg<open-att__<footnote<num>1\n
|
||||
mi<tg<empty-att_<page-definition<margin>33\n
|
||||
mi<tg<empty_____<para\n
|
||||
"""
|
||||
|
@ -34,18 +34,16 @@ class CheckBrackets:
|
||||
try:
|
||||
last_num = self.__open_bracket_num.pop()
|
||||
except:
|
||||
return 0
|
||||
return False
|
||||
if num != last_num:
|
||||
return 0
|
||||
return False
|
||||
self.__bracket_count -= 1
|
||||
return 1
|
||||
return True
|
||||
def check_brackets(self):
|
||||
read_obj = open(self.__file, 'r')
|
||||
line = 'dummy'
|
||||
line_count = 0
|
||||
while line:
|
||||
for line in read_obj:
|
||||
line_count += 1
|
||||
line = read_obj.readline()
|
||||
self.__token_info = line[:16]
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.open_brack(line)
|
||||
|
@ -23,45 +23,33 @@ class FixLineEndings:
|
||||
bug_handler,
|
||||
in_file = None,
|
||||
copy = None,
|
||||
run_level = 1,
|
||||
#run_level = 1, calibre why keep it?
|
||||
replace_illegals = 1,
|
||||
):
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__run_level = run_level
|
||||
self.__write_to = tempfile.mktemp()
|
||||
self.__replace_illegals = replace_illegals
|
||||
def fix_endings(self):
|
||||
##tempFileName = tempfile.mktemp()
|
||||
illegal_regx = re.compile( '\x00|\x01|\x02|\x03|\x04|\x05|\x06|\x07|\x08|\x0B|\x0E|\x0F|\x10|\x11|\x12|\x13')
|
||||
#nums = [0, 1, 2, 3, 4, 5, 6, 7, 8, 11, 14, 15, 16, 17, 18, 19]
|
||||
"""
|
||||
read_obj = open(self.__file, 'r')
|
||||
line = read_obj.read(1000)
|
||||
regexp = re.compile(r"\r")
|
||||
macintosh = regexp.search(line)
|
||||
read_obj.close()
|
||||
"""
|
||||
# always check since I have to get rid of illegal characters
|
||||
macintosh = 1
|
||||
if macintosh:
|
||||
line = 1
|
||||
read_obj = open(self.__file, 'r')
|
||||
write_obj = open(self.__write_to, 'w')
|
||||
while line:
|
||||
line = read_obj.read(1000)
|
||||
# line = re.sub(regexp,"\n",line)
|
||||
line = line.replace ('\r', '\n')
|
||||
if self.__replace_illegals:
|
||||
line = re.sub(illegal_regx, '', line)
|
||||
# for num in nums:
|
||||
# line = line.replace(chr(num), '')
|
||||
write_obj.write(line )
|
||||
read_obj.close()
|
||||
write_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "line_endings.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
#read
|
||||
read_obj = open(self.__file, 'r')
|
||||
input_file = read_obj.read()
|
||||
read_obj.close()
|
||||
#calibre go from win and mac to unix
|
||||
input_file = input_file.replace ('\r\n', '\n')
|
||||
input_file = input_file.replace ('\r', '\n')
|
||||
if self.__replace_illegals:
|
||||
input_file = re.sub(illegal_regx, '', input_file)
|
||||
#write
|
||||
write_obj = open(self.__write_to, 'wb')
|
||||
write_obj.write(input_file)
|
||||
write_obj.close()
|
||||
#copy
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "line_endings.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
|
@ -645,10 +645,8 @@ class ProcessTokens:
|
||||
return 'tx<nu<__________<%s\n' % token
|
||||
def ob_func(self, pre, token, num=None):
|
||||
self.__bracket_count += 1
|
||||
##return 'ob<%04d\n' % self.__bracket_count
|
||||
return 'ob<nu<open-brack<%04d\n' % self.__bracket_count
|
||||
def cb_func(self, pre, token, num=None):
|
||||
##line = 'cb<%04d\n' % self.__bracket_count
|
||||
line = 'cb<nu<clos-brack<%04d\n' % self.__bracket_count
|
||||
self.__bracket_count -= 1
|
||||
return line
|
||||
|
@ -23,7 +23,7 @@ class Tokenize:
|
||||
in_file,
|
||||
bug_handler,
|
||||
copy = None,
|
||||
run_level = 1,
|
||||
#run_level = 1,
|
||||
):
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
@ -80,7 +80,7 @@ class Tokenize:
|
||||
def __create_tokens(self):
|
||||
self.__compile_expressions()
|
||||
read_obj = open(self.__file, 'r')
|
||||
write_obj = open(self.__write_to, 'w')
|
||||
write_obj = open(self.__write_to, 'wb')
|
||||
line_to_read = "dummy"
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
@ -106,7 +106,7 @@ class Tokenize:
|
||||
write_obj.close()
|
||||
def tokenize(self):
|
||||
"""Main class for handling other methods. Reads in one line \
|
||||
at a time, usues method self.sub_line to make basic substitutions,\
|
||||
at a time, uses method self.sub_line to make basic substitutions,\
|
||||
uses ? to process tokens"""
|
||||
self.__create_tokens()
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
|
@ -4,8 +4,7 @@
|
||||
Read content from txt file.
|
||||
'''
|
||||
|
||||
import os
|
||||
import re
|
||||
import os, re
|
||||
|
||||
from calibre import prepare_string_for_xml
|
||||
from calibre.ebooks.markdown import markdown
|
||||
|
Loading…
x
Reference in New Issue
Block a user