mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Global overhaul of rtf2xml : RTF fixes (1)
This commit is contained in:
parent
8512f57866
commit
09c8f13a1f
@ -50,6 +50,7 @@ class RTFInput(InputFormatPlugin):
|
|||||||
parser = ParseRtf(
|
parser = ParseRtf(
|
||||||
in_file = stream,
|
in_file = stream,
|
||||||
out_file = ofile,
|
out_file = ofile,
|
||||||
|
#deb_dir = 'I:\\Calibre\\rtfdebug',
|
||||||
# Convert symbol fonts to unicode equivalents. Default
|
# Convert symbol fonts to unicode equivalents. Default
|
||||||
# is 1
|
# is 1
|
||||||
convert_symbol = 1,
|
convert_symbol = 1,
|
||||||
|
@ -143,7 +143,7 @@ class ParseRtf:
|
|||||||
self.__convert_wingdings = convert_wingdings
|
self.__convert_wingdings = convert_wingdings
|
||||||
self.__convert_zapf = convert_zapf
|
self.__convert_zapf = convert_zapf
|
||||||
self.__run_level = run_level
|
self.__run_level = run_level
|
||||||
self.__exit_level = 0
|
#self.__exit_level = 0
|
||||||
self.__indent = indent
|
self.__indent = indent
|
||||||
self.__replace_illegals = replace_illegals
|
self.__replace_illegals = replace_illegals
|
||||||
self.__form_lists = form_lists
|
self.__form_lists = form_lists
|
||||||
@ -162,8 +162,7 @@ class ParseRtf:
|
|||||||
elif os.path.exists(the_file):
|
elif os.path.exists(the_file):
|
||||||
pass # do nothing
|
pass # do nothing
|
||||||
else:
|
else:
|
||||||
message = "\nThe file '%s' cannot be found" % the_file
|
msg = "\nThe file '%s' cannot be found" % the_file
|
||||||
msg = message
|
|
||||||
raise RtfInvalidCodeException, msg
|
raise RtfInvalidCodeException, msg
|
||||||
def __check_dir(self, the_dir):
|
def __check_dir(self, the_dir):
|
||||||
"""Check to see if directory exists"""
|
"""Check to see if directory exists"""
|
||||||
@ -180,8 +179,7 @@ class ParseRtf:
|
|||||||
test = codecs.open(the_file, 'r', 'ascii', 'strict')
|
test = codecs.open(the_file, 'r', 'ascii', 'strict')
|
||||||
test.close()
|
test.close()
|
||||||
except UnicodeError:
|
except UnicodeError:
|
||||||
message= "\n%s is not a correct ascii file" % the_file
|
msg = "\n%s is not a correct ascii file" % the_file
|
||||||
msg = message
|
|
||||||
raise RtfInvalidCodeException, msg
|
raise RtfInvalidCodeException, msg
|
||||||
return 1
|
return 1
|
||||||
def parse_rtf(self):
|
def parse_rtf(self):
|
||||||
@ -204,27 +202,29 @@ class ParseRtf:
|
|||||||
copy_obj.set_dir(self.__debug_dir)
|
copy_obj.set_dir(self.__debug_dir)
|
||||||
copy_obj.remove_files()
|
copy_obj.remove_files()
|
||||||
copy_obj.copy_file(self.__temp_file, "original_file")
|
copy_obj.copy_file(self.__temp_file, "original_file")
|
||||||
# new as of 2005-08-02. Do I want this?
|
# Function to check if bracket are well handled
|
||||||
if self.__debug_dir or self.__run_level > 2:
|
if self.__debug_dir or self.__run_level > 2:
|
||||||
self.__check_brack_obj = check_brackets.CheckBrackets\
|
self.__check_brack_obj = check_brackets.CheckBrackets\
|
||||||
(file = self.__temp_file,
|
(file = self.__temp_file,
|
||||||
bug_handler = RtfInvalidCodeException,
|
bug_handler = RtfInvalidCodeException,
|
||||||
)
|
)
|
||||||
# convert Macintosh line endings to Unix line endings
|
# convert Macintosh and Windows line endings to Unix line endings
|
||||||
|
#why do this if you don't wb after?
|
||||||
line_obj = line_endings.FixLineEndings(
|
line_obj = line_endings.FixLineEndings(
|
||||||
in_file = self.__temp_file,
|
in_file = self.__temp_file,
|
||||||
bug_handler = RtfInvalidCodeException,
|
bug_handler = RtfInvalidCodeException,
|
||||||
copy = self.__copy,
|
copy = self.__copy,
|
||||||
run_level = self.__run_level,
|
#run_level = self.__run_level,
|
||||||
replace_illegals = self.__replace_illegals,
|
replace_illegals = self.__replace_illegals,
|
||||||
)
|
)
|
||||||
return_value = line_obj.fix_endings()
|
line_obj.fix_endings()
|
||||||
self.__return_code(return_value)
|
#return_value = line_obj.fix_endings() #calibre: no return in this function, why keep it?
|
||||||
|
#self.__return_code(return_value)
|
||||||
tokenize_obj = tokenize.Tokenize(
|
tokenize_obj = tokenize.Tokenize(
|
||||||
bug_handler = RtfInvalidCodeException,
|
bug_handler = RtfInvalidCodeException,
|
||||||
in_file = self.__temp_file,
|
in_file = self.__temp_file,
|
||||||
copy = self.__copy,
|
copy = self.__copy,)
|
||||||
run_level = self.__run_level,)
|
#run_level = self.__run_level,)
|
||||||
tokenize_obj.tokenize()
|
tokenize_obj.tokenize()
|
||||||
process_tokens_obj = process_tokens.ProcessTokens(
|
process_tokens_obj = process_tokens.ProcessTokens(
|
||||||
in_file = self.__temp_file,
|
in_file = self.__temp_file,
|
||||||
@ -529,7 +529,7 @@ class ParseRtf:
|
|||||||
)
|
)
|
||||||
output_obj.output()
|
output_obj.output()
|
||||||
os.remove(self.__temp_file)
|
os.remove(self.__temp_file)
|
||||||
return self.__exit_level
|
#return self.__exit_level
|
||||||
def __bracket_match(self, file_name):
|
def __bracket_match(self, file_name):
|
||||||
if self.__run_level > 2:
|
if self.__run_level > 2:
|
||||||
good_br, msg = self.__check_brack_obj.check_brackets()
|
good_br, msg = self.__check_brack_obj.check_brackets()
|
||||||
@ -539,26 +539,17 @@ class ParseRtf:
|
|||||||
else:
|
else:
|
||||||
msg += msg + " in file '" + file_name + "'\n"
|
msg += msg + " in file '" + file_name + "'\n"
|
||||||
raise RtfInvalidCodeException, msg
|
raise RtfInvalidCodeException, msg
|
||||||
def __return_code(self, num):
|
#def __return_code(self, num): calibre not used
|
||||||
if num == None:
|
# if num == None:
|
||||||
return
|
# return
|
||||||
if int(num) > self.__exit_level:
|
# if int(num) > self.__exit_level:
|
||||||
self.__exit_level = num
|
# self.__exit_level = num
|
||||||
def __make_temp_file(self,file):
|
def __make_temp_file(self,file):
|
||||||
"""Make a temporary file to parse"""
|
"""Make a temporary file to parse"""
|
||||||
write_file="rtf_write_file"
|
write_file="rtf_write_file"
|
||||||
read_obj = file if hasattr(file, 'read') else open(file,'r')
|
read_obj = file if hasattr(file, 'read') else open(file,'r')
|
||||||
write_obj = open(write_file, 'w')
|
write_obj = open(write_file, 'w')
|
||||||
line = "dummy"
|
for line in read_obj:
|
||||||
while line:
|
|
||||||
line = read_obj.read(1000)
|
|
||||||
write_obj.write(line)
|
write_obj.write(line)
|
||||||
write_obj.close()
|
write_obj.close()
|
||||||
return write_file
|
return write_file
|
||||||
"""
|
|
||||||
mi<tg<open______<style-sheet\n
|
|
||||||
mi<tg<close_____<style-sheet\n
|
|
||||||
mi<tg<open-att__<footnote<num>1\n
|
|
||||||
mi<tg<empty-att_<page-definition<margin>33\n
|
|
||||||
mi<tg<empty_____<para\n
|
|
||||||
"""
|
|
||||||
|
@ -34,18 +34,16 @@ class CheckBrackets:
|
|||||||
try:
|
try:
|
||||||
last_num = self.__open_bracket_num.pop()
|
last_num = self.__open_bracket_num.pop()
|
||||||
except:
|
except:
|
||||||
return 0
|
return False
|
||||||
if num != last_num:
|
if num != last_num:
|
||||||
return 0
|
return False
|
||||||
self.__bracket_count -= 1
|
self.__bracket_count -= 1
|
||||||
return 1
|
return True
|
||||||
def check_brackets(self):
|
def check_brackets(self):
|
||||||
read_obj = open(self.__file, 'r')
|
read_obj = open(self.__file, 'r')
|
||||||
line = 'dummy'
|
|
||||||
line_count = 0
|
line_count = 0
|
||||||
while line:
|
for line in read_obj:
|
||||||
line_count += 1
|
line_count += 1
|
||||||
line = read_obj.readline()
|
|
||||||
self.__token_info = line[:16]
|
self.__token_info = line[:16]
|
||||||
if self.__token_info == 'ob<nu<open-brack':
|
if self.__token_info == 'ob<nu<open-brack':
|
||||||
self.open_brack(line)
|
self.open_brack(line)
|
||||||
|
@ -23,43 +23,31 @@ class FixLineEndings:
|
|||||||
bug_handler,
|
bug_handler,
|
||||||
in_file = None,
|
in_file = None,
|
||||||
copy = None,
|
copy = None,
|
||||||
run_level = 1,
|
#run_level = 1, calibre why keep it?
|
||||||
replace_illegals = 1,
|
replace_illegals = 1,
|
||||||
):
|
):
|
||||||
self.__file = in_file
|
self.__file = in_file
|
||||||
self.__bug_handler = bug_handler
|
self.__bug_handler = bug_handler
|
||||||
self.__copy = copy
|
self.__copy = copy
|
||||||
self.__run_level = run_level
|
|
||||||
self.__write_to = tempfile.mktemp()
|
self.__write_to = tempfile.mktemp()
|
||||||
self.__replace_illegals = replace_illegals
|
self.__replace_illegals = replace_illegals
|
||||||
def fix_endings(self):
|
def fix_endings(self):
|
||||||
##tempFileName = tempfile.mktemp()
|
|
||||||
illegal_regx = re.compile( '\x00|\x01|\x02|\x03|\x04|\x05|\x06|\x07|\x08|\x0B|\x0E|\x0F|\x10|\x11|\x12|\x13')
|
illegal_regx = re.compile( '\x00|\x01|\x02|\x03|\x04|\x05|\x06|\x07|\x08|\x0B|\x0E|\x0F|\x10|\x11|\x12|\x13')
|
||||||
#nums = [0, 1, 2, 3, 4, 5, 6, 7, 8, 11, 14, 15, 16, 17, 18, 19]
|
|
||||||
"""
|
|
||||||
read_obj = open(self.__file, 'r')
|
|
||||||
line = read_obj.read(1000)
|
|
||||||
regexp = re.compile(r"\r")
|
|
||||||
macintosh = regexp.search(line)
|
|
||||||
read_obj.close()
|
|
||||||
"""
|
|
||||||
# always check since I have to get rid of illegal characters
|
# always check since I have to get rid of illegal characters
|
||||||
macintosh = 1
|
#read
|
||||||
if macintosh:
|
|
||||||
line = 1
|
|
||||||
read_obj = open(self.__file, 'r')
|
read_obj = open(self.__file, 'r')
|
||||||
write_obj = open(self.__write_to, 'w')
|
input_file = read_obj.read()
|
||||||
while line:
|
|
||||||
line = read_obj.read(1000)
|
|
||||||
# line = re.sub(regexp,"\n",line)
|
|
||||||
line = line.replace ('\r', '\n')
|
|
||||||
if self.__replace_illegals:
|
|
||||||
line = re.sub(illegal_regx, '', line)
|
|
||||||
# for num in nums:
|
|
||||||
# line = line.replace(chr(num), '')
|
|
||||||
write_obj.write(line )
|
|
||||||
read_obj.close()
|
read_obj.close()
|
||||||
|
#calibre go from win and mac to unix
|
||||||
|
input_file = input_file.replace ('\r\n', '\n')
|
||||||
|
input_file = input_file.replace ('\r', '\n')
|
||||||
|
if self.__replace_illegals:
|
||||||
|
input_file = re.sub(illegal_regx, '', input_file)
|
||||||
|
#write
|
||||||
|
write_obj = open(self.__write_to, 'wb')
|
||||||
|
write_obj.write(input_file)
|
||||||
write_obj.close()
|
write_obj.close()
|
||||||
|
#copy
|
||||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||||
if self.__copy:
|
if self.__copy:
|
||||||
copy_obj.copy_file(self.__write_to, "line_endings.data")
|
copy_obj.copy_file(self.__write_to, "line_endings.data")
|
||||||
|
@ -645,10 +645,8 @@ class ProcessTokens:
|
|||||||
return 'tx<nu<__________<%s\n' % token
|
return 'tx<nu<__________<%s\n' % token
|
||||||
def ob_func(self, pre, token, num=None):
|
def ob_func(self, pre, token, num=None):
|
||||||
self.__bracket_count += 1
|
self.__bracket_count += 1
|
||||||
##return 'ob<%04d\n' % self.__bracket_count
|
|
||||||
return 'ob<nu<open-brack<%04d\n' % self.__bracket_count
|
return 'ob<nu<open-brack<%04d\n' % self.__bracket_count
|
||||||
def cb_func(self, pre, token, num=None):
|
def cb_func(self, pre, token, num=None):
|
||||||
##line = 'cb<%04d\n' % self.__bracket_count
|
|
||||||
line = 'cb<nu<clos-brack<%04d\n' % self.__bracket_count
|
line = 'cb<nu<clos-brack<%04d\n' % self.__bracket_count
|
||||||
self.__bracket_count -= 1
|
self.__bracket_count -= 1
|
||||||
return line
|
return line
|
||||||
|
@ -23,7 +23,7 @@ class Tokenize:
|
|||||||
in_file,
|
in_file,
|
||||||
bug_handler,
|
bug_handler,
|
||||||
copy = None,
|
copy = None,
|
||||||
run_level = 1,
|
#run_level = 1,
|
||||||
):
|
):
|
||||||
self.__file = in_file
|
self.__file = in_file
|
||||||
self.__bug_handler = bug_handler
|
self.__bug_handler = bug_handler
|
||||||
@ -80,7 +80,7 @@ class Tokenize:
|
|||||||
def __create_tokens(self):
|
def __create_tokens(self):
|
||||||
self.__compile_expressions()
|
self.__compile_expressions()
|
||||||
read_obj = open(self.__file, 'r')
|
read_obj = open(self.__file, 'r')
|
||||||
write_obj = open(self.__write_to, 'w')
|
write_obj = open(self.__write_to, 'wb')
|
||||||
line_to_read = "dummy"
|
line_to_read = "dummy"
|
||||||
while line_to_read:
|
while line_to_read:
|
||||||
line_to_read = read_obj.readline()
|
line_to_read = read_obj.readline()
|
||||||
@ -106,7 +106,7 @@ class Tokenize:
|
|||||||
write_obj.close()
|
write_obj.close()
|
||||||
def tokenize(self):
|
def tokenize(self):
|
||||||
"""Main class for handling other methods. Reads in one line \
|
"""Main class for handling other methods. Reads in one line \
|
||||||
at a time, usues method self.sub_line to make basic substitutions,\
|
at a time, uses method self.sub_line to make basic substitutions,\
|
||||||
uses ? to process tokens"""
|
uses ? to process tokens"""
|
||||||
self.__create_tokens()
|
self.__create_tokens()
|
||||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||||
|
@ -4,8 +4,7 @@
|
|||||||
Read content from txt file.
|
Read content from txt file.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
import os
|
import os, re
|
||||||
import re
|
|
||||||
|
|
||||||
from calibre import prepare_string_for_xml
|
from calibre import prepare_string_for_xml
|
||||||
from calibre.ebooks.markdown import markdown
|
from calibre.ebooks.markdown import markdown
|
||||||
|
Loading…
x
Reference in New Issue
Block a user