Remove unicode preprocessing of RTF files & correct rtftoxml

Slight modification in rtftoxml
This commit is contained in:
Sengian 2011-01-05 00:21:32 +01:00
commit 5ca4d81071
8 changed files with 349 additions and 291 deletions

View File

@ -198,21 +198,21 @@ class RTFInput(InputFormatPlugin):
with open('styles.css', 'ab') as f: with open('styles.css', 'ab') as f:
f.write(css) f.write(css)
def preprocess(self, fname): # def preprocess(self, fname):
self.log('\tPreprocessing to convert unicode characters') # self.log('\tPreprocessing to convert unicode characters')
try: # try:
data = open(fname, 'rb').read() # data = open(fname, 'rb').read()
from calibre.ebooks.rtf.preprocess import RtfTokenizer, RtfTokenParser # from calibre.ebooks.rtf.preprocess import RtfTokenizer, RtfTokenParser
tokenizer = RtfTokenizer(data) # tokenizer = RtfTokenizer(data)
tokens = RtfTokenParser(tokenizer.tokens) # tokens = RtfTokenParser(tokenizer.tokens)
data = tokens.toRTF() # data = tokens.toRTF()
fname = 'preprocessed.rtf' # fname = 'preprocessed.rtf'
with open(fname, 'wb') as f: # with open(fname, 'wb') as f:
f.write(data) # f.write(data)
except: # except:
self.log.exception( # self.log.exception(
'Failed to preprocess RTF to convert unicode sequences, ignoring...') # 'Failed to preprocess RTF to convert unicode sequences, ignoring...')
return fname # return fname
def convert_borders(self, doc): def convert_borders(self, doc):
border_styles = [] border_styles = []
@ -249,9 +249,9 @@ class RTFInput(InputFormatPlugin):
self.log = log self.log = log
self.log('Converting RTF to XML...') self.log('Converting RTF to XML...')
#Name of the preprocesssed RTF file #Name of the preprocesssed RTF file
fname = self.preprocess(stream.name) # fname = self.preprocess(stream.name)
try: try:
xml = self.generate_xml(fname) xml = self.generate_xml(stream.name)
except RtfInvalidCodeException, e: except RtfInvalidCodeException, e:
raise ValueError(_('This RTF file has a feature calibre does not ' raise ValueError(_('This RTF file has a feature calibre does not '
'support. Convert it to HTML first and then try it.\n%s')%e) 'support. Convert it to HTML first and then try it.\n%s')%e)

View File

@ -18,6 +18,7 @@
# $Revision: 1.41 $ # $Revision: 1.41 $
# $Date: 2006/03/24 23:50:07 $ # $Date: 2006/03/24 23:50:07 $
import sys, os import sys, os
from calibre.ebooks.rtf2xml import headings_to_sections, \ from calibre.ebooks.rtf2xml import headings_to_sections, \
line_endings, footnote, fields_small, default_encoding, \ line_endings, footnote, fields_small, default_encoding, \
make_lists, preamble_div, header, colors, group_borders, \ make_lists, preamble_div, header, colors, group_borders, \
@ -90,7 +91,6 @@ class ParseRtf:
out_file = '', out_file = '',
out_dir = None, out_dir = None,
dtd = '', dtd = '',
#debug = 0, #why? calibre
deb_dir = None, deb_dir = None,
convert_symbol = None, convert_symbol = None,
convert_wingdings = None, convert_wingdings = None,
@ -107,6 +107,7 @@ class ParseRtf:
no_dtd = 0, no_dtd = 0,
char_data = '', char_data = '',
): ):
""" """
Requires: Requires:
'file' --file to parse 'file' --file to parse
@ -119,12 +120,11 @@ class ParseRtf:
script tries to output to directory where is script is exectued.) script tries to output to directory where is script is exectued.)
'deb_dir' --debug directory. If a debug_dir is provided, the script 'deb_dir' --debug directory. If a debug_dir is provided, the script
will copy each run through as a file to examine in the debug_dir will copy each run through as a file to examine in the debug_dir
'perl_script'--use perl to make tokens. This runs just a bit faster.
(I will probably phase this out.)
'check_brackets' -- make sure the brackets match up after each run 'check_brackets' -- make sure the brackets match up after each run
through a file. Only for debugging. through a file. Only for debugging.
Returns: Nothing Returns: Nothing
""" """
self.__file = in_file self.__file = in_file
self.__out_file = out_file self.__out_file = out_file
self.__out_dir = out_dir self.__out_dir = out_dir
@ -132,7 +132,7 @@ class ParseRtf:
self.__dtd_path = dtd self.__dtd_path = dtd
self.__check_file(in_file,"file_to_parse") self.__check_file(in_file,"file_to_parse")
self.__char_data = char_data self.__char_data = char_data
self.__debug_dir = deb_dir #self.__debug_dir = debug calibre self.__debug_dir = deb_dir
self.__check_dir(self.__temp_dir) self.__check_dir(self.__temp_dir)
self.__copy = self.__check_dir(self.__debug_dir) self.__copy = self.__check_dir(self.__debug_dir)
self.__convert_caps = convert_caps self.__convert_caps = convert_caps
@ -155,25 +155,24 @@ class ParseRtf:
if hasattr(the_file, 'read'): return if hasattr(the_file, 'read'): return
if the_file == None: if the_file == None:
if type == "file_to_parse": if type == "file_to_parse":
message = "You must provide a file for the script to work" msg = _("\nYou must provide a file for the script to work")
msg = message
raise RtfInvalidCodeException, msg raise RtfInvalidCodeException, msg
elif os.path.exists(the_file): elif os.path.exists(the_file):
pass # do nothing pass # do nothing
else: else:
message = "The file '%s' cannot be found" % the_file msg = _("\nThe file '%s' cannot be found") % the_file
msg = message
raise RtfInvalidCodeException, msg raise RtfInvalidCodeException, msg
def __check_dir(self, the_dir): def __check_dir(self, the_dir):
"""Check to see if directory exists""" """Check to see if directory exists"""
if not the_dir : if not the_dir :
return return
dir_exists = os.path.isdir(the_dir) dir_exists = os.path.isdir(the_dir)
if not dir_exists: if not dir_exists:
message = "%s is not a directory" % the_dir msg = _("\n%s is not a directory") % the_dir
msg = message
raise RtfInvalidCodeException, msg raise RtfInvalidCodeException, msg
return 1 return 1
def parse_rtf(self): def parse_rtf(self):
""" """
Parse the file by calling on other classes. Parse the file by calling on other classes.
@ -194,13 +193,14 @@ class ParseRtf:
copy_obj.set_dir(self.__debug_dir) copy_obj.set_dir(self.__debug_dir)
copy_obj.remove_files() copy_obj.remove_files()
copy_obj.copy_file(self.__temp_file, "original_file") copy_obj.copy_file(self.__temp_file, "original_file")
# new as of 2005-08-02. Do I want this? # Function to check if bracket are well handled
if self.__debug_dir or self.__run_level > 2: if self.__debug_dir or self.__run_level > 2:
self.__check_brack_obj = check_brackets.CheckBrackets\ self.__check_brack_obj = check_brackets.CheckBrackets\
(file = self.__temp_file, (file = self.__temp_file,
bug_handler = RtfInvalidCodeException, bug_handler = RtfInvalidCodeException,
) )
# convert Macintosh line endings to Unix line endings # convert Macintosh and Windows line endings to Unix line endings
#why do this if you don't wb after?
line_obj = line_endings.FixLineEndings( line_obj = line_endings.FixLineEndings(
in_file = self.__temp_file, in_file = self.__temp_file,
bug_handler = RtfInvalidCodeException, bug_handler = RtfInvalidCodeException,
@ -208,13 +208,13 @@ class ParseRtf:
run_level = self.__run_level, run_level = self.__run_level,
replace_illegals = self.__replace_illegals, replace_illegals = self.__replace_illegals,
) )
return_value = line_obj.fix_endings() return_value = line_obj.fix_endings() #calibre return what?
self.__return_code(return_value) self.__return_code(return_value)
tokenize_obj = tokenize.Tokenize( tokenize_obj = tokenize.Tokenize(
bug_handler = RtfInvalidCodeException, bug_handler = RtfInvalidCodeException,
in_file = self.__temp_file, in_file = self.__temp_file,
copy = self.__copy, copy = self.__copy,
run_level = self.__run_level,) run_level = self.__run_level)
tokenize_obj.tokenize() tokenize_obj.tokenize()
process_tokens_obj = process_tokens.ProcessTokens( process_tokens_obj = process_tokens.ProcessTokens(
in_file = self.__temp_file, in_file = self.__temp_file,
@ -230,11 +230,13 @@ class ParseRtf:
os.remove(self.__temp_file) os.remove(self.__temp_file)
except OSError: except OSError:
pass pass
#Check to see if the file is correct ascii
check_encoding_obj = check_encoding.CheckEncoding( check_encoding_obj = check_encoding.CheckEncoding(
bug_handler = RtfInvalidCodeException, bug_handler = RtfInvalidCodeException,
) )
check_encoding_obj.check_encoding(self.__file) if check_encoding_obj.check_encoding(self.__file):
sys.stderr.write('File "%s" does not appear to be RTF.\n' % self.__file if isinstance(self.__file, str) else self.__file.encode('utf-8')) sys.stderr.write(_('File "%s" does not appear to be ascii.\n') \
% self.__file if isinstance(self.__file, str) else self.__file.encode('utf-8'))
raise InvalidRtfException, msg raise InvalidRtfException, msg
delete_info_obj = delete_info.DeleteInfo( delete_info_obj = delete_info.DeleteInfo(
in_file = self.__temp_file, in_file = self.__temp_file,
@ -370,10 +372,10 @@ class ParseRtf:
sys.stderr.write('File could be older RTF...\n') sys.stderr.write('File could be older RTF...\n')
if found_destination: if found_destination:
if self.__run_level > 1: if self.__run_level > 1:
sys.stderr.write( sys.stderr.write(_(
'File also has newer RTF.\n' 'File also has newer RTF.\n'
'Will do the best to convert.\n' 'Will do the best to convert.\n'
) ))
add_brackets_obj = add_brackets.AddBrackets( add_brackets_obj = add_brackets.AddBrackets(
in_file = self.__temp_file, in_file = self.__temp_file,
bug_handler = RtfInvalidCodeException, bug_handler = RtfInvalidCodeException,
@ -520,6 +522,7 @@ class ParseRtf:
output_obj.output() output_obj.output()
os.remove(self.__temp_file) os.remove(self.__temp_file)
return self.__exit_level return self.__exit_level
def __bracket_match(self, file_name): def __bracket_match(self, file_name):
if self.__run_level > 2: if self.__run_level > 2:
good_br, msg = self.__check_brack_obj.check_brackets() good_br, msg = self.__check_brack_obj.check_brackets()
@ -527,28 +530,20 @@ class ParseRtf:
pass pass
#sys.stderr.write( msg + ' in ' + file_name + "\n") #sys.stderr.write( msg + ' in ' + file_name + "\n")
else: else:
msg += msg + " in file '" + file_name + "'\n" msg = _('%s in file %s\n') % (msg, file_name)
raise RtfInvalidCodeException, msg raise RtfInvalidCodeException, msg
def __return_code(self, num): def __return_code(self, num):
if num == None: if num == None:
return return
if int(num) > self.__exit_level: if int(num) > self.__exit_level:
self.__exit_level = num self.__exit_level = num
def __make_temp_file(self,file): def __make_temp_file(self,file):
"""Make a temporary file to parse""" """Make a temporary file to parse"""
write_file="rtf_write_file" write_file="rtf_write_file"
read_obj = file if hasattr(file, 'read') else open(file,'r') read_obj = file if hasattr(file, 'read') else open(file,'r')
write_obj = open(write_file, 'w') with open(write_file, 'wb') as write_obj:
line = "dummy" for line in read_obj:
while line:
line = read_obj.read(1000)
write_obj.write(line) write_obj.write(line)
write_obj.close()
return write_file return write_file
"""
mi<tg<open______<style-sheet\n
mi<tg<close_____<style-sheet\n
mi<tg<open-att__<footnote<num>1\n
mi<tg<empty-att_<page-definition<margin>33\n
mi<tg<empty_____<para\n
"""

View File

@ -24,38 +24,37 @@ class CheckBrackets:
self.__ob_count = 0 self.__ob_count = 0
self.__cb_count = 0 self.__cb_count = 0
self.__open_bracket_num = [] self.__open_bracket_num = []
def open_brack(self, line): def open_brack(self, line):
num = line[-5:-1] num = line[-5:-1]
self.__open_bracket_num.append(num) self.__open_bracket_num.append(num)
self.__bracket_count += 1 self.__bracket_count += 1
def close_brack(self, line): def close_brack(self, line):
num = line[-5:-1] num = line[-5:-1]
##self.__open_bracket_num.append(num)
try: try:
last_num = self.__open_bracket_num.pop() last_num = self.__open_bracket_num.pop()
except: except:
return 0 return False
if num != last_num: if num != last_num:
return 0 return False
self.__bracket_count -= 1 self.__bracket_count -= 1
return 1 return True
def check_brackets(self): def check_brackets(self):
read_obj = open(self.__file, 'r')
line = 'dummy'
line_count = 0 line_count = 0
while line: with open(self.__file, 'r') as read_obj:
for line in read_obj:
line_count += 1 line_count += 1
line = read_obj.readline()
self.__token_info = line[:16] self.__token_info = line[:16]
if self.__token_info == 'ob<nu<open-brack': if self.__token_info == 'ob<nu<open-brack':
self.open_brack(line) self.open_brack(line)
if self.__token_info == 'cb<nu<clos-brack': if self.__token_info == 'cb<nu<clos-brack':
right_count = self.close_brack(line) if not self.close_brack(line):
if not right_count: return (False, "closed bracket doesn't match, line %s" % line_count)
return (0, "closed bracket doesn't match, line %s" % line_count)
read_obj.close()
if self.__bracket_count != 0: if self.__bracket_count != 0:
msg = 'At end of file open and closed brackets don\'t match\n' msg = _('At end of file open and closed brackets don\'t match\n' \
msg = msg + 'total number of brackets is %s' % self.__bracket_count 'total number of brackets is %s') % self.__bracket_count
return (0, msg) return (False, msg)
return (1, "brackets match!") return (True, _("Brackets match!"))

View File

@ -1,8 +1,10 @@
#!/usr/bin/env python #!/usr/bin/env python
import sys import sys
class CheckEncoding: class CheckEncoding:
def __init__(self, bug_handler): def __init__(self, bug_handler):
self.__bug_handler = bug_handler self.__bug_handler = bug_handler
def __get_position_error(self, line, encoding, line_num): def __get_position_error(self, line, encoding, line_num):
char_position = 0 char_position = 0
for char in line: for char in line:
@ -10,23 +12,24 @@ class CheckEncoding:
try: try:
char.decode(encoding) char.decode(encoding)
except UnicodeError, msg: except UnicodeError, msg:
sys.stderr.write('line: %s char: %s\n' % (line_num, char_position)) sys.stderr.write(_('line: %s char: %s\n') % (line_num, char_position))
sys.stderr.write(str(msg) + '\n') sys.stderr.write(str(msg) + '\n')
def check_encoding(self, path, encoding='us-ascii'): def check_encoding(self, path, encoding='us-ascii'):
read_obj = open(path, 'r')
line_to_read = 1
line_num = 0 line_num = 0
while line_to_read: with open(path, 'r') as read_obj:
for line in read_obj:
line_num += 1 line_num += 1
line_to_read = read_obj.readline()
line = line_to_read
try: try:
line.decode(encoding) line.decode(encoding)
except UnicodeError: except UnicodeError:
if len(line) < 1000: if len(line) < 1000:
self.__get_position_error(line, encoding, line_num) self.__get_position_error(line, encoding, line_num)
else: else:
sys.stderr.write('line: %d has bad encoding\n'%line_num) sys.stderr.write(_('line: %d has bad encoding\n') % line_num)
return True
return False
if __name__ == '__main__': if __name__ == '__main__':
check_encoding_obj = CheckEncoding() check_encoding_obj = CheckEncoding()
check_encoding_obj.check_encoding(sys.argv[1]) check_encoding_obj.check_encoding(sys.argv[1])

View File

@ -23,6 +23,7 @@ class Copy:
def __init__(self, bug_handler, file = None, deb_dir = None, ): def __init__(self, bug_handler, file = None, deb_dir = None, ):
self.__file = file self.__file = file
self.__bug_handler = bug_handler self.__bug_handler = bug_handler
def set_dir(self, deb_dir): def set_dir(self, deb_dir):
"""Set the temporary directory to write files to""" """Set the temporary directory to write files to"""
if deb_dir is None: if deb_dir is None:
@ -33,19 +34,11 @@ class Copy:
message = "%(deb_dir)s is not a directory" % vars() message = "%(deb_dir)s is not a directory" % vars()
raise self.__bug_handler , message raise self.__bug_handler , message
Copy.__dir = deb_dir Copy.__dir = deb_dir
def remove_files(self ): def remove_files(self ):
"""Remove files from directory""" """Remove files from directory"""
self.__remove_the_files(Copy.__dir) self.__remove_the_files(Copy.__dir)
"""
list_of_files = os.listdir(Copy.__dir)
list_of_files = os.listdir(the_dir)
for file in list_of_files:
rem_file = os.path.join(Copy.__dir,file)
if os.path.isdir(rem_file):
self.remove_files(rem_file)
else:
os.remove(rem_file)
"""
def __remove_the_files(self, the_dir): def __remove_the_files(self, the_dir):
"""Remove files from directory""" """Remove files from directory"""
list_of_files = os.listdir(the_dir) list_of_files = os.listdir(the_dir)
@ -58,6 +51,7 @@ class Copy:
os.remove(rem_file) os.remove(rem_file)
except OSError: except OSError:
pass pass
def copy_file(self, file, new_file): def copy_file(self, file, new_file):
""" """
Copy the file to a new name Copy the file to a new name

View File

@ -16,7 +16,10 @@
# # # #
######################################################################### #########################################################################
import os, tempfile, re import os, tempfile, re
from calibre.ebooks.rtf2xml import copy from calibre.ebooks.rtf2xml import copy
from calibre.utils.cleantext import clean_ascii_chars
class FixLineEndings: class FixLineEndings:
"""Fix line endings""" """Fix line endings"""
def __init__(self, def __init__(self,
@ -32,34 +35,21 @@ class FixLineEndings:
self.__run_level = run_level self.__run_level = run_level
self.__write_to = tempfile.mktemp() self.__write_to = tempfile.mktemp()
self.__replace_illegals = replace_illegals self.__replace_illegals = replace_illegals
def fix_endings(self): def fix_endings(self):
##tempFileName = tempfile.mktemp() #read
illegal_regx = re.compile( '\x00|\x01|\x02|\x03|\x04|\x05|\x06|\x07|\x08|\x0B|\x0E|\x0F|\x10|\x11|\x12|\x13') with open(self.__file, 'r') as read_obj:
#nums = [0, 1, 2, 3, 4, 5, 6, 7, 8, 11, 14, 15, 16, 17, 18, 19] input_file = read_obj.read()
""" #calibre go from win and mac to unix
read_obj = open(self.__file, 'r') input_file = input_file.replace ('\r\n', '\n')
line = read_obj.read(1000) input_file = input_file.replace ('\r', '\n')
regexp = re.compile(r"\r") #remove ASCII invalid chars : 0 to 8 and 11-14 to 24-26-27
macintosh = regexp.search(line)
read_obj.close()
"""
# always check since I have to get rid of illegal characters
macintosh = 1
if macintosh:
line = 1
read_obj = open(self.__file, 'r')
write_obj = open(self.__write_to, 'w')
while line:
line = read_obj.read(1000)
# line = re.sub(regexp,"\n",line)
line = line.replace ('\r', '\n')
if self.__replace_illegals: if self.__replace_illegals:
line = re.sub(illegal_regx, '', line) input_file = clean_ascii_chars(input_file)
# for num in nums: #write
# line = line.replace(chr(num), '') with open(self.__write_to, 'wb') as write_obj:
write_obj.write(line ) write_obj.write(input_file)
read_obj.close() #copy
write_obj.close()
copy_obj = copy.Copy(bug_handler = self.__bug_handler) copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy: if self.__copy:
copy_obj.copy_file(self.__write_to, "line_endings.data") copy_obj.copy_file(self.__write_to, "line_endings.data")

View File

@ -16,7 +16,9 @@
# # # #
######################################################################### #########################################################################
import os, re, tempfile import os, re, tempfile
from calibre.ebooks.rtf2xml import copy, check_brackets from calibre.ebooks.rtf2xml import copy, check_brackets
class ProcessTokens: class ProcessTokens:
""" """
Process each token on a line and add information that will be useful for Process each token on a line and add information that will be useful for
@ -41,9 +43,11 @@ class ProcessTokens:
self.__bracket_count=0 self.__bracket_count=0
self.__exception_handler = exception_handler self.__exception_handler = exception_handler
self.__bug_handler = bug_handler self.__bug_handler = bug_handler
def compile_expressions(self): def compile_expressions(self):
self.__num_exp = re.compile(r"([a-zA-Z]+)(.*)") self.__num_exp = re.compile(r"([a-zA-Z]+)(.*)")
self.__utf_exp = re.compile(r'(&.*?;)') self.__utf_exp = re.compile(r'(&.*?;)')
def initiate_token_dict(self): def initiate_token_dict(self):
self.__return_code = 0 self.__return_code = 0
self.dict_token={ self.dict_token={
@ -595,12 +599,15 @@ class ProcessTokens:
num = num[1:] # chop off leading 0, which I added num = num[1:] # chop off leading 0, which I added
num = num.upper() # the mappings store hex in caps num = num.upper() # the mappings store hex in caps
return 'tx<hx<__________<\'%s\n' % num # add an ' for the mappings return 'tx<hx<__________<\'%s\n' % num # add an ' for the mappings
def ms_sub_func(self, pre, token, num): def ms_sub_func(self, pre, token, num):
return 'tx<mc<__________<%s\n' % token return 'tx<mc<__________<%s\n' % token
def default_func(self, pre, token, num): def default_func(self, pre, token, num):
if num == None: if num == None:
num = 'true' num = 'true'
return 'cw<%s<%s<nu<%s\n' % (pre, token, num) return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
def __list_type_func(self, pre, token, num): def __list_type_func(self, pre, token, num):
type = 'arabic' type = 'arabic'
if num == None: if num == None:
@ -610,15 +617,16 @@ class ProcessTokens:
num = int(num) num = int(num)
except ValueError: except ValueError:
if self.__run_level > 3: if self.__run_level > 3:
msg = 'number "%s" cannot be converted to integer\n' % num msg = _('Number "%s" cannot be converted to integer\n') % num
raise self.__bug_handler, msg raise self.__bug_handler, msg
type = self.__number_type_dict.get(num) type = self.__number_type_dict.get(num)
if type == None: if type == None:
if self.__run_level > 3: if self.__run_level > 3:
msg = 'No type for "%s" in self.__number_type_dict\n' msg = _('No type for "%s" in self.__number_type_dict\n')
raise self.__bug_handler raise self.__bug_handler
type = 'Arabic' type = 'Arabic'
return 'cw<%s<%s<nu<%s\n' % (pre, token, type) return 'cw<%s<%s<nu<%s\n' % (pre, token, type)
def __language_func(self, pre, token, num): def __language_func(self, pre, token, num):
lang_name = self.__language_dict.get(int(re.search('[0-9]+', num).group())) lang_name = self.__language_dict.get(int(re.search('[0-9]+', num).group()))
if not lang_name: if not lang_name:
@ -627,31 +635,36 @@ class ProcessTokens:
msg = 'No entry for number "%s"' % num msg = 'No entry for number "%s"' % num
raise self.__bug_handler, msg raise self.__bug_handler, msg
return 'cw<%s<%s<nu<%s\n' % (pre, token, lang_name) return 'cw<%s<%s<nu<%s\n' % (pre, token, lang_name)
def two_part_func(self, pre, token, num): def two_part_func(self, pre, token, num):
list = token.split("<") list = token.split("<")
token = list[0] token = list[0]
num = list[1] num = list[1]
return 'cw<%s<%s<nu<%s\n' % (pre, token, num) return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
##return 'cw<nu<nu<nu<%s>num<%s\n' % (token, num) ##return 'cw<nu<nu<nu<%s>num<%s\n' % (token, num)
def divide_by_2(self, pre, token, num): def divide_by_2(self, pre, token, num):
num = self.divide_num(num, 2) num = self.divide_num(num, 2)
return 'cw<%s<%s<nu<%s\n' % (pre, token, num) return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
##return 'cw<nu<nu<nu<%s>%s<%s\n' % (token, num, token) ##return 'cw<nu<nu<nu<%s>%s<%s\n' % (token, num, token)
def divide_by_20(self, pre, token, num): def divide_by_20(self, pre, token, num):
num = self.divide_num(num, 20) num = self.divide_num(num, 20)
return 'cw<%s<%s<nu<%s\n' % (pre, token, num) return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
##return 'cw<nu<nu<nu<%s>%s<%s\n' % (token, num, token) ##return 'cw<nu<nu<nu<%s>%s<%s\n' % (token, num, token)
def text_func(self, pre, token, num=None): def text_func(self, pre, token, num=None):
return 'tx<nu<__________<%s\n' % token return 'tx<nu<__________<%s\n' % token
def ob_func(self, pre, token, num=None): def ob_func(self, pre, token, num=None):
self.__bracket_count += 1 self.__bracket_count += 1
##return 'ob<%04d\n' % self.__bracket_count
return 'ob<nu<open-brack<%04d\n' % self.__bracket_count return 'ob<nu<open-brack<%04d\n' % self.__bracket_count
def cb_func(self, pre, token, num=None): def cb_func(self, pre, token, num=None):
##line = 'cb<%04d\n' % self.__bracket_count
line = 'cb<nu<clos-brack<%04d\n' % self.__bracket_count line = 'cb<nu<clos-brack<%04d\n' % self.__bracket_count
self.__bracket_count -= 1 self.__bracket_count -= 1
return line return line
def color_func(self, pre, token, num): def color_func(self, pre, token, num):
third_field = 'nu' third_field = 'nu'
if num[-1] == ';': if num[-1] == ';':
@ -662,6 +675,7 @@ class ProcessTokens:
num = "0" + num num = "0" + num
return 'cw<%s<%s<%s<%s\n' % (pre, token, third_field, num) return 'cw<%s<%s<%s<%s\n' % (pre, token, third_field, num)
##return 'cw<cl<%s<nu<nu<%s>%s<%s\n' % (third_field, token, num, token) ##return 'cw<cl<%s<nu<nu<%s>%s<%s\n' % (third_field, token, num, token)
def bool_st_func(self, pre, token, num): def bool_st_func(self, pre, token, num):
if num is None or num == '' or num == '1': if num is None or num == '' or num == '1':
return 'cw<%s<%s<nu<true\n' % (pre, token) return 'cw<%s<%s<nu<true\n' % (pre, token)
@ -674,13 +688,16 @@ class ProcessTokens:
msg += 'token is ' + token + "\n" msg += 'token is ' + token + "\n"
msg += "'" + num + "'" + "\n" msg += "'" + num + "'" + "\n"
raise self.__bug_handler, msg raise self.__bug_handler, msg
def __no_sup_sub_func(self, pre, token, num): def __no_sup_sub_func(self, pre, token, num):
the_string = 'cw<ci<subscript_<nu<false\n' the_string = 'cw<ci<subscript_<nu<false\n'
the_string += 'cw<ci<superscrip<nu<false\n' the_string += 'cw<ci<superscrip<nu<false\n'
return the_string return the_string
def divide_num(self, numerator, denominator): def divide_num(self, numerator, denominator):
try: try:
numerator = float(re.search('[0-9.]+', numerator).group()) #calibre why ignore negative number? Wrong in case of \fi
numerator = float(re.search('[0-9.\-]+', numerator).group())
except TypeError, msg: except TypeError, msg:
if self.__run_level > 3: if self.__run_level > 3:
msg = 'no number to process?\n' msg = 'no number to process?\n'
@ -698,6 +715,7 @@ class ProcessTokens:
if string_num[-2:] == ".0": if string_num[-2:] == ".0":
string_num = string_num[:-2] string_num = string_num[:-2]
return string_num return string_num
def split_let_num(self, token): def split_let_num(self, token):
match_obj = re.search(self.__num_exp,token) match_obj = re.search(self.__num_exp,token)
if match_obj != None: if match_obj != None:
@ -714,6 +732,7 @@ class ProcessTokens:
raise self.__bug_handler raise self.__bug_handler
return token, 0 return token, 0
return first, second return first, second
def convert_to_hex(self,number): def convert_to_hex(self,number):
"""Convert a string to uppercase hexidecimal""" """Convert a string to uppercase hexidecimal"""
num = int(number) num = int(number)
@ -722,6 +741,7 @@ class ProcessTokens:
return hex_num return hex_num
except: except:
raise self.__bug_handler raise self.__bug_handler
def process_cw(self, token): def process_cw(self, token):
"""Change the value of the control word by determining what dictionary """Change the value of the control word by determining what dictionary
it belongs to""" it belongs to"""
@ -737,69 +757,36 @@ class ProcessTokens:
pre, token, action = self.dict_token.get(token, (None, None, None)) pre, token, action = self.dict_token.get(token, (None, None, None))
if action: if action:
return action(pre, token, num) return action(pre, token, num)
# unused function
def initiate_token_actions(self):
self.action_for_token={
'{' : self.ob_func,
'}' : self.cb_func,
'\\' : self.process_cw,
}
# unused function
def evaluate_token(self,token):
"""Evaluate tokens. Return a value if the token is not a
control word. Otherwise, pass token onto another method
for further evaluation."""
token, action = self.dict_token.get(token[0:1])
if action:
line = action(token)
return line
else :
return 'tx<nu<nu<nu<nu<%s\n' % token
def __check_brackets(self, in_file): def __check_brackets(self, in_file):
self.__check_brack_obj = check_brackets.CheckBrackets\ self.__check_brack_obj = check_brackets.CheckBrackets\
(file = in_file) (file = in_file)
good_br = self.__check_brack_obj.check_brackets()[0] good_br = self.__check_brack_obj.check_brackets()[0]
if not good_br: if not good_br:
return 1 return 1
def process_tokens(self): def process_tokens(self):
"""Main method for handling other methods. """ """Main method for handling other methods. """
first_token = 0
second_token = 0
read_obj = open(self.__file, 'r')
write_obj = open(self.__write_to, 'w')
line_to_read = "dummy"
line_count = 0 line_count = 0
while line_to_read: with open(self.__file, 'r') as read_obj, open(self.__write_to, 'wb') as write_obj:
line_to_read = read_obj.readline() for line in read_obj:
token = line_to_read token = line.replace("\n","")
token = token.replace("\n","")
if not token:
continue
line_count += 1 line_count += 1
try: if line_count == 1 and token != '\\{':
token.decode('us-ascii') msg = _('Invalid RTF: document doesn\'t start with {\n')
except UnicodeError, msg:
msg = str(msg)
msg += 'Invalid RTF: File not ascii encoded.\n'
raise self.__exception_handler, msg raise self.__exception_handler, msg
if not first_token: elif line_count == 2 and token[0:4] != '\\rtf':
if token != '\\{': msg =_('Invalid RTF: document doesn\'t start with \\rtf \n')
msg = 'Invalid RTF: document doesn\'t start with {\n'
raise self.__exception_handler, msg raise self.__exception_handler, msg
first_token = 1
elif first_token and not second_token:
if token[0:4] != '\\rtf':
msg ='Invalid RTF: document doesn\'t start with \\rtf \n'
raise self.__exception_handler, msg
second_token = 1
##token = self.evaluate_token(token) ##token = self.evaluate_token(token)
the_index = token.find('\\ ') the_index = token.find('\\ ')
if token != None and the_index > -1: if token is not None and the_index > -1:
msg ='Invalid RTF: token "\\ " not valid.\n' msg ='Invalid RTF: token "\\ " not valid.\n'
raise self.__exception_handler, msg raise self.__exception_handler, msg
elif token[0:1] == "\\": elif token[:1] == "\\":
line = self.process_cw(token) line = self.process_cw(token)
if line != None: if line is not None:
write_obj.write(line) write_obj.write(line)
else: else:
fields = re.split(self.__utf_exp, token) fields = re.split(self.__utf_exp, token)
@ -810,19 +797,20 @@ class ProcessTokens:
write_obj.write('tx<ut<__________<%s\n' % field) write_obj.write('tx<ut<__________<%s\n' % field)
else: else:
write_obj.write('tx<nu<__________<%s\n' % field) write_obj.write('tx<nu<__________<%s\n' % field)
read_obj.close()
write_obj.close()
if not line_count: if not line_count:
msg ='Invalid RTF: file appears to be empty. \n' msg =_('Invalid RTF: file appears to be empty.\n')
raise self.__exception_handler, msg raise self.__exception_handler, msg
copy_obj = copy.Copy(bug_handler = self.__bug_handler) copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy: if self.__copy:
copy_obj.copy_file(self.__write_to, "processed_tokens.data") copy_obj.copy_file(self.__write_to, "processed_tokens.data")
copy_obj.rename(self.__write_to, self.__file) copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to) os.remove(self.__write_to)
bad_brackets = self.__check_brackets(self.__file) bad_brackets = self.__check_brackets(self.__file)
if bad_brackets: if bad_brackets:
msg = 'Invalid RTF: document does not have matching brackets.\n' msg = _('Invalid RTF: document does not have matching brackets.\n')
raise self.__exception_handler, msg raise self.__exception_handler, msg
else: else:
return self.__return_code return self.__return_code

View File

@ -16,7 +16,10 @@
# # # #
######################################################################### #########################################################################
import os, re, tempfile import os, re, tempfile
from calibre.ebooks.rtf2xml import copy from calibre.ebooks.rtf2xml import copy
from calibre.utils.mreplace import MReplace
class Tokenize: class Tokenize:
"""Tokenize RTF into one line per field. Each line will contain information useful for the rest of the script""" """Tokenize RTF into one line per field. Each line will contain information useful for the rest of the script"""
def __init__(self, def __init__(self,
@ -28,89 +31,175 @@ class Tokenize:
self.__file = in_file self.__file = in_file
self.__bug_handler = bug_handler self.__bug_handler = bug_handler
self.__copy = copy self.__copy = copy
self.__special_tokens = [ '_', '~', "'", '{', '}' ]
self.__write_to = tempfile.mktemp() self.__write_to = tempfile.mktemp()
def __from_ms_to_utf8(self,match_obj): self.__compile_expressions()
#variables
self.__uc_char = 0
self.__uc_bin = False
self.__uc_value = [1]
def __reini_utf8_counters(self):
self.__uc_char = 0
self.__uc_bin = False
def __remove_uc_chars(self, startchar, token):
for i in xrange(startchar, len(token)):
if token[i] == " ":
continue
elif self.__uc_char:
self.__uc_char -= 1
else:
return token[i:]
#if only " " and char to skip
return ''
def __unicode_process(self, token):
#change scope in
if token == '\{':
self.__uc_value.append(self.__uc_value[-1])
#basic error handling
self.__reini_utf8_counters()
return token
#change scope out
elif token == '\}':
self.__uc_value.pop()
self.__reini_utf8_counters()
return token
#add a uc control
elif token[:3] == '\uc':
self.__uc_value[-1] = int(token[3:])
self.__reini_utf8_counters()
return token
#bin data to slip
elif self.__uc_bin:
self.__uc_bin = False
return ''
#uc char to remove
elif self.__uc_char:
#handle \bin tag in case of uc char to skip
if token[:4] == '\bin':
self.__uc_char -=1
self.__uc_bin = True
return ''
elif token[:1] == "\\" :
self.__uc_char -=1
return ''
else:
return self.__remove_uc_chars(0, token)
#go for real \u token
match_obj = self.__utf_exp.match(token)
if match_obj is not None:
self.__reini_utf8_counters()
#get value and handle negative case
uni_char = int(match_obj.group(1)) uni_char = int(match_obj.group(1))
uni_len = len(match_obj.group(1)) + 2
if uni_char < 0: if uni_char < 0:
uni_char += 65536 uni_char += 65536
return '&#x' + str('%X' % uni_char) + ';' uni_char = unichr(uni_char).encode('ascii', 'xmlcharrefreplace')
def __neg_unicode_func(self, match_obj): self.__uc_char = self.__uc_value[-1]
neg_uni_char = int(match_obj.group(1)) * -1 #there is only an unicode char
# sys.stderr.write(str( neg_uni_char)) if len(token)<= uni_len:
uni_char = neg_uni_char + 65536 return uni_char
return '&#x' + str('%X' % uni_char) + ';' #an unicode char and something else
def __sub_line_reg(self,line): #must be after as it is splited on \
line = line.replace("\\\\", "\\backslash ") #necessary? maybe for \bin?
line = line.replace("\\~", "\\~ ") elif not self.__uc_char:
line = line.replace("\\;", "\\; ") return uni_char + token[uni_len:]
line = line.replace("&", "&amp;") #if not uc0 and chars
line = line.replace("<", "&lt;")
line = line.replace(">", "&gt;")
line = line.replace("\\~", "\\~ ")
line = line.replace("\\_", "\\_ ")
line = line.replace("\\:", "\\: ")
line = line.replace("\\-", "\\- ")
# turn into a generic token to eliminate special
# cases and make processing easier
line = line.replace("\\{", "\\ob ")
# turn into a generic token to eliminate special
# cases and make processing easier
line = line.replace("\\}", "\\cb ")
# put a backslash in front of to eliminate special cases and
# make processing easier
line = line.replace("{", "\\{")
# put a backslash in front of to eliminate special cases and
# make processing easier
line = line.replace("}", "\\}")
line = re.sub(self.__utf_exp, self.__from_ms_to_utf8, line)
# line = re.sub( self.__neg_utf_exp, self.__neg_unicode_func, line)
line = re.sub(self.__ms_hex_exp, "\\mshex0\g<1> ", line)
##line = line.replace("\\backslash", "\\\\")
# this is for older RTF
line = re.sub(self.__par_exp, '\\par ', line)
return line
def __compile_expressions(self):
self.__ms_hex_exp = re.compile(r"\\\'(..)")
self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) {0,1}")
self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\\[^\s\\{}&]+(?:\s)?)")
self.__par_exp = re.compile(r'\\$')
self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)")
##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)")
def __create_tokens(self):
self.__compile_expressions()
read_obj = open(self.__file, 'r')
write_obj = open(self.__write_to, 'w')
line_to_read = "dummy"
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
line = line.replace("\n", "")
line = self.__sub_line_reg(line)
tokens = re.split(self.__splitexp, line)
##print tokens
for token in tokens:
if token != "":
write_obj.write(token + "\n")
"""
match_obj = re.search(self.__mixed_exp, token)
if match_obj != None:
first = match_obj.group(1)
second = match_obj.group(2)
write_obj.write(first + "\n")
write_obj.write(second + "\n")
else: else:
write_obj.write(token + "\n") return uni_char + self.__remove_uc_chars(uni_len, token)
""" #default
read_obj.close() return token
write_obj.close()
def __sub_reg_split(self,input_file):
input_file = self.__replace_spchar.mreplace(input_file)
input_file = self.__ms_hex_exp.sub("\\mshex0\g<1> ", input_file)
input_file = self.__utf_ud.sub("\\{\\uc0 \g<1>\\}", input_file)
#remove \n in bin data
input_file = self.__bin_exp.sub(lambda x: \
x.group().replace('\n', '') +'\n', input_file)
#split
tokens = re.split(self.__splitexp, input_file)
#remove empty tokens and \n
return filter(lambda x: len(x) > 0 and x != '\n', tokens)
#input_file = re.sub(self.__utf_exp, self.__from_ms_to_utf8, input_file)
# line = re.sub( self.__neg_utf_exp, self.__neg_unicode_func, line)
# this is for older RTF
#line = re.sub(self.__par_exp, '\\par ', line)
#return filter(lambda x: len(x) > 0, \
#(self.__remove_line.sub('', x) for x in tokens))
def __compile_expressions(self):
SIMPLE_RPL = {
"\\\\": "\\backslash ",
"\\~": "\\~ ",
"\\;": "\\; ",
"&": "&amp;",
"<": "&lt;",
">": "&gt;",
"\\~": "\\~ ",
"\\_": "\\_ ",
"\\:": "\\: ",
"\\-": "\\- ",
# turn into a generic token to eliminate special
# cases and make processing easier
"\\{": "\\ob ",
# turn into a generic token to eliminate special
# cases and make processing easier
"\\}": "\\cb ",
# put a backslash in front of to eliminate special cases and
# make processing easier
"{": "\\{",
# put a backslash in front of to eliminate special cases and
# make processing easier
"}": "\\}",
# this is for older RTF
r'\\$': '\\par ',
}
self.__replace_spchar = MReplace(SIMPLE_RPL)
#add ;? in case of char following \u
self.__ms_hex_exp = re.compile(r"\\\'([0-9a-fA-F]{2})") #r"\\\'(..)"
self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) ?")
self.__bin_exp = re.compile(r"(?:\\bin(-?\d{0,10})[\n ]+)[01\n]+")
#manage upr/ud situations
self.__utf_ud = re.compile(r"\\{[\n ]?\\upr[\n ]?(?:\\{.*?\\})[\n ]?" + \
r"\\{[\n ]?\\*[\n ]?\\ud[\n ]?(\\{.*?\\})[\n ]?\\}[\n ]?\\}")
#add \n in split for whole file reading
#why keep backslash whereas \is replaced before?
#remove \n from endline char
self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)")
#self.__bin_exp = re.compile(r"\\bin(-?\d{1,8}) {0,1}")
#self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})")
#self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\n|\\[^\s\\{}&]+(?:\s)?)")
#self.__par_exp = re.compile(r'\\$')
#self.__remove_line = re.compile(r'\n+')
#self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)")
##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)")
def tokenize(self): def tokenize(self):
"""Main class for handling other methods. Reads in one line \ """Main class for handling other methods. Reads the file \
at a time, usues method self.sub_line to make basic substitutions,\ , uses method self.sub_reg to make basic substitutions,\
uses ? to process tokens""" and process tokens by itself"""
self.__create_tokens() #read
with open(self.__file, 'r') as read_obj:
input_file = read_obj.read()
#process simple replacements and split giving us a correct list
#remove '' and \n in the process
tokens = self.__sub_reg_split(input_file)
#correct unicode
tokens = map(self.__unicode_process, tokens)
#remove empty items created by removing \uc
tokens = filter(lambda x: len(x) > 0, tokens)
#write
with open(self.__write_to, 'wb') as write_obj:
write_obj.write('\n'.join(tokens))
#Move and copy
copy_obj = copy.Copy(bug_handler = self.__bug_handler) copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy: if self.__copy:
copy_obj.copy_file(self.__write_to, "tokenize.data") copy_obj.copy_file(self.__write_to, "tokenize.data")
copy_obj.rename(self.__write_to, self.__file) copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to) os.remove(self.__write_to)
#self.__special_tokens = [ '_', '~', "'", '{', '}' ]