mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Remove unicode preprocessing of RTF files & correct rtftoxml
Slight modification in rtftoxml
This commit is contained in:
commit
5ca4d81071
@ -198,21 +198,21 @@ class RTFInput(InputFormatPlugin):
|
||||
with open('styles.css', 'ab') as f:
|
||||
f.write(css)
|
||||
|
||||
def preprocess(self, fname):
|
||||
self.log('\tPreprocessing to convert unicode characters')
|
||||
try:
|
||||
data = open(fname, 'rb').read()
|
||||
from calibre.ebooks.rtf.preprocess import RtfTokenizer, RtfTokenParser
|
||||
tokenizer = RtfTokenizer(data)
|
||||
tokens = RtfTokenParser(tokenizer.tokens)
|
||||
data = tokens.toRTF()
|
||||
fname = 'preprocessed.rtf'
|
||||
with open(fname, 'wb') as f:
|
||||
f.write(data)
|
||||
except:
|
||||
self.log.exception(
|
||||
'Failed to preprocess RTF to convert unicode sequences, ignoring...')
|
||||
return fname
|
||||
# def preprocess(self, fname):
|
||||
# self.log('\tPreprocessing to convert unicode characters')
|
||||
# try:
|
||||
# data = open(fname, 'rb').read()
|
||||
# from calibre.ebooks.rtf.preprocess import RtfTokenizer, RtfTokenParser
|
||||
# tokenizer = RtfTokenizer(data)
|
||||
# tokens = RtfTokenParser(tokenizer.tokens)
|
||||
# data = tokens.toRTF()
|
||||
# fname = 'preprocessed.rtf'
|
||||
# with open(fname, 'wb') as f:
|
||||
# f.write(data)
|
||||
# except:
|
||||
# self.log.exception(
|
||||
# 'Failed to preprocess RTF to convert unicode sequences, ignoring...')
|
||||
# return fname
|
||||
|
||||
def convert_borders(self, doc):
|
||||
border_styles = []
|
||||
@ -249,9 +249,9 @@ class RTFInput(InputFormatPlugin):
|
||||
self.log = log
|
||||
self.log('Converting RTF to XML...')
|
||||
#Name of the preprocesssed RTF file
|
||||
fname = self.preprocess(stream.name)
|
||||
# fname = self.preprocess(stream.name)
|
||||
try:
|
||||
xml = self.generate_xml(fname)
|
||||
xml = self.generate_xml(stream.name)
|
||||
except RtfInvalidCodeException, e:
|
||||
raise ValueError(_('This RTF file has a feature calibre does not '
|
||||
'support. Convert it to HTML first and then try it.\n%s')%e)
|
||||
|
@ -17,7 +17,8 @@
|
||||
#########################################################################
|
||||
# $Revision: 1.41 $
|
||||
# $Date: 2006/03/24 23:50:07 $
|
||||
import sys,os
|
||||
import sys, os
|
||||
|
||||
from calibre.ebooks.rtf2xml import headings_to_sections, \
|
||||
line_endings, footnote, fields_small, default_encoding, \
|
||||
make_lists, preamble_div, header, colors, group_borders, \
|
||||
@ -90,7 +91,6 @@ class ParseRtf:
|
||||
out_file = '',
|
||||
out_dir = None,
|
||||
dtd = '',
|
||||
#debug = 0, #why? calibre
|
||||
deb_dir = None,
|
||||
convert_symbol = None,
|
||||
convert_wingdings = None,
|
||||
@ -107,6 +107,7 @@ class ParseRtf:
|
||||
no_dtd = 0,
|
||||
char_data = '',
|
||||
):
|
||||
|
||||
"""
|
||||
Requires:
|
||||
'file' --file to parse
|
||||
@ -119,12 +120,11 @@ class ParseRtf:
|
||||
script tries to output to directory where is script is exectued.)
|
||||
'deb_dir' --debug directory. If a debug_dir is provided, the script
|
||||
will copy each run through as a file to examine in the debug_dir
|
||||
'perl_script'--use perl to make tokens. This runs just a bit faster.
|
||||
(I will probably phase this out.)
|
||||
'check_brackets' -- make sure the brackets match up after each run
|
||||
through a file. Only for debugging.
|
||||
Returns: Nothing
|
||||
"""
|
||||
|
||||
self.__file = in_file
|
||||
self.__out_file = out_file
|
||||
self.__out_dir = out_dir
|
||||
@ -132,7 +132,7 @@ class ParseRtf:
|
||||
self.__dtd_path = dtd
|
||||
self.__check_file(in_file,"file_to_parse")
|
||||
self.__char_data = char_data
|
||||
self.__debug_dir = deb_dir #self.__debug_dir = debug calibre
|
||||
self.__debug_dir = deb_dir
|
||||
self.__check_dir(self.__temp_dir)
|
||||
self.__copy = self.__check_dir(self.__debug_dir)
|
||||
self.__convert_caps = convert_caps
|
||||
@ -155,25 +155,24 @@ class ParseRtf:
|
||||
if hasattr(the_file, 'read'): return
|
||||
if the_file == None:
|
||||
if type == "file_to_parse":
|
||||
message = "You must provide a file for the script to work"
|
||||
msg = message
|
||||
msg = _("\nYou must provide a file for the script to work")
|
||||
raise RtfInvalidCodeException, msg
|
||||
elif os.path.exists(the_file):
|
||||
pass # do nothing
|
||||
else:
|
||||
message = "The file '%s' cannot be found" % the_file
|
||||
msg = message
|
||||
msg = _("\nThe file '%s' cannot be found") % the_file
|
||||
raise RtfInvalidCodeException, msg
|
||||
|
||||
def __check_dir(self, the_dir):
|
||||
"""Check to see if directory exists"""
|
||||
if not the_dir :
|
||||
return
|
||||
dir_exists = os.path.isdir(the_dir)
|
||||
if not dir_exists:
|
||||
message = "%s is not a directory" % the_dir
|
||||
msg = message
|
||||
msg = _("\n%s is not a directory") % the_dir
|
||||
raise RtfInvalidCodeException, msg
|
||||
return 1
|
||||
|
||||
def parse_rtf(self):
|
||||
"""
|
||||
Parse the file by calling on other classes.
|
||||
@ -194,13 +193,14 @@ class ParseRtf:
|
||||
copy_obj.set_dir(self.__debug_dir)
|
||||
copy_obj.remove_files()
|
||||
copy_obj.copy_file(self.__temp_file, "original_file")
|
||||
# new as of 2005-08-02. Do I want this?
|
||||
# Function to check if bracket are well handled
|
||||
if self.__debug_dir or self.__run_level > 2:
|
||||
self.__check_brack_obj = check_brackets.CheckBrackets\
|
||||
(file = self.__temp_file,
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
)
|
||||
# convert Macintosh line endings to Unix line endings
|
||||
# convert Macintosh and Windows line endings to Unix line endings
|
||||
#why do this if you don't wb after?
|
||||
line_obj = line_endings.FixLineEndings(
|
||||
in_file = self.__temp_file,
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
@ -208,13 +208,13 @@ class ParseRtf:
|
||||
run_level = self.__run_level,
|
||||
replace_illegals = self.__replace_illegals,
|
||||
)
|
||||
return_value = line_obj.fix_endings()
|
||||
return_value = line_obj.fix_endings() #calibre return what?
|
||||
self.__return_code(return_value)
|
||||
tokenize_obj = tokenize.Tokenize(
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
in_file = self.__temp_file,
|
||||
copy = self.__copy,
|
||||
run_level = self.__run_level,)
|
||||
run_level = self.__run_level)
|
||||
tokenize_obj.tokenize()
|
||||
process_tokens_obj = process_tokens.ProcessTokens(
|
||||
in_file = self.__temp_file,
|
||||
@ -230,11 +230,13 @@ class ParseRtf:
|
||||
os.remove(self.__temp_file)
|
||||
except OSError:
|
||||
pass
|
||||
#Check to see if the file is correct ascii
|
||||
check_encoding_obj = check_encoding.CheckEncoding(
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
)
|
||||
check_encoding_obj.check_encoding(self.__file)
|
||||
sys.stderr.write('File "%s" does not appear to be RTF.\n' % self.__file if isinstance(self.__file, str) else self.__file.encode('utf-8'))
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
)
|
||||
if check_encoding_obj.check_encoding(self.__file):
|
||||
sys.stderr.write(_('File "%s" does not appear to be ascii.\n') \
|
||||
% self.__file if isinstance(self.__file, str) else self.__file.encode('utf-8'))
|
||||
raise InvalidRtfException, msg
|
||||
delete_info_obj = delete_info.DeleteInfo(
|
||||
in_file = self.__temp_file,
|
||||
@ -370,10 +372,10 @@ class ParseRtf:
|
||||
sys.stderr.write('File could be older RTF...\n')
|
||||
if found_destination:
|
||||
if self.__run_level > 1:
|
||||
sys.stderr.write(
|
||||
sys.stderr.write(_(
|
||||
'File also has newer RTF.\n'
|
||||
'Will do the best to convert.\n'
|
||||
)
|
||||
))
|
||||
add_brackets_obj = add_brackets.AddBrackets(
|
||||
in_file = self.__temp_file,
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
@ -520,35 +522,28 @@ class ParseRtf:
|
||||
output_obj.output()
|
||||
os.remove(self.__temp_file)
|
||||
return self.__exit_level
|
||||
|
||||
def __bracket_match(self, file_name):
|
||||
if self.__run_level > 2:
|
||||
good_br, msg = self.__check_brack_obj.check_brackets()
|
||||
if good_br:
|
||||
pass
|
||||
# sys.stderr.write( msg + ' in ' + file_name + "\n")
|
||||
#sys.stderr.write( msg + ' in ' + file_name + "\n")
|
||||
else:
|
||||
msg += msg + " in file '" + file_name + "'\n"
|
||||
msg = _('%s in file %s\n') % (msg, file_name)
|
||||
raise RtfInvalidCodeException, msg
|
||||
|
||||
def __return_code(self, num):
|
||||
if num == None:
|
||||
return
|
||||
if int(num) > self.__exit_level:
|
||||
self.__exit_level = num
|
||||
if num == None:
|
||||
return
|
||||
if int(num) > self.__exit_level:
|
||||
self.__exit_level = num
|
||||
|
||||
def __make_temp_file(self,file):
|
||||
"""Make a temporary file to parse"""
|
||||
write_file="rtf_write_file"
|
||||
read_obj = file if hasattr(file, 'read') else open(file,'r')
|
||||
write_obj = open(write_file, 'w')
|
||||
line = "dummy"
|
||||
while line:
|
||||
line = read_obj.read(1000)
|
||||
write_obj.write(line )
|
||||
write_obj.close()
|
||||
return write_file
|
||||
"""
|
||||
mi<tg<open______<style-sheet\n
|
||||
mi<tg<close_____<style-sheet\n
|
||||
mi<tg<open-att__<footnote<num>1\n
|
||||
mi<tg<empty-att_<page-definition<margin>33\n
|
||||
mi<tg<empty_____<para\n
|
||||
"""
|
||||
with open(write_file, 'wb') as write_obj:
|
||||
for line in read_obj:
|
||||
write_obj.write(line)
|
||||
return write_file
|
@ -24,38 +24,37 @@ class CheckBrackets:
|
||||
self.__ob_count = 0
|
||||
self.__cb_count = 0
|
||||
self.__open_bracket_num = []
|
||||
|
||||
def open_brack(self, line):
|
||||
num = line[-5:-1]
|
||||
self.__open_bracket_num.append(num)
|
||||
self.__bracket_count += 1
|
||||
|
||||
def close_brack(self, line):
|
||||
num = line[-5:-1]
|
||||
##self.__open_bracket_num.append(num)
|
||||
try:
|
||||
last_num = self.__open_bracket_num.pop()
|
||||
except:
|
||||
return 0
|
||||
return False
|
||||
if num != last_num:
|
||||
return 0
|
||||
return False
|
||||
self.__bracket_count -= 1
|
||||
return 1
|
||||
return True
|
||||
|
||||
def check_brackets(self):
|
||||
read_obj = open(self.__file, 'r')
|
||||
line = 'dummy'
|
||||
line_count = 0
|
||||
while line:
|
||||
line_count += 1
|
||||
line = read_obj.readline()
|
||||
self.__token_info = line[:16]
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.open_brack(line)
|
||||
if self.__token_info == 'cb<nu<clos-brack':
|
||||
right_count = self.close_brack(line)
|
||||
if not right_count:
|
||||
return (0, "closed bracket doesn't match, line %s" % line_count)
|
||||
read_obj.close()
|
||||
with open(self.__file, 'r') as read_obj:
|
||||
for line in read_obj:
|
||||
line_count += 1
|
||||
self.__token_info = line[:16]
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.open_brack(line)
|
||||
if self.__token_info == 'cb<nu<clos-brack':
|
||||
if not self.close_brack(line):
|
||||
return (False, "closed bracket doesn't match, line %s" % line_count)
|
||||
|
||||
if self.__bracket_count != 0:
|
||||
msg = 'At end of file open and closed brackets don\'t match\n'
|
||||
msg = msg + 'total number of brackets is %s' % self.__bracket_count
|
||||
return (0, msg)
|
||||
return (1, "brackets match!")
|
||||
msg = _('At end of file open and closed brackets don\'t match\n' \
|
||||
'total number of brackets is %s') % self.__bracket_count
|
||||
return (False, msg)
|
||||
return (True, _("Brackets match!"))
|
||||
|
@ -1,8 +1,10 @@
|
||||
#!/usr/bin/env python
|
||||
import sys
|
||||
class CheckEncoding:
|
||||
|
||||
def __init__(self, bug_handler):
|
||||
self.__bug_handler = bug_handler
|
||||
|
||||
def __get_position_error(self, line, encoding, line_num):
|
||||
char_position = 0
|
||||
for char in line:
|
||||
@ -10,23 +12,24 @@ class CheckEncoding:
|
||||
try:
|
||||
char.decode(encoding)
|
||||
except UnicodeError, msg:
|
||||
sys.stderr.write('line: %s char: %s\n' % (line_num, char_position))
|
||||
sys.stderr.write(_('line: %s char: %s\n') % (line_num, char_position))
|
||||
sys.stderr.write(str(msg) + '\n')
|
||||
|
||||
def check_encoding(self, path, encoding='us-ascii'):
|
||||
read_obj = open(path, 'r')
|
||||
line_to_read = 1
|
||||
line_num = 0
|
||||
while line_to_read:
|
||||
line_num += 1
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
try:
|
||||
line.decode(encoding)
|
||||
except UnicodeError:
|
||||
if len(line) < 1000:
|
||||
self.__get_position_error(line, encoding, line_num)
|
||||
else:
|
||||
sys.stderr.write('line: %d has bad encoding\n'%line_num)
|
||||
with open(path, 'r') as read_obj:
|
||||
for line in read_obj:
|
||||
line_num += 1
|
||||
try:
|
||||
line.decode(encoding)
|
||||
except UnicodeError:
|
||||
if len(line) < 1000:
|
||||
self.__get_position_error(line, encoding, line_num)
|
||||
else:
|
||||
sys.stderr.write(_('line: %d has bad encoding\n') % line_num)
|
||||
return True
|
||||
return False
|
||||
|
||||
if __name__ == '__main__':
|
||||
check_encoding_obj = CheckEncoding()
|
||||
check_encoding_obj.check_encoding(sys.argv[1])
|
||||
|
@ -23,6 +23,7 @@ class Copy:
|
||||
def __init__(self, bug_handler, file = None, deb_dir = None, ):
|
||||
self.__file = file
|
||||
self.__bug_handler = bug_handler
|
||||
|
||||
def set_dir(self, deb_dir):
|
||||
"""Set the temporary directory to write files to"""
|
||||
if deb_dir is None:
|
||||
@ -33,19 +34,11 @@ class Copy:
|
||||
message = "%(deb_dir)s is not a directory" % vars()
|
||||
raise self.__bug_handler , message
|
||||
Copy.__dir = deb_dir
|
||||
|
||||
def remove_files(self ):
|
||||
"""Remove files from directory"""
|
||||
self.__remove_the_files(Copy.__dir)
|
||||
"""
|
||||
list_of_files = os.listdir(Copy.__dir)
|
||||
list_of_files = os.listdir(the_dir)
|
||||
for file in list_of_files:
|
||||
rem_file = os.path.join(Copy.__dir,file)
|
||||
if os.path.isdir(rem_file):
|
||||
self.remove_files(rem_file)
|
||||
else:
|
||||
os.remove(rem_file)
|
||||
"""
|
||||
|
||||
def __remove_the_files(self, the_dir):
|
||||
"""Remove files from directory"""
|
||||
list_of_files = os.listdir(the_dir)
|
||||
@ -58,6 +51,7 @@ class Copy:
|
||||
os.remove(rem_file)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
def copy_file(self, file, new_file):
|
||||
"""
|
||||
Copy the file to a new name
|
||||
|
@ -16,7 +16,10 @@
|
||||
# #
|
||||
#########################################################################
|
||||
import os, tempfile, re
|
||||
|
||||
from calibre.ebooks.rtf2xml import copy
|
||||
from calibre.utils.cleantext import clean_ascii_chars
|
||||
|
||||
class FixLineEndings:
|
||||
"""Fix line endings"""
|
||||
def __init__(self,
|
||||
@ -32,36 +35,23 @@ class FixLineEndings:
|
||||
self.__run_level = run_level
|
||||
self.__write_to = tempfile.mktemp()
|
||||
self.__replace_illegals = replace_illegals
|
||||
|
||||
def fix_endings(self):
|
||||
##tempFileName = tempfile.mktemp()
|
||||
illegal_regx = re.compile( '\x00|\x01|\x02|\x03|\x04|\x05|\x06|\x07|\x08|\x0B|\x0E|\x0F|\x10|\x11|\x12|\x13')
|
||||
#nums = [0, 1, 2, 3, 4, 5, 6, 7, 8, 11, 14, 15, 16, 17, 18, 19]
|
||||
"""
|
||||
read_obj = open(self.__file, 'r')
|
||||
line = read_obj.read(1000)
|
||||
regexp = re.compile(r"\r")
|
||||
macintosh = regexp.search(line)
|
||||
read_obj.close()
|
||||
"""
|
||||
# always check since I have to get rid of illegal characters
|
||||
macintosh = 1
|
||||
if macintosh:
|
||||
line = 1
|
||||
read_obj = open(self.__file, 'r')
|
||||
write_obj = open(self.__write_to, 'w')
|
||||
while line:
|
||||
line = read_obj.read(1000)
|
||||
# line = re.sub(regexp,"\n",line)
|
||||
line = line.replace ('\r', '\n')
|
||||
if self.__replace_illegals:
|
||||
line = re.sub(illegal_regx, '', line)
|
||||
# for num in nums:
|
||||
# line = line.replace(chr(num), '')
|
||||
write_obj.write(line )
|
||||
read_obj.close()
|
||||
write_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "line_endings.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
#read
|
||||
with open(self.__file, 'r') as read_obj:
|
||||
input_file = read_obj.read()
|
||||
#calibre go from win and mac to unix
|
||||
input_file = input_file.replace ('\r\n', '\n')
|
||||
input_file = input_file.replace ('\r', '\n')
|
||||
#remove ASCII invalid chars : 0 to 8 and 11-14 to 24-26-27
|
||||
if self.__replace_illegals:
|
||||
input_file = clean_ascii_chars(input_file)
|
||||
#write
|
||||
with open(self.__write_to, 'wb') as write_obj:
|
||||
write_obj.write(input_file)
|
||||
#copy
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "line_endings.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
|
@ -15,8 +15,10 @@
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import os, re, tempfile
|
||||
import os, re, tempfile
|
||||
|
||||
from calibre.ebooks.rtf2xml import copy, check_brackets
|
||||
|
||||
class ProcessTokens:
|
||||
"""
|
||||
Process each token on a line and add information that will be useful for
|
||||
@ -41,9 +43,11 @@ class ProcessTokens:
|
||||
self.__bracket_count=0
|
||||
self.__exception_handler = exception_handler
|
||||
self.__bug_handler = bug_handler
|
||||
|
||||
def compile_expressions(self):
|
||||
self.__num_exp = re.compile(r"([a-zA-Z]+)(.*)")
|
||||
self.__utf_exp = re.compile(r'(&.*?;)')
|
||||
|
||||
def initiate_token_dict(self):
|
||||
self.__return_code = 0
|
||||
self.dict_token={
|
||||
@ -347,7 +351,7 @@ class ProcessTokens:
|
||||
10: 'Kanji numbering without the digit character',
|
||||
11: 'Kanji numbering with the digit character',
|
||||
1246: 'phonetic Katakana characters in aiueo order',
|
||||
1346: 'phonetic katakana characters in iroha order',
|
||||
1346: 'phonetic katakana characters in iroha order',
|
||||
14: 'double byte character',
|
||||
15: 'single byte character',
|
||||
16: 'Kanji numbering 3',
|
||||
@ -392,7 +396,7 @@ class ProcessTokens:
|
||||
5121 : 'Arabic Algeria',
|
||||
15361 : 'Arabic Bahrain',
|
||||
3073 : 'Arabic Egypt',
|
||||
1 : 'Arabic General',
|
||||
1 : 'Arabic General',
|
||||
2049 : 'Arabic Iraq',
|
||||
11265 : 'Arabic Jordan',
|
||||
13313 : 'Arabic Kuwait',
|
||||
@ -417,7 +421,7 @@ class ProcessTokens:
|
||||
1059 : 'Byelorussian',
|
||||
1027 : 'Catalan',
|
||||
2052 : 'Chinese China',
|
||||
4 : 'Chinese General',
|
||||
4 : 'Chinese General',
|
||||
3076 : 'Chinese Hong Kong',
|
||||
4100 : 'Chinese Singapore',
|
||||
1028 : 'Chinese Taiwan',
|
||||
@ -431,7 +435,7 @@ class ProcessTokens:
|
||||
2057 : 'English British',
|
||||
4105 : 'English Canada',
|
||||
9225 : 'English Caribbean',
|
||||
9 : 'English General',
|
||||
9 : 'English General',
|
||||
6153 : 'English Ireland',
|
||||
8201 : 'English Jamaica',
|
||||
5129 : 'English New Zealand',
|
||||
@ -595,12 +599,15 @@ class ProcessTokens:
|
||||
num = num[1:] # chop off leading 0, which I added
|
||||
num = num.upper() # the mappings store hex in caps
|
||||
return 'tx<hx<__________<\'%s\n' % num # add an ' for the mappings
|
||||
|
||||
def ms_sub_func(self, pre, token, num):
|
||||
return 'tx<mc<__________<%s\n' % token
|
||||
|
||||
def default_func(self, pre, token, num):
|
||||
if num == None:
|
||||
num = 'true'
|
||||
return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
|
||||
|
||||
def __list_type_func(self, pre, token, num):
|
||||
type = 'arabic'
|
||||
if num == None:
|
||||
@ -610,15 +617,16 @@ class ProcessTokens:
|
||||
num = int(num)
|
||||
except ValueError:
|
||||
if self.__run_level > 3:
|
||||
msg = 'number "%s" cannot be converted to integer\n' % num
|
||||
msg = _('Number "%s" cannot be converted to integer\n') % num
|
||||
raise self.__bug_handler, msg
|
||||
type = self.__number_type_dict.get(num)
|
||||
if type == None:
|
||||
if self.__run_level > 3:
|
||||
msg = 'No type for "%s" in self.__number_type_dict\n'
|
||||
msg = _('No type for "%s" in self.__number_type_dict\n')
|
||||
raise self.__bug_handler
|
||||
type = 'Arabic'
|
||||
return 'cw<%s<%s<nu<%s\n' % (pre, token, type)
|
||||
|
||||
def __language_func(self, pre, token, num):
|
||||
lang_name = self.__language_dict.get(int(re.search('[0-9]+', num).group()))
|
||||
if not lang_name:
|
||||
@ -627,31 +635,36 @@ class ProcessTokens:
|
||||
msg = 'No entry for number "%s"' % num
|
||||
raise self.__bug_handler, msg
|
||||
return 'cw<%s<%s<nu<%s\n' % (pre, token, lang_name)
|
||||
|
||||
def two_part_func(self, pre, token, num):
|
||||
list = token.split("<")
|
||||
token = list[0]
|
||||
num = list[1]
|
||||
return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
|
||||
##return 'cw<nu<nu<nu<%s>num<%s\n' % (token, num)
|
||||
|
||||
def divide_by_2(self, pre, token, num):
|
||||
num = self.divide_num(num, 2)
|
||||
return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
|
||||
##return 'cw<nu<nu<nu<%s>%s<%s\n' % (token, num, token)
|
||||
|
||||
def divide_by_20(self, pre, token, num):
|
||||
num = self.divide_num(num, 20)
|
||||
return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
|
||||
##return 'cw<nu<nu<nu<%s>%s<%s\n' % (token, num, token)
|
||||
|
||||
def text_func(self, pre, token, num=None):
|
||||
return 'tx<nu<__________<%s\n' % token
|
||||
|
||||
def ob_func(self, pre, token, num=None):
|
||||
self.__bracket_count += 1
|
||||
##return 'ob<%04d\n' % self.__bracket_count
|
||||
return 'ob<nu<open-brack<%04d\n' % self.__bracket_count
|
||||
|
||||
def cb_func(self, pre, token, num=None):
|
||||
##line = 'cb<%04d\n' % self.__bracket_count
|
||||
line = 'cb<nu<clos-brack<%04d\n' % self.__bracket_count
|
||||
self.__bracket_count -= 1
|
||||
return line
|
||||
|
||||
def color_func(self, pre, token, num):
|
||||
third_field = 'nu'
|
||||
if num[-1] == ';':
|
||||
@ -662,6 +675,7 @@ class ProcessTokens:
|
||||
num = "0" + num
|
||||
return 'cw<%s<%s<%s<%s\n' % (pre, token, third_field, num)
|
||||
##return 'cw<cl<%s<nu<nu<%s>%s<%s\n' % (third_field, token, num, token)
|
||||
|
||||
def bool_st_func(self, pre, token, num):
|
||||
if num is None or num == '' or num == '1':
|
||||
return 'cw<%s<%s<nu<true\n' % (pre, token)
|
||||
@ -674,13 +688,16 @@ class ProcessTokens:
|
||||
msg += 'token is ' + token + "\n"
|
||||
msg += "'" + num + "'" + "\n"
|
||||
raise self.__bug_handler, msg
|
||||
|
||||
def __no_sup_sub_func(self, pre, token, num):
|
||||
the_string = 'cw<ci<subscript_<nu<false\n'
|
||||
the_string += 'cw<ci<superscrip<nu<false\n'
|
||||
return the_string
|
||||
|
||||
def divide_num(self, numerator, denominator):
|
||||
try:
|
||||
numerator = float(re.search('[0-9.]+', numerator).group())
|
||||
#calibre why ignore negative number? Wrong in case of \fi
|
||||
numerator = float(re.search('[0-9.\-]+', numerator).group())
|
||||
except TypeError, msg:
|
||||
if self.__run_level > 3:
|
||||
msg = 'no number to process?\n'
|
||||
@ -698,6 +715,7 @@ class ProcessTokens:
|
||||
if string_num[-2:] == ".0":
|
||||
string_num = string_num[:-2]
|
||||
return string_num
|
||||
|
||||
def split_let_num(self, token):
|
||||
match_obj = re.search(self.__num_exp,token)
|
||||
if match_obj != None:
|
||||
@ -714,6 +732,7 @@ class ProcessTokens:
|
||||
raise self.__bug_handler
|
||||
return token, 0
|
||||
return first, second
|
||||
|
||||
def convert_to_hex(self,number):
|
||||
"""Convert a string to uppercase hexidecimal"""
|
||||
num = int(number)
|
||||
@ -722,6 +741,7 @@ class ProcessTokens:
|
||||
return hex_num
|
||||
except:
|
||||
raise self.__bug_handler
|
||||
|
||||
def process_cw(self, token):
|
||||
"""Change the value of the control word by determining what dictionary
|
||||
it belongs to"""
|
||||
@ -737,92 +757,60 @@ class ProcessTokens:
|
||||
pre, token, action = self.dict_token.get(token, (None, None, None))
|
||||
if action:
|
||||
return action(pre, token, num)
|
||||
# unused function
|
||||
def initiate_token_actions(self):
|
||||
self.action_for_token={
|
||||
'{' : self.ob_func,
|
||||
'}' : self.cb_func,
|
||||
'\\' : self.process_cw,
|
||||
}
|
||||
# unused function
|
||||
def evaluate_token(self,token):
|
||||
"""Evaluate tokens. Return a value if the token is not a
|
||||
control word. Otherwise, pass token onto another method
|
||||
for further evaluation."""
|
||||
token, action = self.dict_token.get(token[0:1])
|
||||
if action:
|
||||
line = action(token)
|
||||
return line
|
||||
else :
|
||||
return 'tx<nu<nu<nu<nu<%s\n' % token
|
||||
|
||||
def __check_brackets(self, in_file):
|
||||
self.__check_brack_obj = check_brackets.CheckBrackets\
|
||||
(file = in_file)
|
||||
good_br = self.__check_brack_obj.check_brackets()[0]
|
||||
if not good_br:
|
||||
return 1
|
||||
|
||||
def process_tokens(self):
|
||||
"""Main method for handling other methods. """
|
||||
first_token = 0
|
||||
second_token = 0
|
||||
read_obj = open(self.__file, 'r')
|
||||
write_obj = open(self.__write_to, 'w')
|
||||
line_to_read = "dummy"
|
||||
line_count = 0
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
token = line_to_read
|
||||
token = token.replace("\n","")
|
||||
if not token:
|
||||
continue
|
||||
line_count += 1
|
||||
try:
|
||||
token.decode('us-ascii')
|
||||
except UnicodeError, msg:
|
||||
msg = str(msg)
|
||||
msg += 'Invalid RTF: File not ascii encoded.\n'
|
||||
raise self.__exception_handler, msg
|
||||
if not first_token:
|
||||
if token != '\\{':
|
||||
msg = 'Invalid RTF: document doesn\'t start with {\n'
|
||||
with open(self.__file, 'r') as read_obj, open(self.__write_to, 'wb') as write_obj:
|
||||
for line in read_obj:
|
||||
token = line.replace("\n","")
|
||||
line_count += 1
|
||||
if line_count == 1 and token != '\\{':
|
||||
msg = _('Invalid RTF: document doesn\'t start with {\n')
|
||||
raise self.__exception_handler, msg
|
||||
elif line_count == 2 and token[0:4] != '\\rtf':
|
||||
msg =_('Invalid RTF: document doesn\'t start with \\rtf \n')
|
||||
raise self.__exception_handler, msg
|
||||
|
||||
##token = self.evaluate_token(token)
|
||||
the_index = token.find('\\ ')
|
||||
if token is not None and the_index > -1:
|
||||
msg ='Invalid RTF: token "\\ " not valid.\n'
|
||||
raise self.__exception_handler, msg
|
||||
first_token = 1
|
||||
elif first_token and not second_token:
|
||||
if token[0:4] != '\\rtf':
|
||||
msg ='Invalid RTF: document doesn\'t start with \\rtf \n'
|
||||
raise self.__exception_handler, msg
|
||||
second_token = 1
|
||||
##token = self.evaluate_token(token)
|
||||
the_index = token.find('\\ ')
|
||||
if token != None and the_index > -1:
|
||||
msg ='Invalid RTF: token "\\ " not valid. \n'
|
||||
raise self.__exception_handler, msg
|
||||
elif token[0:1] == "\\":
|
||||
line = self.process_cw(token)
|
||||
if line != None:
|
||||
write_obj.write(line)
|
||||
else:
|
||||
fields = re.split(self.__utf_exp, token)
|
||||
for field in fields:
|
||||
if not field:
|
||||
continue
|
||||
if field[0:1] == '&':
|
||||
write_obj.write('tx<ut<__________<%s\n' % field)
|
||||
else:
|
||||
write_obj.write('tx<nu<__________<%s\n' % field)
|
||||
read_obj.close()
|
||||
write_obj.close()
|
||||
elif token[:1] == "\\":
|
||||
line = self.process_cw(token)
|
||||
if line is not None:
|
||||
write_obj.write(line)
|
||||
else:
|
||||
fields = re.split(self.__utf_exp, token)
|
||||
for field in fields:
|
||||
if not field:
|
||||
continue
|
||||
if field[0:1] == '&':
|
||||
write_obj.write('tx<ut<__________<%s\n' % field)
|
||||
else:
|
||||
write_obj.write('tx<nu<__________<%s\n' % field)
|
||||
|
||||
if not line_count:
|
||||
msg ='Invalid RTF: file appears to be empty. \n'
|
||||
msg =_('Invalid RTF: file appears to be empty.\n')
|
||||
raise self.__exception_handler, msg
|
||||
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "processed_tokens.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
|
||||
bad_brackets = self.__check_brackets(self.__file)
|
||||
if bad_brackets:
|
||||
msg = 'Invalid RTF: document does not have matching brackets.\n'
|
||||
msg = _('Invalid RTF: document does not have matching brackets.\n')
|
||||
raise self.__exception_handler, msg
|
||||
else:
|
||||
return self.__return_code
|
||||
return self.__return_code
|
@ -16,7 +16,10 @@
|
||||
# #
|
||||
#########################################################################
|
||||
import os, re, tempfile
|
||||
|
||||
from calibre.ebooks.rtf2xml import copy
|
||||
from calibre.utils.mreplace import MReplace
|
||||
|
||||
class Tokenize:
|
||||
"""Tokenize RTF into one line per field. Each line will contain information useful for the rest of the script"""
|
||||
def __init__(self,
|
||||
@ -28,89 +31,175 @@ class Tokenize:
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__special_tokens = [ '_', '~', "'", '{', '}' ]
|
||||
self.__write_to = tempfile.mktemp()
|
||||
def __from_ms_to_utf8(self,match_obj):
|
||||
uni_char = int(match_obj.group(1))
|
||||
if uni_char < 0:
|
||||
uni_char += 65536
|
||||
return '&#x' + str('%X' % uni_char) + ';'
|
||||
def __neg_unicode_func(self, match_obj):
|
||||
neg_uni_char = int(match_obj.group(1)) * -1
|
||||
# sys.stderr.write(str( neg_uni_char))
|
||||
uni_char = neg_uni_char + 65536
|
||||
return '&#x' + str('%X' % uni_char) + ';'
|
||||
def __sub_line_reg(self,line):
|
||||
line = line.replace("\\\\", "\\backslash ")
|
||||
line = line.replace("\\~", "\\~ ")
|
||||
line = line.replace("\\;", "\\; ")
|
||||
line = line.replace("&", "&")
|
||||
line = line.replace("<", "<")
|
||||
line = line.replace(">", ">")
|
||||
line = line.replace("\\~", "\\~ ")
|
||||
line = line.replace("\\_", "\\_ ")
|
||||
line = line.replace("\\:", "\\: ")
|
||||
line = line.replace("\\-", "\\- ")
|
||||
# turn into a generic token to eliminate special
|
||||
# cases and make processing easier
|
||||
line = line.replace("\\{", "\\ob ")
|
||||
# turn into a generic token to eliminate special
|
||||
# cases and make processing easier
|
||||
line = line.replace("\\}", "\\cb ")
|
||||
# put a backslash in front of to eliminate special cases and
|
||||
# make processing easier
|
||||
line = line.replace("{", "\\{")
|
||||
# put a backslash in front of to eliminate special cases and
|
||||
# make processing easier
|
||||
line = line.replace("}", "\\}")
|
||||
line = re.sub(self.__utf_exp, self.__from_ms_to_utf8, line)
|
||||
# line = re.sub( self.__neg_utf_exp, self.__neg_unicode_func, line)
|
||||
line = re.sub(self.__ms_hex_exp, "\\mshex0\g<1> ", line)
|
||||
##line = line.replace("\\backslash", "\\\\")
|
||||
# this is for older RTF
|
||||
line = re.sub(self.__par_exp, '\\par ', line)
|
||||
return line
|
||||
def __compile_expressions(self):
|
||||
self.__ms_hex_exp = re.compile(r"\\\'(..)")
|
||||
self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) {0,1}")
|
||||
self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\\[^\s\\{}&]+(?:\s)?)")
|
||||
self.__par_exp = re.compile(r'\\$')
|
||||
self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)")
|
||||
##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)")
|
||||
def __create_tokens(self):
|
||||
self.__compile_expressions()
|
||||
read_obj = open(self.__file, 'r')
|
||||
write_obj = open(self.__write_to, 'w')
|
||||
line_to_read = "dummy"
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
line = line.replace("\n", "")
|
||||
line = self.__sub_line_reg(line)
|
||||
tokens = re.split(self.__splitexp, line)
|
||||
##print tokens
|
||||
for token in tokens:
|
||||
if token != "":
|
||||
write_obj.write(token + "\n")
|
||||
"""
|
||||
match_obj = re.search(self.__mixed_exp, token)
|
||||
if match_obj != None:
|
||||
first = match_obj.group(1)
|
||||
second = match_obj.group(2)
|
||||
write_obj.write(first + "\n")
|
||||
write_obj.write(second + "\n")
|
||||
else:
|
||||
write_obj.write(token + "\n")
|
||||
"""
|
||||
read_obj.close()
|
||||
write_obj.close()
|
||||
#variables
|
||||
self.__uc_char = 0
|
||||
self.__uc_bin = False
|
||||
self.__uc_value = [1]
|
||||
|
||||
def __reini_utf8_counters(self):
|
||||
self.__uc_char = 0
|
||||
self.__uc_bin = False
|
||||
|
||||
def __remove_uc_chars(self, startchar, token):
|
||||
for i in xrange(startchar, len(token)):
|
||||
if token[i] == " ":
|
||||
continue
|
||||
elif self.__uc_char:
|
||||
self.__uc_char -= 1
|
||||
else:
|
||||
return token[i:]
|
||||
#if only " " and char to skip
|
||||
return ''
|
||||
|
||||
def __unicode_process(self, token):
|
||||
#change scope in
|
||||
if token == '\{':
|
||||
self.__uc_value.append(self.__uc_value[-1])
|
||||
#basic error handling
|
||||
self.__reini_utf8_counters()
|
||||
return token
|
||||
#change scope out
|
||||
elif token == '\}':
|
||||
self.__uc_value.pop()
|
||||
self.__reini_utf8_counters()
|
||||
return token
|
||||
#add a uc control
|
||||
elif token[:3] == '\uc':
|
||||
self.__uc_value[-1] = int(token[3:])
|
||||
self.__reini_utf8_counters()
|
||||
return token
|
||||
#bin data to slip
|
||||
elif self.__uc_bin:
|
||||
self.__uc_bin = False
|
||||
return ''
|
||||
#uc char to remove
|
||||
elif self.__uc_char:
|
||||
#handle \bin tag in case of uc char to skip
|
||||
if token[:4] == '\bin':
|
||||
self.__uc_char -=1
|
||||
self.__uc_bin = True
|
||||
return ''
|
||||
elif token[:1] == "\\" :
|
||||
self.__uc_char -=1
|
||||
return ''
|
||||
else:
|
||||
return self.__remove_uc_chars(0, token)
|
||||
#go for real \u token
|
||||
match_obj = self.__utf_exp.match(token)
|
||||
if match_obj is not None:
|
||||
self.__reini_utf8_counters()
|
||||
#get value and handle negative case
|
||||
uni_char = int(match_obj.group(1))
|
||||
uni_len = len(match_obj.group(1)) + 2
|
||||
if uni_char < 0:
|
||||
uni_char += 65536
|
||||
uni_char = unichr(uni_char).encode('ascii', 'xmlcharrefreplace')
|
||||
self.__uc_char = self.__uc_value[-1]
|
||||
#there is only an unicode char
|
||||
if len(token)<= uni_len:
|
||||
return uni_char
|
||||
#an unicode char and something else
|
||||
#must be after as it is splited on \
|
||||
#necessary? maybe for \bin?
|
||||
elif not self.__uc_char:
|
||||
return uni_char + token[uni_len:]
|
||||
#if not uc0 and chars
|
||||
else:
|
||||
return uni_char + self.__remove_uc_chars(uni_len, token)
|
||||
#default
|
||||
return token
|
||||
|
||||
def __sub_reg_split(self,input_file):
|
||||
input_file = self.__replace_spchar.mreplace(input_file)
|
||||
input_file = self.__ms_hex_exp.sub("\\mshex0\g<1> ", input_file)
|
||||
input_file = self.__utf_ud.sub("\\{\\uc0 \g<1>\\}", input_file)
|
||||
#remove \n in bin data
|
||||
input_file = self.__bin_exp.sub(lambda x: \
|
||||
x.group().replace('\n', '') +'\n', input_file)
|
||||
#split
|
||||
tokens = re.split(self.__splitexp, input_file)
|
||||
#remove empty tokens and \n
|
||||
return filter(lambda x: len(x) > 0 and x != '\n', tokens)
|
||||
#input_file = re.sub(self.__utf_exp, self.__from_ms_to_utf8, input_file)
|
||||
# line = re.sub( self.__neg_utf_exp, self.__neg_unicode_func, line)
|
||||
# this is for older RTF
|
||||
#line = re.sub(self.__par_exp, '\\par ', line)
|
||||
#return filter(lambda x: len(x) > 0, \
|
||||
#(self.__remove_line.sub('', x) for x in tokens))
|
||||
|
||||
def __compile_expressions(self):
|
||||
SIMPLE_RPL = {
|
||||
"\\\\": "\\backslash ",
|
||||
"\\~": "\\~ ",
|
||||
"\\;": "\\; ",
|
||||
"&": "&",
|
||||
"<": "<",
|
||||
">": ">",
|
||||
"\\~": "\\~ ",
|
||||
"\\_": "\\_ ",
|
||||
"\\:": "\\: ",
|
||||
"\\-": "\\- ",
|
||||
# turn into a generic token to eliminate special
|
||||
# cases and make processing easier
|
||||
"\\{": "\\ob ",
|
||||
# turn into a generic token to eliminate special
|
||||
# cases and make processing easier
|
||||
"\\}": "\\cb ",
|
||||
# put a backslash in front of to eliminate special cases and
|
||||
# make processing easier
|
||||
"{": "\\{",
|
||||
# put a backslash in front of to eliminate special cases and
|
||||
# make processing easier
|
||||
"}": "\\}",
|
||||
# this is for older RTF
|
||||
r'\\$': '\\par ',
|
||||
}
|
||||
self.__replace_spchar = MReplace(SIMPLE_RPL)
|
||||
#add ;? in case of char following \u
|
||||
self.__ms_hex_exp = re.compile(r"\\\'([0-9a-fA-F]{2})") #r"\\\'(..)"
|
||||
self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) ?")
|
||||
self.__bin_exp = re.compile(r"(?:\\bin(-?\d{0,10})[\n ]+)[01\n]+")
|
||||
#manage upr/ud situations
|
||||
self.__utf_ud = re.compile(r"\\{[\n ]?\\upr[\n ]?(?:\\{.*?\\})[\n ]?" + \
|
||||
r"\\{[\n ]?\\*[\n ]?\\ud[\n ]?(\\{.*?\\})[\n ]?\\}[\n ]?\\}")
|
||||
#add \n in split for whole file reading
|
||||
#why keep backslash whereas \is replaced before?
|
||||
#remove \n from endline char
|
||||
self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)")
|
||||
#self.__bin_exp = re.compile(r"\\bin(-?\d{1,8}) {0,1}")
|
||||
#self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})")
|
||||
#self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\n|\\[^\s\\{}&]+(?:\s)?)")
|
||||
#self.__par_exp = re.compile(r'\\$')
|
||||
#self.__remove_line = re.compile(r'\n+')
|
||||
#self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)")
|
||||
##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)")
|
||||
|
||||
def tokenize(self):
|
||||
"""Main class for handling other methods. Reads in one line \
|
||||
at a time, usues method self.sub_line to make basic substitutions,\
|
||||
uses ? to process tokens"""
|
||||
self.__create_tokens()
|
||||
"""Main class for handling other methods. Reads the file \
|
||||
, uses method self.sub_reg to make basic substitutions,\
|
||||
and process tokens by itself"""
|
||||
#read
|
||||
with open(self.__file, 'r') as read_obj:
|
||||
input_file = read_obj.read()
|
||||
|
||||
#process simple replacements and split giving us a correct list
|
||||
#remove '' and \n in the process
|
||||
tokens = self.__sub_reg_split(input_file)
|
||||
#correct unicode
|
||||
tokens = map(self.__unicode_process, tokens)
|
||||
#remove empty items created by removing \uc
|
||||
tokens = filter(lambda x: len(x) > 0, tokens)
|
||||
|
||||
#write
|
||||
with open(self.__write_to, 'wb') as write_obj:
|
||||
write_obj.write('\n'.join(tokens))
|
||||
#Move and copy
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "tokenize.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
|
||||
#self.__special_tokens = [ '_', '~', "'", '{', '}' ]
|
Loading…
x
Reference in New Issue
Block a user