diff --git a/resources/templates/rtf.xsl b/resources/templates/rtf.xsl
index ea1fc71172..6db1c0388d 100644
--- a/resources/templates/rtf.xsl
+++ b/resources/templates/rtf.xsl
@@ -287,7 +287,7 @@
]
-
+
@@ -297,7 +297,7 @@
-
+
diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py
index 714a5b656f..ba13668eb7 100644
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@@ -77,7 +77,15 @@ class RTFInput(InputFormatPlugin):
def generate_xml(self, stream):
from calibre.ebooks.rtf2xml.ParseRtf import ParseRtf
- ofile = 'out.xml'
+ ofile = 'dataxml.xml'
+ run_lev, debug_dir = 1, None
+ if getattr(self.opts, 'debug_pipeline', None) is not None:
+ try:
+ os.mkdir(debug_dir)
+ debug_dir = 'rtfdebug'
+ run_lev = 4
+ except:
+ pass
parser = ParseRtf(
in_file = stream,
out_file = ofile,
@@ -115,43 +123,45 @@ class RTFInput(InputFormatPlugin):
# Write or do not write paragraphs. Default is 0.
empty_paragraphs = 1,
+
+ #debug
+ deb_dir = debug_dir,
+ run_level = run_lev,
)
parser.parse_rtf()
- ans = open('out.xml').read()
- os.remove('out.xml')
- return ans
+ with open(ofile, 'rb') as f:
+ return f.read()
def extract_images(self, picts):
+ import imghdr
self.log('Extracting images...')
+ with open(picts, 'rb') as f:
+ raw = f.read()
+ picts = filter(len, re.findall(r'\{\\pict([^}]+)\}', raw))
+ hex = re.compile(r'[^a-fA-F0-9]')
+ encs = [hex.sub('', pict) for pict in picts]
+
count = 0
- raw = open(picts, 'rb').read()
- starts = []
- for match in re.finditer(r'\{\\pict([^}]+)\}', raw):
- starts.append(match.start(1))
-
imap = {}
-
- for start in starts:
- pos, bc = start, 1
- while bc > 0:
- if raw[pos] == '}': bc -= 1
- elif raw[pos] == '{': bc += 1
- pos += 1
- pict = raw[start:pos+1]
- enc = re.sub(r'[^a-zA-Z0-9]', '', pict)
+ for enc in encs:
if len(enc) % 2 == 1:
enc = enc[:-1]
data = enc.decode('hex')
+ fmt = imghdr.what(None, data)
+ if fmt is None:
+ fmt = 'wmf'
count += 1
- name = (('%4d'%count).replace(' ', '0'))+'.wmf'
- open(name, 'wb').write(data)
+ name = '%04d.%s' % (count, fmt)
+ with open(name, 'wb') as f:
+ f.write(data)
imap[count] = name
#open(name+'.hex', 'wb').write(enc)
return self.convert_images(imap)
def convert_images(self, imap):
- for count, val in imap.items():
+ self.default_img = None
+ for count, val in imap.iteritems():
try:
imap[count] = self.convert_image(val)
except:
@@ -159,6 +169,8 @@ class RTFInput(InputFormatPlugin):
return imap
def convert_image(self, name):
+ if not name.endswith('.wmf'):
+ return name
try:
return self.rasterize_wmf(name)
except:
@@ -167,16 +179,18 @@ class RTFInput(InputFormatPlugin):
def replace_wmf(self, name):
from calibre.ebooks import calibre_cover
- data = calibre_cover('Conversion of WMF images is not supported',
+ if self.default_img is None:
+ self.default_img = calibre_cover('Conversion of WMF images is not supported',
'Use Microsoft Word or OpenOffice to save this RTF file'
' as HTML and convert that in calibre.', title_size=36,
author_size=20)
name = name.replace('.wmf', '.jpg')
with open(name, 'wb') as f:
- f.write(data)
+ f.write(self.default_img)
return name
def rasterize_wmf(self, name):
+ raise ValueError('Conversion of WMF images not supported')
from calibre.utils.wmf import extract_raster_image
with open(name, 'rb') as f:
data = f.read()
@@ -212,27 +226,27 @@ class RTFInput(InputFormatPlugin):
css += '\n'+'\n'.join(font_size_classes)
css += '\n' +'\n'.join(color_classes)
- for cls, val in border_styles.items():
+ for cls, val in border_styles.iteritems():
css += '\n\n.%s {\n%s\n}'%(cls, val)
with open('styles.css', 'ab') as f:
f.write(css)
- def preprocess(self, fname):
- self.log('\tPreprocessing to convert unicode characters')
- try:
- data = open(fname, 'rb').read()
- from calibre.ebooks.rtf.preprocess import RtfTokenizer, RtfTokenParser
- tokenizer = RtfTokenizer(data)
- tokens = RtfTokenParser(tokenizer.tokens)
- data = tokens.toRTF()
- fname = 'preprocessed.rtf'
- with open(fname, 'wb') as f:
- f.write(data)
- except:
- self.log.exception(
- 'Failed to preprocess RTF to convert unicode sequences, ignoring...')
- return fname
+ # def preprocess(self, fname):
+ # self.log('\tPreprocessing to convert unicode characters')
+ # try:
+ # data = open(fname, 'rb').read()
+ # from calibre.ebooks.rtf.preprocess import RtfTokenizer, RtfTokenParser
+ # tokenizer = RtfTokenizer(data)
+ # tokens = RtfTokenParser(tokenizer.tokens)
+ # data = tokens.toRTF()
+ # fname = 'preprocessed.rtf'
+ # with open(fname, 'wb') as f:
+ # f.write(data)
+ # except:
+ # self.log.exception(
+ # 'Failed to preprocess RTF to convert unicode sequences, ignoring...')
+ # return fname
def convert_borders(self, doc):
border_styles = []
@@ -269,17 +283,14 @@ class RTFInput(InputFormatPlugin):
self.log = log
self.log('Converting RTF to XML...')
#Name of the preprocesssed RTF file
- fname = self.preprocess(stream.name)
+ # fname = self.preprocess(stream.name)
try:
- xml = self.generate_xml(fname)
+ xml = self.generate_xml(stream.name)
except RtfInvalidCodeException, e:
+ raise
raise ValueError(_('This RTF file has a feature calibre does not '
'support. Convert it to HTML first and then try it.\n%s')%e)
- '''dataxml = open('dataxml.xml', 'w')
- dataxml.write(xml)
- dataxml.close'''
-
d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf'))
if d:
imap = {}
diff --git a/src/calibre/ebooks/rtf2xml/ParseRtf.py b/src/calibre/ebooks/rtf2xml/ParseRtf.py
index 7b89407f79..cdd9a3d088 100755
--- a/src/calibre/ebooks/rtf2xml/ParseRtf.py
+++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py
@@ -17,7 +17,8 @@
#########################################################################
# $Revision: 1.41 $
# $Date: 2006/03/24 23:50:07 $
-import sys,os
+import sys, os
+
from calibre.ebooks.rtf2xml import headings_to_sections, \
line_endings, footnote, fields_small, default_encoding, \
make_lists, preamble_div, header, colors, group_borders, \
@@ -90,7 +91,6 @@ class ParseRtf:
out_file = '',
out_dir = None,
dtd = '',
- #debug = 0, #why? calibre
deb_dir = None,
convert_symbol = None,
convert_wingdings = None,
@@ -107,6 +107,7 @@ class ParseRtf:
no_dtd = 0,
char_data = '',
):
+
"""
Requires:
'file' --file to parse
@@ -119,12 +120,11 @@ class ParseRtf:
script tries to output to directory where is script is exectued.)
'deb_dir' --debug directory. If a debug_dir is provided, the script
will copy each run through as a file to examine in the debug_dir
- 'perl_script'--use perl to make tokens. This runs just a bit faster.
- (I will probably phase this out.)
'check_brackets' -- make sure the brackets match up after each run
through a file. Only for debugging.
Returns: Nothing
"""
+
self.__file = in_file
self.__out_file = out_file
self.__out_dir = out_dir
@@ -132,7 +132,7 @@ class ParseRtf:
self.__dtd_path = dtd
self.__check_file(in_file,"file_to_parse")
self.__char_data = char_data
- self.__debug_dir = deb_dir #self.__debug_dir = debug calibre
+ self.__debug_dir = deb_dir
self.__check_dir(self.__temp_dir)
self.__copy = self.__check_dir(self.__debug_dir)
self.__convert_caps = convert_caps
@@ -155,25 +155,24 @@ class ParseRtf:
if hasattr(the_file, 'read'): return
if the_file == None:
if type == "file_to_parse":
- message = "You must provide a file for the script to work"
- msg = message
+ msg = "\nYou must provide a file for the script to work"
raise RtfInvalidCodeException, msg
elif os.path.exists(the_file):
pass # do nothing
else:
- message = "The file '%s' cannot be found" % the_file
- msg = message
+ msg = "\nThe file '%s' cannot be found" % the_file
raise RtfInvalidCodeException, msg
+
def __check_dir(self, the_dir):
"""Check to see if directory exists"""
if not the_dir :
return
dir_exists = os.path.isdir(the_dir)
if not dir_exists:
- message = "%s is not a directory" % the_dir
- msg = message
+ msg = "\n%s is not a directory" % the_dir
raise RtfInvalidCodeException, msg
return 1
+
def parse_rtf(self):
"""
Parse the file by calling on other classes.
@@ -194,13 +193,14 @@ class ParseRtf:
copy_obj.set_dir(self.__debug_dir)
copy_obj.remove_files()
copy_obj.copy_file(self.__temp_file, "original_file")
- # new as of 2005-08-02. Do I want this?
+ # Function to check if bracket are well handled
if self.__debug_dir or self.__run_level > 2:
self.__check_brack_obj = check_brackets.CheckBrackets\
(file = self.__temp_file,
bug_handler = RtfInvalidCodeException,
)
- # convert Macintosh line endings to Unix line endings
+ #convert Macintosh and Windows line endings to Unix line endings
+ #why do this if you don't wb after?
line_obj = line_endings.FixLineEndings(
in_file = self.__temp_file,
bug_handler = RtfInvalidCodeException,
@@ -208,13 +208,13 @@ class ParseRtf:
run_level = self.__run_level,
replace_illegals = self.__replace_illegals,
)
- return_value = line_obj.fix_endings()
+ return_value = line_obj.fix_endings() #calibre return what?
self.__return_code(return_value)
tokenize_obj = tokenize.Tokenize(
bug_handler = RtfInvalidCodeException,
in_file = self.__temp_file,
copy = self.__copy,
- run_level = self.__run_level,)
+ run_level = self.__run_level)
tokenize_obj.tokenize()
process_tokens_obj = process_tokens.ProcessTokens(
in_file = self.__temp_file,
@@ -230,12 +230,25 @@ class ParseRtf:
os.remove(self.__temp_file)
except OSError:
pass
+ #Check to see if the file is correctly encoded
+ encode_obj = default_encoding.DefaultEncoding(
+ in_file = self.__temp_file,
+ run_level = self.__run_level,
+ bug_handler = RtfInvalidCodeException,
+ check_raw = True,
+ )
+ platform, code_page, default_font_num = encode_obj.find_default_encoding()
check_encoding_obj = check_encoding.CheckEncoding(
- bug_handler = RtfInvalidCodeException,
- )
- check_encoding_obj.check_encoding(self.__file)
- sys.stderr.write('File "%s" does not appear to be RTF.\n' % self.__file if isinstance(self.__file, str) else self.__file.encode('utf-8'))
- raise InvalidRtfException, msg
+ bug_handler = RtfInvalidCodeException,
+ )
+ enc = encode_obj.get_codepage()
+ if enc != 'mac_roman':
+ enc = 'cp' + enc
+ if check_encoding_obj.check_encoding(self.__file, enc):
+ file_name = self.__file if isinstance(self.__file, str) \
+ else self.__file.encode('utf-8')
+ msg = 'File %s does not appear to be correctly encoded.\n' % file_name
+ raise InvalidRtfException, msg
delete_info_obj = delete_info.DeleteInfo(
in_file = self.__temp_file,
copy = self.__copy,
@@ -508,6 +521,7 @@ class ParseRtf:
indent = self.__indent,
run_level = self.__run_level,
no_dtd = self.__no_dtd,
+ encoding = encode_obj.get_codepage(),
bug_handler = RtfInvalidCodeException,
)
tags_obj.convert_to_tags()
@@ -520,35 +534,28 @@ class ParseRtf:
output_obj.output()
os.remove(self.__temp_file)
return self.__exit_level
+
def __bracket_match(self, file_name):
if self.__run_level > 2:
good_br, msg = self.__check_brack_obj.check_brackets()
if good_br:
pass
- # sys.stderr.write( msg + ' in ' + file_name + "\n")
+ #sys.stderr.write( msg + ' in ' + file_name + "\n")
else:
- msg += msg + " in file '" + file_name + "'\n"
+ msg = '%s in file %s\n' % (msg, file_name)
raise RtfInvalidCodeException, msg
+
def __return_code(self, num):
- if num == None:
- return
- if int(num) > self.__exit_level:
- self.__exit_level = num
+ if num == None:
+ return
+ if int(num) > self.__exit_level:
+ self.__exit_level = num
+
def __make_temp_file(self,file):
"""Make a temporary file to parse"""
write_file="rtf_write_file"
read_obj = file if hasattr(file, 'read') else open(file,'r')
- write_obj = open(write_file, 'w')
- line = "dummy"
- while line:
- line = read_obj.read(1000)
- write_obj.write(line )
- write_obj.close()
+ with open(write_file, 'wb') as write_obj:
+ for line in read_obj:
+ write_obj.write(line)
return write_file
- """
-mi1\n
-mi33\n
-mi' % info)
+
def __empty_func(self, line):
"""
Print out empty tag and newlines when needed.
@@ -85,10 +96,11 @@ class ConvertToTags:
self.__write_new_line()
if info in self.__two_new_line:
self.__write_extra_new_line()
+
def __open_att_func(self, line):
"""
Process lines for open tags that have attributes.
- The important infor is between [17:-1]. Take this info and split it
+ The important info is between [17:-1]. Take this info and split it
with the delimeter '<'. The first token in this group is the element
name. The rest are attributes, separated fromt their values by '>'. So
read each token one at a time, and split them by '>'.
@@ -119,6 +131,7 @@ class ConvertToTags:
self.__write_new_line()
if element_name in self.__two_new_line:
self.__write_extra_new_line()
+
def __empty_att_func(self, line):
"""
Same as the __open_att_func, except a '/' is placed at the end of the tag.
@@ -143,6 +156,7 @@ class ConvertToTags:
self.__write_new_line()
if element_name in self.__two_new_line:
self.__write_extra_new_line()
+
def __close_func(self, line):
"""
Print out the closed tag and new lines, if appropriate.
@@ -156,6 +170,7 @@ class ConvertToTags:
self.__write_new_line()
if info in self.__two_new_line:
self.__write_extra_new_line()
+
def __text_func(self, line):
"""
Simply print out the information between [17:-1]
@@ -163,6 +178,7 @@ class ConvertToTags:
#tx')
+ #keep maximum compatibility with previous version
+ check_encoding_obj = check_encoding.CheckEncoding(
+ bug_handler=self.__bug_handler)
+
+ if not check_encoding_obj.check_encoding(self.__file, verbose=False):
+ self.__write_obj.write('')
+ elif not check_encoding_obj.check_encoding(self.__file, self.__encoding):
+ self.__write_obj.write('' % self.__encoding)
+ else:
+ self.__write_obj.write('')
+ sys.stderr.write('Bad RTF encoding, revert to US-ASCII chars and'
+ ' hope for the best')
self.__new_line = 0
self.__write_new_line()
if self.__no_dtd:
@@ -207,6 +237,7 @@ class ConvertToTags:
)
self.__new_line = 0
self.__write_new_line()
+
def convert_to_tags(self):
"""
Read in the file one line at a time. Get the important info, between
@@ -222,18 +253,14 @@ class ConvertToTags:
an empty tag function.
"""
self.__initiate_values()
- read_obj = open(self.__file, 'r')
self.__write_obj = open(self.__write_to, 'w')
self.__write_dec()
- line_to_read = 1
- while line_to_read:
- line_to_read = read_obj.readline()
- line = line_to_read
- self.__token_info = line[:16]
- action = self.__state_dict.get(self.__token_info)
- if action != None:
- action(line)
- read_obj.close()
+ with open(self.__file, 'r') as read_obj:
+ for line in read_obj:
+ self.__token_info = line[:16]
+ action = self.__state_dict.get(self.__token_info)
+ if action is not None:
+ action(line)
self.__write_obj.close()
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
diff --git a/src/calibre/ebooks/rtf2xml/copy.py b/src/calibre/ebooks/rtf2xml/copy.py
index ff029c1841..1b620b9fbf 100755
--- a/src/calibre/ebooks/rtf2xml/copy.py
+++ b/src/calibre/ebooks/rtf2xml/copy.py
@@ -23,6 +23,7 @@ class Copy:
def __init__(self, bug_handler, file = None, deb_dir = None, ):
self.__file = file
self.__bug_handler = bug_handler
+
def set_dir(self, deb_dir):
"""Set the temporary directory to write files to"""
if deb_dir is None:
@@ -33,19 +34,11 @@ class Copy:
message = "%(deb_dir)s is not a directory" % vars()
raise self.__bug_handler , message
Copy.__dir = deb_dir
+
def remove_files(self ):
"""Remove files from directory"""
self.__remove_the_files(Copy.__dir)
- """
- list_of_files = os.listdir(Copy.__dir)
- list_of_files = os.listdir(the_dir)
- for file in list_of_files:
- rem_file = os.path.join(Copy.__dir,file)
- if os.path.isdir(rem_file):
- self.remove_files(rem_file)
- else:
- os.remove(rem_file)
- """
+
def __remove_the_files(self, the_dir):
"""Remove files from directory"""
list_of_files = os.listdir(the_dir)
@@ -58,6 +51,7 @@ class Copy:
os.remove(rem_file)
except OSError:
pass
+
def copy_file(self, file, new_file):
"""
Copy the file to a new name
diff --git a/src/calibre/ebooks/rtf2xml/default_encoding.py b/src/calibre/ebooks/rtf2xml/default_encoding.py
index b932b465d0..53887e0d90 100755
--- a/src/calibre/ebooks/rtf2xml/default_encoding.py
+++ b/src/calibre/ebooks/rtf2xml/default_encoding.py
@@ -1,61 +1,142 @@
#########################################################################
# #
-# #
# copyright 2002 Paul Henry Tremblay #
# #
-# This program is distributed in the hope that it will be useful, #
-# but WITHOUT ANY WARRANTY; without even the implied warranty of #
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
-# General Public License for more details. #
-# #
-# You should have received a copy of the GNU General Public License #
-# along with this program; if not, write to the Free Software #
-# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
-# 02111-1307 USA #
-# #
-# #
#########################################################################
+
+'''
+Codepages as to RTF 1.9.1:
+ 437 United States IBM
+ 708 Arabic (ASMO 708)
+ 709 Arabic (ASMO 449+, BCON V4)
+ 710 Arabic (transparent Arabic)
+ 711 Arabic (Nafitha Enhanced)
+ 720 Arabic (transparent ASMO)
+ 819 Windows 3.1 (United States and Western Europe)
+ 850 IBM multilingual
+ 852 Eastern European
+ 860 Portuguese
+ 862 Hebrew
+ 863 French Canadian
+ 864 Arabic
+ 865 Norwegian
+ 866 Soviet Union
+ 874 Thai
+ 932 Japanese
+ 936 Simplified Chinese
+ 949 Korean
+ 950 Traditional Chinese
+ 1250 Eastern European
+ 1251 Cyrillic
+ 1252 Western European
+ 1253 Greek
+ 1254 Turkish
+ 1255 Hebrew
+ 1256 Arabic
+ 1257 Baltic
+ 1258 Vietnamese
+ 1361 Johab
+ 10000 MAC Roman
+ 10001 MAC Japan
+ 10004 MAC Arabic
+ 10005 MAC Hebrew
+ 10006 MAC Greek
+ 10007 MAC Cyrillic
+ 10029 MAC Latin2
+ 10081 MAC Turkish
+ 57002 Devanagari
+ 57003 Bengali
+ 57004 Tamil
+ 57005 Telugu
+ 57006 Assamese
+ 57007 Oriya
+ 57008 Kannada
+ 57009 Malayalam
+ 57010 Gujarati
+ 57011 Punjabi
+'''
+import re
+
class DefaultEncoding:
"""
Find the default encoding for the doc
"""
- def __init__(self, in_file, bug_handler, run_level = 1,):
- """
- Required:
- 'file'
- Returns:
- nothing
- """
+ def __init__(self, in_file, bug_handler, run_level = 1, check_raw = False):
self.__file = in_file
self.__bug_handler = bug_handler
+ self.__platform = 'Windows'
+ self.__default_num = 'not-defined'
+ self.__code_page = '1252'
+ self.__datafetched = False
+ self.__fetchraw = check_raw
+
def find_default_encoding(self):
- platform = 'Windows'
- default_num = 'not-defined'
- code_page = 'ansicpg1252'
- read_obj = open(self.__file, 'r')
- line_to_read = 1
- while line_to_read:
- line_to_read = read_obj.readline()
- line = line_to_read
- self.__token_info = line[:16]
- if self.__token_info == 'mi 3:
msg = 'flag problem\n'
raise self.__bug_handler, msg
- return 1
+ return True
elif self.__token_info in self.__allowable :
if self.__ob:
self.__write_obj.write(self.__ob)
@@ -132,85 +138,81 @@ class DeleteInfo:
self.__state = 'default'
else:
pass
- return 1
+ return True
elif self.__token_info == 'cw 5:
- msg = 'After an asterisk, and found neither an allowable or non-allowble token\n'
- msg += 'token is "%s"\n' % self.__token_info
- raise self.__bug_handler
+ msg = ('After an asterisk, and found neither an allowable or non-allowable token\n\
+ token is "%s"\n') % self.__token_info
+ raise self.__bug_handler, msg
if not self.__ob:
- self.__write_cb = 1
+ self.__write_cb = True
self.__ob = 0
self.__state = 'delete'
self.__cb_count = 0
- return 0
+ return False
+
def __found_list_func(self, line):
"""
print out control words in this group
"""
self.__state = 'list'
+
def __list_func(self, line):
"""
Check to see if the group has ended.
- Return 1 for all control words.
- Return 0 otherwise.
+ Return True for all control words.
+ Return False otherwise.
"""
if self.__delete_count == self.__cb_count and self.__token_info ==\
'cb%s\n' % self.__footnote_count)
self.__first_line = 0
+
def __in_footnote_func(self, line):
"""Handle all tokens that are part of footnote"""
if self.__first_line:
@@ -68,6 +72,7 @@ class Footnote:
'mi ci
- 'annotation' : 'annotation',
+ 'annotation' : 'annotation',
'blue______' : 'blue',
'bold______' : 'bold',
- 'caps______' : 'caps',
- 'char-style' : 'character-style',
- 'dbl-strike' : 'double-strike-through',
+ 'caps______' : 'caps',
+ 'char-style' : 'character-style',
+ 'dbl-strike' : 'double-strike-through',
'emboss____' : 'emboss',
'engrave___' : 'engrave',
'font-color' : 'font-color',
@@ -96,7 +97,7 @@ class Inline:
'font-size_' : 'font-size',
'font-style' : 'font-style',
'font-up___' : 'superscript',
- 'footnot-mk' : 'footnote-marker',
+ 'footnot-mk' : 'footnote-marker',
'green_____' : 'green',
'hidden____' : 'hidden',
'italics___' : 'italics',
@@ -107,9 +108,10 @@ class Inline:
'strike-thr' : 'strike-through',
'subscript_' : 'subscript',
'superscrip' : 'superscript',
- 'underlined' : 'underlined',
+ 'underlined' : 'underlined',
}
self.__caps_list = ['false']
+
def __set_list_func(self, line):
"""
Requires:
@@ -128,6 +130,7 @@ class Inline:
self.__place = 'in_list'
self.__inline_list = self.__list_inline_list
self.__groups_in_waiting = self.__groups_in_waiting_list
+
def __default_func(self, line):
"""
Requires:
@@ -140,8 +143,8 @@ class Inline:
action = self.__default_dict.get(self.__token_info)
if action:
action(line)
- if self.__token_info != 'cw%s' % (the_key, the_dict[the_key]))
self.__write_obj.write('\n')
self.__groups_in_waiting[0] = 0
+
def __end_para_func(self, line):
"""
Requires:
@@ -342,6 +346,7 @@ class Inline:
self.__write_obj.write('mi%s' % (the_key, the_dict[the_key]))
self.__write_obj.write('\n')
self.__groups_in_waiting[0] = 0
+
def __found_field_func(self, line):
"""
Just a default function to make sure I don't prematurely exit
default state
"""
pass
+
def form_tags(self):
"""
Requires:
@@ -386,32 +393,27 @@ class Inline:
the state.
"""
self.__initiate_values()
- read_obj = open(self.__file, 'r')
- self.__write_obj = open(self.__write_to, 'w')
- line_to_read = 1
- while line_to_read:
- line_to_read = read_obj.readline()
- line = line_to_read
- token = line[0:-1]
- self.__token_info = ''
- if token == 'tx 1:
- sys.stderr.write('Removing files from old pict directory...\n')
- all_files = os.listdir(self.__dir_name)
- for the_file in all_files:
- the_file = os.path.join(self.__dir_name, the_file)
- try:
- os.remove(the_file)
- except OSError:
- pass
- if self.__run_level > 1:
- sys.stderr.write('Files removed.\n')
+ if self.__run_level > 1:
+ sys.stderr.write('Removing files from old pict directory...\n')
+ all_files = os.listdir(self.__dir_name)
+ for the_file in all_files:
+ the_file = os.path.join(self.__dir_name, the_file)
+ try:
+ os.remove(the_file)
+ except OSError:
+ pass
+ if self.__run_level > 1:
+ sys.stderr.write('Files removed.\n')
def __create_pict_file(self):
"""Create a file for all the pict data to be written to.
"""
self.__pict_file = os.path.join(self.__dir_name, 'picts.rtf')
- write_pic_obj = open(self.__pict_file, 'w')
- write_pic_obj.close()
self.__write_pic_obj = open(self.__pict_file, 'a')
def __in_pict_func(self, line):
if self.__cb_count == self.__pict_br_count:
- self.__in_pict = 0
+ self.__in_pict = False
self.__write_pic_obj.write("}\n")
- return 1
+ return True
else:
action = self.__pict_dict.get(self.__token_info)
if action:
- line = action(line)
- self.__write_pic_obj.write(line)
- return 0
+ self.__write_pic_obj.write(action(line))
+ return False
def __default(self, line, write_obj):
"""Determine if each token marks the beginning of pict data.
@@ -142,53 +128,50 @@ class Pict:
write_obj.write('mi ml
'*' : ('ml', 'asterisk__', self.default_func),
':' : ('ml', 'colon_____', self.default_func),
@@ -73,7 +78,6 @@ class ProcessTokens:
'backslash' : ('nu', '\\', self.text_func),
'ob' : ('nu', '{', self.text_func),
'cb' : ('nu', '}', self.text_func),
- 'line' : ('nu', 'hard-lineb', self.default_func), #calibre
#'line' : ('nu', ' ', self.text_func), calibre
# paragraph formatting => pf
'page' : ('pf', 'page-break', self.default_func),
@@ -159,15 +163,17 @@ class ProcessTokens:
'rtf' : ('ri', 'rtf_______', self.default_func),
'deff' : ('ri', 'deflt-font', self.default_func),
'mac' : ('ri', 'macintosh_', self.default_func),
+ 'pc' : ('ri', 'pc________', self.default_func),
+ 'pca' : ('ri', 'pca_______', self.default_func),
'ansi' : ('ri', 'ansi______', self.default_func),
'ansicpg' : ('ri', 'ansi-codpg', self.default_func),
# notes => nt
'footnote' : ('nt', 'footnote__', self.default_func),
'ftnalt' : ('nt', 'type______ an
- 'tc' : ('an', 'toc_______', self.default_func),
+ 'tc' : ('an', 'toc_______', self.default_func),
'bkmkstt' : ('an', 'book-mk-st', self.default_func),
- 'bkmkstart' : ('an', 'book-mk-st', self.default_func),
+ 'bkmkstart' : ('an', 'book-mk-st', self.default_func),
'bkmkend' : ('an', 'book-mk-en', self.default_func),
'xe' : ('an', 'index-mark', self.default_func),
'rxe' : ('an', 'place_____', self.default_func),
@@ -347,7 +353,7 @@ class ProcessTokens:
10: 'Kanji numbering without the digit character',
11: 'Kanji numbering with the digit character',
1246: 'phonetic Katakana characters in aiueo order',
- 1346: 'phonetic katakana characters in iroha order',
+ 1346: 'phonetic katakana characters in iroha order',
14: 'double byte character',
15: 'single byte character',
16: 'Kanji numbering 3',
@@ -392,7 +398,7 @@ class ProcessTokens:
5121 : 'Arabic Algeria',
15361 : 'Arabic Bahrain',
3073 : 'Arabic Egypt',
- 1 : 'Arabic General',
+ 1 : 'Arabic General',
2049 : 'Arabic Iraq',
11265 : 'Arabic Jordan',
13313 : 'Arabic Kuwait',
@@ -417,7 +423,7 @@ class ProcessTokens:
1059 : 'Byelorussian',
1027 : 'Catalan',
2052 : 'Chinese China',
- 4 : 'Chinese General',
+ 4 : 'Chinese General',
3076 : 'Chinese Hong Kong',
4100 : 'Chinese Singapore',
1028 : 'Chinese Taiwan',
@@ -431,7 +437,7 @@ class ProcessTokens:
2057 : 'English British',
4105 : 'English Canada',
9225 : 'English Caribbean',
- 9 : 'English General',
+ 9 : 'English General',
6153 : 'English Ireland',
8201 : 'English Jamaica',
5129 : 'English New Zealand',
@@ -595,30 +601,37 @@ class ProcessTokens:
num = num[1:] # chop off leading 0, which I added
num = num.upper() # the mappings store hex in caps
return 'tx 3:
- msg = 'number "%s" cannot be converted to integer\n' % num
+ msg = 'Number "%s" cannot be converted to integer\n' % num
raise self.__bug_handler, msg
type = self.__number_type_dict.get(num)
- if type == None:
+ if type is None:
if self.__run_level > 3:
msg = 'No type for "%s" in self.__number_type_dict\n'
raise self.__bug_handler
type = 'Arabic'
return 'cw<%s<%snum<%s\n' % (token, num)
+
def divide_by_2(self, pre, token, num):
num = self.divide_num(num, 2)
return 'cw<%s<%s%s<%s\n' % (token, num, token)
+
def divide_by_20(self, pre, token, num):
num = self.divide_num(num, 20)
return 'cw<%s<%s%s<%s\n' % (token, num, token)
+
def text_func(self, pre, token, num=None):
return 'tx%s<%s\n' % (third_field, token, num, token)
+
def bool_st_func(self, pre, token, num):
if num is None or num == '' or num == '1':
return 'cw<%s<%sfalse<%s\n' % (token, token)
else:
- msg = 'boolean should have some value module process tokens\n'
- msg += 'token is ' + token + "\n"
- msg += "'" + num + "'" + "\n"
+ msg = "boolean should have some value module process tokens\ntoken is %s\n'%s'\n" % (token, num)
raise self.__bug_handler, msg
+
def __no_sup_sub_func(self, pre, token, num):
the_string = 'cw 3:
- msg = 'no number to process?\n'
- msg += 'this indicates that the token '
- msg += ' \(\\li\) should have a number and does not\n'
- msg += 'numerator is "%s"\n' % numerator
- msg += 'denominator is "%s"\n' % denominator
+ msg = ('No number to process?\nthis indicates that the token \(\\li\) \
+ should have a number and does not\nnumerator is \
+ "%s"\ndenominator is "%s"\n') % (numerator, denominator)
raise self.__bug_handler, msg
if 5 > self.__return_code:
self.__return_code = 5
@@ -698,9 +716,10 @@ class ProcessTokens:
if string_num[-2:] == ".0":
string_num = string_num[:-2]
return string_num
+
def split_let_num(self, token):
match_obj = re.search(self.__num_exp,token)
- if match_obj != None:
+ if match_obj is not None:
first = match_obj.group(1)
second = match_obj.group(2)
if not second:
@@ -714,6 +733,7 @@ class ProcessTokens:
raise self.__bug_handler
return token, 0
return first, second
+
def convert_to_hex(self,number):
"""Convert a string to uppercase hexidecimal"""
num = int(number)
@@ -722,6 +742,7 @@ class ProcessTokens:
return hex_num
except:
raise self.__bug_handler
+
def process_cw(self, token):
"""Change the value of the control word by determining what dictionary
it belongs to"""
@@ -737,89 +758,62 @@ class ProcessTokens:
pre, token, action = self.dict_token.get(token, (None, None, None))
if action:
return action(pre, token, num)
- # unused function
- def initiate_token_actions(self):
- self.action_for_token={
- '{' : self.ob_func,
- '}' : self.cb_func,
- '\\' : self.process_cw,
- }
- # unused function
- def evaluate_token(self,token):
- """Evaluate tokens. Return a value if the token is not a
- control word. Otherwise, pass token onto another method
- for further evaluation."""
- token, action = self.dict_token.get(token[0:1])
- if action:
- line = action(token)
- return line
- else :
- return 'tx -1:
- msg ='Invalid RTF: token "\\ " not valid. \n'
- raise self.__exception_handler, msg
- elif token[0:1] == "\\":
- line = self.process_cw(token)
- if line != None:
- write_obj.write(line)
- else:
- fields = re.split(self.__utf_exp, token)
- for field in fields:
- if not field:
- continue
- if field[0:1] == '&':
- write_obj.write('tx -1:
+ msg = 'Invalid RTF: token "\\ " not valid.\n'
+ raise self.__exception_handler, msg
+ elif token[:1] == "\\":
+ try:
+ token.decode('us-ascii')
+ except UnicodeError, msg:
+ msg = 'Invalid RTF: Tokens not ascii encoded.\n%s' % str(msg)
+ raise self.__exception_handler, msg
+ line = self.process_cw(token)
+ if line is not None:
+ write_obj.write(line)
else:
- write_obj.write('tx", ">")
- line = line.replace("\\~", "\\~ ")
- line = line.replace("\\_", "\\_ ")
- line = line.replace("\\:", "\\: ")
- line = line.replace("\\-", "\\- ")
- # turn into a generic token to eliminate special
- # cases and make processing easier
- line = line.replace("\\{", "\\ob ")
- # turn into a generic token to eliminate special
- # cases and make processing easier
- line = line.replace("\\}", "\\cb ")
- # put a backslash in front of to eliminate special cases and
- # make processing easier
- line = line.replace("{", "\\{")
- # put a backslash in front of to eliminate special cases and
- # make processing easier
- line = line.replace("}", "\\}")
- line = re.sub(self.__utf_exp, self.__from_ms_to_utf8, line)
- # line = re.sub( self.__neg_utf_exp, self.__neg_unicode_func, line)
- line = re.sub(self.__ms_hex_exp, "\\mshex0\g<1> ", line)
- ##line = line.replace("\\backslash", "\\\\")
- # this is for older RTF
- line = re.sub(self.__par_exp, '\\par ', line)
- return line
- def __compile_expressions(self):
- self.__ms_hex_exp = re.compile(r"\\\'(..)")
- self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) {0,1}")
- self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\\[^\s\\{}&]+(?:\s)?)")
- self.__par_exp = re.compile(r'\\$')
- self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)")
- ##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)")
- def __create_tokens(self):
self.__compile_expressions()
- read_obj = open(self.__file, 'r')
- write_obj = open(self.__write_to, 'w')
- line_to_read = "dummy"
- while line_to_read:
- line_to_read = read_obj.readline()
- line = line_to_read
- line = line.replace("\n", "")
- line = self.__sub_line_reg(line)
- tokens = re.split(self.__splitexp, line)
- ##print tokens
- for token in tokens:
- if token != "":
- write_obj.write(token + "\n")
- """
- match_obj = re.search(self.__mixed_exp, token)
- if match_obj != None:
- first = match_obj.group(1)
- second = match_obj.group(2)
- write_obj.write(first + "\n")
- write_obj.write(second + "\n")
- else:
- write_obj.write(token + "\n")
- """
- read_obj.close()
- write_obj.close()
+ #variables
+ self.__uc_char = 0
+ self.__uc_bin = False
+ self.__uc_value = [1]
+
+ def __reini_utf8_counters(self):
+ self.__uc_char = 0
+ self.__uc_bin = False
+
+ def __remove_uc_chars(self, startchar, token):
+ for i in xrange(startchar, len(token)):
+ if token[i] == " ":
+ continue
+ elif self.__uc_char:
+ self.__uc_char -= 1
+ else:
+ return token[i:]
+ #if only " " and char to skip
+ return ''
+
+ def __unicode_process(self, token):
+ #change scope in
+ if token == '\{':
+ self.__uc_value.append(self.__uc_value[-1])
+ #basic error handling
+ self.__reini_utf8_counters()
+ return token
+ #change scope out
+ elif token == '\}':
+ self.__uc_value.pop()
+ self.__reini_utf8_counters()
+ return token
+ #add a uc control
+ elif token[:3] == '\uc':
+ self.__uc_value[-1] = int(token[3:])
+ self.__reini_utf8_counters()
+ return token
+ #bin data to slip
+ elif self.__uc_bin:
+ self.__uc_bin = False
+ return ''
+ #uc char to remove
+ elif self.__uc_char:
+ #handle \bin tag in case of uc char to skip
+ if token[:4] == '\bin':
+ self.__uc_char -=1
+ self.__uc_bin = True
+ return ''
+ elif token[:1] == "\\" :
+ self.__uc_char -=1
+ return ''
+ else:
+ return self.__remove_uc_chars(0, token)
+ #go for real \u token
+ match_obj = self.__utf_exp.match(token)
+ if match_obj is not None:
+ self.__reini_utf8_counters()
+ #get value and handle negative case
+ uni_char = int(match_obj.group(1))
+ uni_len = len(match_obj.group(1)) + 2
+ if uni_char < 0:
+ uni_char += 65536
+ uni_char = unichr(uni_char).encode('ascii', 'xmlcharrefreplace')
+ self.__uc_char = self.__uc_value[-1]
+ #there is only an unicode char
+ if len(token)<= uni_len:
+ return uni_char
+ #an unicode char and something else
+ #must be after as it is splited on \
+ #necessary? maybe for \bin?
+ elif not self.__uc_char:
+ return uni_char + token[uni_len:]
+ #if not uc0 and chars
+ else:
+ return uni_char + self.__remove_uc_chars(uni_len, token)
+ #default
+ return token
+
+ def __sub_reg_split(self,input_file):
+ input_file = self.__replace_spchar.mreplace(input_file)
+ input_file = self.__ms_hex_exp.sub("\\mshex0\g<1> ", input_file)
+ input_file = self.__utf_ud.sub("\\{\\uc0 \g<1>\\}", input_file)
+ #remove \n in bin data
+ input_file = self.__bin_exp.sub(lambda x: \
+ x.group().replace('\n', '') + '\n', input_file)
+ #split
+ tokens = re.split(self.__splitexp, input_file)
+ #remove empty tokens and \n
+ return filter(lambda x: len(x) > 0 and x != '\n', tokens)
+ #input_file = re.sub(self.__utf_exp, self.__from_ms_to_utf8, input_file)
+ # line = re.sub( self.__neg_utf_exp, self.__neg_unicode_func, line)
+ # this is for older RTF
+ #line = re.sub(self.__par_exp, '\\par ', line)
+ #return filter(lambda x: len(x) > 0, \
+ #(self.__remove_line.sub('', x) for x in tokens))
+
+ def __compile_expressions(self):
+ SIMPLE_RPL = {
+ "\\\\": "\\backslash ",
+ "\\~": "\\~ ",
+ "\\;": "\\; ",
+ "&": "&",
+ "<": "<",
+ ">": ">",
+ "\\~": "\\~ ",
+ "\\_": "\\_ ",
+ "\\:": "\\: ",
+ "\\-": "\\- ",
+ # turn into a generic token to eliminate special
+ # cases and make processing easier
+ "\\{": "\\ob ",
+ # turn into a generic token to eliminate special
+ # cases and make processing easier
+ "\\}": "\\cb ",
+ # put a backslash in front of to eliminate special cases and
+ # make processing easier
+ "{": "\\{",
+ # put a backslash in front of to eliminate special cases and
+ # make processing easier
+ "}": "\\}",
+ # this is for older RTF
+ r'\\$': '\\par ',
+ }
+ self.__replace_spchar = MReplace(SIMPLE_RPL)
+ #add ;? in case of char following \u
+ self.__ms_hex_exp = re.compile(r"\\\'([0-9a-fA-F]{2})") #r"\\\'(..)"
+ self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) ?")
+ self.__bin_exp = re.compile(r"(?:\\bin(-?\d{0,10})[\n ]+)[01\n]+")
+ #manage upr/ud situations
+ self.__utf_ud = re.compile(r"\\{[\n ]?\\upr[\n ]?(?:\\{.*?\\})[\n ]?" + \
+ r"\\{[\n ]?\\*[\n ]?\\ud[\n ]?(\\{.*?\\})[\n ]?\\}[\n ]?\\}")
+ #add \n in split for whole file reading
+ #why keep backslash whereas \is replaced before?
+ #remove \n from endline char
+ self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)")
+ #self.__bin_exp = re.compile(r"\\bin(-?\d{1,8}) {0,1}")
+ #self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})")
+ #self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\n|\\[^\s\\{}&]+(?:\s)?)")
+ #self.__par_exp = re.compile(r'\\$')
+ #self.__remove_line = re.compile(r'\n+')
+ #self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)")
+ ##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)")
+
def tokenize(self):
- """Main class for handling other methods. Reads in one line \
- at a time, usues method self.sub_line to make basic substitutions,\
- uses ? to process tokens"""
- self.__create_tokens()
+ """Main class for handling other methods. Reads the file \
+ , uses method self.sub_reg to make basic substitutions,\
+ and process tokens by itself"""
+ #read
+ with open(self.__file, 'r') as read_obj:
+ input_file = read_obj.read()
+
+ #process simple replacements and split giving us a correct list
+ #remove '' and \n in the process
+ tokens = self.__sub_reg_split(input_file)
+ #correct unicode
+ tokens = map(self.__unicode_process, tokens)
+ #remove empty items created by removing \uc
+ tokens = filter(lambda x: len(x) > 0, tokens)
+
+ #write
+ with open(self.__write_to, 'wb') as write_obj:
+ write_obj.write('\n'.join(tokens))
+ #Move and copy
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "tokenize.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)
+
+ #self.__special_tokens = [ '_', '~', "'", '{', '}' ]
\ No newline at end of file