diff --git a/resources/templates/rtf.xsl b/resources/templates/rtf.xsl index ea1fc71172..6db1c0388d 100644 --- a/resources/templates/rtf.xsl +++ b/resources/templates/rtf.xsl @@ -287,7 +287,7 @@ ] - + @@ -297,7 +297,7 @@ - + diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index 714a5b656f..ba13668eb7 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -77,7 +77,15 @@ class RTFInput(InputFormatPlugin): def generate_xml(self, stream): from calibre.ebooks.rtf2xml.ParseRtf import ParseRtf - ofile = 'out.xml' + ofile = 'dataxml.xml' + run_lev, debug_dir = 1, None + if getattr(self.opts, 'debug_pipeline', None) is not None: + try: + os.mkdir(debug_dir) + debug_dir = 'rtfdebug' + run_lev = 4 + except: + pass parser = ParseRtf( in_file = stream, out_file = ofile, @@ -115,43 +123,45 @@ class RTFInput(InputFormatPlugin): # Write or do not write paragraphs. Default is 0. empty_paragraphs = 1, + + #debug + deb_dir = debug_dir, + run_level = run_lev, ) parser.parse_rtf() - ans = open('out.xml').read() - os.remove('out.xml') - return ans + with open(ofile, 'rb') as f: + return f.read() def extract_images(self, picts): + import imghdr self.log('Extracting images...') + with open(picts, 'rb') as f: + raw = f.read() + picts = filter(len, re.findall(r'\{\\pict([^}]+)\}', raw)) + hex = re.compile(r'[^a-fA-F0-9]') + encs = [hex.sub('', pict) for pict in picts] + count = 0 - raw = open(picts, 'rb').read() - starts = [] - for match in re.finditer(r'\{\\pict([^}]+)\}', raw): - starts.append(match.start(1)) - imap = {} - - for start in starts: - pos, bc = start, 1 - while bc > 0: - if raw[pos] == '}': bc -= 1 - elif raw[pos] == '{': bc += 1 - pos += 1 - pict = raw[start:pos+1] - enc = re.sub(r'[^a-zA-Z0-9]', '', pict) + for enc in encs: if len(enc) % 2 == 1: enc = enc[:-1] data = enc.decode('hex') + fmt = imghdr.what(None, data) + if fmt is None: + fmt = 'wmf' count += 1 - name = (('%4d'%count).replace(' ', '0'))+'.wmf' - open(name, 'wb').write(data) + name = '%04d.%s' % (count, fmt) + with open(name, 'wb') as f: + f.write(data) imap[count] = name #open(name+'.hex', 'wb').write(enc) return self.convert_images(imap) def convert_images(self, imap): - for count, val in imap.items(): + self.default_img = None + for count, val in imap.iteritems(): try: imap[count] = self.convert_image(val) except: @@ -159,6 +169,8 @@ class RTFInput(InputFormatPlugin): return imap def convert_image(self, name): + if not name.endswith('.wmf'): + return name try: return self.rasterize_wmf(name) except: @@ -167,16 +179,18 @@ class RTFInput(InputFormatPlugin): def replace_wmf(self, name): from calibre.ebooks import calibre_cover - data = calibre_cover('Conversion of WMF images is not supported', + if self.default_img is None: + self.default_img = calibre_cover('Conversion of WMF images is not supported', 'Use Microsoft Word or OpenOffice to save this RTF file' ' as HTML and convert that in calibre.', title_size=36, author_size=20) name = name.replace('.wmf', '.jpg') with open(name, 'wb') as f: - f.write(data) + f.write(self.default_img) return name def rasterize_wmf(self, name): + raise ValueError('Conversion of WMF images not supported') from calibre.utils.wmf import extract_raster_image with open(name, 'rb') as f: data = f.read() @@ -212,27 +226,27 @@ class RTFInput(InputFormatPlugin): css += '\n'+'\n'.join(font_size_classes) css += '\n' +'\n'.join(color_classes) - for cls, val in border_styles.items(): + for cls, val in border_styles.iteritems(): css += '\n\n.%s {\n%s\n}'%(cls, val) with open('styles.css', 'ab') as f: f.write(css) - def preprocess(self, fname): - self.log('\tPreprocessing to convert unicode characters') - try: - data = open(fname, 'rb').read() - from calibre.ebooks.rtf.preprocess import RtfTokenizer, RtfTokenParser - tokenizer = RtfTokenizer(data) - tokens = RtfTokenParser(tokenizer.tokens) - data = tokens.toRTF() - fname = 'preprocessed.rtf' - with open(fname, 'wb') as f: - f.write(data) - except: - self.log.exception( - 'Failed to preprocess RTF to convert unicode sequences, ignoring...') - return fname + # def preprocess(self, fname): + # self.log('\tPreprocessing to convert unicode characters') + # try: + # data = open(fname, 'rb').read() + # from calibre.ebooks.rtf.preprocess import RtfTokenizer, RtfTokenParser + # tokenizer = RtfTokenizer(data) + # tokens = RtfTokenParser(tokenizer.tokens) + # data = tokens.toRTF() + # fname = 'preprocessed.rtf' + # with open(fname, 'wb') as f: + # f.write(data) + # except: + # self.log.exception( + # 'Failed to preprocess RTF to convert unicode sequences, ignoring...') + # return fname def convert_borders(self, doc): border_styles = [] @@ -269,17 +283,14 @@ class RTFInput(InputFormatPlugin): self.log = log self.log('Converting RTF to XML...') #Name of the preprocesssed RTF file - fname = self.preprocess(stream.name) + # fname = self.preprocess(stream.name) try: - xml = self.generate_xml(fname) + xml = self.generate_xml(stream.name) except RtfInvalidCodeException, e: + raise raise ValueError(_('This RTF file has a feature calibre does not ' 'support. Convert it to HTML first and then try it.\n%s')%e) - '''dataxml = open('dataxml.xml', 'w') - dataxml.write(xml) - dataxml.close''' - d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf')) if d: imap = {} diff --git a/src/calibre/ebooks/rtf2xml/ParseRtf.py b/src/calibre/ebooks/rtf2xml/ParseRtf.py index 7b89407f79..cdd9a3d088 100755 --- a/src/calibre/ebooks/rtf2xml/ParseRtf.py +++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py @@ -17,7 +17,8 @@ ######################################################################### # $Revision: 1.41 $ # $Date: 2006/03/24 23:50:07 $ -import sys,os +import sys, os + from calibre.ebooks.rtf2xml import headings_to_sections, \ line_endings, footnote, fields_small, default_encoding, \ make_lists, preamble_div, header, colors, group_borders, \ @@ -90,7 +91,6 @@ class ParseRtf: out_file = '', out_dir = None, dtd = '', - #debug = 0, #why? calibre deb_dir = None, convert_symbol = None, convert_wingdings = None, @@ -107,6 +107,7 @@ class ParseRtf: no_dtd = 0, char_data = '', ): + """ Requires: 'file' --file to parse @@ -119,12 +120,11 @@ class ParseRtf: script tries to output to directory where is script is exectued.) 'deb_dir' --debug directory. If a debug_dir is provided, the script will copy each run through as a file to examine in the debug_dir - 'perl_script'--use perl to make tokens. This runs just a bit faster. - (I will probably phase this out.) 'check_brackets' -- make sure the brackets match up after each run through a file. Only for debugging. Returns: Nothing """ + self.__file = in_file self.__out_file = out_file self.__out_dir = out_dir @@ -132,7 +132,7 @@ class ParseRtf: self.__dtd_path = dtd self.__check_file(in_file,"file_to_parse") self.__char_data = char_data - self.__debug_dir = deb_dir #self.__debug_dir = debug calibre + self.__debug_dir = deb_dir self.__check_dir(self.__temp_dir) self.__copy = self.__check_dir(self.__debug_dir) self.__convert_caps = convert_caps @@ -155,25 +155,24 @@ class ParseRtf: if hasattr(the_file, 'read'): return if the_file == None: if type == "file_to_parse": - message = "You must provide a file for the script to work" - msg = message + msg = "\nYou must provide a file for the script to work" raise RtfInvalidCodeException, msg elif os.path.exists(the_file): pass # do nothing else: - message = "The file '%s' cannot be found" % the_file - msg = message + msg = "\nThe file '%s' cannot be found" % the_file raise RtfInvalidCodeException, msg + def __check_dir(self, the_dir): """Check to see if directory exists""" if not the_dir : return dir_exists = os.path.isdir(the_dir) if not dir_exists: - message = "%s is not a directory" % the_dir - msg = message + msg = "\n%s is not a directory" % the_dir raise RtfInvalidCodeException, msg return 1 + def parse_rtf(self): """ Parse the file by calling on other classes. @@ -194,13 +193,14 @@ class ParseRtf: copy_obj.set_dir(self.__debug_dir) copy_obj.remove_files() copy_obj.copy_file(self.__temp_file, "original_file") - # new as of 2005-08-02. Do I want this? + # Function to check if bracket are well handled if self.__debug_dir or self.__run_level > 2: self.__check_brack_obj = check_brackets.CheckBrackets\ (file = self.__temp_file, bug_handler = RtfInvalidCodeException, ) - # convert Macintosh line endings to Unix line endings + #convert Macintosh and Windows line endings to Unix line endings + #why do this if you don't wb after? line_obj = line_endings.FixLineEndings( in_file = self.__temp_file, bug_handler = RtfInvalidCodeException, @@ -208,13 +208,13 @@ class ParseRtf: run_level = self.__run_level, replace_illegals = self.__replace_illegals, ) - return_value = line_obj.fix_endings() + return_value = line_obj.fix_endings() #calibre return what? self.__return_code(return_value) tokenize_obj = tokenize.Tokenize( bug_handler = RtfInvalidCodeException, in_file = self.__temp_file, copy = self.__copy, - run_level = self.__run_level,) + run_level = self.__run_level) tokenize_obj.tokenize() process_tokens_obj = process_tokens.ProcessTokens( in_file = self.__temp_file, @@ -230,12 +230,25 @@ class ParseRtf: os.remove(self.__temp_file) except OSError: pass + #Check to see if the file is correctly encoded + encode_obj = default_encoding.DefaultEncoding( + in_file = self.__temp_file, + run_level = self.__run_level, + bug_handler = RtfInvalidCodeException, + check_raw = True, + ) + platform, code_page, default_font_num = encode_obj.find_default_encoding() check_encoding_obj = check_encoding.CheckEncoding( - bug_handler = RtfInvalidCodeException, - ) - check_encoding_obj.check_encoding(self.__file) - sys.stderr.write('File "%s" does not appear to be RTF.\n' % self.__file if isinstance(self.__file, str) else self.__file.encode('utf-8')) - raise InvalidRtfException, msg + bug_handler = RtfInvalidCodeException, + ) + enc = encode_obj.get_codepage() + if enc != 'mac_roman': + enc = 'cp' + enc + if check_encoding_obj.check_encoding(self.__file, enc): + file_name = self.__file if isinstance(self.__file, str) \ + else self.__file.encode('utf-8') + msg = 'File %s does not appear to be correctly encoded.\n' % file_name + raise InvalidRtfException, msg delete_info_obj = delete_info.DeleteInfo( in_file = self.__temp_file, copy = self.__copy, @@ -508,6 +521,7 @@ class ParseRtf: indent = self.__indent, run_level = self.__run_level, no_dtd = self.__no_dtd, + encoding = encode_obj.get_codepage(), bug_handler = RtfInvalidCodeException, ) tags_obj.convert_to_tags() @@ -520,35 +534,28 @@ class ParseRtf: output_obj.output() os.remove(self.__temp_file) return self.__exit_level + def __bracket_match(self, file_name): if self.__run_level > 2: good_br, msg = self.__check_brack_obj.check_brackets() if good_br: pass - # sys.stderr.write( msg + ' in ' + file_name + "\n") + #sys.stderr.write( msg + ' in ' + file_name + "\n") else: - msg += msg + " in file '" + file_name + "'\n" + msg = '%s in file %s\n' % (msg, file_name) raise RtfInvalidCodeException, msg + def __return_code(self, num): - if num == None: - return - if int(num) > self.__exit_level: - self.__exit_level = num + if num == None: + return + if int(num) > self.__exit_level: + self.__exit_level = num + def __make_temp_file(self,file): """Make a temporary file to parse""" write_file="rtf_write_file" read_obj = file if hasattr(file, 'read') else open(file,'r') - write_obj = open(write_file, 'w') - line = "dummy" - while line: - line = read_obj.read(1000) - write_obj.write(line ) - write_obj.close() + with open(write_file, 'wb') as write_obj: + for line in read_obj: + write_obj.write(line) return write_file - """ -mi1\n -mi33\n -mi' % info) + def __empty_func(self, line): """ Print out empty tag and newlines when needed. @@ -85,10 +96,11 @@ class ConvertToTags: self.__write_new_line() if info in self.__two_new_line: self.__write_extra_new_line() + def __open_att_func(self, line): """ Process lines for open tags that have attributes. - The important infor is between [17:-1]. Take this info and split it + The important info is between [17:-1]. Take this info and split it with the delimeter '<'. The first token in this group is the element name. The rest are attributes, separated fromt their values by '>'. So read each token one at a time, and split them by '>'. @@ -119,6 +131,7 @@ class ConvertToTags: self.__write_new_line() if element_name in self.__two_new_line: self.__write_extra_new_line() + def __empty_att_func(self, line): """ Same as the __open_att_func, except a '/' is placed at the end of the tag. @@ -143,6 +156,7 @@ class ConvertToTags: self.__write_new_line() if element_name in self.__two_new_line: self.__write_extra_new_line() + def __close_func(self, line): """ Print out the closed tag and new lines, if appropriate. @@ -156,6 +170,7 @@ class ConvertToTags: self.__write_new_line() if info in self.__two_new_line: self.__write_extra_new_line() + def __text_func(self, line): """ Simply print out the information between [17:-1] @@ -163,6 +178,7 @@ class ConvertToTags: #tx') + #keep maximum compatibility with previous version + check_encoding_obj = check_encoding.CheckEncoding( + bug_handler=self.__bug_handler) + + if not check_encoding_obj.check_encoding(self.__file, verbose=False): + self.__write_obj.write('') + elif not check_encoding_obj.check_encoding(self.__file, self.__encoding): + self.__write_obj.write('' % self.__encoding) + else: + self.__write_obj.write('') + sys.stderr.write('Bad RTF encoding, revert to US-ASCII chars and' + ' hope for the best') self.__new_line = 0 self.__write_new_line() if self.__no_dtd: @@ -207,6 +237,7 @@ class ConvertToTags: ) self.__new_line = 0 self.__write_new_line() + def convert_to_tags(self): """ Read in the file one line at a time. Get the important info, between @@ -222,18 +253,14 @@ class ConvertToTags: an empty tag function. """ self.__initiate_values() - read_obj = open(self.__file, 'r') self.__write_obj = open(self.__write_to, 'w') self.__write_dec() - line_to_read = 1 - while line_to_read: - line_to_read = read_obj.readline() - line = line_to_read - self.__token_info = line[:16] - action = self.__state_dict.get(self.__token_info) - if action != None: - action(line) - read_obj.close() + with open(self.__file, 'r') as read_obj: + for line in read_obj: + self.__token_info = line[:16] + action = self.__state_dict.get(self.__token_info) + if action is not None: + action(line) self.__write_obj.close() copy_obj = copy.Copy(bug_handler = self.__bug_handler) if self.__copy: diff --git a/src/calibre/ebooks/rtf2xml/copy.py b/src/calibre/ebooks/rtf2xml/copy.py index ff029c1841..1b620b9fbf 100755 --- a/src/calibre/ebooks/rtf2xml/copy.py +++ b/src/calibre/ebooks/rtf2xml/copy.py @@ -23,6 +23,7 @@ class Copy: def __init__(self, bug_handler, file = None, deb_dir = None, ): self.__file = file self.__bug_handler = bug_handler + def set_dir(self, deb_dir): """Set the temporary directory to write files to""" if deb_dir is None: @@ -33,19 +34,11 @@ class Copy: message = "%(deb_dir)s is not a directory" % vars() raise self.__bug_handler , message Copy.__dir = deb_dir + def remove_files(self ): """Remove files from directory""" self.__remove_the_files(Copy.__dir) - """ - list_of_files = os.listdir(Copy.__dir) - list_of_files = os.listdir(the_dir) - for file in list_of_files: - rem_file = os.path.join(Copy.__dir,file) - if os.path.isdir(rem_file): - self.remove_files(rem_file) - else: - os.remove(rem_file) - """ + def __remove_the_files(self, the_dir): """Remove files from directory""" list_of_files = os.listdir(the_dir) @@ -58,6 +51,7 @@ class Copy: os.remove(rem_file) except OSError: pass + def copy_file(self, file, new_file): """ Copy the file to a new name diff --git a/src/calibre/ebooks/rtf2xml/default_encoding.py b/src/calibre/ebooks/rtf2xml/default_encoding.py index b932b465d0..53887e0d90 100755 --- a/src/calibre/ebooks/rtf2xml/default_encoding.py +++ b/src/calibre/ebooks/rtf2xml/default_encoding.py @@ -1,61 +1,142 @@ ######################################################################### # # -# # # copyright 2002 Paul Henry Tremblay # # # -# This program is distributed in the hope that it will be useful, # -# but WITHOUT ANY WARRANTY; without even the implied warranty of # -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # -# General Public License for more details. # -# # -# You should have received a copy of the GNU General Public License # -# along with this program; if not, write to the Free Software # -# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA # -# 02111-1307 USA # -# # -# # ######################################################################### + +''' +Codepages as to RTF 1.9.1: + 437 United States IBM + 708 Arabic (ASMO 708) + 709 Arabic (ASMO 449+, BCON V4) + 710 Arabic (transparent Arabic) + 711 Arabic (Nafitha Enhanced) + 720 Arabic (transparent ASMO) + 819 Windows 3.1 (United States and Western Europe) + 850 IBM multilingual + 852 Eastern European + 860 Portuguese + 862 Hebrew + 863 French Canadian + 864 Arabic + 865 Norwegian + 866 Soviet Union + 874 Thai + 932 Japanese + 936 Simplified Chinese + 949 Korean + 950 Traditional Chinese + 1250 Eastern European + 1251 Cyrillic + 1252 Western European + 1253 Greek + 1254 Turkish + 1255 Hebrew + 1256 Arabic + 1257 Baltic + 1258 Vietnamese + 1361 Johab + 10000 MAC Roman + 10001 MAC Japan + 10004 MAC Arabic + 10005 MAC Hebrew + 10006 MAC Greek + 10007 MAC Cyrillic + 10029 MAC Latin2 + 10081 MAC Turkish + 57002 Devanagari + 57003 Bengali + 57004 Tamil + 57005 Telugu + 57006 Assamese + 57007 Oriya + 57008 Kannada + 57009 Malayalam + 57010 Gujarati + 57011 Punjabi +''' +import re + class DefaultEncoding: """ Find the default encoding for the doc """ - def __init__(self, in_file, bug_handler, run_level = 1,): - """ - Required: - 'file' - Returns: - nothing - """ + def __init__(self, in_file, bug_handler, run_level = 1, check_raw = False): self.__file = in_file self.__bug_handler = bug_handler + self.__platform = 'Windows' + self.__default_num = 'not-defined' + self.__code_page = '1252' + self.__datafetched = False + self.__fetchraw = check_raw + def find_default_encoding(self): - platform = 'Windows' - default_num = 'not-defined' - code_page = 'ansicpg1252' - read_obj = open(self.__file, 'r') - line_to_read = 1 - while line_to_read: - line_to_read = read_obj.readline() - line = line_to_read - self.__token_info = line[:16] - if self.__token_info == 'mi 3: msg = 'flag problem\n' raise self.__bug_handler, msg - return 1 + return True elif self.__token_info in self.__allowable : if self.__ob: self.__write_obj.write(self.__ob) @@ -132,85 +138,81 @@ class DeleteInfo: self.__state = 'default' else: pass - return 1 + return True elif self.__token_info == 'cw 5: - msg = 'After an asterisk, and found neither an allowable or non-allowble token\n' - msg += 'token is "%s"\n' % self.__token_info - raise self.__bug_handler + msg = ('After an asterisk, and found neither an allowable or non-allowable token\n\ + token is "%s"\n') % self.__token_info + raise self.__bug_handler, msg if not self.__ob: - self.__write_cb = 1 + self.__write_cb = True self.__ob = 0 self.__state = 'delete' self.__cb_count = 0 - return 0 + return False + def __found_list_func(self, line): """ print out control words in this group """ self.__state = 'list' + def __list_func(self, line): """ Check to see if the group has ended. - Return 1 for all control words. - Return 0 otherwise. + Return True for all control words. + Return False otherwise. """ if self.__delete_count == self.__cb_count and self.__token_info ==\ 'cb%s\n' % self.__footnote_count) self.__first_line = 0 + def __in_footnote_func(self, line): """Handle all tokens that are part of footnote""" if self.__first_line: @@ -68,6 +72,7 @@ class Footnote: 'mi ci - 'annotation' : 'annotation', + 'annotation' : 'annotation', 'blue______' : 'blue', 'bold______' : 'bold', - 'caps______' : 'caps', - 'char-style' : 'character-style', - 'dbl-strike' : 'double-strike-through', + 'caps______' : 'caps', + 'char-style' : 'character-style', + 'dbl-strike' : 'double-strike-through', 'emboss____' : 'emboss', 'engrave___' : 'engrave', 'font-color' : 'font-color', @@ -96,7 +97,7 @@ class Inline: 'font-size_' : 'font-size', 'font-style' : 'font-style', 'font-up___' : 'superscript', - 'footnot-mk' : 'footnote-marker', + 'footnot-mk' : 'footnote-marker', 'green_____' : 'green', 'hidden____' : 'hidden', 'italics___' : 'italics', @@ -107,9 +108,10 @@ class Inline: 'strike-thr' : 'strike-through', 'subscript_' : 'subscript', 'superscrip' : 'superscript', - 'underlined' : 'underlined', + 'underlined' : 'underlined', } self.__caps_list = ['false'] + def __set_list_func(self, line): """ Requires: @@ -128,6 +130,7 @@ class Inline: self.__place = 'in_list' self.__inline_list = self.__list_inline_list self.__groups_in_waiting = self.__groups_in_waiting_list + def __default_func(self, line): """ Requires: @@ -140,8 +143,8 @@ class Inline: action = self.__default_dict.get(self.__token_info) if action: action(line) - if self.__token_info != 'cw%s' % (the_key, the_dict[the_key])) self.__write_obj.write('\n') self.__groups_in_waiting[0] = 0 + def __end_para_func(self, line): """ Requires: @@ -342,6 +346,7 @@ class Inline: self.__write_obj.write('mi%s' % (the_key, the_dict[the_key])) self.__write_obj.write('\n') self.__groups_in_waiting[0] = 0 + def __found_field_func(self, line): """ Just a default function to make sure I don't prematurely exit default state """ pass + def form_tags(self): """ Requires: @@ -386,32 +393,27 @@ class Inline: the state. """ self.__initiate_values() - read_obj = open(self.__file, 'r') - self.__write_obj = open(self.__write_to, 'w') - line_to_read = 1 - while line_to_read: - line_to_read = read_obj.readline() - line = line_to_read - token = line[0:-1] - self.__token_info = '' - if token == 'tx 1: - sys.stderr.write('Removing files from old pict directory...\n') - all_files = os.listdir(self.__dir_name) - for the_file in all_files: - the_file = os.path.join(self.__dir_name, the_file) - try: - os.remove(the_file) - except OSError: - pass - if self.__run_level > 1: - sys.stderr.write('Files removed.\n') + if self.__run_level > 1: + sys.stderr.write('Removing files from old pict directory...\n') + all_files = os.listdir(self.__dir_name) + for the_file in all_files: + the_file = os.path.join(self.__dir_name, the_file) + try: + os.remove(the_file) + except OSError: + pass + if self.__run_level > 1: + sys.stderr.write('Files removed.\n') def __create_pict_file(self): """Create a file for all the pict data to be written to. """ self.__pict_file = os.path.join(self.__dir_name, 'picts.rtf') - write_pic_obj = open(self.__pict_file, 'w') - write_pic_obj.close() self.__write_pic_obj = open(self.__pict_file, 'a') def __in_pict_func(self, line): if self.__cb_count == self.__pict_br_count: - self.__in_pict = 0 + self.__in_pict = False self.__write_pic_obj.write("}\n") - return 1 + return True else: action = self.__pict_dict.get(self.__token_info) if action: - line = action(line) - self.__write_pic_obj.write(line) - return 0 + self.__write_pic_obj.write(action(line)) + return False def __default(self, line, write_obj): """Determine if each token marks the beginning of pict data. @@ -142,53 +128,50 @@ class Pict: write_obj.write('mi ml '*' : ('ml', 'asterisk__', self.default_func), ':' : ('ml', 'colon_____', self.default_func), @@ -73,7 +78,6 @@ class ProcessTokens: 'backslash' : ('nu', '\\', self.text_func), 'ob' : ('nu', '{', self.text_func), 'cb' : ('nu', '}', self.text_func), - 'line' : ('nu', 'hard-lineb', self.default_func), #calibre #'line' : ('nu', ' ', self.text_func), calibre # paragraph formatting => pf 'page' : ('pf', 'page-break', self.default_func), @@ -159,15 +163,17 @@ class ProcessTokens: 'rtf' : ('ri', 'rtf_______', self.default_func), 'deff' : ('ri', 'deflt-font', self.default_func), 'mac' : ('ri', 'macintosh_', self.default_func), + 'pc' : ('ri', 'pc________', self.default_func), + 'pca' : ('ri', 'pca_______', self.default_func), 'ansi' : ('ri', 'ansi______', self.default_func), 'ansicpg' : ('ri', 'ansi-codpg', self.default_func), # notes => nt 'footnote' : ('nt', 'footnote__', self.default_func), 'ftnalt' : ('nt', 'type______ an - 'tc' : ('an', 'toc_______', self.default_func), + 'tc' : ('an', 'toc_______', self.default_func), 'bkmkstt' : ('an', 'book-mk-st', self.default_func), - 'bkmkstart' : ('an', 'book-mk-st', self.default_func), + 'bkmkstart' : ('an', 'book-mk-st', self.default_func), 'bkmkend' : ('an', 'book-mk-en', self.default_func), 'xe' : ('an', 'index-mark', self.default_func), 'rxe' : ('an', 'place_____', self.default_func), @@ -347,7 +353,7 @@ class ProcessTokens: 10: 'Kanji numbering without the digit character', 11: 'Kanji numbering with the digit character', 1246: 'phonetic Katakana characters in aiueo order', - 1346: 'phonetic katakana characters in iroha order', + 1346: 'phonetic katakana characters in iroha order', 14: 'double byte character', 15: 'single byte character', 16: 'Kanji numbering 3', @@ -392,7 +398,7 @@ class ProcessTokens: 5121 : 'Arabic Algeria', 15361 : 'Arabic Bahrain', 3073 : 'Arabic Egypt', - 1 : 'Arabic General', + 1 : 'Arabic General', 2049 : 'Arabic Iraq', 11265 : 'Arabic Jordan', 13313 : 'Arabic Kuwait', @@ -417,7 +423,7 @@ class ProcessTokens: 1059 : 'Byelorussian', 1027 : 'Catalan', 2052 : 'Chinese China', - 4 : 'Chinese General', + 4 : 'Chinese General', 3076 : 'Chinese Hong Kong', 4100 : 'Chinese Singapore', 1028 : 'Chinese Taiwan', @@ -431,7 +437,7 @@ class ProcessTokens: 2057 : 'English British', 4105 : 'English Canada', 9225 : 'English Caribbean', - 9 : 'English General', + 9 : 'English General', 6153 : 'English Ireland', 8201 : 'English Jamaica', 5129 : 'English New Zealand', @@ -595,30 +601,37 @@ class ProcessTokens: num = num[1:] # chop off leading 0, which I added num = num.upper() # the mappings store hex in caps return 'tx 3: - msg = 'number "%s" cannot be converted to integer\n' % num + msg = 'Number "%s" cannot be converted to integer\n' % num raise self.__bug_handler, msg type = self.__number_type_dict.get(num) - if type == None: + if type is None: if self.__run_level > 3: msg = 'No type for "%s" in self.__number_type_dict\n' raise self.__bug_handler type = 'Arabic' return 'cw<%s<%snum<%s\n' % (token, num) + def divide_by_2(self, pre, token, num): num = self.divide_num(num, 2) return 'cw<%s<%s%s<%s\n' % (token, num, token) + def divide_by_20(self, pre, token, num): num = self.divide_num(num, 20) return 'cw<%s<%s%s<%s\n' % (token, num, token) + def text_func(self, pre, token, num=None): return 'tx%s<%s\n' % (third_field, token, num, token) + def bool_st_func(self, pre, token, num): if num is None or num == '' or num == '1': return 'cw<%s<%sfalse<%s\n' % (token, token) else: - msg = 'boolean should have some value module process tokens\n' - msg += 'token is ' + token + "\n" - msg += "'" + num + "'" + "\n" + msg = "boolean should have some value module process tokens\ntoken is %s\n'%s'\n" % (token, num) raise self.__bug_handler, msg + def __no_sup_sub_func(self, pre, token, num): the_string = 'cw 3: - msg = 'no number to process?\n' - msg += 'this indicates that the token ' - msg += ' \(\\li\) should have a number and does not\n' - msg += 'numerator is "%s"\n' % numerator - msg += 'denominator is "%s"\n' % denominator + msg = ('No number to process?\nthis indicates that the token \(\\li\) \ + should have a number and does not\nnumerator is \ + "%s"\ndenominator is "%s"\n') % (numerator, denominator) raise self.__bug_handler, msg if 5 > self.__return_code: self.__return_code = 5 @@ -698,9 +716,10 @@ class ProcessTokens: if string_num[-2:] == ".0": string_num = string_num[:-2] return string_num + def split_let_num(self, token): match_obj = re.search(self.__num_exp,token) - if match_obj != None: + if match_obj is not None: first = match_obj.group(1) second = match_obj.group(2) if not second: @@ -714,6 +733,7 @@ class ProcessTokens: raise self.__bug_handler return token, 0 return first, second + def convert_to_hex(self,number): """Convert a string to uppercase hexidecimal""" num = int(number) @@ -722,6 +742,7 @@ class ProcessTokens: return hex_num except: raise self.__bug_handler + def process_cw(self, token): """Change the value of the control word by determining what dictionary it belongs to""" @@ -737,89 +758,62 @@ class ProcessTokens: pre, token, action = self.dict_token.get(token, (None, None, None)) if action: return action(pre, token, num) - # unused function - def initiate_token_actions(self): - self.action_for_token={ - '{' : self.ob_func, - '}' : self.cb_func, - '\\' : self.process_cw, - } - # unused function - def evaluate_token(self,token): - """Evaluate tokens. Return a value if the token is not a - control word. Otherwise, pass token onto another method - for further evaluation.""" - token, action = self.dict_token.get(token[0:1]) - if action: - line = action(token) - return line - else : - return 'tx -1: - msg ='Invalid RTF: token "\\ " not valid. \n' - raise self.__exception_handler, msg - elif token[0:1] == "\\": - line = self.process_cw(token) - if line != None: - write_obj.write(line) - else: - fields = re.split(self.__utf_exp, token) - for field in fields: - if not field: - continue - if field[0:1] == '&': - write_obj.write('tx -1: + msg = 'Invalid RTF: token "\\ " not valid.\n' + raise self.__exception_handler, msg + elif token[:1] == "\\": + try: + token.decode('us-ascii') + except UnicodeError, msg: + msg = 'Invalid RTF: Tokens not ascii encoded.\n%s' % str(msg) + raise self.__exception_handler, msg + line = self.process_cw(token) + if line is not None: + write_obj.write(line) else: - write_obj.write('tx", ">") - line = line.replace("\\~", "\\~ ") - line = line.replace("\\_", "\\_ ") - line = line.replace("\\:", "\\: ") - line = line.replace("\\-", "\\- ") - # turn into a generic token to eliminate special - # cases and make processing easier - line = line.replace("\\{", "\\ob ") - # turn into a generic token to eliminate special - # cases and make processing easier - line = line.replace("\\}", "\\cb ") - # put a backslash in front of to eliminate special cases and - # make processing easier - line = line.replace("{", "\\{") - # put a backslash in front of to eliminate special cases and - # make processing easier - line = line.replace("}", "\\}") - line = re.sub(self.__utf_exp, self.__from_ms_to_utf8, line) - # line = re.sub( self.__neg_utf_exp, self.__neg_unicode_func, line) - line = re.sub(self.__ms_hex_exp, "\\mshex0\g<1> ", line) - ##line = line.replace("\\backslash", "\\\\") - # this is for older RTF - line = re.sub(self.__par_exp, '\\par ', line) - return line - def __compile_expressions(self): - self.__ms_hex_exp = re.compile(r"\\\'(..)") - self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) {0,1}") - self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\\[^\s\\{}&]+(?:\s)?)") - self.__par_exp = re.compile(r'\\$') - self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)") - ##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)") - def __create_tokens(self): self.__compile_expressions() - read_obj = open(self.__file, 'r') - write_obj = open(self.__write_to, 'w') - line_to_read = "dummy" - while line_to_read: - line_to_read = read_obj.readline() - line = line_to_read - line = line.replace("\n", "") - line = self.__sub_line_reg(line) - tokens = re.split(self.__splitexp, line) - ##print tokens - for token in tokens: - if token != "": - write_obj.write(token + "\n") - """ - match_obj = re.search(self.__mixed_exp, token) - if match_obj != None: - first = match_obj.group(1) - second = match_obj.group(2) - write_obj.write(first + "\n") - write_obj.write(second + "\n") - else: - write_obj.write(token + "\n") - """ - read_obj.close() - write_obj.close() + #variables + self.__uc_char = 0 + self.__uc_bin = False + self.__uc_value = [1] + + def __reini_utf8_counters(self): + self.__uc_char = 0 + self.__uc_bin = False + + def __remove_uc_chars(self, startchar, token): + for i in xrange(startchar, len(token)): + if token[i] == " ": + continue + elif self.__uc_char: + self.__uc_char -= 1 + else: + return token[i:] + #if only " " and char to skip + return '' + + def __unicode_process(self, token): + #change scope in + if token == '\{': + self.__uc_value.append(self.__uc_value[-1]) + #basic error handling + self.__reini_utf8_counters() + return token + #change scope out + elif token == '\}': + self.__uc_value.pop() + self.__reini_utf8_counters() + return token + #add a uc control + elif token[:3] == '\uc': + self.__uc_value[-1] = int(token[3:]) + self.__reini_utf8_counters() + return token + #bin data to slip + elif self.__uc_bin: + self.__uc_bin = False + return '' + #uc char to remove + elif self.__uc_char: + #handle \bin tag in case of uc char to skip + if token[:4] == '\bin': + self.__uc_char -=1 + self.__uc_bin = True + return '' + elif token[:1] == "\\" : + self.__uc_char -=1 + return '' + else: + return self.__remove_uc_chars(0, token) + #go for real \u token + match_obj = self.__utf_exp.match(token) + if match_obj is not None: + self.__reini_utf8_counters() + #get value and handle negative case + uni_char = int(match_obj.group(1)) + uni_len = len(match_obj.group(1)) + 2 + if uni_char < 0: + uni_char += 65536 + uni_char = unichr(uni_char).encode('ascii', 'xmlcharrefreplace') + self.__uc_char = self.__uc_value[-1] + #there is only an unicode char + if len(token)<= uni_len: + return uni_char + #an unicode char and something else + #must be after as it is splited on \ + #necessary? maybe for \bin? + elif not self.__uc_char: + return uni_char + token[uni_len:] + #if not uc0 and chars + else: + return uni_char + self.__remove_uc_chars(uni_len, token) + #default + return token + + def __sub_reg_split(self,input_file): + input_file = self.__replace_spchar.mreplace(input_file) + input_file = self.__ms_hex_exp.sub("\\mshex0\g<1> ", input_file) + input_file = self.__utf_ud.sub("\\{\\uc0 \g<1>\\}", input_file) + #remove \n in bin data + input_file = self.__bin_exp.sub(lambda x: \ + x.group().replace('\n', '') + '\n', input_file) + #split + tokens = re.split(self.__splitexp, input_file) + #remove empty tokens and \n + return filter(lambda x: len(x) > 0 and x != '\n', tokens) + #input_file = re.sub(self.__utf_exp, self.__from_ms_to_utf8, input_file) + # line = re.sub( self.__neg_utf_exp, self.__neg_unicode_func, line) + # this is for older RTF + #line = re.sub(self.__par_exp, '\\par ', line) + #return filter(lambda x: len(x) > 0, \ + #(self.__remove_line.sub('', x) for x in tokens)) + + def __compile_expressions(self): + SIMPLE_RPL = { + "\\\\": "\\backslash ", + "\\~": "\\~ ", + "\\;": "\\; ", + "&": "&", + "<": "<", + ">": ">", + "\\~": "\\~ ", + "\\_": "\\_ ", + "\\:": "\\: ", + "\\-": "\\- ", + # turn into a generic token to eliminate special + # cases and make processing easier + "\\{": "\\ob ", + # turn into a generic token to eliminate special + # cases and make processing easier + "\\}": "\\cb ", + # put a backslash in front of to eliminate special cases and + # make processing easier + "{": "\\{", + # put a backslash in front of to eliminate special cases and + # make processing easier + "}": "\\}", + # this is for older RTF + r'\\$': '\\par ', + } + self.__replace_spchar = MReplace(SIMPLE_RPL) + #add ;? in case of char following \u + self.__ms_hex_exp = re.compile(r"\\\'([0-9a-fA-F]{2})") #r"\\\'(..)" + self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) ?") + self.__bin_exp = re.compile(r"(?:\\bin(-?\d{0,10})[\n ]+)[01\n]+") + #manage upr/ud situations + self.__utf_ud = re.compile(r"\\{[\n ]?\\upr[\n ]?(?:\\{.*?\\})[\n ]?" + \ + r"\\{[\n ]?\\*[\n ]?\\ud[\n ]?(\\{.*?\\})[\n ]?\\}[\n ]?\\}") + #add \n in split for whole file reading + #why keep backslash whereas \is replaced before? + #remove \n from endline char + self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)") + #self.__bin_exp = re.compile(r"\\bin(-?\d{1,8}) {0,1}") + #self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})") + #self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\n|\\[^\s\\{}&]+(?:\s)?)") + #self.__par_exp = re.compile(r'\\$') + #self.__remove_line = re.compile(r'\n+') + #self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)") + ##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)") + def tokenize(self): - """Main class for handling other methods. Reads in one line \ - at a time, usues method self.sub_line to make basic substitutions,\ - uses ? to process tokens""" - self.__create_tokens() + """Main class for handling other methods. Reads the file \ + , uses method self.sub_reg to make basic substitutions,\ + and process tokens by itself""" + #read + with open(self.__file, 'r') as read_obj: + input_file = read_obj.read() + + #process simple replacements and split giving us a correct list + #remove '' and \n in the process + tokens = self.__sub_reg_split(input_file) + #correct unicode + tokens = map(self.__unicode_process, tokens) + #remove empty items created by removing \uc + tokens = filter(lambda x: len(x) > 0, tokens) + + #write + with open(self.__write_to, 'wb') as write_obj: + write_obj.write('\n'.join(tokens)) + #Move and copy copy_obj = copy.Copy(bug_handler = self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "tokenize.data") copy_obj.rename(self.__write_to, self.__file) os.remove(self.__write_to) + + #self.__special_tokens = [ '_', '~', "'", '{', '}' ] \ No newline at end of file