diff --git a/resources/templates/rtf.xsl b/resources/templates/rtf.xsl index 6db1c0388d..0f91d7f4ac 100644 --- a/resources/templates/rtf.xsl +++ b/resources/templates/rtf.xsl @@ -98,7 +98,7 @@ - + @@ -220,7 +220,7 @@ - + padding-left: @@ -256,15 +256,15 @@ ; - + text-decoration:underline ; - + text-align: justify; @@ -314,7 +314,6 @@ - @@ -452,6 +451,15 @@ + + + + + + + + + diff --git a/src/calibre/ebooks/metadata/rtf.py b/src/calibre/ebooks/metadata/rtf.py index c20d880a2f..70f6effcda 100644 --- a/src/calibre/ebooks/metadata/rtf.py +++ b/src/calibre/ebooks/metadata/rtf.py @@ -93,7 +93,7 @@ def get_metadata(stream): stream.seek(0) cpg = detect_codepage(stream) stream.seek(0) - + title_match = title_pat.search(block) if title_match is not None: title = decode(title_match.group(1).strip(), cpg) @@ -162,7 +162,8 @@ def set_metadata(stream, options): index = src.rindex('}') return src[:index] + r'{\ '[:-1] + name + ' ' + val + '}}' src, pos = get_document_info(stream) - if not src: + print 'I was thre' + if src is not None: create_metadata(stream, options) else: olen = len(src) diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index f08aa76605..c1e649851b 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -41,7 +41,7 @@ border_style_map = { class InlineClass(etree.XSLTExtension): - FMTS = ('italics', 'bold', 'underlined', 'strike-through', 'small-caps') + FMTS = ('italics', 'bold', 'strike-through', 'small-caps') def __init__(self, log): etree.XSLTExtension.__init__(self) @@ -54,6 +54,9 @@ class InlineClass(etree.XSLTExtension): for x in self.FMTS: if input_node.get(x, None) == 'true': classes.append(x) + #underlined is special + if input_node.get('underlined', 'false') != 'false': + classes.append('underlined') fs = input_node.get('font-size', False) if fs: if fs not in self.font_sizes: @@ -78,12 +81,13 @@ class RTFInput(InputFormatPlugin): def generate_xml(self, stream): from calibre.ebooks.rtf2xml.ParseRtf import ParseRtf ofile = 'dataxml.xml' - run_lev, debug_dir = 1, None + run_lev, debug_dir, indent_out = 1, None, 0 if getattr(self.opts, 'debug_pipeline', None) is not None: try: - os.mkdir(debug_dir) + os.mkdir('rtfdebug') debug_dir = 'rtfdebug' run_lev = 4 + indent_out = 1 self.log('Running RTFParser in debug mode') except: self.log.warn('Impossible to run RTFParser in debug mode') @@ -108,7 +112,7 @@ class RTFInput(InputFormatPlugin): # Indent resulting XML. # Default is 0 (no indent). - indent = 1, + indent = indent_out, # Form lists from RTF. Default is 1. form_lists = 1, @@ -157,7 +161,8 @@ class RTFInput(InputFormatPlugin): with open(name, 'wb') as f: f.write(data) imap[count] = name - #open(name+'.hex', 'wb').write(enc) + # with open(name+'.hex', 'wb') as f: + # f.write(enc) return self.convert_images(imap) def convert_images(self, imap): @@ -319,4 +324,6 @@ class RTFInput(InputFormatPlugin): opf.render(open('metadata.opf', 'wb')) return os.path.abspath('metadata.opf') - +#ebook-convert "bad.rtf" test.epub -v -d "E:\Mes eBooks\Developpement\debug" +# os.makedirs("E:\\Mes eBooks\\Developpement\\rtfdebug") +# debug_dir = "E:\\Mes eBooks\\Developpement\\rtfdebug" diff --git a/src/calibre/ebooks/rtf2xml/ParseRtf.py b/src/calibre/ebooks/rtf2xml/ParseRtf.py index 9ba282129f..e442a1c496 100755 --- a/src/calibre/ebooks/rtf2xml/ParseRtf.py +++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py @@ -372,17 +372,17 @@ class ParseRtf: old_rtf = old_rtf_obj.check_if_old_rtf() if old_rtf: if self.__run_level > 5: - msg = 'older RTF\n' + msg = 'Older RTF\n' msg += 'self.__run_level is "%s"\n' % self.__run_level raise RtfInvalidCodeException, msg if self.__run_level > 1: - sys.stderr.write('File could be older RTF...\n') + sys.stderr.write(_('File could be older RTF...\n')) if found_destination: if self.__run_level > 1: - sys.stderr.write( + sys.stderr.write(_( 'File also has newer RTF.\n' 'Will do the best to convert.\n' - ) + )) add_brackets_obj = add_brackets.AddBrackets( in_file = self.__temp_file, bug_handler = RtfInvalidCodeException, diff --git a/src/calibre/ebooks/rtf2xml/check_brackets.py b/src/calibre/ebooks/rtf2xml/check_brackets.py index 35c7ede435..3b353a65c0 100755 --- a/src/calibre/ebooks/rtf2xml/check_brackets.py +++ b/src/calibre/ebooks/rtf2xml/check_brackets.py @@ -53,4 +53,3 @@ class CheckBrackets: 'total number of brackets is %s') % self.__bracket_count return (False, msg) return (True, "Brackets match!") - diff --git a/src/calibre/ebooks/rtf2xml/configure_txt.py b/src/calibre/ebooks/rtf2xml/configure_txt.py index cd4c2558b7..27f06d0d19 100755 --- a/src/calibre/ebooks/rtf2xml/configure_txt.py +++ b/src/calibre/ebooks/rtf2xml/configure_txt.py @@ -25,7 +25,7 @@ class Configure: if self.__show_config_file and self.__configuration_file: sys.stderr.write('configuration file is "%s"\n' % self.__configuration_file) if self.__show_config_file and not self.__configuration_file: - sys.stderr.write('No configuraiton file found; using default vaules\n') + sys.stderr.write('No configuraiton file found; using default values\n') if self.__configuration_file: read_obj = open(self.__configuration_file, 'r') line_to_read = 1 diff --git a/src/calibre/ebooks/rtf2xml/delete_info.py b/src/calibre/ebooks/rtf2xml/delete_info.py index d508887d01..74e6b2aba3 100755 --- a/src/calibre/ebooks/rtf2xml/delete_info.py +++ b/src/calibre/ebooks/rtf2xml/delete_info.py @@ -43,6 +43,7 @@ class DeleteInfo: 'cw%s' '%snone\n' % (type, my_string)) return my_changed_string + def __found_toc_index_func(self, line, tag): """ Requires: @@ -377,6 +392,7 @@ file. self.__cb_count = 0 self.__state = 'toc_index' self.__tag = tag + def __toc_index_func(self, line): """ Requires: @@ -404,6 +420,7 @@ file. self.__write_obj.write(line) else: self.__text_string += line + def fix_fields(self): """ Requires: @@ -418,24 +435,19 @@ file. bookmark. """ self.__initiate_values() - read_obj = open(self.__file) - self.__write_obj = open(self.__write_to, 'w') - line_to_read = '1' - while line_to_read: - line_to_read = read_obj.readline() - line = line_to_read - self.__token_info = line[:16] - if self.__token_info == 'ob 1: self.__caps_list.pop() else: - sys.stderr.write('Module is hex_2_utf8\n') - sys.stderr.write('method is __end_caps_func\n') - sys.stderr.write('caps list should be more than one?\n') #self.__in_caps not set + sys.stderr.write('Module is hex_2_utf8\n' + 'method is __end_caps_func\n' + 'caps list should be more than one?\n') #self.__in_caps not set def __text_func(self, line): """ @@ -486,8 +482,7 @@ class Hex2Utf8: hex_num = '\'%s' % hex_num converted = self.__current_dict.get(hex_num) if converted is None: - sys.stderr.write('module is hex_2_ut8\n') - sys.stderr.write('method is __text_func\n') + sys.stderr.write('module is hex_2_ut8\nmethod is __text_func\n') sys.stderr.write('no hex value for "%s"\n' % hex_num) else: the_string += converted @@ -543,16 +538,15 @@ class Hex2Utf8: def __convert_body(self): self.__state = 'body' with open(self.__file, 'r') as read_obj: - self.__write_obj = open(self.__write_to, 'w') - for line in read_obj: - self.__token_info = line[:16] - action = self.__body_state_dict.get(self.__state) - if action is None: - sys.stderr.write('error no state found in hex_2_utf8', - self.__state - ) - action(line) - self.__write_obj.close() + with open(self.__write_to, 'w') as self.__write_obj: + for line in read_obj: + self.__token_info = line[:16] + action = self.__body_state_dict.get(self.__state) + if action is None: + sys.stderr.write('error no state found in hex_2_utf8', + self.__state + ) + action(line) copy_obj = copy.Copy(bug_handler = self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "body_utf_convert.data") diff --git a/src/calibre/ebooks/rtf2xml/info.py b/src/calibre/ebooks/rtf2xml/info.py index e9c9d5c38d..8eabe25c18 100755 --- a/src/calibre/ebooks/rtf2xml/info.py +++ b/src/calibre/ebooks/rtf2xml/info.py @@ -68,7 +68,6 @@ class Info: 'cw tb + # underline + # can't see why it isn't a char info: 'ul'=>'ci' + 'ul' : ('ci', 'underlined tb 'trowd' : ('tb', 'row-def___', self.default_func), 'cell' : ('tb', 'cell______', self.default_func), 'row' : ('tb', 'row_______', self.default_func), @@ -274,25 +294,6 @@ class ProcessTokens: 'paperh' : ('pa', 'paper-hght', self.divide_by_20), # annotation => an 'annotation' : ('an', 'annotation', self.default_func), - # underline - 'ul' : ('ul', 'underlined bd 'trbrdrh' : ('bd', 'bor-t-r-hi', self.default_func), 'trbrdrv' : ('bd', 'bor-t-r-vi', self.default_func), @@ -757,7 +758,7 @@ class ProcessTokens: def process_cw(self, token): """Change the value of the control word by determining what dictionary it belongs to""" - special = [ '*', ':', '}', '{', '~', '_', '-', ';' ] + special = [ '*', ':', '}', '{', '~', '_', '-', ';' ] ##if token != "{" or token != "}": token = token[1:] # strip off leading \ token = token.replace(" ", "") @@ -793,7 +794,7 @@ class ProcessTokens: raise self.__exception_handler, msg the_index = token.find('\\ ') - if token is not None and the_index > -1: + if token is not None and the_index > -1: msg = '\nInvalid RTF: token "\\ " not valid.\nError at line %d'\ % line_count raise self.__exception_handler, msg @@ -832,4 +833,4 @@ class ProcessTokens: msg = '\nInvalid RTF: document does not have matching brackets.\n' raise self.__exception_handler, msg else: - return self.__return_code + return self.__return_code \ No newline at end of file diff --git a/src/calibre/ebooks/rtf2xml/sections.py b/src/calibre/ebooks/rtf2xml/sections.py index 13bf2c2ddc..a315729525 100755 --- a/src/calibre/ebooks/rtf2xml/sections.py +++ b/src/calibre/ebooks/rtf2xml/sections.py @@ -496,7 +496,7 @@ Instead, ingore all section information in a field-block. self.__token_info = line[:16] action = self.__state_dict.get(self.__state) if action == None: - sys.stderr.write('no no matching state in module sections.py\n') + sys.stderr.write('no matching state in module sections.py\n') sys.stderr.write(self.__state + '\n') action(line) read_obj.close() diff --git a/src/calibre/ebooks/rtf2xml/styles.py b/src/calibre/ebooks/rtf2xml/styles.py index 55f86e4208..7fcbfb24a3 100755 --- a/src/calibre/ebooks/rtf2xml/styles.py +++ b/src/calibre/ebooks/rtf2xml/styles.py @@ -103,8 +103,6 @@ class Styles: 'sect-note_' : 'endnotes-in-section', # list=> ls 'list-text_' : 'list-text', - # this line must be wrong because it duplicates an earlier one - 'list-text_' : 'list-text', 'list______' : 'list', 'list-lev-d' : 'list-level-definition', 'list-cardi' : 'list-cardinal-numbering', diff --git a/src/calibre/ebooks/rtf2xml/tokenize.py b/src/calibre/ebooks/rtf2xml/tokenize.py index deac793111..10d3fbba6f 100755 --- a/src/calibre/ebooks/rtf2xml/tokenize.py +++ b/src/calibre/ebooks/rtf2xml/tokenize.py @@ -114,6 +114,7 @@ class Tokenize: # this is for older RTF input_file = self.__par_exp.sub('\n\\par \n', input_file) input_file = self.__cwdigit_exp.sub("\g<1>\n\g<2>", input_file) + input_file = self.__cs_ast.sub("\g<1>", input_file) input_file = self.__ms_hex_exp.sub("\\mshex0\g<1> ", input_file) input_file = self.__utf_ud.sub("\\{\\uc0 \g<1>\\}", input_file) #remove \n in bin data @@ -163,6 +164,8 @@ class Tokenize: self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)") #this is for old RTF self.__par_exp = re.compile(r'(\\\n+|\\ )') + #handle improper cs char-style with \* before without { + self.__cs_ast = re.compile(r'\\\*([\n ]*\\cs\d+[\n \\]+)') #handle cw using a digit as argument and without space as delimiter self.__cwdigit_exp = re.compile(r"(\\[a-zA-Z]+[\-0-9]+)([^0-9 \\]+)") diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index 4037ee1be7..4cff648fa5 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -12,6 +12,7 @@ import os, re from calibre import prepare_string_for_xml, isbytestring from calibre.ebooks.metadata.opf2 import OPFCreator + from calibre.ebooks.conversion.preprocess import DocAnalysis from calibre.utils.cleantext import clean_ascii_chars diff --git a/src/calibre/library/catalog.py b/src/calibre/library/catalog.py index 51aefe214b..0f5a31e1d7 100644 --- a/src/calibre/library/catalog.py +++ b/src/calibre/library/catalog.py @@ -477,17 +477,17 @@ class BIBTEX(CatalogPlugin): # {{{ if opts.bibfile_enc in bibfile_enc : bibfile_enc = opts.bibfile_enc else : - log(" WARNING: incorrect --choose-encoding flag, revert to default") + log.warn("Incorrect --choose-encoding flag, revert to default") bibfile_enc = bibfile_enc[0] if opts.bibfile_enctag in bibfile_enctag : bibfile_enctag = opts.bibfile_enctag else : - log(" WARNING: incorrect --choose-encoding-configuration flag, revert to default") + log.warn("Incorrect --choose-encoding-configuration flag, revert to default") bibfile_enctag = bibfile_enctag[0] if opts.bib_entry in bib_entry : bib_entry = opts.bib_entry else : - log(" WARNING: incorrect --entry-type flag, revert to default") + log.warn("Incorrect --entry-type flag, revert to default") bib_entry = bib_entry[0] if opts.verbose: @@ -544,7 +544,7 @@ class BIBTEX(CatalogPlugin): # {{{ elif opts.impcit == 'True' : citation_bibtex= True else : - log(" WARNING: incorrect --create-citation, revert to default") + log.warn("Incorrect --create-citation, revert to default") citation_bibtex= True else : citation_bibtex= opts.impcit @@ -556,7 +556,7 @@ class BIBTEX(CatalogPlugin): # {{{ elif opts.addfiles == 'True' : addfiles_bibtex = True else : - log(" WARNING: incorrect --add-files-path, revert to default") + log.warn("Incorrect --add-files-path, revert to default") addfiles_bibtex= True else : addfiles_bibtex = opts.addfiles @@ -574,7 +574,7 @@ class BIBTEX(CatalogPlugin): # {{{ if bib_entry == 'book' : nb_books = len(filter(check_entry_book_valid, data)) if nb_books < nb_entries : - log(" WARNING: only %d entries in %d are book compatible" % (nb_books, nb_entries)) + log.warn("Only %d entries in %d are book compatible" % (nb_books, nb_entries)) nb_entries = nb_books # If connected device, add 'On Device' values to data