Correct problems with tag splitting in RTFParser, some encoding refactoring & move all encodings to UTF-8 or US-ASCII for lxml

This commit is contained in:
Sengian 2011-01-31 08:29:42 +01:00
parent 23f2fc6202
commit ed4da14df0
8 changed files with 101 additions and 82 deletions

View File

@ -85,6 +85,7 @@ class RTFInput(InputFormatPlugin):
debug_dir = 'rtfdebug'
run_lev = 4
indent_out = 1
self.log('Running RTFParser in debug mode')
except:
pass
parser = ParseRtf(
@ -233,22 +234,6 @@ class RTFInput(InputFormatPlugin):
with open('styles.css', 'ab') as f:
f.write(css)
# def preprocess(self, fname):
# self.log('\tPreprocessing to convert unicode characters')
# try:
# data = open(fname, 'rb').read()
# from calibre.ebooks.rtf.preprocess import RtfTokenizer, RtfTokenParser
# tokenizer = RtfTokenizer(data)
# tokens = RtfTokenParser(tokenizer.tokens)
# data = tokens.toRTF()
# fname = 'preprocessed.rtf'
# with open(fname, 'wb') as f:
# f.write(data)
# except:
# self.log.exception(
# 'Failed to preprocess RTF to convert unicode sequences, ignoring...')
# return fname
def convert_borders(self, doc):
border_styles = []
style_map = {}
@ -283,8 +268,6 @@ class RTFInput(InputFormatPlugin):
self.opts = options
self.log = log
self.log('Converting RTF to XML...')
#Name of the preprocesssed RTF file
# fname = self.preprocess(stream.name)
try:
xml = self.generate_xml(stream.name)
except RtfInvalidCodeException, e:
@ -339,3 +322,5 @@ class RTFInput(InputFormatPlugin):
return os.path.abspath('metadata.opf')
#ebook-convert "bad.rtf" test.epub -v -d "D:\Mes eBooks\Developpement\debug"
# os.makedirs('D:\\Mes eBooks\\Developpement\\rtfdebug')
# debug_dir = 'D:\\Mes eBooks\\Developpement\\rtfdebug'

View File

@ -238,6 +238,8 @@ class ParseRtf:
bug_handler = RtfInvalidCodeException,
)
enc = 'cp' + encode_obj.get_codepage()
if enc == 'cp10000':
enc = 'mac_roman'
msg = 'Exception in token processing'
if check_encoding_obj.check_encoding(self.__file, enc):
file_name = self.__file if isinstance(self.__file, str) \

View File

@ -15,8 +15,10 @@
# #
# #
#########################################################################
import sys, os, tempfile, re
import sys, os, tempfile, re
from calibre.ebooks.rtf2xml import copy
class Colors:
"""
Change lines with color info from color numbers to the actual color names.
@ -40,8 +42,10 @@ class Colors:
self.__file = in_file
self.__copy = copy
self.__bug_handler = bug_handler
self.__line = 0
self.__write_to = tempfile.mktemp()
self.__run_level = run_level
def __initiate_values(self):
"""
Initiate all values.
@ -61,6 +65,7 @@ class Colors:
self.__color_num = 1
self.__line_color_exp = re.compile(r'bdr-color_:(\d+)')
# cw<bd<bor-par-to<nu<bdr-hair__|bdr-li-wid:0.50|bdr-sp-wid:1.00|bdr-color_:2
def __before_color_func(self, line):
"""
Requires:
@ -76,6 +81,7 @@ class Colors:
if self.__token_info == 'mi<mk<clrtbl-beg':
self.__state = 'in_color_table'
self.__write_obj.write(line)
def __default_color_func(self, line):
"""
Requires:
@ -87,6 +93,7 @@ class Colors:
"""
hex_num = line[-3:-1]
self.__color_string += hex_num
def __blue_func(self, line):
"""
Requires:
@ -109,6 +116,7 @@ class Colors:
)
self.__color_num += 1
self.__color_string = '#'
def __in_color_func(self, line):
"""
Requires:
@ -127,12 +135,13 @@ class Colors:
self.__state = 'after_color_table'
else:
action = self.__state_dict.get(self.__token_info)
if action == None:
if action is None:
sys.stderr.write('in module colors.py\n'
'function is self.__in_color_func\n'
'no action for %s' % self.__token_info
)
action(line)
def __after_color_func(self, line):
"""
Check the to see if it contains color info. If it does, extract the
@ -180,6 +189,7 @@ class Colors:
else:
self.__write_obj.write(line)
# cw<bd<bor-par-to<nu<bdr-hair__|bdr-li-wid:0.50|bdr-sp-wid:1.00|bdr-color_:2
def __sub_from_line_color(self, match_obj):
num = match_obj.group(1)
try:
@ -191,25 +201,27 @@ class Colors:
else:
return 'bdr-color_:no-value'
hex_num = self.__figure_num(num)
return_value = 'bdr-color_:%s' % hex_num
return return_value
return 'bdr-color_:%s' % hex_num
def __figure_num(self, num):
if num == 0:
hex_num = 'false'
else:
hex_num = self.__color_dict.get(num)
if hex_num == None:
if self.__run_level > 3:
msg = 'no value in self.__color_dict for key %s\n' % num
raise self.__bug_hanlder, msg
if hex_num == None:
if hex_num is None:
hex_num = '0'
if self.__run_level > 5:
msg = 'no value in self.__color_dict' \
'for key %s at line %d\n' % (num, self.__line)
raise self.__bug_handler, msg
return hex_num
def __do_nothing_func(self, line):
"""
Bad RTF will have text in the color table
"""
pass
def convert_colors(self):
"""
Requires:
@ -226,20 +238,16 @@ class Colors:
info, and substitute the number with the hex number.
"""
self.__initiate_values()
read_obj = open(self.__file, 'r')
self.__write_obj = open(self.__write_to, 'w')
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
action = self.__state_dict.get(self.__state)
if action == None:
sys.stderr.write('no no matching state in module fonts.py\n')
sys.stderr.write(self.__state + '\n')
action(line)
read_obj.close()
self.__write_obj.close()
with open(self.__file, 'r') as read_obj:
with open(self.__write_to, 'w') as self.__write_obj:
for line in read_obj:
self.__line+=1
self.__token_info = line[:16]
action = self.__state_dict.get(self.__state)
if action is None:
sys.stderr.write('no matching state in module fonts.py\n')
sys.stderr.write(self.__state + '\n')
action(line)
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "color.data")

View File

@ -33,13 +33,13 @@ class ConvertToTags:
self.__copy = copy
self.__dtd_path = dtd_path
self.__no_dtd = no_dtd
if encoding != 'mac_roman':
self.__encoding = 'cp' + encoding
else:
self.__encoding = 'cp' + encoding
if encoding == 'mac_roman':
self.__encoding = 'mac_roman'
self.__indent = indent
self.__run_level = run_level
self.__write_to = tempfile.mktemp()
self.__convert_utf = False
def __initiate_values(self):
"""
@ -213,7 +213,8 @@ class ConvertToTags:
if not check_encoding_obj.check_encoding(self.__file, verbose=False):
self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>')
elif not check_encoding_obj.check_encoding(self.__file, self.__encoding):
self.__write_obj.write('<?xml version="1.0" encoding="%s" ?>' % self.__encoding)
self.__write_obj.write('<?xml version="1.0" encoding="UTF-8" ?>')
self.__convert_utf = True
else:
self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>')
sys.stderr.write('Bad RTF encoding, revert to US-ASCII chars and'
@ -253,15 +254,28 @@ class ConvertToTags:
an empty tag function.
"""
self.__initiate_values()
self.__write_obj = open(self.__write_to, 'w')
self.__write_dec()
with open(self.__file, 'r') as read_obj:
for line in read_obj:
self.__token_info = line[:16]
action = self.__state_dict.get(self.__token_info)
if action is not None:
action(line)
with open(self.__write_to, 'w') as self.__write_obj:
self.__write_dec()
with open(self.__file, 'r') as read_obj:
for line in read_obj:
self.__token_info = line[:16]
action = self.__state_dict.get(self.__token_info)
if action is not None:
action(line)
self.__write_obj.close()
#convert all encodings to UTF8 to avoid unsupported encodings in lxml
if self.__convert_utf:
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
copy_obj.rename(self.__write_to, self.__file)
with open(self.__file, 'r') as read_obj:
with open(self.__write_to, 'w') as write_obj:
file = read_obj.read()
try:
file = file.decode(self.__encoding)
write_obj.write(file.encode('utf-8'))
except:
sys.stderr.write('Conversion to UTF-8 is not possible,'
' encoding should be very carefully checked')
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "convert_to_tags.data")

View File

@ -75,12 +75,16 @@ class DefaultEncoding:
self._encoding()
self.__datafetched = True
code_page = 'ansicpg' + self.__code_page
if self.__code_page == '10000':
self.__code_page = 'mac_roman'
return self.__platform, code_page, self.__default_num
def get_codepage(self):
if not self.__datafetched:
self._encoding()
self.__datafetched = True
if self.__code_page == '10000':
self.__code_page = 'mac_roman'
return self.__code_page
def get_platform(self):

View File

@ -16,7 +16,9 @@
# #
#########################################################################
import sys, os, tempfile
from calibre.ebooks.rtf2xml import copy
class Fonts:
"""
Change lines with font info from font numbers to the actual font names.
@ -45,6 +47,7 @@ class Fonts:
self.__default_font_num = default_font_num
self.__write_to = tempfile.mktemp()
self.__run_level = run_level
def __initiate_values(self):
"""
Initiate all values.
@ -67,6 +70,7 @@ class Fonts:
self.__font_table = {}
# individual font written
self.__wrote_ind_font = 0
def __default_func(self, line):
"""
Requires:
@ -79,6 +83,7 @@ class Fonts:
if self.__token_info == 'mi<mk<fonttb-beg':
self.__state = 'font_table'
self.__write_obj.write(line)
def __font_table_func(self, line):
"""
Requires:
@ -101,6 +106,7 @@ class Fonts:
self.__font_num = self.__default_font_num
self.__text_line = ''
##self.__write_obj.write(line)
def __font_in_table_func(self, line):
"""
Requires:
@ -138,6 +144,7 @@ class Fonts:
elif self.__token_info == 'mi<mk<fonttb-end':
self.__found_end_font_table_func()
self.__state = 'after_font_table'
def __found_end_font_table_func(self):
"""
Required:
@ -150,7 +157,8 @@ class Fonts:
if not self.__wrote_ind_font:
self.__write_obj.write(
'mi<tg<empty-att_'
'<font-in-table<name>Times<num>0\n' )
'<font-in-table<name>Times<num>0\n')
def __after_font_table_func(self, line):
"""
Required:
@ -169,7 +177,7 @@ class Fonts:
if self.__token_info == 'cw<ci<font-style':
font_num = line[20:-1]
font_name = self.__font_table.get(font_num)
if font_name == None:
if font_name is None:
if self.__run_level > 3:
msg = 'no value for %s in self.__font_table\n' % font_num
raise self.__bug_handler, msg
@ -182,6 +190,7 @@ class Fonts:
)
else:
self.__write_obj.write(line)
def convert_fonts(self):
"""
Required:
@ -197,20 +206,15 @@ class Fonts:
info. Substitute a font name for a font number.
"""
self.__initiate_values()
read_obj = open(self.__file, 'r')
self.__write_obj = open(self.__write_to, 'w')
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
action = self.__state_dict.get(self.__state)
if action == None:
sys.stderr.write('no no matching state in module fonts.py\n')
sys.stderr.write(self.__state + '\n')
action(line)
read_obj.close()
self.__write_obj.close()
with open(self.__file, 'r') as read_obj:
with open(self.__write_to, 'w') as self.__write_obj:
for line in read_obj:
self.__token_info = line[:16]
action = self.__state_dict.get(self.__state)
if action is None:
sys.stderr.write('no matching state in module fonts.py\n' \
+ self.__state + '\n')
action(line)
default_font_name = self.__font_table.get(self.__default_font_num)
if not default_font_name:
default_font_name = 'Not Defined'

View File

@ -41,7 +41,7 @@ class GetCharMap:
def get_char_map(self, map):
if map == 'ansicpg0':
map = 'ansicpg1250'
if map in ('ansicpg10000', '10000'):
if map == 'ansicpg10000':
map = 'mac_roman'
found_map = False
map_dict = {}

View File

@ -115,6 +115,7 @@ class Tokenize:
def __sub_reg_split(self,input_file):
input_file = self.__replace_spchar.mreplace(input_file)
# this is for older RTF
input_file = self.__par_exp.sub('\n\\par \n', input_file)
input_file = self.__cs_ast.sub("\g<1>", input_file)
input_file = self.__ms_hex_exp.sub("\\mshex0\g<1> ", input_file)
@ -126,12 +127,6 @@ class Tokenize:
tokens = re.split(self.__splitexp, input_file)
#remove empty tokens and \n
return filter(lambda x: len(x) > 0 and x != '\n', tokens)
#input_file = re.sub(self.__utf_exp, self.__from_ms_to_utf8, input_file)
# line = re.sub( self.__neg_utf_exp, self.__neg_unicode_func, line)
# this is for older RTF
#line = re.sub(self.__par_exp, '\\par ', line)
#return filter(lambda x: len(x) > 0, \
#(self.__remove_line.sub('', x) for x in tokens))
def __compile_expressions(self):
SIMPLE_RPL = {
@ -160,7 +155,7 @@ class Tokenize:
}
self.__replace_spchar = MReplace(SIMPLE_RPL)
#add ;? in case of char following \u
self.__ms_hex_exp = re.compile(r"\\\'([0-9a-fA-F]{2})") #r"\\\'(..)"
self.__ms_hex_exp = re.compile(r"\\\'([0-9a-fA-F]{2})")
self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) ?")
self.__bin_exp = re.compile(r"(?:\\bin(-?\d{0,10})[\n ]+)[01\n]+")
#manage upr/ud situations
@ -174,14 +169,21 @@ class Tokenize:
self.__par_exp = re.compile(r'\\\n+')
#handle improper cs char-style with \* before without {
self.__cs_ast = re.compile(r'\\\*([\n ]*\\cs\d+[\n \\]+)')
# self.__par_exp = re.compile(r'\\$')
#handle cw using a digit as argument and without space as delimiter
self.__cwdigit_exp = re.compile(r"(\\[a-zA-Z]+[\-0-9]+)([^0-9 \\]+)")
#self.__bin_exp = re.compile(r"\\bin(-?\d{1,8}) {0,1}")
#self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})")
#self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\n|\\[^\s\\{}&]+(?:\s)?)")
#self.__remove_line = re.compile(r'\n+')
#self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)")
##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)")
def __correct_spliting(self, token):
match_obj = re.search(self.__cwdigit_exp, token)
if match_obj is None:
return token
else:
return '%s\n%s' % (match_obj.group(1), match_obj.group(2))
def tokenize(self):
"""Main class for handling other methods. Reads the file \
, uses method self.sub_reg to make basic substitutions,\
@ -197,6 +199,8 @@ class Tokenize:
tokens = map(self.__unicode_process, tokens)
#remove empty items created by removing \uc
tokens = filter(lambda x: len(x) > 0, tokens)
#handles bothersome cases
tokens = map(self.__correct_spliting, tokens)
#write
with open(self.__write_to, 'wb') as write_obj:
@ -205,8 +209,6 @@ class Tokenize:
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "tokenize.data")
# if self.__out_file:
# self.__file = self.__out_file
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)