mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Correct problems with tag splitting in RTFParser, some encoding refactoring & move all encodings to UTF-8 or US-ASCII for lxml
This commit is contained in:
parent
23f2fc6202
commit
ed4da14df0
@ -85,6 +85,7 @@ class RTFInput(InputFormatPlugin):
|
||||
debug_dir = 'rtfdebug'
|
||||
run_lev = 4
|
||||
indent_out = 1
|
||||
self.log('Running RTFParser in debug mode')
|
||||
except:
|
||||
pass
|
||||
parser = ParseRtf(
|
||||
@ -233,22 +234,6 @@ class RTFInput(InputFormatPlugin):
|
||||
with open('styles.css', 'ab') as f:
|
||||
f.write(css)
|
||||
|
||||
# def preprocess(self, fname):
|
||||
# self.log('\tPreprocessing to convert unicode characters')
|
||||
# try:
|
||||
# data = open(fname, 'rb').read()
|
||||
# from calibre.ebooks.rtf.preprocess import RtfTokenizer, RtfTokenParser
|
||||
# tokenizer = RtfTokenizer(data)
|
||||
# tokens = RtfTokenParser(tokenizer.tokens)
|
||||
# data = tokens.toRTF()
|
||||
# fname = 'preprocessed.rtf'
|
||||
# with open(fname, 'wb') as f:
|
||||
# f.write(data)
|
||||
# except:
|
||||
# self.log.exception(
|
||||
# 'Failed to preprocess RTF to convert unicode sequences, ignoring...')
|
||||
# return fname
|
||||
|
||||
def convert_borders(self, doc):
|
||||
border_styles = []
|
||||
style_map = {}
|
||||
@ -283,8 +268,6 @@ class RTFInput(InputFormatPlugin):
|
||||
self.opts = options
|
||||
self.log = log
|
||||
self.log('Converting RTF to XML...')
|
||||
#Name of the preprocesssed RTF file
|
||||
# fname = self.preprocess(stream.name)
|
||||
try:
|
||||
xml = self.generate_xml(stream.name)
|
||||
except RtfInvalidCodeException, e:
|
||||
@ -339,3 +322,5 @@ class RTFInput(InputFormatPlugin):
|
||||
return os.path.abspath('metadata.opf')
|
||||
|
||||
#ebook-convert "bad.rtf" test.epub -v -d "D:\Mes eBooks\Developpement\debug"
|
||||
# os.makedirs('D:\\Mes eBooks\\Developpement\\rtfdebug')
|
||||
# debug_dir = 'D:\\Mes eBooks\\Developpement\\rtfdebug'
|
@ -238,6 +238,8 @@ class ParseRtf:
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
)
|
||||
enc = 'cp' + encode_obj.get_codepage()
|
||||
if enc == 'cp10000':
|
||||
enc = 'mac_roman'
|
||||
msg = 'Exception in token processing'
|
||||
if check_encoding_obj.check_encoding(self.__file, enc):
|
||||
file_name = self.__file if isinstance(self.__file, str) \
|
||||
|
@ -15,8 +15,10 @@
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os, tempfile, re
|
||||
import sys, os, tempfile, re
|
||||
|
||||
from calibre.ebooks.rtf2xml import copy
|
||||
|
||||
class Colors:
|
||||
"""
|
||||
Change lines with color info from color numbers to the actual color names.
|
||||
@ -40,8 +42,10 @@ class Colors:
|
||||
self.__file = in_file
|
||||
self.__copy = copy
|
||||
self.__bug_handler = bug_handler
|
||||
self.__line = 0
|
||||
self.__write_to = tempfile.mktemp()
|
||||
self.__run_level = run_level
|
||||
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Initiate all values.
|
||||
@ -61,6 +65,7 @@ class Colors:
|
||||
self.__color_num = 1
|
||||
self.__line_color_exp = re.compile(r'bdr-color_:(\d+)')
|
||||
# cw<bd<bor-par-to<nu<bdr-hair__|bdr-li-wid:0.50|bdr-sp-wid:1.00|bdr-color_:2
|
||||
|
||||
def __before_color_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
@ -76,6 +81,7 @@ class Colors:
|
||||
if self.__token_info == 'mi<mk<clrtbl-beg':
|
||||
self.__state = 'in_color_table'
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __default_color_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
@ -87,6 +93,7 @@ class Colors:
|
||||
"""
|
||||
hex_num = line[-3:-1]
|
||||
self.__color_string += hex_num
|
||||
|
||||
def __blue_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
@ -109,6 +116,7 @@ class Colors:
|
||||
)
|
||||
self.__color_num += 1
|
||||
self.__color_string = '#'
|
||||
|
||||
def __in_color_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
@ -127,12 +135,13 @@ class Colors:
|
||||
self.__state = 'after_color_table'
|
||||
else:
|
||||
action = self.__state_dict.get(self.__token_info)
|
||||
if action == None:
|
||||
if action is None:
|
||||
sys.stderr.write('in module colors.py\n'
|
||||
'function is self.__in_color_func\n'
|
||||
'no action for %s' % self.__token_info
|
||||
)
|
||||
action(line)
|
||||
|
||||
def __after_color_func(self, line):
|
||||
"""
|
||||
Check the to see if it contains color info. If it does, extract the
|
||||
@ -180,6 +189,7 @@ class Colors:
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
# cw<bd<bor-par-to<nu<bdr-hair__|bdr-li-wid:0.50|bdr-sp-wid:1.00|bdr-color_:2
|
||||
|
||||
def __sub_from_line_color(self, match_obj):
|
||||
num = match_obj.group(1)
|
||||
try:
|
||||
@ -191,25 +201,27 @@ class Colors:
|
||||
else:
|
||||
return 'bdr-color_:no-value'
|
||||
hex_num = self.__figure_num(num)
|
||||
return_value = 'bdr-color_:%s' % hex_num
|
||||
return return_value
|
||||
return 'bdr-color_:%s' % hex_num
|
||||
|
||||
def __figure_num(self, num):
|
||||
if num == 0:
|
||||
hex_num = 'false'
|
||||
else:
|
||||
hex_num = self.__color_dict.get(num)
|
||||
if hex_num == None:
|
||||
if self.__run_level > 3:
|
||||
msg = 'no value in self.__color_dict for key %s\n' % num
|
||||
raise self.__bug_hanlder, msg
|
||||
if hex_num == None:
|
||||
if hex_num is None:
|
||||
hex_num = '0'
|
||||
if self.__run_level > 5:
|
||||
msg = 'no value in self.__color_dict' \
|
||||
'for key %s at line %d\n' % (num, self.__line)
|
||||
raise self.__bug_handler, msg
|
||||
return hex_num
|
||||
|
||||
def __do_nothing_func(self, line):
|
||||
"""
|
||||
Bad RTF will have text in the color table
|
||||
"""
|
||||
pass
|
||||
|
||||
def convert_colors(self):
|
||||
"""
|
||||
Requires:
|
||||
@ -226,20 +238,16 @@ class Colors:
|
||||
info, and substitute the number with the hex number.
|
||||
"""
|
||||
self.__initiate_values()
|
||||
read_obj = open(self.__file, 'r')
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if action == None:
|
||||
sys.stderr.write('no no matching state in module fonts.py\n')
|
||||
sys.stderr.write(self.__state + '\n')
|
||||
action(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
with open(self.__file, 'r') as read_obj:
|
||||
with open(self.__write_to, 'w') as self.__write_obj:
|
||||
for line in read_obj:
|
||||
self.__line+=1
|
||||
self.__token_info = line[:16]
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if action is None:
|
||||
sys.stderr.write('no matching state in module fonts.py\n')
|
||||
sys.stderr.write(self.__state + '\n')
|
||||
action(line)
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "color.data")
|
||||
|
@ -33,13 +33,13 @@ class ConvertToTags:
|
||||
self.__copy = copy
|
||||
self.__dtd_path = dtd_path
|
||||
self.__no_dtd = no_dtd
|
||||
if encoding != 'mac_roman':
|
||||
self.__encoding = 'cp' + encoding
|
||||
else:
|
||||
self.__encoding = 'cp' + encoding
|
||||
if encoding == 'mac_roman':
|
||||
self.__encoding = 'mac_roman'
|
||||
self.__indent = indent
|
||||
self.__run_level = run_level
|
||||
self.__write_to = tempfile.mktemp()
|
||||
self.__convert_utf = False
|
||||
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
@ -213,7 +213,8 @@ class ConvertToTags:
|
||||
if not check_encoding_obj.check_encoding(self.__file, verbose=False):
|
||||
self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>')
|
||||
elif not check_encoding_obj.check_encoding(self.__file, self.__encoding):
|
||||
self.__write_obj.write('<?xml version="1.0" encoding="%s" ?>' % self.__encoding)
|
||||
self.__write_obj.write('<?xml version="1.0" encoding="UTF-8" ?>')
|
||||
self.__convert_utf = True
|
||||
else:
|
||||
self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>')
|
||||
sys.stderr.write('Bad RTF encoding, revert to US-ASCII chars and'
|
||||
@ -253,15 +254,28 @@ class ConvertToTags:
|
||||
an empty tag function.
|
||||
"""
|
||||
self.__initiate_values()
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
self.__write_dec()
|
||||
with open(self.__file, 'r') as read_obj:
|
||||
for line in read_obj:
|
||||
self.__token_info = line[:16]
|
||||
action = self.__state_dict.get(self.__token_info)
|
||||
if action is not None:
|
||||
action(line)
|
||||
with open(self.__write_to, 'w') as self.__write_obj:
|
||||
self.__write_dec()
|
||||
with open(self.__file, 'r') as read_obj:
|
||||
for line in read_obj:
|
||||
self.__token_info = line[:16]
|
||||
action = self.__state_dict.get(self.__token_info)
|
||||
if action is not None:
|
||||
action(line)
|
||||
self.__write_obj.close()
|
||||
#convert all encodings to UTF8 to avoid unsupported encodings in lxml
|
||||
if self.__convert_utf:
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
with open(self.__file, 'r') as read_obj:
|
||||
with open(self.__write_to, 'w') as write_obj:
|
||||
file = read_obj.read()
|
||||
try:
|
||||
file = file.decode(self.__encoding)
|
||||
write_obj.write(file.encode('utf-8'))
|
||||
except:
|
||||
sys.stderr.write('Conversion to UTF-8 is not possible,'
|
||||
' encoding should be very carefully checked')
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "convert_to_tags.data")
|
||||
|
@ -75,12 +75,16 @@ class DefaultEncoding:
|
||||
self._encoding()
|
||||
self.__datafetched = True
|
||||
code_page = 'ansicpg' + self.__code_page
|
||||
if self.__code_page == '10000':
|
||||
self.__code_page = 'mac_roman'
|
||||
return self.__platform, code_page, self.__default_num
|
||||
|
||||
def get_codepage(self):
|
||||
if not self.__datafetched:
|
||||
self._encoding()
|
||||
self.__datafetched = True
|
||||
if self.__code_page == '10000':
|
||||
self.__code_page = 'mac_roman'
|
||||
return self.__code_page
|
||||
|
||||
def get_platform(self):
|
||||
|
@ -16,7 +16,9 @@
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os, tempfile
|
||||
|
||||
from calibre.ebooks.rtf2xml import copy
|
||||
|
||||
class Fonts:
|
||||
"""
|
||||
Change lines with font info from font numbers to the actual font names.
|
||||
@ -45,6 +47,7 @@ class Fonts:
|
||||
self.__default_font_num = default_font_num
|
||||
self.__write_to = tempfile.mktemp()
|
||||
self.__run_level = run_level
|
||||
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Initiate all values.
|
||||
@ -67,6 +70,7 @@ class Fonts:
|
||||
self.__font_table = {}
|
||||
# individual font written
|
||||
self.__wrote_ind_font = 0
|
||||
|
||||
def __default_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
@ -79,6 +83,7 @@ class Fonts:
|
||||
if self.__token_info == 'mi<mk<fonttb-beg':
|
||||
self.__state = 'font_table'
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __font_table_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
@ -101,6 +106,7 @@ class Fonts:
|
||||
self.__font_num = self.__default_font_num
|
||||
self.__text_line = ''
|
||||
##self.__write_obj.write(line)
|
||||
|
||||
def __font_in_table_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
@ -138,6 +144,7 @@ class Fonts:
|
||||
elif self.__token_info == 'mi<mk<fonttb-end':
|
||||
self.__found_end_font_table_func()
|
||||
self.__state = 'after_font_table'
|
||||
|
||||
def __found_end_font_table_func(self):
|
||||
"""
|
||||
Required:
|
||||
@ -150,7 +157,8 @@ class Fonts:
|
||||
if not self.__wrote_ind_font:
|
||||
self.__write_obj.write(
|
||||
'mi<tg<empty-att_'
|
||||
'<font-in-table<name>Times<num>0\n' )
|
||||
'<font-in-table<name>Times<num>0\n')
|
||||
|
||||
def __after_font_table_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
@ -169,7 +177,7 @@ class Fonts:
|
||||
if self.__token_info == 'cw<ci<font-style':
|
||||
font_num = line[20:-1]
|
||||
font_name = self.__font_table.get(font_num)
|
||||
if font_name == None:
|
||||
if font_name is None:
|
||||
if self.__run_level > 3:
|
||||
msg = 'no value for %s in self.__font_table\n' % font_num
|
||||
raise self.__bug_handler, msg
|
||||
@ -182,6 +190,7 @@ class Fonts:
|
||||
)
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def convert_fonts(self):
|
||||
"""
|
||||
Required:
|
||||
@ -197,20 +206,15 @@ class Fonts:
|
||||
info. Substitute a font name for a font number.
|
||||
"""
|
||||
self.__initiate_values()
|
||||
read_obj = open(self.__file, 'r')
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if action == None:
|
||||
sys.stderr.write('no no matching state in module fonts.py\n')
|
||||
sys.stderr.write(self.__state + '\n')
|
||||
action(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
with open(self.__file, 'r') as read_obj:
|
||||
with open(self.__write_to, 'w') as self.__write_obj:
|
||||
for line in read_obj:
|
||||
self.__token_info = line[:16]
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if action is None:
|
||||
sys.stderr.write('no matching state in module fonts.py\n' \
|
||||
+ self.__state + '\n')
|
||||
action(line)
|
||||
default_font_name = self.__font_table.get(self.__default_font_num)
|
||||
if not default_font_name:
|
||||
default_font_name = 'Not Defined'
|
||||
|
@ -41,7 +41,7 @@ class GetCharMap:
|
||||
def get_char_map(self, map):
|
||||
if map == 'ansicpg0':
|
||||
map = 'ansicpg1250'
|
||||
if map in ('ansicpg10000', '10000'):
|
||||
if map == 'ansicpg10000':
|
||||
map = 'mac_roman'
|
||||
found_map = False
|
||||
map_dict = {}
|
||||
|
@ -115,6 +115,7 @@ class Tokenize:
|
||||
|
||||
def __sub_reg_split(self,input_file):
|
||||
input_file = self.__replace_spchar.mreplace(input_file)
|
||||
# this is for older RTF
|
||||
input_file = self.__par_exp.sub('\n\\par \n', input_file)
|
||||
input_file = self.__cs_ast.sub("\g<1>", input_file)
|
||||
input_file = self.__ms_hex_exp.sub("\\mshex0\g<1> ", input_file)
|
||||
@ -126,12 +127,6 @@ class Tokenize:
|
||||
tokens = re.split(self.__splitexp, input_file)
|
||||
#remove empty tokens and \n
|
||||
return filter(lambda x: len(x) > 0 and x != '\n', tokens)
|
||||
#input_file = re.sub(self.__utf_exp, self.__from_ms_to_utf8, input_file)
|
||||
# line = re.sub( self.__neg_utf_exp, self.__neg_unicode_func, line)
|
||||
# this is for older RTF
|
||||
#line = re.sub(self.__par_exp, '\\par ', line)
|
||||
#return filter(lambda x: len(x) > 0, \
|
||||
#(self.__remove_line.sub('', x) for x in tokens))
|
||||
|
||||
def __compile_expressions(self):
|
||||
SIMPLE_RPL = {
|
||||
@ -160,7 +155,7 @@ class Tokenize:
|
||||
}
|
||||
self.__replace_spchar = MReplace(SIMPLE_RPL)
|
||||
#add ;? in case of char following \u
|
||||
self.__ms_hex_exp = re.compile(r"\\\'([0-9a-fA-F]{2})") #r"\\\'(..)"
|
||||
self.__ms_hex_exp = re.compile(r"\\\'([0-9a-fA-F]{2})")
|
||||
self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) ?")
|
||||
self.__bin_exp = re.compile(r"(?:\\bin(-?\d{0,10})[\n ]+)[01\n]+")
|
||||
#manage upr/ud situations
|
||||
@ -174,14 +169,21 @@ class Tokenize:
|
||||
self.__par_exp = re.compile(r'\\\n+')
|
||||
#handle improper cs char-style with \* before without {
|
||||
self.__cs_ast = re.compile(r'\\\*([\n ]*\\cs\d+[\n \\]+)')
|
||||
# self.__par_exp = re.compile(r'\\$')
|
||||
#handle cw using a digit as argument and without space as delimiter
|
||||
self.__cwdigit_exp = re.compile(r"(\\[a-zA-Z]+[\-0-9]+)([^0-9 \\]+)")
|
||||
#self.__bin_exp = re.compile(r"\\bin(-?\d{1,8}) {0,1}")
|
||||
#self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})")
|
||||
#self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\n|\\[^\s\\{}&]+(?:\s)?)")
|
||||
#self.__remove_line = re.compile(r'\n+')
|
||||
#self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)")
|
||||
##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)")
|
||||
|
||||
def __correct_spliting(self, token):
|
||||
match_obj = re.search(self.__cwdigit_exp, token)
|
||||
if match_obj is None:
|
||||
return token
|
||||
else:
|
||||
return '%s\n%s' % (match_obj.group(1), match_obj.group(2))
|
||||
|
||||
def tokenize(self):
|
||||
"""Main class for handling other methods. Reads the file \
|
||||
, uses method self.sub_reg to make basic substitutions,\
|
||||
@ -197,6 +199,8 @@ class Tokenize:
|
||||
tokens = map(self.__unicode_process, tokens)
|
||||
#remove empty items created by removing \uc
|
||||
tokens = filter(lambda x: len(x) > 0, tokens)
|
||||
#handles bothersome cases
|
||||
tokens = map(self.__correct_spliting, tokens)
|
||||
|
||||
#write
|
||||
with open(self.__write_to, 'wb') as write_obj:
|
||||
@ -205,8 +209,6 @@ class Tokenize:
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "tokenize.data")
|
||||
# if self.__out_file:
|
||||
# self.__file = self.__out_file
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user