RTF Input: Various code cleanups. Go back to trying to handle unicode mappings without pre-processing (Fixes #8171 (Unsupported charsets ie non ascii in RTF)). Fix bug in handling super/sub scripts.

This commit is contained in:
Kovid Goyal 2011-01-12 17:23:53 -07:00
parent 3e9e655674
commit 7fde6cbead
19 changed files with 904 additions and 715 deletions

View File

@ -287,7 +287,7 @@
<xsl:value-of select="count(preceding::rtf:footnote) + 1"/>
<xsl:text>]</xsl:text>
</xsl:when>
<xsl:when test="(@superscript = 'true')">
<xsl:when test="(@superscript)">
<xsl:element name="sup">
<xsl:element name="span">
<xsl:attribute name="class">
@ -297,7 +297,7 @@
</xsl:element>
</xsl:element>
</xsl:when>
<xsl:when test="(@underscript = 'true')">
<xsl:when test="(@underscript or @subscript)">
<xsl:element name="sub">
<xsl:element name="span">
<xsl:attribute name="class">

View File

@ -77,7 +77,15 @@ class RTFInput(InputFormatPlugin):
def generate_xml(self, stream):
from calibre.ebooks.rtf2xml.ParseRtf import ParseRtf
ofile = 'out.xml'
ofile = 'dataxml.xml'
run_lev, debug_dir = 1, None
if getattr(self.opts, 'debug_pipeline', None) is not None:
try:
os.mkdir(debug_dir)
debug_dir = 'rtfdebug'
run_lev = 4
except:
pass
parser = ParseRtf(
in_file = stream,
out_file = ofile,
@ -115,43 +123,45 @@ class RTFInput(InputFormatPlugin):
# Write or do not write paragraphs. Default is 0.
empty_paragraphs = 1,
#debug
deb_dir = debug_dir,
run_level = run_lev,
)
parser.parse_rtf()
ans = open('out.xml').read()
os.remove('out.xml')
return ans
with open(ofile, 'rb') as f:
return f.read()
def extract_images(self, picts):
import imghdr
self.log('Extracting images...')
with open(picts, 'rb') as f:
raw = f.read()
picts = filter(len, re.findall(r'\{\\pict([^}]+)\}', raw))
hex = re.compile(r'[^a-fA-F0-9]')
encs = [hex.sub('', pict) for pict in picts]
count = 0
raw = open(picts, 'rb').read()
starts = []
for match in re.finditer(r'\{\\pict([^}]+)\}', raw):
starts.append(match.start(1))
imap = {}
for start in starts:
pos, bc = start, 1
while bc > 0:
if raw[pos] == '}': bc -= 1
elif raw[pos] == '{': bc += 1
pos += 1
pict = raw[start:pos+1]
enc = re.sub(r'[^a-zA-Z0-9]', '', pict)
for enc in encs:
if len(enc) % 2 == 1:
enc = enc[:-1]
data = enc.decode('hex')
fmt = imghdr.what(None, data)
if fmt is None:
fmt = 'wmf'
count += 1
name = (('%4d'%count).replace(' ', '0'))+'.wmf'
open(name, 'wb').write(data)
name = '%04d.%s' % (count, fmt)
with open(name, 'wb') as f:
f.write(data)
imap[count] = name
#open(name+'.hex', 'wb').write(enc)
return self.convert_images(imap)
def convert_images(self, imap):
for count, val in imap.items():
self.default_img = None
for count, val in imap.iteritems():
try:
imap[count] = self.convert_image(val)
except:
@ -159,6 +169,8 @@ class RTFInput(InputFormatPlugin):
return imap
def convert_image(self, name):
if not name.endswith('.wmf'):
return name
try:
return self.rasterize_wmf(name)
except:
@ -167,16 +179,18 @@ class RTFInput(InputFormatPlugin):
def replace_wmf(self, name):
from calibre.ebooks import calibre_cover
data = calibre_cover('Conversion of WMF images is not supported',
if self.default_img is None:
self.default_img = calibre_cover('Conversion of WMF images is not supported',
'Use Microsoft Word or OpenOffice to save this RTF file'
' as HTML and convert that in calibre.', title_size=36,
author_size=20)
name = name.replace('.wmf', '.jpg')
with open(name, 'wb') as f:
f.write(data)
f.write(self.default_img)
return name
def rasterize_wmf(self, name):
raise ValueError('Conversion of WMF images not supported')
from calibre.utils.wmf import extract_raster_image
with open(name, 'rb') as f:
data = f.read()
@ -212,27 +226,27 @@ class RTFInput(InputFormatPlugin):
css += '\n'+'\n'.join(font_size_classes)
css += '\n' +'\n'.join(color_classes)
for cls, val in border_styles.items():
for cls, val in border_styles.iteritems():
css += '\n\n.%s {\n%s\n}'%(cls, val)
with open('styles.css', 'ab') as f:
f.write(css)
def preprocess(self, fname):
self.log('\tPreprocessing to convert unicode characters')
try:
data = open(fname, 'rb').read()
from calibre.ebooks.rtf.preprocess import RtfTokenizer, RtfTokenParser
tokenizer = RtfTokenizer(data)
tokens = RtfTokenParser(tokenizer.tokens)
data = tokens.toRTF()
fname = 'preprocessed.rtf'
with open(fname, 'wb') as f:
f.write(data)
except:
self.log.exception(
'Failed to preprocess RTF to convert unicode sequences, ignoring...')
return fname
# def preprocess(self, fname):
# self.log('\tPreprocessing to convert unicode characters')
# try:
# data = open(fname, 'rb').read()
# from calibre.ebooks.rtf.preprocess import RtfTokenizer, RtfTokenParser
# tokenizer = RtfTokenizer(data)
# tokens = RtfTokenParser(tokenizer.tokens)
# data = tokens.toRTF()
# fname = 'preprocessed.rtf'
# with open(fname, 'wb') as f:
# f.write(data)
# except:
# self.log.exception(
# 'Failed to preprocess RTF to convert unicode sequences, ignoring...')
# return fname
def convert_borders(self, doc):
border_styles = []
@ -269,17 +283,14 @@ class RTFInput(InputFormatPlugin):
self.log = log
self.log('Converting RTF to XML...')
#Name of the preprocesssed RTF file
fname = self.preprocess(stream.name)
# fname = self.preprocess(stream.name)
try:
xml = self.generate_xml(fname)
xml = self.generate_xml(stream.name)
except RtfInvalidCodeException, e:
raise
raise ValueError(_('This RTF file has a feature calibre does not '
'support. Convert it to HTML first and then try it.\n%s')%e)
'''dataxml = open('dataxml.xml', 'w')
dataxml.write(xml)
dataxml.close'''
d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf'))
if d:
imap = {}

View File

@ -17,7 +17,8 @@
#########################################################################
# $Revision: 1.41 $
# $Date: 2006/03/24 23:50:07 $
import sys,os
import sys, os
from calibre.ebooks.rtf2xml import headings_to_sections, \
line_endings, footnote, fields_small, default_encoding, \
make_lists, preamble_div, header, colors, group_borders, \
@ -90,7 +91,6 @@ class ParseRtf:
out_file = '',
out_dir = None,
dtd = '',
#debug = 0, #why? calibre
deb_dir = None,
convert_symbol = None,
convert_wingdings = None,
@ -107,6 +107,7 @@ class ParseRtf:
no_dtd = 0,
char_data = '',
):
"""
Requires:
'file' --file to parse
@ -119,12 +120,11 @@ class ParseRtf:
script tries to output to directory where is script is exectued.)
'deb_dir' --debug directory. If a debug_dir is provided, the script
will copy each run through as a file to examine in the debug_dir
'perl_script'--use perl to make tokens. This runs just a bit faster.
(I will probably phase this out.)
'check_brackets' -- make sure the brackets match up after each run
through a file. Only for debugging.
Returns: Nothing
"""
self.__file = in_file
self.__out_file = out_file
self.__out_dir = out_dir
@ -132,7 +132,7 @@ class ParseRtf:
self.__dtd_path = dtd
self.__check_file(in_file,"file_to_parse")
self.__char_data = char_data
self.__debug_dir = deb_dir #self.__debug_dir = debug calibre
self.__debug_dir = deb_dir
self.__check_dir(self.__temp_dir)
self.__copy = self.__check_dir(self.__debug_dir)
self.__convert_caps = convert_caps
@ -155,25 +155,24 @@ class ParseRtf:
if hasattr(the_file, 'read'): return
if the_file == None:
if type == "file_to_parse":
message = "You must provide a file for the script to work"
msg = message
msg = "\nYou must provide a file for the script to work"
raise RtfInvalidCodeException, msg
elif os.path.exists(the_file):
pass # do nothing
else:
message = "The file '%s' cannot be found" % the_file
msg = message
msg = "\nThe file '%s' cannot be found" % the_file
raise RtfInvalidCodeException, msg
def __check_dir(self, the_dir):
"""Check to see if directory exists"""
if not the_dir :
return
dir_exists = os.path.isdir(the_dir)
if not dir_exists:
message = "%s is not a directory" % the_dir
msg = message
msg = "\n%s is not a directory" % the_dir
raise RtfInvalidCodeException, msg
return 1
def parse_rtf(self):
"""
Parse the file by calling on other classes.
@ -194,13 +193,14 @@ class ParseRtf:
copy_obj.set_dir(self.__debug_dir)
copy_obj.remove_files()
copy_obj.copy_file(self.__temp_file, "original_file")
# new as of 2005-08-02. Do I want this?
# Function to check if bracket are well handled
if self.__debug_dir or self.__run_level > 2:
self.__check_brack_obj = check_brackets.CheckBrackets\
(file = self.__temp_file,
bug_handler = RtfInvalidCodeException,
)
# convert Macintosh line endings to Unix line endings
#convert Macintosh and Windows line endings to Unix line endings
#why do this if you don't wb after?
line_obj = line_endings.FixLineEndings(
in_file = self.__temp_file,
bug_handler = RtfInvalidCodeException,
@ -208,13 +208,13 @@ class ParseRtf:
run_level = self.__run_level,
replace_illegals = self.__replace_illegals,
)
return_value = line_obj.fix_endings()
return_value = line_obj.fix_endings() #calibre return what?
self.__return_code(return_value)
tokenize_obj = tokenize.Tokenize(
bug_handler = RtfInvalidCodeException,
in_file = self.__temp_file,
copy = self.__copy,
run_level = self.__run_level,)
run_level = self.__run_level)
tokenize_obj.tokenize()
process_tokens_obj = process_tokens.ProcessTokens(
in_file = self.__temp_file,
@ -230,12 +230,25 @@ class ParseRtf:
os.remove(self.__temp_file)
except OSError:
pass
#Check to see if the file is correctly encoded
encode_obj = default_encoding.DefaultEncoding(
in_file = self.__temp_file,
run_level = self.__run_level,
bug_handler = RtfInvalidCodeException,
check_raw = True,
)
platform, code_page, default_font_num = encode_obj.find_default_encoding()
check_encoding_obj = check_encoding.CheckEncoding(
bug_handler = RtfInvalidCodeException,
)
check_encoding_obj.check_encoding(self.__file)
sys.stderr.write('File "%s" does not appear to be RTF.\n' % self.__file if isinstance(self.__file, str) else self.__file.encode('utf-8'))
raise InvalidRtfException, msg
bug_handler = RtfInvalidCodeException,
)
enc = encode_obj.get_codepage()
if enc != 'mac_roman':
enc = 'cp' + enc
if check_encoding_obj.check_encoding(self.__file, enc):
file_name = self.__file if isinstance(self.__file, str) \
else self.__file.encode('utf-8')
msg = 'File %s does not appear to be correctly encoded.\n' % file_name
raise InvalidRtfException, msg
delete_info_obj = delete_info.DeleteInfo(
in_file = self.__temp_file,
copy = self.__copy,
@ -508,6 +521,7 @@ class ParseRtf:
indent = self.__indent,
run_level = self.__run_level,
no_dtd = self.__no_dtd,
encoding = encode_obj.get_codepage(),
bug_handler = RtfInvalidCodeException,
)
tags_obj.convert_to_tags()
@ -520,35 +534,28 @@ class ParseRtf:
output_obj.output()
os.remove(self.__temp_file)
return self.__exit_level
def __bracket_match(self, file_name):
if self.__run_level > 2:
good_br, msg = self.__check_brack_obj.check_brackets()
if good_br:
pass
# sys.stderr.write( msg + ' in ' + file_name + "\n")
#sys.stderr.write( msg + ' in ' + file_name + "\n")
else:
msg += msg + " in file '" + file_name + "'\n"
msg = '%s in file %s\n' % (msg, file_name)
raise RtfInvalidCodeException, msg
def __return_code(self, num):
if num == None:
return
if int(num) > self.__exit_level:
self.__exit_level = num
if num == None:
return
if int(num) > self.__exit_level:
self.__exit_level = num
def __make_temp_file(self,file):
"""Make a temporary file to parse"""
write_file="rtf_write_file"
read_obj = file if hasattr(file, 'read') else open(file,'r')
write_obj = open(write_file, 'w')
line = "dummy"
while line:
line = read_obj.read(1000)
write_obj.write(line )
write_obj.close()
with open(write_file, 'wb') as write_obj:
for line in read_obj:
write_obj.write(line)
return write_file
"""
mi<tg<open______<style-sheet\n
mi<tg<close_____<style-sheet\n
mi<tg<open-att__<footnote<num>1\n
mi<tg<empty-att_<page-definition<margin>33\n
mi<tg<empty_____<para\n
"""

View File

@ -24,38 +24,38 @@ class CheckBrackets:
self.__ob_count = 0
self.__cb_count = 0
self.__open_bracket_num = []
def open_brack(self, line):
num = line[-5:-1]
self.__open_bracket_num.append(num)
self.__bracket_count += 1
def close_brack(self, line):
num = line[-5:-1]
##self.__open_bracket_num.append(num)
try:
last_num = self.__open_bracket_num.pop()
except:
return 0
return False
if num != last_num:
return 0
return False
self.__bracket_count -= 1
return 1
return True
def check_brackets(self):
read_obj = open(self.__file, 'r')
line = 'dummy'
line_count = 0
while line:
line_count += 1
line = read_obj.readline()
self.__token_info = line[:16]
if self.__token_info == 'ob<nu<open-brack':
self.open_brack(line)
if self.__token_info == 'cb<nu<clos-brack':
right_count = self.close_brack(line)
if not right_count:
return (0, "closed bracket doesn't match, line %s" % line_count)
read_obj.close()
with open(self.__file, 'r') as read_obj:
for line in read_obj:
line_count += 1
self.__token_info = line[:16]
if self.__token_info == 'ob<nu<open-brack':
self.open_brack(line)
if self.__token_info == 'cb<nu<clos-brack':
if not self.close_brack(line):
return (False, "closed bracket doesn't match, line %s" % line_count)
if self.__bracket_count != 0:
msg = 'At end of file open and closed brackets don\'t match\n'
msg = msg + 'total number of brackets is %s' % self.__bracket_count
return (0, msg)
return (1, "brackets match!")
msg = ('At end of file open and closed brackets don\'t match\n' \
'total number of brackets is %s') % self.__bracket_count
return (False, msg)
return (True, "Brackets match!")

View File

@ -1,8 +1,11 @@
#!/usr/bin/env python
import sys
class CheckEncoding:
def __init__(self, bug_handler):
self.__bug_handler = bug_handler
def __get_position_error(self, line, encoding, line_num):
char_position = 0
for char in line:
@ -12,21 +15,23 @@ class CheckEncoding:
except UnicodeError, msg:
sys.stderr.write('line: %s char: %s\n' % (line_num, char_position))
sys.stderr.write(str(msg) + '\n')
def check_encoding(self, path, encoding='us-ascii'):
read_obj = open(path, 'r')
line_to_read = 1
def check_encoding(self, path, encoding='us-ascii', verbose=True):
line_num = 0
while line_to_read:
line_num += 1
line_to_read = read_obj.readline()
line = line_to_read
try:
line.decode(encoding)
except UnicodeError:
if len(line) < 1000:
self.__get_position_error(line, encoding, line_num)
else:
sys.stderr.write('line: %d has bad encoding\n'%line_num)
with open(path, 'r') as read_obj:
for line in read_obj:
line_num += 1
try:
line.decode(encoding)
except UnicodeError:
if verbose:
if len(line) < 1000:
self.__get_position_error(line, encoding, line_num)
else:
sys.stderr.write('line: %d has bad encoding\n' % line_num)
return True
return False
if __name__ == '__main__':
check_encoding_obj = CheckEncoding()
check_encoding_obj.check_encoding(sys.argv[1])

View File

@ -16,7 +16,9 @@
# #
#########################################################################
import os, tempfile
from calibre.ebooks.rtf2xml import copy
class CombineBorders:
"""Combine borders in RTF tokens to make later processing easier"""
def __init__(self,
@ -32,28 +34,31 @@ class CombineBorders:
self.__state = 'default'
self.__bord_pos = 'default'
self.__bord_att = []
def found_bd(self, line):
#cw<bd<bor-t-r-vi
self.__state = 'border'
self.__bord_pos = line[6:16]
def __default_func(self, line):
#cw<bd<bor-t-r-vi
if self.__first_five == 'cw<bd':
self.found_bd(line)
return ''
return line
def end_border(self, line, write_obj):
joiner = "|"
border_string = joiner.join(self.__bord_att)
border_string = "|".join(self.__bord_att)
self.__bord_att = []
write_obj.write('cw<bd<%s<nu<%s\n' % (self.__bord_pos,
border_string))
border_string))
self.__state = 'default'
self.__bord_string = ''
if self.__first_five == 'cw<bd':
self. found_bd(line)
else:
write_obj.write(line)
def add_to_border_desc(self, line):
#cw<bt<bdr-hair__<nu<true
#cw<bt<bdr-linew<nu<0.50
@ -65,26 +70,22 @@ class CombineBorders:
else:
num = ':' + num
self.__bord_att.append(border_desc + num)
def __border_func(self, line, write_obj):
if self.__first_five != 'cw<bt':
self.end_border(line, write_obj)
else:
self.add_to_border_desc(line)
def combine_borders(self):
read_obj = open(self.__file, 'r')
write_obj = open(self.__write_to, 'w')
line_to_read = 'dummy'
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__first_five = line[0:5]
if self.__state == 'border':
self.__border_func(line, write_obj)
else:
to_print = self.__default_func(line)
write_obj.write(to_print)
read_obj.close()
write_obj.close()
with open(self.__file, 'r') as read_obj:
with open(self.__write_to, 'w') as write_obj:
for line in read_obj:
self.__first_five = line[0:5]
if self.__state == 'border':
self.__border_func(line, write_obj)
else:
write_obj.write(self.__default_func(line))
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "combine_borders.data")

View File

@ -1,6 +1,9 @@
import os, tempfile
from calibre.ebooks.rtf2xml import copy
import os, tempfile, sys
from calibre.ebooks.rtf2xml import copy, check_encoding
public_dtd = 'rtf2xml1.0.dtd'
class ConvertToTags:
"""
Convert file to XML
@ -10,6 +13,7 @@ class ConvertToTags:
bug_handler,
dtd_path,
no_dtd,
encoding,
indent = None,
copy = None,
run_level = 1,
@ -29,9 +33,14 @@ class ConvertToTags:
self.__copy = copy
self.__dtd_path = dtd_path
self.__no_dtd = no_dtd
if encoding != 'mac_roman':
self.__encoding = 'cp' + encoding
else:
self.__encoding = 'mac_roman'
self.__indent = indent
self.__run_level = run_level
self.__write_to = tempfile.mktemp()
def __initiate_values(self):
"""
Set values, including those for the dictionary.
@ -61,6 +70,7 @@ class ConvertToTags:
'tx<ut<__________' : self.__text_func,
'mi<tg<empty_____' : self.__empty_func,
}
def __open_func(self, line):
"""
Print the opening tag and newlines when needed.
@ -73,6 +83,7 @@ class ConvertToTags:
if info in self.__two_new_line:
self.__write_extra_new_line()
self.__write_obj.write('<%s>' % info)
def __empty_func(self, line):
"""
Print out empty tag and newlines when needed.
@ -85,10 +96,11 @@ class ConvertToTags:
self.__write_new_line()
if info in self.__two_new_line:
self.__write_extra_new_line()
def __open_att_func(self, line):
"""
Process lines for open tags that have attributes.
The important infor is between [17:-1]. Take this info and split it
The important info is between [17:-1]. Take this info and split it
with the delimeter '<'. The first token in this group is the element
name. The rest are attributes, separated fromt their values by '>'. So
read each token one at a time, and split them by '>'.
@ -119,6 +131,7 @@ class ConvertToTags:
self.__write_new_line()
if element_name in self.__two_new_line:
self.__write_extra_new_line()
def __empty_att_func(self, line):
"""
Same as the __open_att_func, except a '/' is placed at the end of the tag.
@ -143,6 +156,7 @@ class ConvertToTags:
self.__write_new_line()
if element_name in self.__two_new_line:
self.__write_extra_new_line()
def __close_func(self, line):
"""
Print out the closed tag and new lines, if appropriate.
@ -156,6 +170,7 @@ class ConvertToTags:
self.__write_new_line()
if info in self.__two_new_line:
self.__write_extra_new_line()
def __text_func(self, line):
"""
Simply print out the information between [17:-1]
@ -163,6 +178,7 @@ class ConvertToTags:
#tx<nu<__________<Normal;
# change this!
self.__write_obj.write(line[17:-1])
def __write_extra_new_line(self):
"""
Print out extra new lines if the new lines have not exceeded two. If
@ -172,8 +188,10 @@ class ConvertToTags:
return
if self.__new_line < 2:
self.__write_obj.write('\n')
def __default_func(self, line):
pass
def __write_new_line(self):
"""
Print out a new line if a new line has not already been printed out.
@ -183,11 +201,23 @@ class ConvertToTags:
if not self.__new_line:
self.__write_obj.write('\n')
self.__new_line += 1
def __write_dec(self):
"""
Write the XML declaration at the top of the document.
"""
self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>')
#keep maximum compatibility with previous version
check_encoding_obj = check_encoding.CheckEncoding(
bug_handler=self.__bug_handler)
if not check_encoding_obj.check_encoding(self.__file, verbose=False):
self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>')
elif not check_encoding_obj.check_encoding(self.__file, self.__encoding):
self.__write_obj.write('<?xml version="1.0" encoding="%s" ?>' % self.__encoding)
else:
self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>')
sys.stderr.write('Bad RTF encoding, revert to US-ASCII chars and'
' hope for the best')
self.__new_line = 0
self.__write_new_line()
if self.__no_dtd:
@ -207,6 +237,7 @@ class ConvertToTags:
)
self.__new_line = 0
self.__write_new_line()
def convert_to_tags(self):
"""
Read in the file one line at a time. Get the important info, between
@ -222,18 +253,14 @@ class ConvertToTags:
an empty tag function.
"""
self.__initiate_values()
read_obj = open(self.__file, 'r')
self.__write_obj = open(self.__write_to, 'w')
self.__write_dec()
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
action = self.__state_dict.get(self.__token_info)
if action != None:
action(line)
read_obj.close()
with open(self.__file, 'r') as read_obj:
for line in read_obj:
self.__token_info = line[:16]
action = self.__state_dict.get(self.__token_info)
if action is not None:
action(line)
self.__write_obj.close()
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:

View File

@ -23,6 +23,7 @@ class Copy:
def __init__(self, bug_handler, file = None, deb_dir = None, ):
self.__file = file
self.__bug_handler = bug_handler
def set_dir(self, deb_dir):
"""Set the temporary directory to write files to"""
if deb_dir is None:
@ -33,19 +34,11 @@ class Copy:
message = "%(deb_dir)s is not a directory" % vars()
raise self.__bug_handler , message
Copy.__dir = deb_dir
def remove_files(self ):
"""Remove files from directory"""
self.__remove_the_files(Copy.__dir)
"""
list_of_files = os.listdir(Copy.__dir)
list_of_files = os.listdir(the_dir)
for file in list_of_files:
rem_file = os.path.join(Copy.__dir,file)
if os.path.isdir(rem_file):
self.remove_files(rem_file)
else:
os.remove(rem_file)
"""
def __remove_the_files(self, the_dir):
"""Remove files from directory"""
list_of_files = os.listdir(the_dir)
@ -58,6 +51,7 @@ class Copy:
os.remove(rem_file)
except OSError:
pass
def copy_file(self, file, new_file):
"""
Copy the file to a new name

View File

@ -1,61 +1,142 @@
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program; if not, write to the Free Software #
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
# 02111-1307 USA #
# #
# #
#########################################################################
'''
Codepages as to RTF 1.9.1:
437 United States IBM
708 Arabic (ASMO 708)
709 Arabic (ASMO 449+, BCON V4)
710 Arabic (transparent Arabic)
711 Arabic (Nafitha Enhanced)
720 Arabic (transparent ASMO)
819 Windows 3.1 (United States and Western Europe)
850 IBM multilingual
852 Eastern European
860 Portuguese
862 Hebrew
863 French Canadian
864 Arabic
865 Norwegian
866 Soviet Union
874 Thai
932 Japanese
936 Simplified Chinese
949 Korean
950 Traditional Chinese
1250 Eastern European
1251 Cyrillic
1252 Western European
1253 Greek
1254 Turkish
1255 Hebrew
1256 Arabic
1257 Baltic
1258 Vietnamese
1361 Johab
10000 MAC Roman
10001 MAC Japan
10004 MAC Arabic
10005 MAC Hebrew
10006 MAC Greek
10007 MAC Cyrillic
10029 MAC Latin2
10081 MAC Turkish
57002 Devanagari
57003 Bengali
57004 Tamil
57005 Telugu
57006 Assamese
57007 Oriya
57008 Kannada
57009 Malayalam
57010 Gujarati
57011 Punjabi
'''
import re
class DefaultEncoding:
"""
Find the default encoding for the doc
"""
def __init__(self, in_file, bug_handler, run_level = 1,):
"""
Required:
'file'
Returns:
nothing
"""
def __init__(self, in_file, bug_handler, run_level = 1, check_raw = False):
self.__file = in_file
self.__bug_handler = bug_handler
self.__platform = 'Windows'
self.__default_num = 'not-defined'
self.__code_page = '1252'
self.__datafetched = False
self.__fetchraw = check_raw
def find_default_encoding(self):
platform = 'Windows'
default_num = 'not-defined'
code_page = 'ansicpg1252'
read_obj = open(self.__file, 'r')
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
if self.__token_info == 'mi<mk<rtfhed-end':
break
if self.__token_info == 'cw<ri<ansi-codpg':
#cw<ri<ansi-codpg<nu<10000
num = line[20:-1]
if not num:
num = '1252'
code_page = 'ansicpg' + num
if self.__token_info == 'cw<ri<macintosh_':
platform = 'Macintosh'
if self.__token_info == 'cw<ri<deflt-font':
default_num = line[20:-1]
#cw<ri<deflt-font<nu<0
#action = self.__state_dict.get(self.__state)
#if action == None:
#print self.__state
#action(line)
read_obj.close()
if platform == 'Macintosh':
code_page = 'mac_roman'
return platform, code_page, default_num
if not self.__datafetched:
self._encoding()
self.__datafetched = True
if self.__platform == 'Macintosh':
code_page = self.__code_page
else:
code_page = 'ansicpg' + self.__code_page
return self.__platform, code_page, self.__default_num
def get_codepage(self):
if not self.__datafetched:
self._encoding()
self.__datafetched = True
return self.__code_page
def get_platform(self):
if not self.__datafetched:
self._encoding()
self.__datafetched = True
return self.__platform
def _encoding(self):
with open(self.__file, 'r') as read_obj:
if not self.__fetchraw:
for line in read_obj:
self.__token_info = line[:16]
if self.__token_info == 'mi<mk<rtfhed-end':
break
if self.__token_info == 'cw<ri<ansi-codpg':
#cw<ri<ansi-codpg<nu<10000
self.__code_page = line[20:-1] if int(line[20:-1]) \
else '1252'
if self.__token_info == 'cw<ri<macintosh_':
self.__platform = 'Macintosh'
self.__code_page = 'mac_roman'
elif self.__token_info == 'cw<ri<pc________':
self.__platform = 'IBMPC'
self.__code_page = '437'
elif self.__token_info == 'cw<ri<pca_______':
self.__platform = 'OS/2'
self.__code_page = '850'
if self.__token_info == 'cw<ri<deflt-font':
self.__default_num = line[20:-1]
#cw<ri<deflt-font<nu<0
else:
fenc = re.compile(r'\\(mac|pc|ansi|pca)[\\ \{\}\t\n]+')
fenccp = re.compile(r'\\ansicpg(\d+)[\\ \{\}\t\n]+')
for line in read_obj:
if fenccp.search(line):
cp = fenccp.search(line).group(1)
if not int(cp):
self.__code_page = cp
break
if fenc.search(line):
enc = fenc.search(line).group(1)
if enc == 'mac':
self.__code_page = 'mac_roman'
elif enc == 'pc':
self.__code_page = '437'
elif enc == 'pca':
self.__code_page = '850'
# if __name__ == '__main__':
# encode_obj = DefaultEncoding(
# in_file = sys.argv[1],
# bug_handler = Exception,
# check_raw = True,
# )
# print encode_obj.get_codepage()

View File

@ -16,7 +16,9 @@
# #
#########################################################################
import sys, os, tempfile
from calibre.ebooks.rtf2xml import copy
class DeleteInfo:
"""Delelet unecessary destination groups"""
def __init__(self,
@ -29,17 +31,18 @@ class DeleteInfo:
self.__bug_handler = bug_handler
self.__copy = copy
self.__write_to = tempfile.mktemp()
self.__bracket_count=0
self.__bracket_count= 0
self.__ob_count = 0
self.__cb_count = 0
self.__after_asterisk = 0
self.__delete = 0
# self.__after_asterisk = False
# self.__delete = 0
self.__initiate_allow()
self.__ob = 0
self.__write_cb = 0
self.__write_cb = False
self.__run_level = run_level
self.__found_delete = 0
self.__list = 0
self.__found_delete = False
# self.__list = False
def __initiate_allow(self):
"""
Initiate a list of destination groups which should be printed out.
@ -66,9 +69,10 @@ class DeleteInfo:
self.__state_dict = {
'default' : self.__default_func,
'after_asterisk' : self.__asterisk_func,
'delete' : self.__delete_func,
'delete' : self.__delete_func,
'list' : self.__list_func,
}
def __default_func(self,line):
"""Handle lines when in no special state. Look for an asterisk to
begin a special state. Otherwise, print out line."""
@ -81,27 +85,29 @@ class DeleteInfo:
if self.__ob:
self.__write_obj.write(self.__ob)
self.__ob = line
return 0
return False
else:
# write previous bracket, since didn't fine asterisk
if self.__ob:
self.__write_obj.write(self.__ob)
self.__ob = 0
return 1
return True
def __delete_func(self,line):
"""Handle lines when in delete state. Don't print out lines
unless the state has ended."""
if self.__delete_count == self.__cb_count:
self.__state = 'default'
if self.__write_cb:
self.__write_cb = 0
return 1
return 0
self.__write_cb = True
return True
return False
def __asterisk_func(self,line):
"""
Determine whether to delete info in group
Note on self.__cb flag.
If you find that you are in a delete group, and the preivous
If you find that you are in a delete group, and the previous
token in not an open bracket (self.__ob = 0), that means
that the delete group is nested inside another acceptable
detination group. In this case, you have alrady written
@ -110,21 +116,21 @@ class DeleteInfo:
"""
# Test for {\*}, in which case don't enter
# delete state
self.__after_asterisk = 0 # only enter this function once
self.__found_delete = 1
# self.__after_asterisk = False # only enter this function once
self.__found_delete = True
if self.__token_info == 'cb<nu<clos-brack':
if self.__delete_count == self.__cb_count:
self.__state = 'default'
self.__ob = 0
# changed this because haven't printed out start
return 0
return False
else:
# not sure what happens here!
# believe I have a '{\*}
if self.__run_level > 3:
msg = 'flag problem\n'
raise self.__bug_handler, msg
return 1
return True
elif self.__token_info in self.__allowable :
if self.__ob:
self.__write_obj.write(self.__ob)
@ -132,85 +138,81 @@ class DeleteInfo:
self.__state = 'default'
else:
pass
return 1
return True
elif self.__token_info == 'cw<ls<list______':
self.__ob = 0
self.__found_list_func(line)
elif self.__token_info in self.__not_allowable:
if not self.__ob:
self.__write_cb = 1
self.__write_cb = True
self.__ob = 0
self.__state = 'delete'
self.__cb_count = 0
return 0
return False
else:
if self.__run_level > 5:
msg = 'After an asterisk, and found neither an allowable or non-allowble token\n'
msg += 'token is "%s"\n' % self.__token_info
raise self.__bug_handler
msg = ('After an asterisk, and found neither an allowable or non-allowable token\n\
token is "%s"\n') % self.__token_info
raise self.__bug_handler, msg
if not self.__ob:
self.__write_cb = 1
self.__write_cb = True
self.__ob = 0
self.__state = 'delete'
self.__cb_count = 0
return 0
return False
def __found_list_func(self, line):
"""
print out control words in this group
"""
self.__state = 'list'
def __list_func(self, line):
"""
Check to see if the group has ended.
Return 1 for all control words.
Return 0 otherwise.
Return True for all control words.
Return False otherwise.
"""
if self.__delete_count == self.__cb_count and self.__token_info ==\
'cb<nu<clos-brack':
self.__state = 'default'
if self.__write_cb:
self.__write_cb = 0
return 1
return 0
self.__write_cb = False
return True
return False
elif line[0:2] == 'cw':
return 1
return True
else:
return 0
return False
def delete_info(self):
"""Main method for handling other methods. Read one line in at
a time, and determine wheter to print the line based on the state."""
line_to_read = 'dummy'
read_obj = open(self.__file, 'r')
self.__write_obj = open(self.__write_to, 'w')
while line_to_read:
#ob<nu<open-brack<0001
to_print =1
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
if self.__token_info == 'ob<nu<open-brack':
self.__ob_count = line[-5:-1]
if self.__token_info == 'cb<nu<clos-brack':
self.__cb_count = line[-5:-1]
action = self.__state_dict.get(self.__state)
if not action:
sys.stderr.write('No action in dictionary state is "%s" \n'
% self.__state)
to_print = action(line)
"""
if self.__after_asterisk:
to_print = self.__asterisk_func(line)
elif self.__list:
self.__in_list_func(line)
elif self.__delete:
to_print = self.__delete_func(line)
else:
to_print = self.__default_func(line)
"""
if to_print:
self.__write_obj.write(line)
self.__write_obj.close()
read_obj.close()
a time, and determine whether to print the line based on the state."""
with open(self.__file, 'r') as read_obj:
with open(self.__write_to, 'w') as self.__write_obj:
for line in read_obj:
#ob<nu<open-brack<0001
to_print = True
self.__token_info = line[:16]
if self.__token_info == 'ob<nu<open-brack':
self.__ob_count = line[-5:-1]
if self.__token_info == 'cb<nu<clos-brack':
self.__cb_count = line[-5:-1]
action = self.__state_dict.get(self.__state)
if not action:
sys.stderr.write(_('No action in dictionary state is "%s" \n')
% self.__state)
to_print = action(line)
# if self.__after_asterisk:
# to_print = self.__asterisk_func(line)
# elif self.__list:
# self.__in_list_func(line)
# elif self.__delete:
# to_print = self.__delete_func(line)
# else:
# to_print = self.__default_func(line)
if to_print:
self.__write_obj.write(line)
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "delete_info.data")

View File

@ -16,7 +16,9 @@
# #
#########################################################################
import os, tempfile
from calibre.ebooks.rtf2xml import copy
class Footnote:
"""
Two public methods are available. The first separates all of the
@ -35,6 +37,7 @@ class Footnote:
self.__copy = copy
self.__write_to = tempfile.mktemp()
self.__found_a_footnote = 0
def __first_line_func(self, line):
"""
Print the tag info for footnotes. Check whether footnote is an
@ -47,6 +50,7 @@ class Footnote:
self.__write_to_foot_obj.write(
'mi<tg<open-att__<footnote<num>%s\n' % self.__footnote_count)
self.__first_line = 0
def __in_footnote_func(self, line):
"""Handle all tokens that are part of footnote"""
if self.__first_line:
@ -68,6 +72,7 @@ class Footnote:
'mi<mk<footnt-clo\n')
else:
self.__write_to_foot_obj.write(line)
def __found_footnote(self, line):
""" Found a footnote"""
self.__found_a_footnote = 1
@ -81,6 +86,7 @@ class Footnote:
'mi<mk<footnt-ind<%04d\n' % self.__footnote_count)
self.__write_to_foot_obj.write(
'mi<mk<footnt-ope<%04d\n' % self.__footnote_count)
def __default_sep(self, line):
"""Handle all tokens that are not footnote tokens"""
if self.__token_info == 'cw<nt<footnote__':
@ -91,6 +97,7 @@ class Footnote:
self.__write_obj.write(
'tx<nu<__________<%s\n' % num
)
def __initiate_sep_values(self):
"""
initiate counters for separate_footnotes method.
@ -102,6 +109,7 @@ class Footnote:
self.__in_footnote = 0
self.__first_line = 0 #have not processed the first line of footnote
self.__footnote_count = 0
def separate_footnotes(self):
"""
Separate all the footnotes in an RTF file and put them at the bottom,
@ -111,58 +119,50 @@ class Footnote:
bottom of the main file.
"""
self.__initiate_sep_values()
read_obj = open(self.__file)
self.__write_obj = open(self.__write_to, 'w')
self.__footnote_holder = tempfile.mktemp()
self.__write_to_foot_obj = open(self.__footnote_holder, 'w')
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
# keep track of opening and closing brackets
if self.__token_info == 'ob<nu<open-brack':
self.__ob_count = line[-5:-1]
if self.__token_info == 'cb<nu<clos-brack':
self.__cb_count = line[-5:-1]
# In the middle of footnote text
if self.__in_footnote:
self.__in_footnote_func(line)
# not in the middle of footnote text
else:
self.__default_sep(line)
self.__write_obj.close()
read_obj.close()
self.__write_to_foot_obj.close()
read_obj = open(self.__footnote_holder, 'r')
write_obj = open(self.__write_to, 'a')
write_obj.write(
'mi<mk<sect-close\n'
'mi<mk<body-close\n'
'mi<tg<close_____<section\n'
'mi<tg<close_____<body\n'
'mi<tg<close_____<doc\n'
'mi<mk<footnt-beg\n')
line = 1
while line:
line = read_obj.readline()
write_obj.write(line)
write_obj.write(
'mi<mk<footnt-end\n')
read_obj.close()
write_obj.close()
with open(self.__file) as read_obj:
with open(self.__write_to, 'w') as self.__write_obj:
with open(self.__footnote_holder, 'w') as self.__write_to_foot_obj:
for line in read_obj:
self.__token_info = line[:16]
# keep track of opening and closing brackets
if self.__token_info == 'ob<nu<open-brack':
self.__ob_count = line[-5:-1]
if self.__token_info == 'cb<nu<clos-brack':
self.__cb_count = line[-5:-1]
# In the middle of footnote text
if self.__in_footnote:
self.__in_footnote_func(line)
# not in the middle of footnote text
else:
self.__default_sep(line)
with open(self.__footnote_holder, 'r') as read_obj:
with open(self.__write_to, 'a') as write_obj:
write_obj.write(
'mi<mk<sect-close\n'
'mi<mk<body-close\n'
'mi<tg<close_____<section\n'
'mi<tg<close_____<body\n'
'mi<tg<close_____<doc\n'
'mi<mk<footnt-beg\n')
for line in read_obj:
write_obj.write(line)
write_obj.write(
'mi<mk<footnt-end\n')
os.remove(self.__footnote_holder)
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "footnote_separate.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)
def update_info(self, file, copy):
"""
Unused method
"""
self.__file = file
self.__copy = copy
def __get_foot_body_func(self, line):
"""
Process lines in main body and look for beginning of footnotes.
@ -172,6 +172,7 @@ class Footnote:
self.__state = 'foot'
else:
self.__write_obj.write(line)
def __get_foot_foot_func(self, line):
"""
Copy footnotes from bottom of file to a separate, temporary file.
@ -180,6 +181,7 @@ class Footnote:
self.__state = 'body'
else:
self.__write_to_foot_obj.write(line)
def __get_footnotes(self):
"""
Private method to remove footnotes from main file. Read one line from
@ -188,21 +190,16 @@ class Footnote:
These two functions do the work of separating the footnotes form the
body.
"""
read_obj = open(self.__file)
self.__write_obj = open(self.__write_to, 'w')
# self.__write_to = "footnote_info.data"
self.__write_to_foot_obj = open(self.__footnote_holder, 'w')
line = 1
while line:
line = read_obj.readline()
self.__token_info = line[:16]
if self.__state == 'body':
self.__get_foot_body_func(line)
elif self.__state == 'foot':
self.__get_foot_foot_func(line)
read_obj.close()
self.__write_obj.close()
self.__write_to_foot_obj.close()
with open(self.__file) as read_obj:
with open(self.__write_to, 'w') as self.__write_obj:
with open(self.__footnote_holder, 'w') as self.__write_to_foot_obj:
for line in read_obj:
self.__token_info = line[:16]
if self.__state == 'body':
self.__get_foot_body_func(line)
elif self.__state == 'foot':
self.__get_foot_foot_func(line)
def __get_foot_from_temp(self, num):
"""
Private method for joining footnotes to body. This method reads from
@ -213,9 +210,7 @@ class Footnote:
look_for = 'mi<mk<footnt-ope<' + num + '\n'
found_foot = 0
string_to_return = ''
line = 1
while line:
line = self.__read_from_foot_obj.readline()
for line in self.__read_from_foot_obj:
if found_foot:
if line == 'mi<mk<footnt-clo\n':
return string_to_return
@ -223,6 +218,7 @@ class Footnote:
else:
if line == look_for:
found_foot = 1
def __join_from_temp(self):
"""
Private method for rejoining footnotes to body. Read from the
@ -232,16 +228,14 @@ class Footnote:
print out to the third file.
If no footnote marker is found, simply print out the token (line).
"""
self.__read_from_foot_obj = open(self.__footnote_holder, 'r')
read_obj = open(self.__write_to, 'r')
self.__write_obj = open(self.__write_to2, 'w')
line = 1
while line:
line = read_obj.readline()
if line[:16] == 'mi<mk<footnt-ind':
line = self.__get_foot_from_temp(line[17:-1])
self.__write_obj.write(line)
read_obj.close()
with open(self.__footnote_holder, 'r') as self.__read_from_foot_obj:
with open(self.__write_to, 'r') as read_obj:
with open(self.__write_to2, 'w') as self.__write_obj:
for line in read_obj:
if line[:16] == 'mi<mk<footnt-ind':
line = self.__get_foot_from_temp(line[17:-1])
self.__write_obj.write(line)
def join_footnotes(self):
"""
Join the footnotes from the bottom of the file and put them in their
@ -258,8 +252,8 @@ class Footnote:
self.__state = 'body'
self.__get_footnotes()
self.__join_from_temp()
self.__write_obj.close()
self.__read_from_foot_obj.close()
# self.__write_obj.close()
# self.__read_from_foot_obj.close()
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to2, "footnote_joined.data")

View File

@ -43,27 +43,28 @@ class GetCharMap:
def get_char_map(self, map):
if map == 'ansicpg0':
map = 'ansicpg1250'
found_map = 0
if map in ('ansicpg10000', '10000'):
map = 'mac_roman'
found_map = False
map_dict = {}
self.__char_file.seek(0)
for line in self.__char_file.readlines():
for line in self.__char_file:
if not line.strip(): continue
begin_element = '<%s>' % map;
end_element = '</%s>' % map
if not found_map:
if begin_element in line:
found_map = 1
found_map = True
else:
if end_element in line:
break
fields = line.split(':')
fields[1].replace('\\colon', ':')
map_dict[fields[1]] = fields[3]
if not found_map:
msg = 'no map found\n'
msg += 'map is "%s"\n'%(map,)
msg = 'no map found\nmap is "%s"\n'%(map,)
raise self.__bug_handler, msg
return map_dict

View File

@ -54,10 +54,10 @@ class Hex2Utf8:
'convert_to_caps'--wether to convert caps to utf-8
Returns:
nothing
"""
"""
self.__file = in_file
self.__copy = copy
if area_to_convert != 'preamble' and area_to_convert != 'body':
if area_to_convert not in ('preamble', 'body'):
msg = (
'Developer error! Wrong flag.\n'
'in module "hex_2_utf8.py\n'
@ -79,7 +79,8 @@ class Hex2Utf8:
self.__write_to = tempfile.mktemp()
self.__bug_handler = bug_handler
self.__invalid_rtf_handler = invalid_rtf_handler
def update_values( self,
def update_values(self,
file,
area_to_convert,
char_file,
@ -132,6 +133,7 @@ class Hex2Utf8:
# self.__convert_symbol = 0
# self.__convert_wingdings = 0
# self.__convert_zapf = 0
def __initiate_values(self):
"""
Required:
@ -191,6 +193,7 @@ class Hex2Utf8:
'body' : self.__body_func,
'mi<mk<body-open_' : self.__found_body_func,
'tx<hx<__________' : self.__hex_text_func,
# 'tx<nu<__________' : self.__text_func,
}
self.__body_state_dict = {
'preamble' : self.__preamble_for_body_func,
@ -209,6 +212,7 @@ class Hex2Utf8:
}
self.__caps_list = ['false']
self.__font_list = ['not-defined']
def __hex_text_func(self, line):
"""
Required:
@ -218,12 +222,12 @@ class Hex2Utf8:
token is in the dictionary, then check if the value starts with a
"&". If it does, then tag the result as utf text. Otherwise, tag it
as normal text.
If the nex_num is not in the dictionary, then a mistake has been
If the hex_num is not in the dictionary, then a mistake has been
made.
"""
hex_num = line[17:-1]
converted = self.__current_dict.get(hex_num)
if converted != None:
if converted is not None:
# tag as utf-8
if converted[0:1] == "&":
font = self.__current_dict_name
@ -263,42 +267,43 @@ class Hex2Utf8:
# msg += 'dictionary is %s\n' % self.__current_dict_name
msg = 'Character "&#x%s;" does not appear to be valid (or is a control character)\n' % token
raise self.__bug_handler, msg
def __found_body_func(self, line):
self.__state = 'body'
self.__write_obj.write(line)
def __body_func(self, line):
"""
When parsing preamble
"""
self.__write_obj.write(line)
def __preamble_func(self, line):
action = self.__preamble_state_dict.get(self.__token_info)
if action != None:
if action is not None:
action(line)
else:
self.__write_obj.write(line)
def __convert_preamble(self):
self.__state = 'preamble'
read_obj = open(self.__file, 'r')
self.__write_obj = open(self.__write_to, 'w')
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
action = self.__preamble_state_dict.get(self.__state)
if action == None:
sys.stderr.write('error no state found in hex_2_utf8',
self.__state
)
action(line)
read_obj.close()
with open(self.__file, 'r') as read_obj:
for line in read_obj:
self.__token_info = line[:16]
action = self.__preamble_state_dict.get(self.__state)
if action is None:
sys.stderr.write(_('error no state found in hex_2_utf8'),
self.__state
)
action(line)
self.__write_obj.close()
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "preamble_utf_convert.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)
def __preamble_for_body_func(self, line):
"""
Required:
@ -311,6 +316,7 @@ class Hex2Utf8:
if self.__token_info == 'mi<mk<body-open_':
self.__found_body_func(line)
self.__write_obj.write(line)
def __body_for_body_func(self, line):
"""
Required:
@ -321,10 +327,11 @@ class Hex2Utf8:
Used when parsing the body.
"""
action = self.__in_body_dict.get(self.__token_info)
if action != None:
if action is not None:
action(line)
else:
self.__write_obj.write(line)
def __start_font_func(self, line):
"""
Required:
@ -348,6 +355,7 @@ class Hex2Utf8:
else:
self.__current_dict_name = 'default'
self.__current_dict = self.__def_dict
def __end_font_func(self, line):
"""
Required:
@ -376,6 +384,7 @@ class Hex2Utf8:
else:
self.__current_dict_name = 'default'
self.__current_dict = self.__def_dict
def __start_special_font_func_old(self, line):
"""
Required:
@ -398,6 +407,7 @@ class Hex2Utf8:
self.__current_dict.append(self.__dingbats_dict)
self.__special_fonts_found += 1
self.__current_dict_name = 'Zapf Dingbats'
def __end_special_font_func(self, line):
"""
Required:
@ -416,6 +426,7 @@ class Hex2Utf8:
self.__current_dict.pop()
self.__special_fonts_found -= 1
self.__dict_name = 'default'
def __start_caps_func_old(self, line):
"""
Required:
@ -427,6 +438,7 @@ class Hex2Utf8:
self.__in_caps to 1
"""
self.__in_caps = 1
def __start_caps_func(self, line):
"""
Required:
@ -440,6 +452,7 @@ class Hex2Utf8:
self.__in_caps = 1
value = line[17:-1]
self.__caps_list.append(value)
def __end_caps_func(self, line):
"""
Required:
@ -455,7 +468,8 @@ class Hex2Utf8:
else:
sys.stderr.write('Module is hex_2_utf8\n')
sys.stderr.write('method is __end_caps_func\n')
sys.stderr.write('caps list should be more than one?\n')
sys.stderr.write('caps list should be more than one?\n') #self.__in_caps not set
def __text_func(self, line):
"""
Required:
@ -466,9 +480,8 @@ class Hex2Utf8:
if in caps, convert. Otherwise, print out.
"""
text = line[17:-1]
if self.__current_dict_name == 'Symbol'\
or self.__current_dict_name == 'Wingdings'\
or self.__current_dict_name == 'Zapf Dingbats':
# print line
if self.__current_dict_name in ('Symbol', 'Wingdings', 'Zapf Dingbats'):
the_string = ''
for letter in text:
hex_num = hex(ord(letter))
@ -477,21 +490,21 @@ class Hex2Utf8:
hex_num = hex_num[2:]
hex_num = '\'%s' % hex_num
converted = self.__current_dict.get(hex_num)
if converted == None:
if converted is None:
sys.stderr.write('module is hex_2_ut8\n')
sys.stderr.write('method is __text_func\n')
sys.stderr.write('no hex value for "%s"\n' % hex_num)
else:
the_string += converted
self.__write_obj.write('tx<nu<__________<%s\n' % the_string)
# print the_string
else:
if self.__caps_list[-1] == 'true' \
and self.__convert_caps\
and self.__current_dict_name != 'Symbol'\
and self.__current_dict_name != 'Wingdings'\
and self.__current_dict_name != 'Zapf Dingbats':
and self.__current_dict_name not in ('Symbol', 'Wingdings', 'Zapf Dingbats'):
text = text.upper()
self.__write_obj.write('tx<nu<__________<%s\n' % text)
def __utf_to_caps_func(self, line):
"""
Required:
@ -506,6 +519,7 @@ class Hex2Utf8:
# utf_text = utf_text.upper()
utf_text = self.__utf_token_to_caps_func(utf_text)
self.__write_obj.write('tx<ut<__________<%s\n' % utf_text)
def __utf_token_to_caps_func(self, char_entity):
"""
Required:
@ -530,28 +544,26 @@ class Hex2Utf8:
return char_entity
else:
return converted
def __convert_body(self):
self.__state = 'body'
read_obj = open(self.__file, 'r')
self.__write_obj = open(self.__write_to, 'w')
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
action = self.__body_state_dict.get(self.__state)
if action == None:
sys.stderr.write('error no state found in hex_2_utf8',
self.__state
)
action(line)
read_obj.close()
with open(self.__file, 'r') as read_obj:
self.__write_obj = open(self.__write_to, 'w')
for line in read_obj:
self.__token_info = line[:16]
action = self.__body_state_dict.get(self.__state)
if action is None:
sys.stderr.write('error no state found in hex_2_utf8',
self.__state
)
action(line)
self.__write_obj.close()
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "body_utf_convert.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)
def convert_hex_2_utf8(self):
self.__initiate_values()
if self.__area_to_convert == 'preamble':

View File

@ -1,5 +1,7 @@
import sys, os, tempfile
from calibre.ebooks.rtf2xml import copy
"""
States.
1. default
@ -36,6 +38,7 @@ class Inline:
self.__copy = copy
self.__run_level = run_level
self.__write_to = tempfile.mktemp()
def __initiate_values(self):
"""
Initiate all values.
@ -51,7 +54,6 @@ class Inline:
'tx<ut<__________' : self.__found_text_func,
'mi<mk<inline-fld' : self.__found_text_func,
'text' : self.__found_text_func,
'cw<nu<hard-lineb' : self.__found_text_func, #calibre
'cb<nu<clos-brack' : self.__close_bracket_func,
'mi<mk<par-end___' : self.__end_para_func,
'mi<mk<footnt-ope' : self.__end_para_func,
@ -63,7 +65,6 @@ class Inline:
'tx<hx<__________' : self.__found_text_func,
'tx<ut<__________' : self.__found_text_func,
'text' : self.__found_text_func,
'cw<nu<hard-lineb' : self.__found_text_func, #calibre
'mi<mk<inline-fld' : self.__found_text_func,
'ob<nu<open-brack': self.__found_open_bracket_func,
'mi<mk<par-end___' : self.__end_para_func,
@ -83,12 +84,12 @@ class Inline:
self.__in_para = 0 # not in paragraph
self.__char_dict = {
# character info => ci
'annotation' : 'annotation',
'annotation' : 'annotation',
'blue______' : 'blue',
'bold______' : 'bold',
'caps______' : 'caps',
'char-style' : 'character-style',
'dbl-strike' : 'double-strike-through',
'caps______' : 'caps',
'char-style' : 'character-style',
'dbl-strike' : 'double-strike-through',
'emboss____' : 'emboss',
'engrave___' : 'engrave',
'font-color' : 'font-color',
@ -96,7 +97,7 @@ class Inline:
'font-size_' : 'font-size',
'font-style' : 'font-style',
'font-up___' : 'superscript',
'footnot-mk' : 'footnote-marker',
'footnot-mk' : 'footnote-marker',
'green_____' : 'green',
'hidden____' : 'hidden',
'italics___' : 'italics',
@ -107,9 +108,10 @@ class Inline:
'strike-thr' : 'strike-through',
'subscript_' : 'subscript',
'superscrip' : 'superscript',
'underlined' : 'underlined',
'underlined' : 'underlined',
}
self.__caps_list = ['false']
def __set_list_func(self, line):
"""
Requires:
@ -128,6 +130,7 @@ class Inline:
self.__place = 'in_list'
self.__inline_list = self.__list_inline_list
self.__groups_in_waiting = self.__groups_in_waiting_list
def __default_func(self, line):
"""
Requires:
@ -140,8 +143,8 @@ class Inline:
action = self.__default_dict.get(self.__token_info)
if action:
action(line)
if self.__token_info != 'cw<nu<hard-lineb': #calibre
self.__write_obj.write(line)
self.__write_obj.write(line)
def __found_open_bracket_func(self, line):
"""
Requires:
@ -156,6 +159,7 @@ class Inline:
self.__groups_in_waiting[0] += 1
self.__inline_list.append({})
self.__inline_list[-1]['contains_inline'] = 0
def __after_open_bracket_func(self, line):
"""
Requires:
@ -176,6 +180,7 @@ class Inline:
self.__state = 'default' # a non control word?
action(line)
self.__write_obj.write(line)
def __handle_control_word(self, line):
"""
Required:
@ -206,6 +211,7 @@ class Inline:
elif char_value == 'Zapf Dingbats':
self.__write_obj.write('mi<mk<font-dingb\n')
"""
def __close_bracket_func(self, line):
"""
Requires:
@ -244,6 +250,7 @@ class Inline:
self.__inline_list.pop()
if self.__groups_in_waiting[0] != 0:
self.__groups_in_waiting[0] -= 1
def __found_text_func(self, line):
"""
Required:
@ -257,7 +264,6 @@ class Inline:
Text can mark the start of a paragraph.
If already in a paragraph, check to see if any groups are waiting
to be added. If so, use another method to write these groups.
3. If not check if hardline break, then write
"""
if self.__place == 'in_list':
self.__write_inline()
@ -265,12 +271,9 @@ class Inline:
if not self.__in_para:
self.__in_para = 1
self.__start_para_func(line)
else:
if self.__token_info == 'cw<nu<hard-lineb': #calibre
self.__write_obj.write('mi<tg<empty_____<hardline-break\n')
if self.__groups_in_waiting[0] != 0:
elif self.__groups_in_waiting[0] != 0:
self.__write_inline()
def __write_inline(self):
"""
Required:
@ -314,6 +317,7 @@ class Inline:
self.__write_obj.write('<%s>%s' % (the_key, the_dict[the_key]))
self.__write_obj.write('\n')
self.__groups_in_waiting[0] = 0
def __end_para_func(self, line):
"""
Requires:
@ -342,6 +346,7 @@ class Inline:
self.__write_obj.write('mi<mk<caps-end__\n')
self.__write_obj.write('mi<tg<close_____<inline\n')
self.__in_para = 0
def __start_para_func(self, line):
"""
Requires:
@ -369,12 +374,14 @@ class Inline:
self.__write_obj.write('<%s>%s' % (the_key, the_dict[the_key]))
self.__write_obj.write('\n')
self.__groups_in_waiting[0] = 0
def __found_field_func(self, line):
"""
Just a default function to make sure I don't prematurely exit
default state
"""
pass
def form_tags(self):
"""
Requires:
@ -386,32 +393,27 @@ class Inline:
the state.
"""
self.__initiate_values()
read_obj = open(self.__file, 'r')
self.__write_obj = open(self.__write_to, 'w')
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
token = line[0:-1]
self.__token_info = ''
if token == 'tx<mc<__________<rdblquote'\
or token == 'tx<mc<__________<ldblquote'\
or token == 'tx<mc<__________<lquote'\
or token == 'tx<mc<__________<rquote'\
or token == 'tx<mc<__________<emdash'\
or token == 'tx<mc<__________<endash'\
or token == 'tx<mc<__________<bullet':
self.__token_info = 'text'
else:
self.__token_info = line[:16]
self.__set_list_func(line)
action = self.__state_dict.get(self.__state)
if action == None:
sys.stderr.write('No matching state in module inline_for_lists.py\n')
sys.stderr.write(self.__state + '\n')
action(line)
read_obj.close()
self.__write_obj.close()
with open(self.__file, 'r') as read_obj:
with open(self.__write_to, 'w') as self.__write_obj:
for line in read_obj:
token = line[0:-1]
self.__token_info = ''
if token == 'tx<mc<__________<rdblquote'\
or token == 'tx<mc<__________<ldblquote'\
or token == 'tx<mc<__________<lquote'\
or token == 'tx<mc<__________<rquote'\
or token == 'tx<mc<__________<emdash'\
or token == 'tx<mc<__________<endash'\
or token == 'tx<mc<__________<bullet':
self.__token_info = 'text'
else:
self.__token_info = line[:16]
self.__set_list_func(line)
action = self.__state_dict.get(self.__state)
if action is None:
sys.stderr.write('No matching state in module inline_for_lists.py\n')
sys.stderr.write(self.__state + '\n')
action(line)
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "inline.data")

View File

@ -15,8 +15,11 @@
# #
# #
#########################################################################
import os, tempfile, re
import os, tempfile
from calibre.ebooks.rtf2xml import copy
from calibre.utils.cleantext import clean_ascii_chars
class FixLineEndings:
"""Fix line endings"""
def __init__(self,
@ -32,36 +35,23 @@ class FixLineEndings:
self.__run_level = run_level
self.__write_to = tempfile.mktemp()
self.__replace_illegals = replace_illegals
def fix_endings(self):
##tempFileName = tempfile.mktemp()
illegal_regx = re.compile( '\x00|\x01|\x02|\x03|\x04|\x05|\x06|\x07|\x08|\x0B|\x0E|\x0F|\x10|\x11|\x12|\x13')
#nums = [0, 1, 2, 3, 4, 5, 6, 7, 8, 11, 14, 15, 16, 17, 18, 19]
"""
read_obj = open(self.__file, 'r')
line = read_obj.read(1000)
regexp = re.compile(r"\r")
macintosh = regexp.search(line)
read_obj.close()
"""
# always check since I have to get rid of illegal characters
macintosh = 1
if macintosh:
line = 1
read_obj = open(self.__file, 'r')
write_obj = open(self.__write_to, 'w')
while line:
line = read_obj.read(1000)
# line = re.sub(regexp,"\n",line)
line = line.replace ('\r', '\n')
if self.__replace_illegals:
line = re.sub(illegal_regx, '', line)
# for num in nums:
# line = line.replace(chr(num), '')
write_obj.write(line )
read_obj.close()
write_obj.close()
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "line_endings.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)
#read
with open(self.__file, 'r') as read_obj:
input_file = read_obj.read()
#calibre go from win and mac to unix
input_file = input_file.replace ('\r\n', '\n')
input_file = input_file.replace ('\r', '\n')
#remove ASCII invalid chars : 0 to 8 and 11-14 to 24-26-27
if self.__replace_illegals:
input_file = clean_ascii_chars(input_file)
#write
with open(self.__write_to, 'wb') as write_obj:
write_obj.write(input_file)
#copy
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "line_endings.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)

View File

@ -16,7 +16,9 @@
# #
#########################################################################
import sys, os, tempfile
from calibre.ebooks.rtf2xml import copy
class Pict:
"""Process graphic information"""
def __init__(self,
@ -36,13 +38,11 @@ class Pict:
self.__ob_count = 0
self.__cb_count = 0
self.__pict_count = 0
self.__in_pict = 0
self.__already_found_pict = 0
self.__in_pict = False
self.__already_found_pict = False
self.__orig_file = orig_file
self.__initiate_pict_dict()
self.__out_file = out_file
# this is left over
self.__no_ask = 1
def __initiate_pict_dict(self):
self.__pict_dict = {
@ -71,57 +71,43 @@ class Pict:
self.__out_file))
else:
dir_name = os.path.dirname(self.__orig_file)
# self.__output_to_file_func()
self.__dir_name = base_name + "_rtf_pict_dir/"
self.__dir_name = os.path.join(dir_name, self.__dir_name)
if not os.path.isdir(self.__dir_name):
try:
os.mkdir(self.__dir_name)
except OSError, msg:
msg = str(msg)
msg += "Couldn't make directory '%s':\n" % (self.__dir_name)
msg = "%sCouldn't make directory '%s':\n" % (str(msg), self.__dir_name)
raise self.__bug_handler
else:
if self.__no_ask:
user_response = 'r'
else:
msg = 'Do you want to remove all files in %s?\n' % self.__dir_name
msg += 'Type "r" to remove.\n'
msg += 'Type any other key to keep files in place.\n'
sys.stderr.write(msg)
user_response = raw_input()
if user_response == 'r':
if self.__run_level > 1:
sys.stderr.write('Removing files from old pict directory...\n')
all_files = os.listdir(self.__dir_name)
for the_file in all_files:
the_file = os.path.join(self.__dir_name, the_file)
try:
os.remove(the_file)
except OSError:
pass
if self.__run_level > 1:
sys.stderr.write('Files removed.\n')
if self.__run_level > 1:
sys.stderr.write('Removing files from old pict directory...\n')
all_files = os.listdir(self.__dir_name)
for the_file in all_files:
the_file = os.path.join(self.__dir_name, the_file)
try:
os.remove(the_file)
except OSError:
pass
if self.__run_level > 1:
sys.stderr.write('Files removed.\n')
def __create_pict_file(self):
"""Create a file for all the pict data to be written to.
"""
self.__pict_file = os.path.join(self.__dir_name, 'picts.rtf')
write_pic_obj = open(self.__pict_file, 'w')
write_pic_obj.close()
self.__write_pic_obj = open(self.__pict_file, 'a')
def __in_pict_func(self, line):
if self.__cb_count == self.__pict_br_count:
self.__in_pict = 0
self.__in_pict = False
self.__write_pic_obj.write("}\n")
return 1
return True
else:
action = self.__pict_dict.get(self.__token_info)
if action:
line = action(line)
self.__write_pic_obj.write(line)
return 0
self.__write_pic_obj.write(action(line))
return False
def __default(self, line, write_obj):
"""Determine if each token marks the beginning of pict data.
@ -142,53 +128,50 @@ class Pict:
write_obj.write('mi<mk<pict-end__\n')
if not self.__already_found_pict:
self.__create_pict_file()
self.__already_found_pict=1;
self.__already_found_pict=True;
self.__print_rtf_header()
self.__in_pict = 1
self.__pict_br_count = self.__ob_count
self.__cb_count = 0
self.__write_pic_obj.write("{\\pict\n")
return 0
return 1
return False
return True
def __print_rtf_header(self):
"""Print to pict file the necessary RTF data for the file to be
recognized as an RTF file.
"""
self.__write_pic_obj.write("{\\rtf1 \n")
self.__write_pic_obj.write("{\\fonttbl\\f0\\null;} \n")
self.__write_pic_obj.write("{\\colortbl\\red255\\green255\\blue255;} \n")
self.__write_pic_obj.write("\\pard \n")
self.__write_pic_obj.write("{\\rtf1 \n{\\fonttbl\\f0\\null;} \n")
self.__write_pic_obj.write("{\\colortbl\\red255\\green255\\blue255;} \n\\pard \n")
def process_pict(self):
self.__make_dir()
read_obj = open(self.__file)
write_obj = open(self.__write_to, 'w')
line_to_read = 'dummy'
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
if self.__token_info == 'ob<nu<open-brack':
self.__ob_count = line[-5:-1]
if self.__token_info == 'cb<nu<clos-brack':
self.__cb_count = line[-5:-1]
if not self.__in_pict:
to_print = self.__default(line, write_obj)
if to_print :
write_obj.write(line)
else:
to_print = self.__in_pict_func(line)
if to_print :
write_obj.write(line)
if self.__already_found_pict:
self.__write_pic_obj.write("}\n")
self.__write_pic_obj.close()
read_obj.close()
write_obj.close()
with open(self.__file) as read_obj:
with open(self.__write_to, 'w') as write_obj:
for line in read_obj:
self.__token_info = line[:16]
if self.__token_info == 'ob<nu<open-brack':
self.__ob_count = line[-5:-1]
if self.__token_info == 'cb<nu<clos-brack':
self.__cb_count = line[-5:-1]
if not self.__in_pict:
to_print = self.__default(line, write_obj)
if to_print :
write_obj.write(line)
else:
to_print = self.__in_pict_func(line)
if to_print :
write_obj.write(line)
if self.__already_found_pict:
self.__write_pic_obj.write("}\n")
self.__write_pic_obj.close()
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "pict.data")
try:
copy_obj.copy_file(self.__pict_file, "pict.rtf")
except:
pass
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)
if self.__pict_count == 0:

View File

@ -15,8 +15,10 @@
# #
# #
#########################################################################
import os, re, tempfile
import os, re, tempfile
from calibre.ebooks.rtf2xml import copy, check_brackets
class ProcessTokens:
"""
Process each token on a line and add information that will be useful for
@ -41,14 +43,16 @@ class ProcessTokens:
self.__bracket_count=0
self.__exception_handler = exception_handler
self.__bug_handler = bug_handler
def compile_expressions(self):
self.__num_exp = re.compile(r"([a-zA-Z]+)(.*)")
self.__utf_exp = re.compile(r'(&.*?;)')
def initiate_token_dict(self):
self.__return_code = 0
self.dict_token={
# unicode
'mshex' : ('nu', '__________', self.__ms_hex_func),
'mshex' : ('nu', '__________', self.__ms_hex_func),
# brackets
'{' : ('nu', '{', self.ob_func),
'}' : ('nu', '}', self.cb_func),
@ -66,6 +70,7 @@ class ProcessTokens:
';' : ('mc', ';', self.ms_sub_func),
# this must be wrong
'-' : ('mc', '-', self.ms_sub_func),
'line' : ('mi', 'hardline-break', self.hardline_func), #calibre
# misc => ml
'*' : ('ml', 'asterisk__', self.default_func),
':' : ('ml', 'colon_____', self.default_func),
@ -73,7 +78,6 @@ class ProcessTokens:
'backslash' : ('nu', '\\', self.text_func),
'ob' : ('nu', '{', self.text_func),
'cb' : ('nu', '}', self.text_func),
'line' : ('nu', 'hard-lineb', self.default_func), #calibre
#'line' : ('nu', ' ', self.text_func), calibre
# paragraph formatting => pf
'page' : ('pf', 'page-break', self.default_func),
@ -159,15 +163,17 @@ class ProcessTokens:
'rtf' : ('ri', 'rtf_______', self.default_func),
'deff' : ('ri', 'deflt-font', self.default_func),
'mac' : ('ri', 'macintosh_', self.default_func),
'pc' : ('ri', 'pc________', self.default_func),
'pca' : ('ri', 'pca_______', self.default_func),
'ansi' : ('ri', 'ansi______', self.default_func),
'ansicpg' : ('ri', 'ansi-codpg', self.default_func),
# notes => nt
'footnote' : ('nt', 'footnote__', self.default_func),
'ftnalt' : ('nt', 'type______<endnote', self.two_part_func),
# anchor => an
'tc' : ('an', 'toc_______', self.default_func),
'tc' : ('an', 'toc_______', self.default_func),
'bkmkstt' : ('an', 'book-mk-st', self.default_func),
'bkmkstart' : ('an', 'book-mk-st', self.default_func),
'bkmkstart' : ('an', 'book-mk-st', self.default_func),
'bkmkend' : ('an', 'book-mk-en', self.default_func),
'xe' : ('an', 'index-mark', self.default_func),
'rxe' : ('an', 'place_____', self.default_func),
@ -347,7 +353,7 @@ class ProcessTokens:
10: 'Kanji numbering without the digit character',
11: 'Kanji numbering with the digit character',
1246: 'phonetic Katakana characters in aiueo order',
1346: 'phonetic katakana characters in iroha order',
1346: 'phonetic katakana characters in iroha order',
14: 'double byte character',
15: 'single byte character',
16: 'Kanji numbering 3',
@ -392,7 +398,7 @@ class ProcessTokens:
5121 : 'Arabic Algeria',
15361 : 'Arabic Bahrain',
3073 : 'Arabic Egypt',
1 : 'Arabic General',
1 : 'Arabic General',
2049 : 'Arabic Iraq',
11265 : 'Arabic Jordan',
13313 : 'Arabic Kuwait',
@ -417,7 +423,7 @@ class ProcessTokens:
1059 : 'Byelorussian',
1027 : 'Catalan',
2052 : 'Chinese China',
4 : 'Chinese General',
4 : 'Chinese General',
3076 : 'Chinese Hong Kong',
4100 : 'Chinese Singapore',
1028 : 'Chinese Taiwan',
@ -431,7 +437,7 @@ class ProcessTokens:
2057 : 'English British',
4105 : 'English Canada',
9225 : 'English Caribbean',
9 : 'English General',
9 : 'English General',
6153 : 'English Ireland',
8201 : 'English Jamaica',
5129 : 'English New Zealand',
@ -595,30 +601,37 @@ class ProcessTokens:
num = num[1:] # chop off leading 0, which I added
num = num.upper() # the mappings store hex in caps
return 'tx<hx<__________<\'%s\n' % num # add an ' for the mappings
def ms_sub_func(self, pre, token, num):
return 'tx<mc<__________<%s\n' % token
def hardline_func(self, pre, token, num):
return 'mi<tg<empty_____<%s\n' % token
def default_func(self, pre, token, num):
if num == None:
if num is None:
num = 'true'
return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
def __list_type_func(self, pre, token, num):
type = 'arabic'
if num == None:
if num is None:
type = 'Arabic'
else:
try:
num = int(num)
except ValueError:
if self.__run_level > 3:
msg = 'number "%s" cannot be converted to integer\n' % num
msg = 'Number "%s" cannot be converted to integer\n' % num
raise self.__bug_handler, msg
type = self.__number_type_dict.get(num)
if type == None:
if type is None:
if self.__run_level > 3:
msg = 'No type for "%s" in self.__number_type_dict\n'
raise self.__bug_handler
type = 'Arabic'
return 'cw<%s<%s<nu<%s\n' % (pre, token, type)
def __language_func(self, pre, token, num):
lang_name = self.__language_dict.get(int(re.search('[0-9]+', num).group()))
if not lang_name:
@ -627,31 +640,36 @@ class ProcessTokens:
msg = 'No entry for number "%s"' % num
raise self.__bug_handler, msg
return 'cw<%s<%s<nu<%s\n' % (pre, token, lang_name)
def two_part_func(self, pre, token, num):
list = token.split("<")
token = list[0]
num = list[1]
return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
##return 'cw<nu<nu<nu<%s>num<%s\n' % (token, num)
def divide_by_2(self, pre, token, num):
num = self.divide_num(num, 2)
return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
##return 'cw<nu<nu<nu<%s>%s<%s\n' % (token, num, token)
def divide_by_20(self, pre, token, num):
num = self.divide_num(num, 20)
return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
##return 'cw<nu<nu<nu<%s>%s<%s\n' % (token, num, token)
def text_func(self, pre, token, num=None):
return 'tx<nu<__________<%s\n' % token
def ob_func(self, pre, token, num=None):
self.__bracket_count += 1
##return 'ob<%04d\n' % self.__bracket_count
return 'ob<nu<open-brack<%04d\n' % self.__bracket_count
def cb_func(self, pre, token, num=None):
##line = 'cb<%04d\n' % self.__bracket_count
line = 'cb<nu<clos-brack<%04d\n' % self.__bracket_count
self.__bracket_count -= 1
return line
def color_func(self, pre, token, num):
third_field = 'nu'
if num[-1] == ';':
@ -662,6 +680,7 @@ class ProcessTokens:
num = "0" + num
return 'cw<%s<%s<%s<%s\n' % (pre, token, third_field, num)
##return 'cw<cl<%s<nu<nu<%s>%s<%s\n' % (third_field, token, num, token)
def bool_st_func(self, pre, token, num):
if num is None or num == '' or num == '1':
return 'cw<%s<%s<nu<true\n' % (pre, token)
@ -670,24 +689,23 @@ class ProcessTokens:
return 'cw<%s<%s<nu<false\n' % (pre, token)
##return 'cw<nu<nu<nu<%s>false<%s\n' % (token, token)
else:
msg = 'boolean should have some value module process tokens\n'
msg += 'token is ' + token + "\n"
msg += "'" + num + "'" + "\n"
msg = "boolean should have some value module process tokens\ntoken is %s\n'%s'\n" % (token, num)
raise self.__bug_handler, msg
def __no_sup_sub_func(self, pre, token, num):
the_string = 'cw<ci<subscript_<nu<false\n'
the_string += 'cw<ci<superscrip<nu<false\n'
return the_string
def divide_num(self, numerator, denominator):
try:
numerator = float(re.search('[0-9.]+', numerator).group())
#calibre why ignore negative number? Wrong in case of \fi
numerator = float(re.search('[0-9.\-]+', numerator).group())
except TypeError, msg:
if self.__run_level > 3:
msg = 'no number to process?\n'
msg += 'this indicates that the token '
msg += ' \(\\li\) should have a number and does not\n'
msg += 'numerator is "%s"\n' % numerator
msg += 'denominator is "%s"\n' % denominator
msg = ('No number to process?\nthis indicates that the token \(\\li\) \
should have a number and does not\nnumerator is \
"%s"\ndenominator is "%s"\n') % (numerator, denominator)
raise self.__bug_handler, msg
if 5 > self.__return_code:
self.__return_code = 5
@ -698,9 +716,10 @@ class ProcessTokens:
if string_num[-2:] == ".0":
string_num = string_num[:-2]
return string_num
def split_let_num(self, token):
match_obj = re.search(self.__num_exp,token)
if match_obj != None:
if match_obj is not None:
first = match_obj.group(1)
second = match_obj.group(2)
if not second:
@ -714,6 +733,7 @@ class ProcessTokens:
raise self.__bug_handler
return token, 0
return first, second
def convert_to_hex(self,number):
"""Convert a string to uppercase hexidecimal"""
num = int(number)
@ -722,6 +742,7 @@ class ProcessTokens:
return hex_num
except:
raise self.__bug_handler
def process_cw(self, token):
"""Change the value of the control word by determining what dictionary
it belongs to"""
@ -737,89 +758,62 @@ class ProcessTokens:
pre, token, action = self.dict_token.get(token, (None, None, None))
if action:
return action(pre, token, num)
# unused function
def initiate_token_actions(self):
self.action_for_token={
'{' : self.ob_func,
'}' : self.cb_func,
'\\' : self.process_cw,
}
# unused function
def evaluate_token(self,token):
"""Evaluate tokens. Return a value if the token is not a
control word. Otherwise, pass token onto another method
for further evaluation."""
token, action = self.dict_token.get(token[0:1])
if action:
line = action(token)
return line
else :
return 'tx<nu<nu<nu<nu<%s\n' % token
def __check_brackets(self, in_file):
self.__check_brack_obj = check_brackets.CheckBrackets\
(file = in_file)
good_br = self.__check_brack_obj.check_brackets()[0]
if not good_br:
return 1
def process_tokens(self):
"""Main method for handling other methods. """
first_token = 0
second_token = 0
read_obj = open(self.__file, 'r')
write_obj = open(self.__write_to, 'w')
line_to_read = "dummy"
line_count = 0
while line_to_read:
line_to_read = read_obj.readline()
token = line_to_read
token = token.replace("\n","")
if not token:
continue
line_count += 1
try:
token.decode('us-ascii')
except UnicodeError, msg:
msg = str(msg)
msg += 'Invalid RTF: File not ascii encoded.\n'
raise self.__exception_handler, msg
if not first_token:
if token != '\\{':
msg = 'Invalid RTF: document doesn\'t start with {\n'
raise self.__exception_handler, msg
first_token = 1
elif first_token and not second_token:
if token[0:4] != '\\rtf':
msg ='Invalid RTF: document doesn\'t start with \\rtf \n'
raise self.__exception_handler, msg
second_token = 1
##token = self.evaluate_token(token)
the_index = token.find('\\ ')
if token != None and the_index > -1:
msg ='Invalid RTF: token "\\ " not valid. \n'
raise self.__exception_handler, msg
elif token[0:1] == "\\":
line = self.process_cw(token)
if line != None:
write_obj.write(line)
else:
fields = re.split(self.__utf_exp, token)
for field in fields:
if not field:
continue
if field[0:1] == '&':
write_obj.write('tx<ut<__________<%s\n' % field)
with open(self.__file, 'r') as read_obj:
with open(self.__write_to, 'wb') as write_obj:
for line in read_obj:
token = line.replace("\n","")
line_count += 1
if line_count == 1 and token != '\\{':
msg = 'Invalid RTF: document doesn\'t start with {\n'
raise self.__exception_handler, msg
elif line_count == 2 and token[0:4] != '\\rtf':
msg = 'Invalid RTF: document doesn\'t start with \\rtf \n'
raise self.__exception_handler, msg
the_index = token.find('\\ ')
if token is not None and the_index > -1:
msg = 'Invalid RTF: token "\\ " not valid.\n'
raise self.__exception_handler, msg
elif token[:1] == "\\":
try:
token.decode('us-ascii')
except UnicodeError, msg:
msg = 'Invalid RTF: Tokens not ascii encoded.\n%s' % str(msg)
raise self.__exception_handler, msg
line = self.process_cw(token)
if line is not None:
write_obj.write(line)
else:
write_obj.write('tx<nu<__________<%s\n' % field)
read_obj.close()
write_obj.close()
fields = re.split(self.__utf_exp, token)
for field in fields:
if not field:
continue
if field[0:1] == '&':
write_obj.write('tx<ut<__________<%s\n' % field)
else:
write_obj.write('tx<nu<__________<%s\n' % field)
if not line_count:
msg ='Invalid RTF: file appears to be empty. \n'
msg = 'Invalid RTF: file appears to be empty.\n'
raise self.__exception_handler, msg
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "processed_tokens.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)
bad_brackets = self.__check_brackets(self.__file)
if bad_brackets:
msg = 'Invalid RTF: document does not have matching brackets.\n'

View File

@ -16,7 +16,10 @@
# #
#########################################################################
import os, tempfile
from calibre.ebooks.rtf2xml import copy
from calibre.utils.cleantext import clean_ascii_chars
class ReplaceIllegals:
"""
reaplace illegal lower ascii characters
@ -30,21 +33,14 @@ class ReplaceIllegals:
self.__copy = copy
self.__run_level = run_level
self.__write_to = tempfile.mktemp()
def replace_illegals(self):
"""
"""
nums = [0, 1, 2, 3, 4, 5, 6, 7, 8, 11, 13, 14, 15, 16, 17, 18, 19]
read_obj = open(self.__file, 'r')
write_obj = open(self.__write_to, 'w')
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
for num in nums:
line = line.replace(chr(num), '')
write_obj.write(line)
read_obj.close()
write_obj.close()
with open(self.__file, 'r') as read_obj:
with open(self.__write_to, 'w') as write_obj:
for line in read_obj:
write_obj.write(clean_ascii_chars(line))
copy_obj = copy.Copy()
if self.__copy:
copy_obj.copy_file(self.__write_to, "replace_illegals.data")

View File

@ -16,7 +16,10 @@
# #
#########################################################################
import os, re, tempfile
from calibre.ebooks.rtf2xml import copy
from calibre.utils.mreplace import MReplace
class Tokenize:
"""Tokenize RTF into one line per field. Each line will contain information useful for the rest of the script"""
def __init__(self,
@ -28,89 +31,175 @@ class Tokenize:
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__special_tokens = [ '_', '~', "'", '{', '}' ]
self.__write_to = tempfile.mktemp()
def __from_ms_to_utf8(self,match_obj):
uni_char = int(match_obj.group(1))
if uni_char < 0:
uni_char += 65536
return '&#x' + str('%X' % uni_char) + ';'
def __neg_unicode_func(self, match_obj):
neg_uni_char = int(match_obj.group(1)) * -1
# sys.stderr.write(str( neg_uni_char))
uni_char = neg_uni_char + 65536
return '&#x' + str('%X' % uni_char) + ';'
def __sub_line_reg(self,line):
line = line.replace("\\\\", "\\backslash ")
line = line.replace("\\~", "\\~ ")
line = line.replace("\\;", "\\; ")
line = line.replace("&", "&amp;")
line = line.replace("<", "&lt;")
line = line.replace(">", "&gt;")
line = line.replace("\\~", "\\~ ")
line = line.replace("\\_", "\\_ ")
line = line.replace("\\:", "\\: ")
line = line.replace("\\-", "\\- ")
# turn into a generic token to eliminate special
# cases and make processing easier
line = line.replace("\\{", "\\ob ")
# turn into a generic token to eliminate special
# cases and make processing easier
line = line.replace("\\}", "\\cb ")
# put a backslash in front of to eliminate special cases and
# make processing easier
line = line.replace("{", "\\{")
# put a backslash in front of to eliminate special cases and
# make processing easier
line = line.replace("}", "\\}")
line = re.sub(self.__utf_exp, self.__from_ms_to_utf8, line)
# line = re.sub( self.__neg_utf_exp, self.__neg_unicode_func, line)
line = re.sub(self.__ms_hex_exp, "\\mshex0\g<1> ", line)
##line = line.replace("\\backslash", "\\\\")
# this is for older RTF
line = re.sub(self.__par_exp, '\\par ', line)
return line
def __compile_expressions(self):
self.__ms_hex_exp = re.compile(r"\\\'(..)")
self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) {0,1}")
self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\\[^\s\\{}&]+(?:\s)?)")
self.__par_exp = re.compile(r'\\$')
self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)")
##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)")
def __create_tokens(self):
self.__compile_expressions()
read_obj = open(self.__file, 'r')
write_obj = open(self.__write_to, 'w')
line_to_read = "dummy"
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
line = line.replace("\n", "")
line = self.__sub_line_reg(line)
tokens = re.split(self.__splitexp, line)
##print tokens
for token in tokens:
if token != "":
write_obj.write(token + "\n")
"""
match_obj = re.search(self.__mixed_exp, token)
if match_obj != None:
first = match_obj.group(1)
second = match_obj.group(2)
write_obj.write(first + "\n")
write_obj.write(second + "\n")
else:
write_obj.write(token + "\n")
"""
read_obj.close()
write_obj.close()
#variables
self.__uc_char = 0
self.__uc_bin = False
self.__uc_value = [1]
def __reini_utf8_counters(self):
self.__uc_char = 0
self.__uc_bin = False
def __remove_uc_chars(self, startchar, token):
for i in xrange(startchar, len(token)):
if token[i] == " ":
continue
elif self.__uc_char:
self.__uc_char -= 1
else:
return token[i:]
#if only " " and char to skip
return ''
def __unicode_process(self, token):
#change scope in
if token == '\{':
self.__uc_value.append(self.__uc_value[-1])
#basic error handling
self.__reini_utf8_counters()
return token
#change scope out
elif token == '\}':
self.__uc_value.pop()
self.__reini_utf8_counters()
return token
#add a uc control
elif token[:3] == '\uc':
self.__uc_value[-1] = int(token[3:])
self.__reini_utf8_counters()
return token
#bin data to slip
elif self.__uc_bin:
self.__uc_bin = False
return ''
#uc char to remove
elif self.__uc_char:
#handle \bin tag in case of uc char to skip
if token[:4] == '\bin':
self.__uc_char -=1
self.__uc_bin = True
return ''
elif token[:1] == "\\" :
self.__uc_char -=1
return ''
else:
return self.__remove_uc_chars(0, token)
#go for real \u token
match_obj = self.__utf_exp.match(token)
if match_obj is not None:
self.__reini_utf8_counters()
#get value and handle negative case
uni_char = int(match_obj.group(1))
uni_len = len(match_obj.group(1)) + 2
if uni_char < 0:
uni_char += 65536
uni_char = unichr(uni_char).encode('ascii', 'xmlcharrefreplace')
self.__uc_char = self.__uc_value[-1]
#there is only an unicode char
if len(token)<= uni_len:
return uni_char
#an unicode char and something else
#must be after as it is splited on \
#necessary? maybe for \bin?
elif not self.__uc_char:
return uni_char + token[uni_len:]
#if not uc0 and chars
else:
return uni_char + self.__remove_uc_chars(uni_len, token)
#default
return token
def __sub_reg_split(self,input_file):
input_file = self.__replace_spchar.mreplace(input_file)
input_file = self.__ms_hex_exp.sub("\\mshex0\g<1> ", input_file)
input_file = self.__utf_ud.sub("\\{\\uc0 \g<1>\\}", input_file)
#remove \n in bin data
input_file = self.__bin_exp.sub(lambda x: \
x.group().replace('\n', '') + '\n', input_file)
#split
tokens = re.split(self.__splitexp, input_file)
#remove empty tokens and \n
return filter(lambda x: len(x) > 0 and x != '\n', tokens)
#input_file = re.sub(self.__utf_exp, self.__from_ms_to_utf8, input_file)
# line = re.sub( self.__neg_utf_exp, self.__neg_unicode_func, line)
# this is for older RTF
#line = re.sub(self.__par_exp, '\\par ', line)
#return filter(lambda x: len(x) > 0, \
#(self.__remove_line.sub('', x) for x in tokens))
def __compile_expressions(self):
SIMPLE_RPL = {
"\\\\": "\\backslash ",
"\\~": "\\~ ",
"\\;": "\\; ",
"&": "&amp;",
"<": "&lt;",
">": "&gt;",
"\\~": "\\~ ",
"\\_": "\\_ ",
"\\:": "\\: ",
"\\-": "\\- ",
# turn into a generic token to eliminate special
# cases and make processing easier
"\\{": "\\ob ",
# turn into a generic token to eliminate special
# cases and make processing easier
"\\}": "\\cb ",
# put a backslash in front of to eliminate special cases and
# make processing easier
"{": "\\{",
# put a backslash in front of to eliminate special cases and
# make processing easier
"}": "\\}",
# this is for older RTF
r'\\$': '\\par ',
}
self.__replace_spchar = MReplace(SIMPLE_RPL)
#add ;? in case of char following \u
self.__ms_hex_exp = re.compile(r"\\\'([0-9a-fA-F]{2})") #r"\\\'(..)"
self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) ?")
self.__bin_exp = re.compile(r"(?:\\bin(-?\d{0,10})[\n ]+)[01\n]+")
#manage upr/ud situations
self.__utf_ud = re.compile(r"\\{[\n ]?\\upr[\n ]?(?:\\{.*?\\})[\n ]?" + \
r"\\{[\n ]?\\*[\n ]?\\ud[\n ]?(\\{.*?\\})[\n ]?\\}[\n ]?\\}")
#add \n in split for whole file reading
#why keep backslash whereas \is replaced before?
#remove \n from endline char
self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)")
#self.__bin_exp = re.compile(r"\\bin(-?\d{1,8}) {0,1}")
#self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})")
#self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\n|\\[^\s\\{}&]+(?:\s)?)")
#self.__par_exp = re.compile(r'\\$')
#self.__remove_line = re.compile(r'\n+')
#self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)")
##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)")
def tokenize(self):
"""Main class for handling other methods. Reads in one line \
at a time, usues method self.sub_line to make basic substitutions,\
uses ? to process tokens"""
self.__create_tokens()
"""Main class for handling other methods. Reads the file \
, uses method self.sub_reg to make basic substitutions,\
and process tokens by itself"""
#read
with open(self.__file, 'r') as read_obj:
input_file = read_obj.read()
#process simple replacements and split giving us a correct list
#remove '' and \n in the process
tokens = self.__sub_reg_split(input_file)
#correct unicode
tokens = map(self.__unicode_process, tokens)
#remove empty items created by removing \uc
tokens = filter(lambda x: len(x) > 0, tokens)
#write
with open(self.__write_to, 'wb') as write_obj:
write_obj.write('\n'.join(tokens))
#Move and copy
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "tokenize.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)
#self.__special_tokens = [ '_', '~', "'", '{', '}' ]