mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
RTF Input: Various code cleanups. Go back to trying to handle unicode mappings without pre-processing (Fixes #8171 (Unsupported charsets ie non ascii in RTF)). Fix bug in handling super/sub scripts.
This commit is contained in:
parent
3e9e655674
commit
7fde6cbead
@ -287,7 +287,7 @@
|
||||
<xsl:value-of select="count(preceding::rtf:footnote) + 1"/>
|
||||
<xsl:text>]</xsl:text>
|
||||
</xsl:when>
|
||||
<xsl:when test="(@superscript = 'true')">
|
||||
<xsl:when test="(@superscript)">
|
||||
<xsl:element name="sup">
|
||||
<xsl:element name="span">
|
||||
<xsl:attribute name="class">
|
||||
@ -297,7 +297,7 @@
|
||||
</xsl:element>
|
||||
</xsl:element>
|
||||
</xsl:when>
|
||||
<xsl:when test="(@underscript = 'true')">
|
||||
<xsl:when test="(@underscript or @subscript)">
|
||||
<xsl:element name="sub">
|
||||
<xsl:element name="span">
|
||||
<xsl:attribute name="class">
|
||||
|
@ -77,7 +77,15 @@ class RTFInput(InputFormatPlugin):
|
||||
|
||||
def generate_xml(self, stream):
|
||||
from calibre.ebooks.rtf2xml.ParseRtf import ParseRtf
|
||||
ofile = 'out.xml'
|
||||
ofile = 'dataxml.xml'
|
||||
run_lev, debug_dir = 1, None
|
||||
if getattr(self.opts, 'debug_pipeline', None) is not None:
|
||||
try:
|
||||
os.mkdir(debug_dir)
|
||||
debug_dir = 'rtfdebug'
|
||||
run_lev = 4
|
||||
except:
|
||||
pass
|
||||
parser = ParseRtf(
|
||||
in_file = stream,
|
||||
out_file = ofile,
|
||||
@ -115,43 +123,45 @@ class RTFInput(InputFormatPlugin):
|
||||
|
||||
# Write or do not write paragraphs. Default is 0.
|
||||
empty_paragraphs = 1,
|
||||
|
||||
#debug
|
||||
deb_dir = debug_dir,
|
||||
run_level = run_lev,
|
||||
)
|
||||
parser.parse_rtf()
|
||||
ans = open('out.xml').read()
|
||||
os.remove('out.xml')
|
||||
return ans
|
||||
with open(ofile, 'rb') as f:
|
||||
return f.read()
|
||||
|
||||
def extract_images(self, picts):
|
||||
import imghdr
|
||||
self.log('Extracting images...')
|
||||
|
||||
with open(picts, 'rb') as f:
|
||||
raw = f.read()
|
||||
picts = filter(len, re.findall(r'\{\\pict([^}]+)\}', raw))
|
||||
hex = re.compile(r'[^a-fA-F0-9]')
|
||||
encs = [hex.sub('', pict) for pict in picts]
|
||||
|
||||
count = 0
|
||||
raw = open(picts, 'rb').read()
|
||||
starts = []
|
||||
for match in re.finditer(r'\{\\pict([^}]+)\}', raw):
|
||||
starts.append(match.start(1))
|
||||
|
||||
imap = {}
|
||||
|
||||
for start in starts:
|
||||
pos, bc = start, 1
|
||||
while bc > 0:
|
||||
if raw[pos] == '}': bc -= 1
|
||||
elif raw[pos] == '{': bc += 1
|
||||
pos += 1
|
||||
pict = raw[start:pos+1]
|
||||
enc = re.sub(r'[^a-zA-Z0-9]', '', pict)
|
||||
for enc in encs:
|
||||
if len(enc) % 2 == 1:
|
||||
enc = enc[:-1]
|
||||
data = enc.decode('hex')
|
||||
fmt = imghdr.what(None, data)
|
||||
if fmt is None:
|
||||
fmt = 'wmf'
|
||||
count += 1
|
||||
name = (('%4d'%count).replace(' ', '0'))+'.wmf'
|
||||
open(name, 'wb').write(data)
|
||||
name = '%04d.%s' % (count, fmt)
|
||||
with open(name, 'wb') as f:
|
||||
f.write(data)
|
||||
imap[count] = name
|
||||
#open(name+'.hex', 'wb').write(enc)
|
||||
return self.convert_images(imap)
|
||||
|
||||
def convert_images(self, imap):
|
||||
for count, val in imap.items():
|
||||
self.default_img = None
|
||||
for count, val in imap.iteritems():
|
||||
try:
|
||||
imap[count] = self.convert_image(val)
|
||||
except:
|
||||
@ -159,6 +169,8 @@ class RTFInput(InputFormatPlugin):
|
||||
return imap
|
||||
|
||||
def convert_image(self, name):
|
||||
if not name.endswith('.wmf'):
|
||||
return name
|
||||
try:
|
||||
return self.rasterize_wmf(name)
|
||||
except:
|
||||
@ -167,16 +179,18 @@ class RTFInput(InputFormatPlugin):
|
||||
|
||||
def replace_wmf(self, name):
|
||||
from calibre.ebooks import calibre_cover
|
||||
data = calibre_cover('Conversion of WMF images is not supported',
|
||||
if self.default_img is None:
|
||||
self.default_img = calibre_cover('Conversion of WMF images is not supported',
|
||||
'Use Microsoft Word or OpenOffice to save this RTF file'
|
||||
' as HTML and convert that in calibre.', title_size=36,
|
||||
author_size=20)
|
||||
name = name.replace('.wmf', '.jpg')
|
||||
with open(name, 'wb') as f:
|
||||
f.write(data)
|
||||
f.write(self.default_img)
|
||||
return name
|
||||
|
||||
def rasterize_wmf(self, name):
|
||||
raise ValueError('Conversion of WMF images not supported')
|
||||
from calibre.utils.wmf import extract_raster_image
|
||||
with open(name, 'rb') as f:
|
||||
data = f.read()
|
||||
@ -212,27 +226,27 @@ class RTFInput(InputFormatPlugin):
|
||||
css += '\n'+'\n'.join(font_size_classes)
|
||||
css += '\n' +'\n'.join(color_classes)
|
||||
|
||||
for cls, val in border_styles.items():
|
||||
for cls, val in border_styles.iteritems():
|
||||
css += '\n\n.%s {\n%s\n}'%(cls, val)
|
||||
|
||||
with open('styles.css', 'ab') as f:
|
||||
f.write(css)
|
||||
|
||||
def preprocess(self, fname):
|
||||
self.log('\tPreprocessing to convert unicode characters')
|
||||
try:
|
||||
data = open(fname, 'rb').read()
|
||||
from calibre.ebooks.rtf.preprocess import RtfTokenizer, RtfTokenParser
|
||||
tokenizer = RtfTokenizer(data)
|
||||
tokens = RtfTokenParser(tokenizer.tokens)
|
||||
data = tokens.toRTF()
|
||||
fname = 'preprocessed.rtf'
|
||||
with open(fname, 'wb') as f:
|
||||
f.write(data)
|
||||
except:
|
||||
self.log.exception(
|
||||
'Failed to preprocess RTF to convert unicode sequences, ignoring...')
|
||||
return fname
|
||||
# def preprocess(self, fname):
|
||||
# self.log('\tPreprocessing to convert unicode characters')
|
||||
# try:
|
||||
# data = open(fname, 'rb').read()
|
||||
# from calibre.ebooks.rtf.preprocess import RtfTokenizer, RtfTokenParser
|
||||
# tokenizer = RtfTokenizer(data)
|
||||
# tokens = RtfTokenParser(tokenizer.tokens)
|
||||
# data = tokens.toRTF()
|
||||
# fname = 'preprocessed.rtf'
|
||||
# with open(fname, 'wb') as f:
|
||||
# f.write(data)
|
||||
# except:
|
||||
# self.log.exception(
|
||||
# 'Failed to preprocess RTF to convert unicode sequences, ignoring...')
|
||||
# return fname
|
||||
|
||||
def convert_borders(self, doc):
|
||||
border_styles = []
|
||||
@ -269,17 +283,14 @@ class RTFInput(InputFormatPlugin):
|
||||
self.log = log
|
||||
self.log('Converting RTF to XML...')
|
||||
#Name of the preprocesssed RTF file
|
||||
fname = self.preprocess(stream.name)
|
||||
# fname = self.preprocess(stream.name)
|
||||
try:
|
||||
xml = self.generate_xml(fname)
|
||||
xml = self.generate_xml(stream.name)
|
||||
except RtfInvalidCodeException, e:
|
||||
raise
|
||||
raise ValueError(_('This RTF file has a feature calibre does not '
|
||||
'support. Convert it to HTML first and then try it.\n%s')%e)
|
||||
|
||||
'''dataxml = open('dataxml.xml', 'w')
|
||||
dataxml.write(xml)
|
||||
dataxml.close'''
|
||||
|
||||
d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf'))
|
||||
if d:
|
||||
imap = {}
|
||||
|
@ -17,7 +17,8 @@
|
||||
#########################################################################
|
||||
# $Revision: 1.41 $
|
||||
# $Date: 2006/03/24 23:50:07 $
|
||||
import sys,os
|
||||
import sys, os
|
||||
|
||||
from calibre.ebooks.rtf2xml import headings_to_sections, \
|
||||
line_endings, footnote, fields_small, default_encoding, \
|
||||
make_lists, preamble_div, header, colors, group_borders, \
|
||||
@ -90,7 +91,6 @@ class ParseRtf:
|
||||
out_file = '',
|
||||
out_dir = None,
|
||||
dtd = '',
|
||||
#debug = 0, #why? calibre
|
||||
deb_dir = None,
|
||||
convert_symbol = None,
|
||||
convert_wingdings = None,
|
||||
@ -107,6 +107,7 @@ class ParseRtf:
|
||||
no_dtd = 0,
|
||||
char_data = '',
|
||||
):
|
||||
|
||||
"""
|
||||
Requires:
|
||||
'file' --file to parse
|
||||
@ -119,12 +120,11 @@ class ParseRtf:
|
||||
script tries to output to directory where is script is exectued.)
|
||||
'deb_dir' --debug directory. If a debug_dir is provided, the script
|
||||
will copy each run through as a file to examine in the debug_dir
|
||||
'perl_script'--use perl to make tokens. This runs just a bit faster.
|
||||
(I will probably phase this out.)
|
||||
'check_brackets' -- make sure the brackets match up after each run
|
||||
through a file. Only for debugging.
|
||||
Returns: Nothing
|
||||
"""
|
||||
|
||||
self.__file = in_file
|
||||
self.__out_file = out_file
|
||||
self.__out_dir = out_dir
|
||||
@ -132,7 +132,7 @@ class ParseRtf:
|
||||
self.__dtd_path = dtd
|
||||
self.__check_file(in_file,"file_to_parse")
|
||||
self.__char_data = char_data
|
||||
self.__debug_dir = deb_dir #self.__debug_dir = debug calibre
|
||||
self.__debug_dir = deb_dir
|
||||
self.__check_dir(self.__temp_dir)
|
||||
self.__copy = self.__check_dir(self.__debug_dir)
|
||||
self.__convert_caps = convert_caps
|
||||
@ -155,25 +155,24 @@ class ParseRtf:
|
||||
if hasattr(the_file, 'read'): return
|
||||
if the_file == None:
|
||||
if type == "file_to_parse":
|
||||
message = "You must provide a file for the script to work"
|
||||
msg = message
|
||||
msg = "\nYou must provide a file for the script to work"
|
||||
raise RtfInvalidCodeException, msg
|
||||
elif os.path.exists(the_file):
|
||||
pass # do nothing
|
||||
else:
|
||||
message = "The file '%s' cannot be found" % the_file
|
||||
msg = message
|
||||
msg = "\nThe file '%s' cannot be found" % the_file
|
||||
raise RtfInvalidCodeException, msg
|
||||
|
||||
def __check_dir(self, the_dir):
|
||||
"""Check to see if directory exists"""
|
||||
if not the_dir :
|
||||
return
|
||||
dir_exists = os.path.isdir(the_dir)
|
||||
if not dir_exists:
|
||||
message = "%s is not a directory" % the_dir
|
||||
msg = message
|
||||
msg = "\n%s is not a directory" % the_dir
|
||||
raise RtfInvalidCodeException, msg
|
||||
return 1
|
||||
|
||||
def parse_rtf(self):
|
||||
"""
|
||||
Parse the file by calling on other classes.
|
||||
@ -194,13 +193,14 @@ class ParseRtf:
|
||||
copy_obj.set_dir(self.__debug_dir)
|
||||
copy_obj.remove_files()
|
||||
copy_obj.copy_file(self.__temp_file, "original_file")
|
||||
# new as of 2005-08-02. Do I want this?
|
||||
# Function to check if bracket are well handled
|
||||
if self.__debug_dir or self.__run_level > 2:
|
||||
self.__check_brack_obj = check_brackets.CheckBrackets\
|
||||
(file = self.__temp_file,
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
)
|
||||
# convert Macintosh line endings to Unix line endings
|
||||
#convert Macintosh and Windows line endings to Unix line endings
|
||||
#why do this if you don't wb after?
|
||||
line_obj = line_endings.FixLineEndings(
|
||||
in_file = self.__temp_file,
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
@ -208,13 +208,13 @@ class ParseRtf:
|
||||
run_level = self.__run_level,
|
||||
replace_illegals = self.__replace_illegals,
|
||||
)
|
||||
return_value = line_obj.fix_endings()
|
||||
return_value = line_obj.fix_endings() #calibre return what?
|
||||
self.__return_code(return_value)
|
||||
tokenize_obj = tokenize.Tokenize(
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
in_file = self.__temp_file,
|
||||
copy = self.__copy,
|
||||
run_level = self.__run_level,)
|
||||
run_level = self.__run_level)
|
||||
tokenize_obj.tokenize()
|
||||
process_tokens_obj = process_tokens.ProcessTokens(
|
||||
in_file = self.__temp_file,
|
||||
@ -230,12 +230,25 @@ class ParseRtf:
|
||||
os.remove(self.__temp_file)
|
||||
except OSError:
|
||||
pass
|
||||
#Check to see if the file is correctly encoded
|
||||
encode_obj = default_encoding.DefaultEncoding(
|
||||
in_file = self.__temp_file,
|
||||
run_level = self.__run_level,
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
check_raw = True,
|
||||
)
|
||||
platform, code_page, default_font_num = encode_obj.find_default_encoding()
|
||||
check_encoding_obj = check_encoding.CheckEncoding(
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
)
|
||||
check_encoding_obj.check_encoding(self.__file)
|
||||
sys.stderr.write('File "%s" does not appear to be RTF.\n' % self.__file if isinstance(self.__file, str) else self.__file.encode('utf-8'))
|
||||
raise InvalidRtfException, msg
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
)
|
||||
enc = encode_obj.get_codepage()
|
||||
if enc != 'mac_roman':
|
||||
enc = 'cp' + enc
|
||||
if check_encoding_obj.check_encoding(self.__file, enc):
|
||||
file_name = self.__file if isinstance(self.__file, str) \
|
||||
else self.__file.encode('utf-8')
|
||||
msg = 'File %s does not appear to be correctly encoded.\n' % file_name
|
||||
raise InvalidRtfException, msg
|
||||
delete_info_obj = delete_info.DeleteInfo(
|
||||
in_file = self.__temp_file,
|
||||
copy = self.__copy,
|
||||
@ -508,6 +521,7 @@ class ParseRtf:
|
||||
indent = self.__indent,
|
||||
run_level = self.__run_level,
|
||||
no_dtd = self.__no_dtd,
|
||||
encoding = encode_obj.get_codepage(),
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
)
|
||||
tags_obj.convert_to_tags()
|
||||
@ -520,35 +534,28 @@ class ParseRtf:
|
||||
output_obj.output()
|
||||
os.remove(self.__temp_file)
|
||||
return self.__exit_level
|
||||
|
||||
def __bracket_match(self, file_name):
|
||||
if self.__run_level > 2:
|
||||
good_br, msg = self.__check_brack_obj.check_brackets()
|
||||
if good_br:
|
||||
pass
|
||||
# sys.stderr.write( msg + ' in ' + file_name + "\n")
|
||||
#sys.stderr.write( msg + ' in ' + file_name + "\n")
|
||||
else:
|
||||
msg += msg + " in file '" + file_name + "'\n"
|
||||
msg = '%s in file %s\n' % (msg, file_name)
|
||||
raise RtfInvalidCodeException, msg
|
||||
|
||||
def __return_code(self, num):
|
||||
if num == None:
|
||||
return
|
||||
if int(num) > self.__exit_level:
|
||||
self.__exit_level = num
|
||||
if num == None:
|
||||
return
|
||||
if int(num) > self.__exit_level:
|
||||
self.__exit_level = num
|
||||
|
||||
def __make_temp_file(self,file):
|
||||
"""Make a temporary file to parse"""
|
||||
write_file="rtf_write_file"
|
||||
read_obj = file if hasattr(file, 'read') else open(file,'r')
|
||||
write_obj = open(write_file, 'w')
|
||||
line = "dummy"
|
||||
while line:
|
||||
line = read_obj.read(1000)
|
||||
write_obj.write(line )
|
||||
write_obj.close()
|
||||
with open(write_file, 'wb') as write_obj:
|
||||
for line in read_obj:
|
||||
write_obj.write(line)
|
||||
return write_file
|
||||
"""
|
||||
mi<tg<open______<style-sheet\n
|
||||
mi<tg<close_____<style-sheet\n
|
||||
mi<tg<open-att__<footnote<num>1\n
|
||||
mi<tg<empty-att_<page-definition<margin>33\n
|
||||
mi<tg<empty_____<para\n
|
||||
"""
|
||||
|
@ -24,38 +24,38 @@ class CheckBrackets:
|
||||
self.__ob_count = 0
|
||||
self.__cb_count = 0
|
||||
self.__open_bracket_num = []
|
||||
|
||||
def open_brack(self, line):
|
||||
num = line[-5:-1]
|
||||
self.__open_bracket_num.append(num)
|
||||
self.__bracket_count += 1
|
||||
|
||||
def close_brack(self, line):
|
||||
num = line[-5:-1]
|
||||
##self.__open_bracket_num.append(num)
|
||||
try:
|
||||
last_num = self.__open_bracket_num.pop()
|
||||
except:
|
||||
return 0
|
||||
return False
|
||||
if num != last_num:
|
||||
return 0
|
||||
return False
|
||||
self.__bracket_count -= 1
|
||||
return 1
|
||||
return True
|
||||
|
||||
def check_brackets(self):
|
||||
read_obj = open(self.__file, 'r')
|
||||
line = 'dummy'
|
||||
line_count = 0
|
||||
while line:
|
||||
line_count += 1
|
||||
line = read_obj.readline()
|
||||
self.__token_info = line[:16]
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.open_brack(line)
|
||||
if self.__token_info == 'cb<nu<clos-brack':
|
||||
right_count = self.close_brack(line)
|
||||
if not right_count:
|
||||
return (0, "closed bracket doesn't match, line %s" % line_count)
|
||||
read_obj.close()
|
||||
with open(self.__file, 'r') as read_obj:
|
||||
for line in read_obj:
|
||||
line_count += 1
|
||||
self.__token_info = line[:16]
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.open_brack(line)
|
||||
if self.__token_info == 'cb<nu<clos-brack':
|
||||
if not self.close_brack(line):
|
||||
return (False, "closed bracket doesn't match, line %s" % line_count)
|
||||
|
||||
if self.__bracket_count != 0:
|
||||
msg = 'At end of file open and closed brackets don\'t match\n'
|
||||
msg = msg + 'total number of brackets is %s' % self.__bracket_count
|
||||
return (0, msg)
|
||||
return (1, "brackets match!")
|
||||
msg = ('At end of file open and closed brackets don\'t match\n' \
|
||||
'total number of brackets is %s') % self.__bracket_count
|
||||
return (False, msg)
|
||||
return (True, "Brackets match!")
|
||||
|
||||
|
@ -1,8 +1,11 @@
|
||||
#!/usr/bin/env python
|
||||
import sys
|
||||
|
||||
class CheckEncoding:
|
||||
|
||||
def __init__(self, bug_handler):
|
||||
self.__bug_handler = bug_handler
|
||||
|
||||
def __get_position_error(self, line, encoding, line_num):
|
||||
char_position = 0
|
||||
for char in line:
|
||||
@ -12,21 +15,23 @@ class CheckEncoding:
|
||||
except UnicodeError, msg:
|
||||
sys.stderr.write('line: %s char: %s\n' % (line_num, char_position))
|
||||
sys.stderr.write(str(msg) + '\n')
|
||||
def check_encoding(self, path, encoding='us-ascii'):
|
||||
read_obj = open(path, 'r')
|
||||
line_to_read = 1
|
||||
|
||||
def check_encoding(self, path, encoding='us-ascii', verbose=True):
|
||||
line_num = 0
|
||||
while line_to_read:
|
||||
line_num += 1
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
try:
|
||||
line.decode(encoding)
|
||||
except UnicodeError:
|
||||
if len(line) < 1000:
|
||||
self.__get_position_error(line, encoding, line_num)
|
||||
else:
|
||||
sys.stderr.write('line: %d has bad encoding\n'%line_num)
|
||||
with open(path, 'r') as read_obj:
|
||||
for line in read_obj:
|
||||
line_num += 1
|
||||
try:
|
||||
line.decode(encoding)
|
||||
except UnicodeError:
|
||||
if verbose:
|
||||
if len(line) < 1000:
|
||||
self.__get_position_error(line, encoding, line_num)
|
||||
else:
|
||||
sys.stderr.write('line: %d has bad encoding\n' % line_num)
|
||||
return True
|
||||
return False
|
||||
|
||||
if __name__ == '__main__':
|
||||
check_encoding_obj = CheckEncoding()
|
||||
check_encoding_obj.check_encoding(sys.argv[1])
|
||||
|
@ -16,7 +16,9 @@
|
||||
# #
|
||||
#########################################################################
|
||||
import os, tempfile
|
||||
|
||||
from calibre.ebooks.rtf2xml import copy
|
||||
|
||||
class CombineBorders:
|
||||
"""Combine borders in RTF tokens to make later processing easier"""
|
||||
def __init__(self,
|
||||
@ -32,28 +34,31 @@ class CombineBorders:
|
||||
self.__state = 'default'
|
||||
self.__bord_pos = 'default'
|
||||
self.__bord_att = []
|
||||
|
||||
def found_bd(self, line):
|
||||
#cw<bd<bor-t-r-vi
|
||||
self.__state = 'border'
|
||||
self.__bord_pos = line[6:16]
|
||||
|
||||
def __default_func(self, line):
|
||||
#cw<bd<bor-t-r-vi
|
||||
if self.__first_five == 'cw<bd':
|
||||
self.found_bd(line)
|
||||
return ''
|
||||
return line
|
||||
|
||||
def end_border(self, line, write_obj):
|
||||
joiner = "|"
|
||||
border_string = joiner.join(self.__bord_att)
|
||||
border_string = "|".join(self.__bord_att)
|
||||
self.__bord_att = []
|
||||
write_obj.write('cw<bd<%s<nu<%s\n' % (self.__bord_pos,
|
||||
border_string))
|
||||
border_string))
|
||||
self.__state = 'default'
|
||||
self.__bord_string = ''
|
||||
if self.__first_five == 'cw<bd':
|
||||
self. found_bd(line)
|
||||
else:
|
||||
write_obj.write(line)
|
||||
|
||||
def add_to_border_desc(self, line):
|
||||
#cw<bt<bdr-hair__<nu<true
|
||||
#cw<bt<bdr-linew<nu<0.50
|
||||
@ -65,26 +70,22 @@ class CombineBorders:
|
||||
else:
|
||||
num = ':' + num
|
||||
self.__bord_att.append(border_desc + num)
|
||||
|
||||
def __border_func(self, line, write_obj):
|
||||
if self.__first_five != 'cw<bt':
|
||||
self.end_border(line, write_obj)
|
||||
else:
|
||||
self.add_to_border_desc(line)
|
||||
|
||||
def combine_borders(self):
|
||||
read_obj = open(self.__file, 'r')
|
||||
write_obj = open(self.__write_to, 'w')
|
||||
line_to_read = 'dummy'
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__first_five = line[0:5]
|
||||
if self.__state == 'border':
|
||||
self.__border_func(line, write_obj)
|
||||
else:
|
||||
to_print = self.__default_func(line)
|
||||
write_obj.write(to_print)
|
||||
read_obj.close()
|
||||
write_obj.close()
|
||||
with open(self.__file, 'r') as read_obj:
|
||||
with open(self.__write_to, 'w') as write_obj:
|
||||
for line in read_obj:
|
||||
self.__first_five = line[0:5]
|
||||
if self.__state == 'border':
|
||||
self.__border_func(line, write_obj)
|
||||
else:
|
||||
write_obj.write(self.__default_func(line))
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "combine_borders.data")
|
||||
|
@ -1,6 +1,9 @@
|
||||
import os, tempfile
|
||||
from calibre.ebooks.rtf2xml import copy
|
||||
import os, tempfile, sys
|
||||
|
||||
from calibre.ebooks.rtf2xml import copy, check_encoding
|
||||
|
||||
public_dtd = 'rtf2xml1.0.dtd'
|
||||
|
||||
class ConvertToTags:
|
||||
"""
|
||||
Convert file to XML
|
||||
@ -10,6 +13,7 @@ class ConvertToTags:
|
||||
bug_handler,
|
||||
dtd_path,
|
||||
no_dtd,
|
||||
encoding,
|
||||
indent = None,
|
||||
copy = None,
|
||||
run_level = 1,
|
||||
@ -29,9 +33,14 @@ class ConvertToTags:
|
||||
self.__copy = copy
|
||||
self.__dtd_path = dtd_path
|
||||
self.__no_dtd = no_dtd
|
||||
if encoding != 'mac_roman':
|
||||
self.__encoding = 'cp' + encoding
|
||||
else:
|
||||
self.__encoding = 'mac_roman'
|
||||
self.__indent = indent
|
||||
self.__run_level = run_level
|
||||
self.__write_to = tempfile.mktemp()
|
||||
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Set values, including those for the dictionary.
|
||||
@ -61,6 +70,7 @@ class ConvertToTags:
|
||||
'tx<ut<__________' : self.__text_func,
|
||||
'mi<tg<empty_____' : self.__empty_func,
|
||||
}
|
||||
|
||||
def __open_func(self, line):
|
||||
"""
|
||||
Print the opening tag and newlines when needed.
|
||||
@ -73,6 +83,7 @@ class ConvertToTags:
|
||||
if info in self.__two_new_line:
|
||||
self.__write_extra_new_line()
|
||||
self.__write_obj.write('<%s>' % info)
|
||||
|
||||
def __empty_func(self, line):
|
||||
"""
|
||||
Print out empty tag and newlines when needed.
|
||||
@ -85,10 +96,11 @@ class ConvertToTags:
|
||||
self.__write_new_line()
|
||||
if info in self.__two_new_line:
|
||||
self.__write_extra_new_line()
|
||||
|
||||
def __open_att_func(self, line):
|
||||
"""
|
||||
Process lines for open tags that have attributes.
|
||||
The important infor is between [17:-1]. Take this info and split it
|
||||
The important info is between [17:-1]. Take this info and split it
|
||||
with the delimeter '<'. The first token in this group is the element
|
||||
name. The rest are attributes, separated fromt their values by '>'. So
|
||||
read each token one at a time, and split them by '>'.
|
||||
@ -119,6 +131,7 @@ class ConvertToTags:
|
||||
self.__write_new_line()
|
||||
if element_name in self.__two_new_line:
|
||||
self.__write_extra_new_line()
|
||||
|
||||
def __empty_att_func(self, line):
|
||||
"""
|
||||
Same as the __open_att_func, except a '/' is placed at the end of the tag.
|
||||
@ -143,6 +156,7 @@ class ConvertToTags:
|
||||
self.__write_new_line()
|
||||
if element_name in self.__two_new_line:
|
||||
self.__write_extra_new_line()
|
||||
|
||||
def __close_func(self, line):
|
||||
"""
|
||||
Print out the closed tag and new lines, if appropriate.
|
||||
@ -156,6 +170,7 @@ class ConvertToTags:
|
||||
self.__write_new_line()
|
||||
if info in self.__two_new_line:
|
||||
self.__write_extra_new_line()
|
||||
|
||||
def __text_func(self, line):
|
||||
"""
|
||||
Simply print out the information between [17:-1]
|
||||
@ -163,6 +178,7 @@ class ConvertToTags:
|
||||
#tx<nu<__________<Normal;
|
||||
# change this!
|
||||
self.__write_obj.write(line[17:-1])
|
||||
|
||||
def __write_extra_new_line(self):
|
||||
"""
|
||||
Print out extra new lines if the new lines have not exceeded two. If
|
||||
@ -172,8 +188,10 @@ class ConvertToTags:
|
||||
return
|
||||
if self.__new_line < 2:
|
||||
self.__write_obj.write('\n')
|
||||
|
||||
def __default_func(self, line):
|
||||
pass
|
||||
|
||||
def __write_new_line(self):
|
||||
"""
|
||||
Print out a new line if a new line has not already been printed out.
|
||||
@ -183,11 +201,23 @@ class ConvertToTags:
|
||||
if not self.__new_line:
|
||||
self.__write_obj.write('\n')
|
||||
self.__new_line += 1
|
||||
|
||||
def __write_dec(self):
|
||||
"""
|
||||
Write the XML declaration at the top of the document.
|
||||
"""
|
||||
self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>')
|
||||
#keep maximum compatibility with previous version
|
||||
check_encoding_obj = check_encoding.CheckEncoding(
|
||||
bug_handler=self.__bug_handler)
|
||||
|
||||
if not check_encoding_obj.check_encoding(self.__file, verbose=False):
|
||||
self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>')
|
||||
elif not check_encoding_obj.check_encoding(self.__file, self.__encoding):
|
||||
self.__write_obj.write('<?xml version="1.0" encoding="%s" ?>' % self.__encoding)
|
||||
else:
|
||||
self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>')
|
||||
sys.stderr.write('Bad RTF encoding, revert to US-ASCII chars and'
|
||||
' hope for the best')
|
||||
self.__new_line = 0
|
||||
self.__write_new_line()
|
||||
if self.__no_dtd:
|
||||
@ -207,6 +237,7 @@ class ConvertToTags:
|
||||
)
|
||||
self.__new_line = 0
|
||||
self.__write_new_line()
|
||||
|
||||
def convert_to_tags(self):
|
||||
"""
|
||||
Read in the file one line at a time. Get the important info, between
|
||||
@ -222,18 +253,14 @@ class ConvertToTags:
|
||||
an empty tag function.
|
||||
"""
|
||||
self.__initiate_values()
|
||||
read_obj = open(self.__file, 'r')
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
self.__write_dec()
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
action = self.__state_dict.get(self.__token_info)
|
||||
if action != None:
|
||||
action(line)
|
||||
read_obj.close()
|
||||
with open(self.__file, 'r') as read_obj:
|
||||
for line in read_obj:
|
||||
self.__token_info = line[:16]
|
||||
action = self.__state_dict.get(self.__token_info)
|
||||
if action is not None:
|
||||
action(line)
|
||||
self.__write_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
|
@ -23,6 +23,7 @@ class Copy:
|
||||
def __init__(self, bug_handler, file = None, deb_dir = None, ):
|
||||
self.__file = file
|
||||
self.__bug_handler = bug_handler
|
||||
|
||||
def set_dir(self, deb_dir):
|
||||
"""Set the temporary directory to write files to"""
|
||||
if deb_dir is None:
|
||||
@ -33,19 +34,11 @@ class Copy:
|
||||
message = "%(deb_dir)s is not a directory" % vars()
|
||||
raise self.__bug_handler , message
|
||||
Copy.__dir = deb_dir
|
||||
|
||||
def remove_files(self ):
|
||||
"""Remove files from directory"""
|
||||
self.__remove_the_files(Copy.__dir)
|
||||
"""
|
||||
list_of_files = os.listdir(Copy.__dir)
|
||||
list_of_files = os.listdir(the_dir)
|
||||
for file in list_of_files:
|
||||
rem_file = os.path.join(Copy.__dir,file)
|
||||
if os.path.isdir(rem_file):
|
||||
self.remove_files(rem_file)
|
||||
else:
|
||||
os.remove(rem_file)
|
||||
"""
|
||||
|
||||
def __remove_the_files(self, the_dir):
|
||||
"""Remove files from directory"""
|
||||
list_of_files = os.listdir(the_dir)
|
||||
@ -58,6 +51,7 @@ class Copy:
|
||||
os.remove(rem_file)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
def copy_file(self, file, new_file):
|
||||
"""
|
||||
Copy the file to a new name
|
||||
|
@ -1,61 +1,142 @@
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# You should have received a copy of the GNU General Public License #
|
||||
# along with this program; if not, write to the Free Software #
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
|
||||
# 02111-1307 USA #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
|
||||
'''
|
||||
Codepages as to RTF 1.9.1:
|
||||
437 United States IBM
|
||||
708 Arabic (ASMO 708)
|
||||
709 Arabic (ASMO 449+, BCON V4)
|
||||
710 Arabic (transparent Arabic)
|
||||
711 Arabic (Nafitha Enhanced)
|
||||
720 Arabic (transparent ASMO)
|
||||
819 Windows 3.1 (United States and Western Europe)
|
||||
850 IBM multilingual
|
||||
852 Eastern European
|
||||
860 Portuguese
|
||||
862 Hebrew
|
||||
863 French Canadian
|
||||
864 Arabic
|
||||
865 Norwegian
|
||||
866 Soviet Union
|
||||
874 Thai
|
||||
932 Japanese
|
||||
936 Simplified Chinese
|
||||
949 Korean
|
||||
950 Traditional Chinese
|
||||
1250 Eastern European
|
||||
1251 Cyrillic
|
||||
1252 Western European
|
||||
1253 Greek
|
||||
1254 Turkish
|
||||
1255 Hebrew
|
||||
1256 Arabic
|
||||
1257 Baltic
|
||||
1258 Vietnamese
|
||||
1361 Johab
|
||||
10000 MAC Roman
|
||||
10001 MAC Japan
|
||||
10004 MAC Arabic
|
||||
10005 MAC Hebrew
|
||||
10006 MAC Greek
|
||||
10007 MAC Cyrillic
|
||||
10029 MAC Latin2
|
||||
10081 MAC Turkish
|
||||
57002 Devanagari
|
||||
57003 Bengali
|
||||
57004 Tamil
|
||||
57005 Telugu
|
||||
57006 Assamese
|
||||
57007 Oriya
|
||||
57008 Kannada
|
||||
57009 Malayalam
|
||||
57010 Gujarati
|
||||
57011 Punjabi
|
||||
'''
|
||||
import re
|
||||
|
||||
class DefaultEncoding:
|
||||
"""
|
||||
Find the default encoding for the doc
|
||||
"""
|
||||
def __init__(self, in_file, bug_handler, run_level = 1,):
|
||||
"""
|
||||
Required:
|
||||
'file'
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
def __init__(self, in_file, bug_handler, run_level = 1, check_raw = False):
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__platform = 'Windows'
|
||||
self.__default_num = 'not-defined'
|
||||
self.__code_page = '1252'
|
||||
self.__datafetched = False
|
||||
self.__fetchraw = check_raw
|
||||
|
||||
def find_default_encoding(self):
|
||||
platform = 'Windows'
|
||||
default_num = 'not-defined'
|
||||
code_page = 'ansicpg1252'
|
||||
read_obj = open(self.__file, 'r')
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
if self.__token_info == 'mi<mk<rtfhed-end':
|
||||
break
|
||||
if self.__token_info == 'cw<ri<ansi-codpg':
|
||||
#cw<ri<ansi-codpg<nu<10000
|
||||
num = line[20:-1]
|
||||
if not num:
|
||||
num = '1252'
|
||||
code_page = 'ansicpg' + num
|
||||
if self.__token_info == 'cw<ri<macintosh_':
|
||||
platform = 'Macintosh'
|
||||
if self.__token_info == 'cw<ri<deflt-font':
|
||||
default_num = line[20:-1]
|
||||
#cw<ri<deflt-font<nu<0
|
||||
#action = self.__state_dict.get(self.__state)
|
||||
#if action == None:
|
||||
#print self.__state
|
||||
#action(line)
|
||||
read_obj.close()
|
||||
if platform == 'Macintosh':
|
||||
code_page = 'mac_roman'
|
||||
return platform, code_page, default_num
|
||||
if not self.__datafetched:
|
||||
self._encoding()
|
||||
self.__datafetched = True
|
||||
if self.__platform == 'Macintosh':
|
||||
code_page = self.__code_page
|
||||
else:
|
||||
code_page = 'ansicpg' + self.__code_page
|
||||
return self.__platform, code_page, self.__default_num
|
||||
|
||||
def get_codepage(self):
|
||||
if not self.__datafetched:
|
||||
self._encoding()
|
||||
self.__datafetched = True
|
||||
return self.__code_page
|
||||
|
||||
def get_platform(self):
|
||||
if not self.__datafetched:
|
||||
self._encoding()
|
||||
self.__datafetched = True
|
||||
return self.__platform
|
||||
|
||||
def _encoding(self):
|
||||
with open(self.__file, 'r') as read_obj:
|
||||
if not self.__fetchraw:
|
||||
for line in read_obj:
|
||||
self.__token_info = line[:16]
|
||||
if self.__token_info == 'mi<mk<rtfhed-end':
|
||||
break
|
||||
if self.__token_info == 'cw<ri<ansi-codpg':
|
||||
#cw<ri<ansi-codpg<nu<10000
|
||||
self.__code_page = line[20:-1] if int(line[20:-1]) \
|
||||
else '1252'
|
||||
if self.__token_info == 'cw<ri<macintosh_':
|
||||
self.__platform = 'Macintosh'
|
||||
self.__code_page = 'mac_roman'
|
||||
elif self.__token_info == 'cw<ri<pc________':
|
||||
self.__platform = 'IBMPC'
|
||||
self.__code_page = '437'
|
||||
elif self.__token_info == 'cw<ri<pca_______':
|
||||
self.__platform = 'OS/2'
|
||||
self.__code_page = '850'
|
||||
if self.__token_info == 'cw<ri<deflt-font':
|
||||
self.__default_num = line[20:-1]
|
||||
#cw<ri<deflt-font<nu<0
|
||||
else:
|
||||
fenc = re.compile(r'\\(mac|pc|ansi|pca)[\\ \{\}\t\n]+')
|
||||
fenccp = re.compile(r'\\ansicpg(\d+)[\\ \{\}\t\n]+')
|
||||
for line in read_obj:
|
||||
if fenccp.search(line):
|
||||
cp = fenccp.search(line).group(1)
|
||||
if not int(cp):
|
||||
self.__code_page = cp
|
||||
break
|
||||
if fenc.search(line):
|
||||
enc = fenc.search(line).group(1)
|
||||
if enc == 'mac':
|
||||
self.__code_page = 'mac_roman'
|
||||
elif enc == 'pc':
|
||||
self.__code_page = '437'
|
||||
elif enc == 'pca':
|
||||
self.__code_page = '850'
|
||||
|
||||
# if __name__ == '__main__':
|
||||
# encode_obj = DefaultEncoding(
|
||||
# in_file = sys.argv[1],
|
||||
# bug_handler = Exception,
|
||||
# check_raw = True,
|
||||
# )
|
||||
# print encode_obj.get_codepage()
|
||||
|
@ -16,7 +16,9 @@
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os, tempfile
|
||||
|
||||
from calibre.ebooks.rtf2xml import copy
|
||||
|
||||
class DeleteInfo:
|
||||
"""Delelet unecessary destination groups"""
|
||||
def __init__(self,
|
||||
@ -29,17 +31,18 @@ class DeleteInfo:
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__write_to = tempfile.mktemp()
|
||||
self.__bracket_count=0
|
||||
self.__bracket_count= 0
|
||||
self.__ob_count = 0
|
||||
self.__cb_count = 0
|
||||
self.__after_asterisk = 0
|
||||
self.__delete = 0
|
||||
# self.__after_asterisk = False
|
||||
# self.__delete = 0
|
||||
self.__initiate_allow()
|
||||
self.__ob = 0
|
||||
self.__write_cb = 0
|
||||
self.__write_cb = False
|
||||
self.__run_level = run_level
|
||||
self.__found_delete = 0
|
||||
self.__list = 0
|
||||
self.__found_delete = False
|
||||
# self.__list = False
|
||||
|
||||
def __initiate_allow(self):
|
||||
"""
|
||||
Initiate a list of destination groups which should be printed out.
|
||||
@ -66,9 +69,10 @@ class DeleteInfo:
|
||||
self.__state_dict = {
|
||||
'default' : self.__default_func,
|
||||
'after_asterisk' : self.__asterisk_func,
|
||||
'delete' : self.__delete_func,
|
||||
'delete' : self.__delete_func,
|
||||
'list' : self.__list_func,
|
||||
}
|
||||
|
||||
def __default_func(self,line):
|
||||
"""Handle lines when in no special state. Look for an asterisk to
|
||||
begin a special state. Otherwise, print out line."""
|
||||
@ -81,27 +85,29 @@ class DeleteInfo:
|
||||
if self.__ob:
|
||||
self.__write_obj.write(self.__ob)
|
||||
self.__ob = line
|
||||
return 0
|
||||
return False
|
||||
else:
|
||||
# write previous bracket, since didn't fine asterisk
|
||||
if self.__ob:
|
||||
self.__write_obj.write(self.__ob)
|
||||
self.__ob = 0
|
||||
return 1
|
||||
return True
|
||||
|
||||
def __delete_func(self,line):
|
||||
"""Handle lines when in delete state. Don't print out lines
|
||||
unless the state has ended."""
|
||||
if self.__delete_count == self.__cb_count:
|
||||
self.__state = 'default'
|
||||
if self.__write_cb:
|
||||
self.__write_cb = 0
|
||||
return 1
|
||||
return 0
|
||||
self.__write_cb = True
|
||||
return True
|
||||
return False
|
||||
|
||||
def __asterisk_func(self,line):
|
||||
"""
|
||||
Determine whether to delete info in group
|
||||
Note on self.__cb flag.
|
||||
If you find that you are in a delete group, and the preivous
|
||||
If you find that you are in a delete group, and the previous
|
||||
token in not an open bracket (self.__ob = 0), that means
|
||||
that the delete group is nested inside another acceptable
|
||||
detination group. In this case, you have alrady written
|
||||
@ -110,21 +116,21 @@ class DeleteInfo:
|
||||
"""
|
||||
# Test for {\*}, in which case don't enter
|
||||
# delete state
|
||||
self.__after_asterisk = 0 # only enter this function once
|
||||
self.__found_delete = 1
|
||||
# self.__after_asterisk = False # only enter this function once
|
||||
self.__found_delete = True
|
||||
if self.__token_info == 'cb<nu<clos-brack':
|
||||
if self.__delete_count == self.__cb_count:
|
||||
self.__state = 'default'
|
||||
self.__ob = 0
|
||||
# changed this because haven't printed out start
|
||||
return 0
|
||||
return False
|
||||
else:
|
||||
# not sure what happens here!
|
||||
# believe I have a '{\*}
|
||||
if self.__run_level > 3:
|
||||
msg = 'flag problem\n'
|
||||
raise self.__bug_handler, msg
|
||||
return 1
|
||||
return True
|
||||
elif self.__token_info in self.__allowable :
|
||||
if self.__ob:
|
||||
self.__write_obj.write(self.__ob)
|
||||
@ -132,85 +138,81 @@ class DeleteInfo:
|
||||
self.__state = 'default'
|
||||
else:
|
||||
pass
|
||||
return 1
|
||||
return True
|
||||
elif self.__token_info == 'cw<ls<list______':
|
||||
self.__ob = 0
|
||||
self.__found_list_func(line)
|
||||
elif self.__token_info in self.__not_allowable:
|
||||
if not self.__ob:
|
||||
self.__write_cb = 1
|
||||
self.__write_cb = True
|
||||
self.__ob = 0
|
||||
self.__state = 'delete'
|
||||
self.__cb_count = 0
|
||||
return 0
|
||||
return False
|
||||
else:
|
||||
if self.__run_level > 5:
|
||||
msg = 'After an asterisk, and found neither an allowable or non-allowble token\n'
|
||||
msg += 'token is "%s"\n' % self.__token_info
|
||||
raise self.__bug_handler
|
||||
msg = ('After an asterisk, and found neither an allowable or non-allowable token\n\
|
||||
token is "%s"\n') % self.__token_info
|
||||
raise self.__bug_handler, msg
|
||||
if not self.__ob:
|
||||
self.__write_cb = 1
|
||||
self.__write_cb = True
|
||||
self.__ob = 0
|
||||
self.__state = 'delete'
|
||||
self.__cb_count = 0
|
||||
return 0
|
||||
return False
|
||||
|
||||
def __found_list_func(self, line):
|
||||
"""
|
||||
print out control words in this group
|
||||
"""
|
||||
self.__state = 'list'
|
||||
|
||||
def __list_func(self, line):
|
||||
"""
|
||||
Check to see if the group has ended.
|
||||
Return 1 for all control words.
|
||||
Return 0 otherwise.
|
||||
Return True for all control words.
|
||||
Return False otherwise.
|
||||
"""
|
||||
if self.__delete_count == self.__cb_count and self.__token_info ==\
|
||||
'cb<nu<clos-brack':
|
||||
self.__state = 'default'
|
||||
if self.__write_cb:
|
||||
self.__write_cb = 0
|
||||
return 1
|
||||
return 0
|
||||
self.__write_cb = False
|
||||
return True
|
||||
return False
|
||||
elif line[0:2] == 'cw':
|
||||
return 1
|
||||
return True
|
||||
else:
|
||||
return 0
|
||||
return False
|
||||
|
||||
def delete_info(self):
|
||||
"""Main method for handling other methods. Read one line in at
|
||||
a time, and determine wheter to print the line based on the state."""
|
||||
line_to_read = 'dummy'
|
||||
read_obj = open(self.__file, 'r')
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
while line_to_read:
|
||||
#ob<nu<open-brack<0001
|
||||
to_print =1
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.__ob_count = line[-5:-1]
|
||||
if self.__token_info == 'cb<nu<clos-brack':
|
||||
self.__cb_count = line[-5:-1]
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if not action:
|
||||
sys.stderr.write('No action in dictionary state is "%s" \n'
|
||||
% self.__state)
|
||||
to_print = action(line)
|
||||
"""
|
||||
if self.__after_asterisk:
|
||||
to_print = self.__asterisk_func(line)
|
||||
elif self.__list:
|
||||
self.__in_list_func(line)
|
||||
elif self.__delete:
|
||||
to_print = self.__delete_func(line)
|
||||
else:
|
||||
to_print = self.__default_func(line)
|
||||
"""
|
||||
if to_print:
|
||||
self.__write_obj.write(line)
|
||||
self.__write_obj.close()
|
||||
read_obj.close()
|
||||
a time, and determine whether to print the line based on the state."""
|
||||
with open(self.__file, 'r') as read_obj:
|
||||
with open(self.__write_to, 'w') as self.__write_obj:
|
||||
for line in read_obj:
|
||||
#ob<nu<open-brack<0001
|
||||
to_print = True
|
||||
self.__token_info = line[:16]
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.__ob_count = line[-5:-1]
|
||||
if self.__token_info == 'cb<nu<clos-brack':
|
||||
self.__cb_count = line[-5:-1]
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if not action:
|
||||
sys.stderr.write(_('No action in dictionary state is "%s" \n')
|
||||
% self.__state)
|
||||
to_print = action(line)
|
||||
# if self.__after_asterisk:
|
||||
# to_print = self.__asterisk_func(line)
|
||||
# elif self.__list:
|
||||
# self.__in_list_func(line)
|
||||
# elif self.__delete:
|
||||
# to_print = self.__delete_func(line)
|
||||
# else:
|
||||
# to_print = self.__default_func(line)
|
||||
if to_print:
|
||||
self.__write_obj.write(line)
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "delete_info.data")
|
||||
|
@ -16,7 +16,9 @@
|
||||
# #
|
||||
#########################################################################
|
||||
import os, tempfile
|
||||
|
||||
from calibre.ebooks.rtf2xml import copy
|
||||
|
||||
class Footnote:
|
||||
"""
|
||||
Two public methods are available. The first separates all of the
|
||||
@ -35,6 +37,7 @@ class Footnote:
|
||||
self.__copy = copy
|
||||
self.__write_to = tempfile.mktemp()
|
||||
self.__found_a_footnote = 0
|
||||
|
||||
def __first_line_func(self, line):
|
||||
"""
|
||||
Print the tag info for footnotes. Check whether footnote is an
|
||||
@ -47,6 +50,7 @@ class Footnote:
|
||||
self.__write_to_foot_obj.write(
|
||||
'mi<tg<open-att__<footnote<num>%s\n' % self.__footnote_count)
|
||||
self.__first_line = 0
|
||||
|
||||
def __in_footnote_func(self, line):
|
||||
"""Handle all tokens that are part of footnote"""
|
||||
if self.__first_line:
|
||||
@ -68,6 +72,7 @@ class Footnote:
|
||||
'mi<mk<footnt-clo\n')
|
||||
else:
|
||||
self.__write_to_foot_obj.write(line)
|
||||
|
||||
def __found_footnote(self, line):
|
||||
""" Found a footnote"""
|
||||
self.__found_a_footnote = 1
|
||||
@ -81,6 +86,7 @@ class Footnote:
|
||||
'mi<mk<footnt-ind<%04d\n' % self.__footnote_count)
|
||||
self.__write_to_foot_obj.write(
|
||||
'mi<mk<footnt-ope<%04d\n' % self.__footnote_count)
|
||||
|
||||
def __default_sep(self, line):
|
||||
"""Handle all tokens that are not footnote tokens"""
|
||||
if self.__token_info == 'cw<nt<footnote__':
|
||||
@ -91,6 +97,7 @@ class Footnote:
|
||||
self.__write_obj.write(
|
||||
'tx<nu<__________<%s\n' % num
|
||||
)
|
||||
|
||||
def __initiate_sep_values(self):
|
||||
"""
|
||||
initiate counters for separate_footnotes method.
|
||||
@ -102,6 +109,7 @@ class Footnote:
|
||||
self.__in_footnote = 0
|
||||
self.__first_line = 0 #have not processed the first line of footnote
|
||||
self.__footnote_count = 0
|
||||
|
||||
def separate_footnotes(self):
|
||||
"""
|
||||
Separate all the footnotes in an RTF file and put them at the bottom,
|
||||
@ -111,58 +119,50 @@ class Footnote:
|
||||
bottom of the main file.
|
||||
"""
|
||||
self.__initiate_sep_values()
|
||||
read_obj = open(self.__file)
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
self.__footnote_holder = tempfile.mktemp()
|
||||
self.__write_to_foot_obj = open(self.__footnote_holder, 'w')
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
# keep track of opening and closing brackets
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.__ob_count = line[-5:-1]
|
||||
if self.__token_info == 'cb<nu<clos-brack':
|
||||
self.__cb_count = line[-5:-1]
|
||||
# In the middle of footnote text
|
||||
if self.__in_footnote:
|
||||
self.__in_footnote_func(line)
|
||||
# not in the middle of footnote text
|
||||
else:
|
||||
self.__default_sep(line)
|
||||
self.__write_obj.close()
|
||||
read_obj.close()
|
||||
self.__write_to_foot_obj.close()
|
||||
read_obj = open(self.__footnote_holder, 'r')
|
||||
write_obj = open(self.__write_to, 'a')
|
||||
write_obj.write(
|
||||
'mi<mk<sect-close\n'
|
||||
'mi<mk<body-close\n'
|
||||
'mi<tg<close_____<section\n'
|
||||
'mi<tg<close_____<body\n'
|
||||
'mi<tg<close_____<doc\n'
|
||||
'mi<mk<footnt-beg\n')
|
||||
line = 1
|
||||
while line:
|
||||
line = read_obj.readline()
|
||||
write_obj.write(line)
|
||||
write_obj.write(
|
||||
'mi<mk<footnt-end\n')
|
||||
read_obj.close()
|
||||
write_obj.close()
|
||||
with open(self.__file) as read_obj:
|
||||
with open(self.__write_to, 'w') as self.__write_obj:
|
||||
with open(self.__footnote_holder, 'w') as self.__write_to_foot_obj:
|
||||
for line in read_obj:
|
||||
self.__token_info = line[:16]
|
||||
# keep track of opening and closing brackets
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.__ob_count = line[-5:-1]
|
||||
if self.__token_info == 'cb<nu<clos-brack':
|
||||
self.__cb_count = line[-5:-1]
|
||||
# In the middle of footnote text
|
||||
if self.__in_footnote:
|
||||
self.__in_footnote_func(line)
|
||||
# not in the middle of footnote text
|
||||
else:
|
||||
self.__default_sep(line)
|
||||
with open(self.__footnote_holder, 'r') as read_obj:
|
||||
with open(self.__write_to, 'a') as write_obj:
|
||||
write_obj.write(
|
||||
'mi<mk<sect-close\n'
|
||||
'mi<mk<body-close\n'
|
||||
'mi<tg<close_____<section\n'
|
||||
'mi<tg<close_____<body\n'
|
||||
'mi<tg<close_____<doc\n'
|
||||
'mi<mk<footnt-beg\n')
|
||||
for line in read_obj:
|
||||
write_obj.write(line)
|
||||
write_obj.write(
|
||||
'mi<mk<footnt-end\n')
|
||||
os.remove(self.__footnote_holder)
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "footnote_separate.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
|
||||
def update_info(self, file, copy):
|
||||
"""
|
||||
Unused method
|
||||
"""
|
||||
self.__file = file
|
||||
self.__copy = copy
|
||||
|
||||
def __get_foot_body_func(self, line):
|
||||
"""
|
||||
Process lines in main body and look for beginning of footnotes.
|
||||
@ -172,6 +172,7 @@ class Footnote:
|
||||
self.__state = 'foot'
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __get_foot_foot_func(self, line):
|
||||
"""
|
||||
Copy footnotes from bottom of file to a separate, temporary file.
|
||||
@ -180,6 +181,7 @@ class Footnote:
|
||||
self.__state = 'body'
|
||||
else:
|
||||
self.__write_to_foot_obj.write(line)
|
||||
|
||||
def __get_footnotes(self):
|
||||
"""
|
||||
Private method to remove footnotes from main file. Read one line from
|
||||
@ -188,21 +190,16 @@ class Footnote:
|
||||
These two functions do the work of separating the footnotes form the
|
||||
body.
|
||||
"""
|
||||
read_obj = open(self.__file)
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
# self.__write_to = "footnote_info.data"
|
||||
self.__write_to_foot_obj = open(self.__footnote_holder, 'w')
|
||||
line = 1
|
||||
while line:
|
||||
line = read_obj.readline()
|
||||
self.__token_info = line[:16]
|
||||
if self.__state == 'body':
|
||||
self.__get_foot_body_func(line)
|
||||
elif self.__state == 'foot':
|
||||
self.__get_foot_foot_func(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
self.__write_to_foot_obj.close()
|
||||
with open(self.__file) as read_obj:
|
||||
with open(self.__write_to, 'w') as self.__write_obj:
|
||||
with open(self.__footnote_holder, 'w') as self.__write_to_foot_obj:
|
||||
for line in read_obj:
|
||||
self.__token_info = line[:16]
|
||||
if self.__state == 'body':
|
||||
self.__get_foot_body_func(line)
|
||||
elif self.__state == 'foot':
|
||||
self.__get_foot_foot_func(line)
|
||||
|
||||
def __get_foot_from_temp(self, num):
|
||||
"""
|
||||
Private method for joining footnotes to body. This method reads from
|
||||
@ -213,9 +210,7 @@ class Footnote:
|
||||
look_for = 'mi<mk<footnt-ope<' + num + '\n'
|
||||
found_foot = 0
|
||||
string_to_return = ''
|
||||
line = 1
|
||||
while line:
|
||||
line = self.__read_from_foot_obj.readline()
|
||||
for line in self.__read_from_foot_obj:
|
||||
if found_foot:
|
||||
if line == 'mi<mk<footnt-clo\n':
|
||||
return string_to_return
|
||||
@ -223,6 +218,7 @@ class Footnote:
|
||||
else:
|
||||
if line == look_for:
|
||||
found_foot = 1
|
||||
|
||||
def __join_from_temp(self):
|
||||
"""
|
||||
Private method for rejoining footnotes to body. Read from the
|
||||
@ -232,16 +228,14 @@ class Footnote:
|
||||
print out to the third file.
|
||||
If no footnote marker is found, simply print out the token (line).
|
||||
"""
|
||||
self.__read_from_foot_obj = open(self.__footnote_holder, 'r')
|
||||
read_obj = open(self.__write_to, 'r')
|
||||
self.__write_obj = open(self.__write_to2, 'w')
|
||||
line = 1
|
||||
while line:
|
||||
line = read_obj.readline()
|
||||
if line[:16] == 'mi<mk<footnt-ind':
|
||||
line = self.__get_foot_from_temp(line[17:-1])
|
||||
self.__write_obj.write(line)
|
||||
read_obj.close()
|
||||
with open(self.__footnote_holder, 'r') as self.__read_from_foot_obj:
|
||||
with open(self.__write_to, 'r') as read_obj:
|
||||
with open(self.__write_to2, 'w') as self.__write_obj:
|
||||
for line in read_obj:
|
||||
if line[:16] == 'mi<mk<footnt-ind':
|
||||
line = self.__get_foot_from_temp(line[17:-1])
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def join_footnotes(self):
|
||||
"""
|
||||
Join the footnotes from the bottom of the file and put them in their
|
||||
@ -258,8 +252,8 @@ class Footnote:
|
||||
self.__state = 'body'
|
||||
self.__get_footnotes()
|
||||
self.__join_from_temp()
|
||||
self.__write_obj.close()
|
||||
self.__read_from_foot_obj.close()
|
||||
# self.__write_obj.close()
|
||||
# self.__read_from_foot_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to2, "footnote_joined.data")
|
||||
|
@ -43,27 +43,28 @@ class GetCharMap:
|
||||
def get_char_map(self, map):
|
||||
if map == 'ansicpg0':
|
||||
map = 'ansicpg1250'
|
||||
found_map = 0
|
||||
if map in ('ansicpg10000', '10000'):
|
||||
map = 'mac_roman'
|
||||
found_map = False
|
||||
map_dict = {}
|
||||
self.__char_file.seek(0)
|
||||
for line in self.__char_file.readlines():
|
||||
for line in self.__char_file:
|
||||
if not line.strip(): continue
|
||||
begin_element = '<%s>' % map;
|
||||
end_element = '</%s>' % map
|
||||
if not found_map:
|
||||
if begin_element in line:
|
||||
found_map = 1
|
||||
found_map = True
|
||||
else:
|
||||
if end_element in line:
|
||||
break
|
||||
fields = line.split(':')
|
||||
fields[1].replace('\\colon', ':')
|
||||
map_dict[fields[1]] = fields[3]
|
||||
|
||||
|
||||
|
||||
|
||||
if not found_map:
|
||||
msg = 'no map found\n'
|
||||
msg += 'map is "%s"\n'%(map,)
|
||||
msg = 'no map found\nmap is "%s"\n'%(map,)
|
||||
raise self.__bug_handler, msg
|
||||
return map_dict
|
||||
|
||||
|
@ -54,10 +54,10 @@ class Hex2Utf8:
|
||||
'convert_to_caps'--wether to convert caps to utf-8
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__copy = copy
|
||||
if area_to_convert != 'preamble' and area_to_convert != 'body':
|
||||
if area_to_convert not in ('preamble', 'body'):
|
||||
msg = (
|
||||
'Developer error! Wrong flag.\n'
|
||||
'in module "hex_2_utf8.py\n'
|
||||
@ -79,7 +79,8 @@ class Hex2Utf8:
|
||||
self.__write_to = tempfile.mktemp()
|
||||
self.__bug_handler = bug_handler
|
||||
self.__invalid_rtf_handler = invalid_rtf_handler
|
||||
def update_values( self,
|
||||
|
||||
def update_values(self,
|
||||
file,
|
||||
area_to_convert,
|
||||
char_file,
|
||||
@ -132,6 +133,7 @@ class Hex2Utf8:
|
||||
# self.__convert_symbol = 0
|
||||
# self.__convert_wingdings = 0
|
||||
# self.__convert_zapf = 0
|
||||
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Required:
|
||||
@ -191,6 +193,7 @@ class Hex2Utf8:
|
||||
'body' : self.__body_func,
|
||||
'mi<mk<body-open_' : self.__found_body_func,
|
||||
'tx<hx<__________' : self.__hex_text_func,
|
||||
# 'tx<nu<__________' : self.__text_func,
|
||||
}
|
||||
self.__body_state_dict = {
|
||||
'preamble' : self.__preamble_for_body_func,
|
||||
@ -209,6 +212,7 @@ class Hex2Utf8:
|
||||
}
|
||||
self.__caps_list = ['false']
|
||||
self.__font_list = ['not-defined']
|
||||
|
||||
def __hex_text_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
@ -218,12 +222,12 @@ class Hex2Utf8:
|
||||
token is in the dictionary, then check if the value starts with a
|
||||
"&". If it does, then tag the result as utf text. Otherwise, tag it
|
||||
as normal text.
|
||||
If the nex_num is not in the dictionary, then a mistake has been
|
||||
If the hex_num is not in the dictionary, then a mistake has been
|
||||
made.
|
||||
"""
|
||||
hex_num = line[17:-1]
|
||||
converted = self.__current_dict.get(hex_num)
|
||||
if converted != None:
|
||||
if converted is not None:
|
||||
# tag as utf-8
|
||||
if converted[0:1] == "&":
|
||||
font = self.__current_dict_name
|
||||
@ -263,42 +267,43 @@ class Hex2Utf8:
|
||||
# msg += 'dictionary is %s\n' % self.__current_dict_name
|
||||
msg = 'Character "&#x%s;" does not appear to be valid (or is a control character)\n' % token
|
||||
raise self.__bug_handler, msg
|
||||
|
||||
def __found_body_func(self, line):
|
||||
self.__state = 'body'
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __body_func(self, line):
|
||||
"""
|
||||
When parsing preamble
|
||||
"""
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __preamble_func(self, line):
|
||||
action = self.__preamble_state_dict.get(self.__token_info)
|
||||
if action != None:
|
||||
if action is not None:
|
||||
action(line)
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __convert_preamble(self):
|
||||
self.__state = 'preamble'
|
||||
read_obj = open(self.__file, 'r')
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
action = self.__preamble_state_dict.get(self.__state)
|
||||
if action == None:
|
||||
sys.stderr.write('error no state found in hex_2_utf8',
|
||||
self.__state
|
||||
)
|
||||
action(line)
|
||||
read_obj.close()
|
||||
with open(self.__file, 'r') as read_obj:
|
||||
for line in read_obj:
|
||||
self.__token_info = line[:16]
|
||||
action = self.__preamble_state_dict.get(self.__state)
|
||||
if action is None:
|
||||
sys.stderr.write(_('error no state found in hex_2_utf8'),
|
||||
self.__state
|
||||
)
|
||||
action(line)
|
||||
self.__write_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "preamble_utf_convert.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
|
||||
def __preamble_for_body_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
@ -311,6 +316,7 @@ class Hex2Utf8:
|
||||
if self.__token_info == 'mi<mk<body-open_':
|
||||
self.__found_body_func(line)
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __body_for_body_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
@ -321,10 +327,11 @@ class Hex2Utf8:
|
||||
Used when parsing the body.
|
||||
"""
|
||||
action = self.__in_body_dict.get(self.__token_info)
|
||||
if action != None:
|
||||
if action is not None:
|
||||
action(line)
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __start_font_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
@ -348,6 +355,7 @@ class Hex2Utf8:
|
||||
else:
|
||||
self.__current_dict_name = 'default'
|
||||
self.__current_dict = self.__def_dict
|
||||
|
||||
def __end_font_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
@ -376,6 +384,7 @@ class Hex2Utf8:
|
||||
else:
|
||||
self.__current_dict_name = 'default'
|
||||
self.__current_dict = self.__def_dict
|
||||
|
||||
def __start_special_font_func_old(self, line):
|
||||
"""
|
||||
Required:
|
||||
@ -398,6 +407,7 @@ class Hex2Utf8:
|
||||
self.__current_dict.append(self.__dingbats_dict)
|
||||
self.__special_fonts_found += 1
|
||||
self.__current_dict_name = 'Zapf Dingbats'
|
||||
|
||||
def __end_special_font_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
@ -416,6 +426,7 @@ class Hex2Utf8:
|
||||
self.__current_dict.pop()
|
||||
self.__special_fonts_found -= 1
|
||||
self.__dict_name = 'default'
|
||||
|
||||
def __start_caps_func_old(self, line):
|
||||
"""
|
||||
Required:
|
||||
@ -427,6 +438,7 @@ class Hex2Utf8:
|
||||
self.__in_caps to 1
|
||||
"""
|
||||
self.__in_caps = 1
|
||||
|
||||
def __start_caps_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
@ -440,6 +452,7 @@ class Hex2Utf8:
|
||||
self.__in_caps = 1
|
||||
value = line[17:-1]
|
||||
self.__caps_list.append(value)
|
||||
|
||||
def __end_caps_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
@ -455,7 +468,8 @@ class Hex2Utf8:
|
||||
else:
|
||||
sys.stderr.write('Module is hex_2_utf8\n')
|
||||
sys.stderr.write('method is __end_caps_func\n')
|
||||
sys.stderr.write('caps list should be more than one?\n')
|
||||
sys.stderr.write('caps list should be more than one?\n') #self.__in_caps not set
|
||||
|
||||
def __text_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
@ -466,9 +480,8 @@ class Hex2Utf8:
|
||||
if in caps, convert. Otherwise, print out.
|
||||
"""
|
||||
text = line[17:-1]
|
||||
if self.__current_dict_name == 'Symbol'\
|
||||
or self.__current_dict_name == 'Wingdings'\
|
||||
or self.__current_dict_name == 'Zapf Dingbats':
|
||||
# print line
|
||||
if self.__current_dict_name in ('Symbol', 'Wingdings', 'Zapf Dingbats'):
|
||||
the_string = ''
|
||||
for letter in text:
|
||||
hex_num = hex(ord(letter))
|
||||
@ -477,21 +490,21 @@ class Hex2Utf8:
|
||||
hex_num = hex_num[2:]
|
||||
hex_num = '\'%s' % hex_num
|
||||
converted = self.__current_dict.get(hex_num)
|
||||
if converted == None:
|
||||
if converted is None:
|
||||
sys.stderr.write('module is hex_2_ut8\n')
|
||||
sys.stderr.write('method is __text_func\n')
|
||||
sys.stderr.write('no hex value for "%s"\n' % hex_num)
|
||||
else:
|
||||
the_string += converted
|
||||
self.__write_obj.write('tx<nu<__________<%s\n' % the_string)
|
||||
# print the_string
|
||||
else:
|
||||
if self.__caps_list[-1] == 'true' \
|
||||
and self.__convert_caps\
|
||||
and self.__current_dict_name != 'Symbol'\
|
||||
and self.__current_dict_name != 'Wingdings'\
|
||||
and self.__current_dict_name != 'Zapf Dingbats':
|
||||
and self.__current_dict_name not in ('Symbol', 'Wingdings', 'Zapf Dingbats'):
|
||||
text = text.upper()
|
||||
self.__write_obj.write('tx<nu<__________<%s\n' % text)
|
||||
|
||||
def __utf_to_caps_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
@ -506,6 +519,7 @@ class Hex2Utf8:
|
||||
# utf_text = utf_text.upper()
|
||||
utf_text = self.__utf_token_to_caps_func(utf_text)
|
||||
self.__write_obj.write('tx<ut<__________<%s\n' % utf_text)
|
||||
|
||||
def __utf_token_to_caps_func(self, char_entity):
|
||||
"""
|
||||
Required:
|
||||
@ -530,28 +544,26 @@ class Hex2Utf8:
|
||||
return char_entity
|
||||
else:
|
||||
return converted
|
||||
|
||||
def __convert_body(self):
|
||||
self.__state = 'body'
|
||||
read_obj = open(self.__file, 'r')
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
action = self.__body_state_dict.get(self.__state)
|
||||
if action == None:
|
||||
sys.stderr.write('error no state found in hex_2_utf8',
|
||||
self.__state
|
||||
)
|
||||
action(line)
|
||||
read_obj.close()
|
||||
with open(self.__file, 'r') as read_obj:
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
for line in read_obj:
|
||||
self.__token_info = line[:16]
|
||||
action = self.__body_state_dict.get(self.__state)
|
||||
if action is None:
|
||||
sys.stderr.write('error no state found in hex_2_utf8',
|
||||
self.__state
|
||||
)
|
||||
action(line)
|
||||
self.__write_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "body_utf_convert.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
|
||||
def convert_hex_2_utf8(self):
|
||||
self.__initiate_values()
|
||||
if self.__area_to_convert == 'preamble':
|
||||
|
@ -1,5 +1,7 @@
|
||||
import sys, os, tempfile
|
||||
|
||||
from calibre.ebooks.rtf2xml import copy
|
||||
|
||||
"""
|
||||
States.
|
||||
1. default
|
||||
@ -36,6 +38,7 @@ class Inline:
|
||||
self.__copy = copy
|
||||
self.__run_level = run_level
|
||||
self.__write_to = tempfile.mktemp()
|
||||
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Initiate all values.
|
||||
@ -51,7 +54,6 @@ class Inline:
|
||||
'tx<ut<__________' : self.__found_text_func,
|
||||
'mi<mk<inline-fld' : self.__found_text_func,
|
||||
'text' : self.__found_text_func,
|
||||
'cw<nu<hard-lineb' : self.__found_text_func, #calibre
|
||||
'cb<nu<clos-brack' : self.__close_bracket_func,
|
||||
'mi<mk<par-end___' : self.__end_para_func,
|
||||
'mi<mk<footnt-ope' : self.__end_para_func,
|
||||
@ -63,7 +65,6 @@ class Inline:
|
||||
'tx<hx<__________' : self.__found_text_func,
|
||||
'tx<ut<__________' : self.__found_text_func,
|
||||
'text' : self.__found_text_func,
|
||||
'cw<nu<hard-lineb' : self.__found_text_func, #calibre
|
||||
'mi<mk<inline-fld' : self.__found_text_func,
|
||||
'ob<nu<open-brack': self.__found_open_bracket_func,
|
||||
'mi<mk<par-end___' : self.__end_para_func,
|
||||
@ -83,12 +84,12 @@ class Inline:
|
||||
self.__in_para = 0 # not in paragraph
|
||||
self.__char_dict = {
|
||||
# character info => ci
|
||||
'annotation' : 'annotation',
|
||||
'annotation' : 'annotation',
|
||||
'blue______' : 'blue',
|
||||
'bold______' : 'bold',
|
||||
'caps______' : 'caps',
|
||||
'char-style' : 'character-style',
|
||||
'dbl-strike' : 'double-strike-through',
|
||||
'caps______' : 'caps',
|
||||
'char-style' : 'character-style',
|
||||
'dbl-strike' : 'double-strike-through',
|
||||
'emboss____' : 'emboss',
|
||||
'engrave___' : 'engrave',
|
||||
'font-color' : 'font-color',
|
||||
@ -96,7 +97,7 @@ class Inline:
|
||||
'font-size_' : 'font-size',
|
||||
'font-style' : 'font-style',
|
||||
'font-up___' : 'superscript',
|
||||
'footnot-mk' : 'footnote-marker',
|
||||
'footnot-mk' : 'footnote-marker',
|
||||
'green_____' : 'green',
|
||||
'hidden____' : 'hidden',
|
||||
'italics___' : 'italics',
|
||||
@ -107,9 +108,10 @@ class Inline:
|
||||
'strike-thr' : 'strike-through',
|
||||
'subscript_' : 'subscript',
|
||||
'superscrip' : 'superscript',
|
||||
'underlined' : 'underlined',
|
||||
'underlined' : 'underlined',
|
||||
}
|
||||
self.__caps_list = ['false']
|
||||
|
||||
def __set_list_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
@ -128,6 +130,7 @@ class Inline:
|
||||
self.__place = 'in_list'
|
||||
self.__inline_list = self.__list_inline_list
|
||||
self.__groups_in_waiting = self.__groups_in_waiting_list
|
||||
|
||||
def __default_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
@ -140,8 +143,8 @@ class Inline:
|
||||
action = self.__default_dict.get(self.__token_info)
|
||||
if action:
|
||||
action(line)
|
||||
if self.__token_info != 'cw<nu<hard-lineb': #calibre
|
||||
self.__write_obj.write(line)
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __found_open_bracket_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
@ -156,6 +159,7 @@ class Inline:
|
||||
self.__groups_in_waiting[0] += 1
|
||||
self.__inline_list.append({})
|
||||
self.__inline_list[-1]['contains_inline'] = 0
|
||||
|
||||
def __after_open_bracket_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
@ -176,6 +180,7 @@ class Inline:
|
||||
self.__state = 'default' # a non control word?
|
||||
action(line)
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __handle_control_word(self, line):
|
||||
"""
|
||||
Required:
|
||||
@ -206,6 +211,7 @@ class Inline:
|
||||
elif char_value == 'Zapf Dingbats':
|
||||
self.__write_obj.write('mi<mk<font-dingb\n')
|
||||
"""
|
||||
|
||||
def __close_bracket_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
@ -244,6 +250,7 @@ class Inline:
|
||||
self.__inline_list.pop()
|
||||
if self.__groups_in_waiting[0] != 0:
|
||||
self.__groups_in_waiting[0] -= 1
|
||||
|
||||
def __found_text_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
@ -257,7 +264,6 @@ class Inline:
|
||||
Text can mark the start of a paragraph.
|
||||
If already in a paragraph, check to see if any groups are waiting
|
||||
to be added. If so, use another method to write these groups.
|
||||
3. If not check if hardline break, then write
|
||||
"""
|
||||
if self.__place == 'in_list':
|
||||
self.__write_inline()
|
||||
@ -265,12 +271,9 @@ class Inline:
|
||||
if not self.__in_para:
|
||||
self.__in_para = 1
|
||||
self.__start_para_func(line)
|
||||
else:
|
||||
if self.__token_info == 'cw<nu<hard-lineb': #calibre
|
||||
self.__write_obj.write('mi<tg<empty_____<hardline-break\n')
|
||||
if self.__groups_in_waiting[0] != 0:
|
||||
elif self.__groups_in_waiting[0] != 0:
|
||||
self.__write_inline()
|
||||
|
||||
|
||||
def __write_inline(self):
|
||||
"""
|
||||
Required:
|
||||
@ -314,6 +317,7 @@ class Inline:
|
||||
self.__write_obj.write('<%s>%s' % (the_key, the_dict[the_key]))
|
||||
self.__write_obj.write('\n')
|
||||
self.__groups_in_waiting[0] = 0
|
||||
|
||||
def __end_para_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
@ -342,6 +346,7 @@ class Inline:
|
||||
self.__write_obj.write('mi<mk<caps-end__\n')
|
||||
self.__write_obj.write('mi<tg<close_____<inline\n')
|
||||
self.__in_para = 0
|
||||
|
||||
def __start_para_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
@ -369,12 +374,14 @@ class Inline:
|
||||
self.__write_obj.write('<%s>%s' % (the_key, the_dict[the_key]))
|
||||
self.__write_obj.write('\n')
|
||||
self.__groups_in_waiting[0] = 0
|
||||
|
||||
def __found_field_func(self, line):
|
||||
"""
|
||||
Just a default function to make sure I don't prematurely exit
|
||||
default state
|
||||
"""
|
||||
pass
|
||||
|
||||
def form_tags(self):
|
||||
"""
|
||||
Requires:
|
||||
@ -386,32 +393,27 @@ class Inline:
|
||||
the state.
|
||||
"""
|
||||
self.__initiate_values()
|
||||
read_obj = open(self.__file, 'r')
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
token = line[0:-1]
|
||||
self.__token_info = ''
|
||||
if token == 'tx<mc<__________<rdblquote'\
|
||||
or token == 'tx<mc<__________<ldblquote'\
|
||||
or token == 'tx<mc<__________<lquote'\
|
||||
or token == 'tx<mc<__________<rquote'\
|
||||
or token == 'tx<mc<__________<emdash'\
|
||||
or token == 'tx<mc<__________<endash'\
|
||||
or token == 'tx<mc<__________<bullet':
|
||||
self.__token_info = 'text'
|
||||
else:
|
||||
self.__token_info = line[:16]
|
||||
self.__set_list_func(line)
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if action == None:
|
||||
sys.stderr.write('No matching state in module inline_for_lists.py\n')
|
||||
sys.stderr.write(self.__state + '\n')
|
||||
action(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
with open(self.__file, 'r') as read_obj:
|
||||
with open(self.__write_to, 'w') as self.__write_obj:
|
||||
for line in read_obj:
|
||||
token = line[0:-1]
|
||||
self.__token_info = ''
|
||||
if token == 'tx<mc<__________<rdblquote'\
|
||||
or token == 'tx<mc<__________<ldblquote'\
|
||||
or token == 'tx<mc<__________<lquote'\
|
||||
or token == 'tx<mc<__________<rquote'\
|
||||
or token == 'tx<mc<__________<emdash'\
|
||||
or token == 'tx<mc<__________<endash'\
|
||||
or token == 'tx<mc<__________<bullet':
|
||||
self.__token_info = 'text'
|
||||
else:
|
||||
self.__token_info = line[:16]
|
||||
self.__set_list_func(line)
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if action is None:
|
||||
sys.stderr.write('No matching state in module inline_for_lists.py\n')
|
||||
sys.stderr.write(self.__state + '\n')
|
||||
action(line)
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "inline.data")
|
||||
|
@ -15,8 +15,11 @@
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import os, tempfile, re
|
||||
import os, tempfile
|
||||
|
||||
from calibre.ebooks.rtf2xml import copy
|
||||
from calibre.utils.cleantext import clean_ascii_chars
|
||||
|
||||
class FixLineEndings:
|
||||
"""Fix line endings"""
|
||||
def __init__(self,
|
||||
@ -32,36 +35,23 @@ class FixLineEndings:
|
||||
self.__run_level = run_level
|
||||
self.__write_to = tempfile.mktemp()
|
||||
self.__replace_illegals = replace_illegals
|
||||
|
||||
def fix_endings(self):
|
||||
##tempFileName = tempfile.mktemp()
|
||||
illegal_regx = re.compile( '\x00|\x01|\x02|\x03|\x04|\x05|\x06|\x07|\x08|\x0B|\x0E|\x0F|\x10|\x11|\x12|\x13')
|
||||
#nums = [0, 1, 2, 3, 4, 5, 6, 7, 8, 11, 14, 15, 16, 17, 18, 19]
|
||||
"""
|
||||
read_obj = open(self.__file, 'r')
|
||||
line = read_obj.read(1000)
|
||||
regexp = re.compile(r"\r")
|
||||
macintosh = regexp.search(line)
|
||||
read_obj.close()
|
||||
"""
|
||||
# always check since I have to get rid of illegal characters
|
||||
macintosh = 1
|
||||
if macintosh:
|
||||
line = 1
|
||||
read_obj = open(self.__file, 'r')
|
||||
write_obj = open(self.__write_to, 'w')
|
||||
while line:
|
||||
line = read_obj.read(1000)
|
||||
# line = re.sub(regexp,"\n",line)
|
||||
line = line.replace ('\r', '\n')
|
||||
if self.__replace_illegals:
|
||||
line = re.sub(illegal_regx, '', line)
|
||||
# for num in nums:
|
||||
# line = line.replace(chr(num), '')
|
||||
write_obj.write(line )
|
||||
read_obj.close()
|
||||
write_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "line_endings.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
#read
|
||||
with open(self.__file, 'r') as read_obj:
|
||||
input_file = read_obj.read()
|
||||
#calibre go from win and mac to unix
|
||||
input_file = input_file.replace ('\r\n', '\n')
|
||||
input_file = input_file.replace ('\r', '\n')
|
||||
#remove ASCII invalid chars : 0 to 8 and 11-14 to 24-26-27
|
||||
if self.__replace_illegals:
|
||||
input_file = clean_ascii_chars(input_file)
|
||||
#write
|
||||
with open(self.__write_to, 'wb') as write_obj:
|
||||
write_obj.write(input_file)
|
||||
#copy
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "line_endings.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
|
@ -16,7 +16,9 @@
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os, tempfile
|
||||
|
||||
from calibre.ebooks.rtf2xml import copy
|
||||
|
||||
class Pict:
|
||||
"""Process graphic information"""
|
||||
def __init__(self,
|
||||
@ -36,13 +38,11 @@ class Pict:
|
||||
self.__ob_count = 0
|
||||
self.__cb_count = 0
|
||||
self.__pict_count = 0
|
||||
self.__in_pict = 0
|
||||
self.__already_found_pict = 0
|
||||
self.__in_pict = False
|
||||
self.__already_found_pict = False
|
||||
self.__orig_file = orig_file
|
||||
self.__initiate_pict_dict()
|
||||
self.__out_file = out_file
|
||||
# this is left over
|
||||
self.__no_ask = 1
|
||||
|
||||
def __initiate_pict_dict(self):
|
||||
self.__pict_dict = {
|
||||
@ -71,57 +71,43 @@ class Pict:
|
||||
self.__out_file))
|
||||
else:
|
||||
dir_name = os.path.dirname(self.__orig_file)
|
||||
# self.__output_to_file_func()
|
||||
self.__dir_name = base_name + "_rtf_pict_dir/"
|
||||
self.__dir_name = os.path.join(dir_name, self.__dir_name)
|
||||
if not os.path.isdir(self.__dir_name):
|
||||
try:
|
||||
os.mkdir(self.__dir_name)
|
||||
except OSError, msg:
|
||||
msg = str(msg)
|
||||
msg += "Couldn't make directory '%s':\n" % (self.__dir_name)
|
||||
msg = "%sCouldn't make directory '%s':\n" % (str(msg), self.__dir_name)
|
||||
raise self.__bug_handler
|
||||
else:
|
||||
if self.__no_ask:
|
||||
user_response = 'r'
|
||||
else:
|
||||
msg = 'Do you want to remove all files in %s?\n' % self.__dir_name
|
||||
msg += 'Type "r" to remove.\n'
|
||||
msg += 'Type any other key to keep files in place.\n'
|
||||
sys.stderr.write(msg)
|
||||
user_response = raw_input()
|
||||
if user_response == 'r':
|
||||
if self.__run_level > 1:
|
||||
sys.stderr.write('Removing files from old pict directory...\n')
|
||||
all_files = os.listdir(self.__dir_name)
|
||||
for the_file in all_files:
|
||||
the_file = os.path.join(self.__dir_name, the_file)
|
||||
try:
|
||||
os.remove(the_file)
|
||||
except OSError:
|
||||
pass
|
||||
if self.__run_level > 1:
|
||||
sys.stderr.write('Files removed.\n')
|
||||
if self.__run_level > 1:
|
||||
sys.stderr.write('Removing files from old pict directory...\n')
|
||||
all_files = os.listdir(self.__dir_name)
|
||||
for the_file in all_files:
|
||||
the_file = os.path.join(self.__dir_name, the_file)
|
||||
try:
|
||||
os.remove(the_file)
|
||||
except OSError:
|
||||
pass
|
||||
if self.__run_level > 1:
|
||||
sys.stderr.write('Files removed.\n')
|
||||
|
||||
def __create_pict_file(self):
|
||||
"""Create a file for all the pict data to be written to.
|
||||
"""
|
||||
self.__pict_file = os.path.join(self.__dir_name, 'picts.rtf')
|
||||
write_pic_obj = open(self.__pict_file, 'w')
|
||||
write_pic_obj.close()
|
||||
self.__write_pic_obj = open(self.__pict_file, 'a')
|
||||
|
||||
def __in_pict_func(self, line):
|
||||
if self.__cb_count == self.__pict_br_count:
|
||||
self.__in_pict = 0
|
||||
self.__in_pict = False
|
||||
self.__write_pic_obj.write("}\n")
|
||||
return 1
|
||||
return True
|
||||
else:
|
||||
action = self.__pict_dict.get(self.__token_info)
|
||||
if action:
|
||||
line = action(line)
|
||||
self.__write_pic_obj.write(line)
|
||||
return 0
|
||||
self.__write_pic_obj.write(action(line))
|
||||
return False
|
||||
|
||||
def __default(self, line, write_obj):
|
||||
"""Determine if each token marks the beginning of pict data.
|
||||
@ -142,53 +128,50 @@ class Pict:
|
||||
write_obj.write('mi<mk<pict-end__\n')
|
||||
if not self.__already_found_pict:
|
||||
self.__create_pict_file()
|
||||
self.__already_found_pict=1;
|
||||
self.__already_found_pict=True;
|
||||
self.__print_rtf_header()
|
||||
self.__in_pict = 1
|
||||
self.__pict_br_count = self.__ob_count
|
||||
self.__cb_count = 0
|
||||
self.__write_pic_obj.write("{\\pict\n")
|
||||
return 0
|
||||
return 1
|
||||
return False
|
||||
return True
|
||||
|
||||
def __print_rtf_header(self):
|
||||
"""Print to pict file the necessary RTF data for the file to be
|
||||
recognized as an RTF file.
|
||||
"""
|
||||
self.__write_pic_obj.write("{\\rtf1 \n")
|
||||
self.__write_pic_obj.write("{\\fonttbl\\f0\\null;} \n")
|
||||
self.__write_pic_obj.write("{\\colortbl\\red255\\green255\\blue255;} \n")
|
||||
self.__write_pic_obj.write("\\pard \n")
|
||||
self.__write_pic_obj.write("{\\rtf1 \n{\\fonttbl\\f0\\null;} \n")
|
||||
self.__write_pic_obj.write("{\\colortbl\\red255\\green255\\blue255;} \n\\pard \n")
|
||||
|
||||
def process_pict(self):
|
||||
self.__make_dir()
|
||||
read_obj = open(self.__file)
|
||||
write_obj = open(self.__write_to, 'w')
|
||||
line_to_read = 'dummy'
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.__ob_count = line[-5:-1]
|
||||
if self.__token_info == 'cb<nu<clos-brack':
|
||||
self.__cb_count = line[-5:-1]
|
||||
if not self.__in_pict:
|
||||
to_print = self.__default(line, write_obj)
|
||||
if to_print :
|
||||
write_obj.write(line)
|
||||
else:
|
||||
to_print = self.__in_pict_func(line)
|
||||
if to_print :
|
||||
write_obj.write(line)
|
||||
if self.__already_found_pict:
|
||||
self.__write_pic_obj.write("}\n")
|
||||
self.__write_pic_obj.close()
|
||||
read_obj.close()
|
||||
write_obj.close()
|
||||
with open(self.__file) as read_obj:
|
||||
with open(self.__write_to, 'w') as write_obj:
|
||||
for line in read_obj:
|
||||
self.__token_info = line[:16]
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.__ob_count = line[-5:-1]
|
||||
if self.__token_info == 'cb<nu<clos-brack':
|
||||
self.__cb_count = line[-5:-1]
|
||||
if not self.__in_pict:
|
||||
to_print = self.__default(line, write_obj)
|
||||
if to_print :
|
||||
write_obj.write(line)
|
||||
else:
|
||||
to_print = self.__in_pict_func(line)
|
||||
if to_print :
|
||||
write_obj.write(line)
|
||||
if self.__already_found_pict:
|
||||
self.__write_pic_obj.write("}\n")
|
||||
self.__write_pic_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "pict.data")
|
||||
try:
|
||||
copy_obj.copy_file(self.__pict_file, "pict.rtf")
|
||||
except:
|
||||
pass
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
if self.__pict_count == 0:
|
||||
|
@ -15,8 +15,10 @@
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import os, re, tempfile
|
||||
import os, re, tempfile
|
||||
|
||||
from calibre.ebooks.rtf2xml import copy, check_brackets
|
||||
|
||||
class ProcessTokens:
|
||||
"""
|
||||
Process each token on a line and add information that will be useful for
|
||||
@ -41,14 +43,16 @@ class ProcessTokens:
|
||||
self.__bracket_count=0
|
||||
self.__exception_handler = exception_handler
|
||||
self.__bug_handler = bug_handler
|
||||
|
||||
def compile_expressions(self):
|
||||
self.__num_exp = re.compile(r"([a-zA-Z]+)(.*)")
|
||||
self.__utf_exp = re.compile(r'(&.*?;)')
|
||||
|
||||
def initiate_token_dict(self):
|
||||
self.__return_code = 0
|
||||
self.dict_token={
|
||||
# unicode
|
||||
'mshex' : ('nu', '__________', self.__ms_hex_func),
|
||||
'mshex' : ('nu', '__________', self.__ms_hex_func),
|
||||
# brackets
|
||||
'{' : ('nu', '{', self.ob_func),
|
||||
'}' : ('nu', '}', self.cb_func),
|
||||
@ -66,6 +70,7 @@ class ProcessTokens:
|
||||
';' : ('mc', ';', self.ms_sub_func),
|
||||
# this must be wrong
|
||||
'-' : ('mc', '-', self.ms_sub_func),
|
||||
'line' : ('mi', 'hardline-break', self.hardline_func), #calibre
|
||||
# misc => ml
|
||||
'*' : ('ml', 'asterisk__', self.default_func),
|
||||
':' : ('ml', 'colon_____', self.default_func),
|
||||
@ -73,7 +78,6 @@ class ProcessTokens:
|
||||
'backslash' : ('nu', '\\', self.text_func),
|
||||
'ob' : ('nu', '{', self.text_func),
|
||||
'cb' : ('nu', '}', self.text_func),
|
||||
'line' : ('nu', 'hard-lineb', self.default_func), #calibre
|
||||
#'line' : ('nu', ' ', self.text_func), calibre
|
||||
# paragraph formatting => pf
|
||||
'page' : ('pf', 'page-break', self.default_func),
|
||||
@ -159,15 +163,17 @@ class ProcessTokens:
|
||||
'rtf' : ('ri', 'rtf_______', self.default_func),
|
||||
'deff' : ('ri', 'deflt-font', self.default_func),
|
||||
'mac' : ('ri', 'macintosh_', self.default_func),
|
||||
'pc' : ('ri', 'pc________', self.default_func),
|
||||
'pca' : ('ri', 'pca_______', self.default_func),
|
||||
'ansi' : ('ri', 'ansi______', self.default_func),
|
||||
'ansicpg' : ('ri', 'ansi-codpg', self.default_func),
|
||||
# notes => nt
|
||||
'footnote' : ('nt', 'footnote__', self.default_func),
|
||||
'ftnalt' : ('nt', 'type______<endnote', self.two_part_func),
|
||||
# anchor => an
|
||||
'tc' : ('an', 'toc_______', self.default_func),
|
||||
'tc' : ('an', 'toc_______', self.default_func),
|
||||
'bkmkstt' : ('an', 'book-mk-st', self.default_func),
|
||||
'bkmkstart' : ('an', 'book-mk-st', self.default_func),
|
||||
'bkmkstart' : ('an', 'book-mk-st', self.default_func),
|
||||
'bkmkend' : ('an', 'book-mk-en', self.default_func),
|
||||
'xe' : ('an', 'index-mark', self.default_func),
|
||||
'rxe' : ('an', 'place_____', self.default_func),
|
||||
@ -347,7 +353,7 @@ class ProcessTokens:
|
||||
10: 'Kanji numbering without the digit character',
|
||||
11: 'Kanji numbering with the digit character',
|
||||
1246: 'phonetic Katakana characters in aiueo order',
|
||||
1346: 'phonetic katakana characters in iroha order',
|
||||
1346: 'phonetic katakana characters in iroha order',
|
||||
14: 'double byte character',
|
||||
15: 'single byte character',
|
||||
16: 'Kanji numbering 3',
|
||||
@ -392,7 +398,7 @@ class ProcessTokens:
|
||||
5121 : 'Arabic Algeria',
|
||||
15361 : 'Arabic Bahrain',
|
||||
3073 : 'Arabic Egypt',
|
||||
1 : 'Arabic General',
|
||||
1 : 'Arabic General',
|
||||
2049 : 'Arabic Iraq',
|
||||
11265 : 'Arabic Jordan',
|
||||
13313 : 'Arabic Kuwait',
|
||||
@ -417,7 +423,7 @@ class ProcessTokens:
|
||||
1059 : 'Byelorussian',
|
||||
1027 : 'Catalan',
|
||||
2052 : 'Chinese China',
|
||||
4 : 'Chinese General',
|
||||
4 : 'Chinese General',
|
||||
3076 : 'Chinese Hong Kong',
|
||||
4100 : 'Chinese Singapore',
|
||||
1028 : 'Chinese Taiwan',
|
||||
@ -431,7 +437,7 @@ class ProcessTokens:
|
||||
2057 : 'English British',
|
||||
4105 : 'English Canada',
|
||||
9225 : 'English Caribbean',
|
||||
9 : 'English General',
|
||||
9 : 'English General',
|
||||
6153 : 'English Ireland',
|
||||
8201 : 'English Jamaica',
|
||||
5129 : 'English New Zealand',
|
||||
@ -595,30 +601,37 @@ class ProcessTokens:
|
||||
num = num[1:] # chop off leading 0, which I added
|
||||
num = num.upper() # the mappings store hex in caps
|
||||
return 'tx<hx<__________<\'%s\n' % num # add an ' for the mappings
|
||||
|
||||
def ms_sub_func(self, pre, token, num):
|
||||
return 'tx<mc<__________<%s\n' % token
|
||||
|
||||
def hardline_func(self, pre, token, num):
|
||||
return 'mi<tg<empty_____<%s\n' % token
|
||||
|
||||
def default_func(self, pre, token, num):
|
||||
if num == None:
|
||||
if num is None:
|
||||
num = 'true'
|
||||
return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
|
||||
|
||||
def __list_type_func(self, pre, token, num):
|
||||
type = 'arabic'
|
||||
if num == None:
|
||||
if num is None:
|
||||
type = 'Arabic'
|
||||
else:
|
||||
try:
|
||||
num = int(num)
|
||||
except ValueError:
|
||||
if self.__run_level > 3:
|
||||
msg = 'number "%s" cannot be converted to integer\n' % num
|
||||
msg = 'Number "%s" cannot be converted to integer\n' % num
|
||||
raise self.__bug_handler, msg
|
||||
type = self.__number_type_dict.get(num)
|
||||
if type == None:
|
||||
if type is None:
|
||||
if self.__run_level > 3:
|
||||
msg = 'No type for "%s" in self.__number_type_dict\n'
|
||||
raise self.__bug_handler
|
||||
type = 'Arabic'
|
||||
return 'cw<%s<%s<nu<%s\n' % (pre, token, type)
|
||||
|
||||
def __language_func(self, pre, token, num):
|
||||
lang_name = self.__language_dict.get(int(re.search('[0-9]+', num).group()))
|
||||
if not lang_name:
|
||||
@ -627,31 +640,36 @@ class ProcessTokens:
|
||||
msg = 'No entry for number "%s"' % num
|
||||
raise self.__bug_handler, msg
|
||||
return 'cw<%s<%s<nu<%s\n' % (pre, token, lang_name)
|
||||
|
||||
def two_part_func(self, pre, token, num):
|
||||
list = token.split("<")
|
||||
token = list[0]
|
||||
num = list[1]
|
||||
return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
|
||||
##return 'cw<nu<nu<nu<%s>num<%s\n' % (token, num)
|
||||
|
||||
def divide_by_2(self, pre, token, num):
|
||||
num = self.divide_num(num, 2)
|
||||
return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
|
||||
##return 'cw<nu<nu<nu<%s>%s<%s\n' % (token, num, token)
|
||||
|
||||
def divide_by_20(self, pre, token, num):
|
||||
num = self.divide_num(num, 20)
|
||||
return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
|
||||
##return 'cw<nu<nu<nu<%s>%s<%s\n' % (token, num, token)
|
||||
|
||||
def text_func(self, pre, token, num=None):
|
||||
return 'tx<nu<__________<%s\n' % token
|
||||
|
||||
def ob_func(self, pre, token, num=None):
|
||||
self.__bracket_count += 1
|
||||
##return 'ob<%04d\n' % self.__bracket_count
|
||||
return 'ob<nu<open-brack<%04d\n' % self.__bracket_count
|
||||
|
||||
def cb_func(self, pre, token, num=None):
|
||||
##line = 'cb<%04d\n' % self.__bracket_count
|
||||
line = 'cb<nu<clos-brack<%04d\n' % self.__bracket_count
|
||||
self.__bracket_count -= 1
|
||||
return line
|
||||
|
||||
def color_func(self, pre, token, num):
|
||||
third_field = 'nu'
|
||||
if num[-1] == ';':
|
||||
@ -662,6 +680,7 @@ class ProcessTokens:
|
||||
num = "0" + num
|
||||
return 'cw<%s<%s<%s<%s\n' % (pre, token, third_field, num)
|
||||
##return 'cw<cl<%s<nu<nu<%s>%s<%s\n' % (third_field, token, num, token)
|
||||
|
||||
def bool_st_func(self, pre, token, num):
|
||||
if num is None or num == '' or num == '1':
|
||||
return 'cw<%s<%s<nu<true\n' % (pre, token)
|
||||
@ -670,24 +689,23 @@ class ProcessTokens:
|
||||
return 'cw<%s<%s<nu<false\n' % (pre, token)
|
||||
##return 'cw<nu<nu<nu<%s>false<%s\n' % (token, token)
|
||||
else:
|
||||
msg = 'boolean should have some value module process tokens\n'
|
||||
msg += 'token is ' + token + "\n"
|
||||
msg += "'" + num + "'" + "\n"
|
||||
msg = "boolean should have some value module process tokens\ntoken is %s\n'%s'\n" % (token, num)
|
||||
raise self.__bug_handler, msg
|
||||
|
||||
def __no_sup_sub_func(self, pre, token, num):
|
||||
the_string = 'cw<ci<subscript_<nu<false\n'
|
||||
the_string += 'cw<ci<superscrip<nu<false\n'
|
||||
return the_string
|
||||
|
||||
def divide_num(self, numerator, denominator):
|
||||
try:
|
||||
numerator = float(re.search('[0-9.]+', numerator).group())
|
||||
#calibre why ignore negative number? Wrong in case of \fi
|
||||
numerator = float(re.search('[0-9.\-]+', numerator).group())
|
||||
except TypeError, msg:
|
||||
if self.__run_level > 3:
|
||||
msg = 'no number to process?\n'
|
||||
msg += 'this indicates that the token '
|
||||
msg += ' \(\\li\) should have a number and does not\n'
|
||||
msg += 'numerator is "%s"\n' % numerator
|
||||
msg += 'denominator is "%s"\n' % denominator
|
||||
msg = ('No number to process?\nthis indicates that the token \(\\li\) \
|
||||
should have a number and does not\nnumerator is \
|
||||
"%s"\ndenominator is "%s"\n') % (numerator, denominator)
|
||||
raise self.__bug_handler, msg
|
||||
if 5 > self.__return_code:
|
||||
self.__return_code = 5
|
||||
@ -698,9 +716,10 @@ class ProcessTokens:
|
||||
if string_num[-2:] == ".0":
|
||||
string_num = string_num[:-2]
|
||||
return string_num
|
||||
|
||||
def split_let_num(self, token):
|
||||
match_obj = re.search(self.__num_exp,token)
|
||||
if match_obj != None:
|
||||
if match_obj is not None:
|
||||
first = match_obj.group(1)
|
||||
second = match_obj.group(2)
|
||||
if not second:
|
||||
@ -714,6 +733,7 @@ class ProcessTokens:
|
||||
raise self.__bug_handler
|
||||
return token, 0
|
||||
return first, second
|
||||
|
||||
def convert_to_hex(self,number):
|
||||
"""Convert a string to uppercase hexidecimal"""
|
||||
num = int(number)
|
||||
@ -722,6 +742,7 @@ class ProcessTokens:
|
||||
return hex_num
|
||||
except:
|
||||
raise self.__bug_handler
|
||||
|
||||
def process_cw(self, token):
|
||||
"""Change the value of the control word by determining what dictionary
|
||||
it belongs to"""
|
||||
@ -737,89 +758,62 @@ class ProcessTokens:
|
||||
pre, token, action = self.dict_token.get(token, (None, None, None))
|
||||
if action:
|
||||
return action(pre, token, num)
|
||||
# unused function
|
||||
def initiate_token_actions(self):
|
||||
self.action_for_token={
|
||||
'{' : self.ob_func,
|
||||
'}' : self.cb_func,
|
||||
'\\' : self.process_cw,
|
||||
}
|
||||
# unused function
|
||||
def evaluate_token(self,token):
|
||||
"""Evaluate tokens. Return a value if the token is not a
|
||||
control word. Otherwise, pass token onto another method
|
||||
for further evaluation."""
|
||||
token, action = self.dict_token.get(token[0:1])
|
||||
if action:
|
||||
line = action(token)
|
||||
return line
|
||||
else :
|
||||
return 'tx<nu<nu<nu<nu<%s\n' % token
|
||||
|
||||
def __check_brackets(self, in_file):
|
||||
self.__check_brack_obj = check_brackets.CheckBrackets\
|
||||
(file = in_file)
|
||||
good_br = self.__check_brack_obj.check_brackets()[0]
|
||||
if not good_br:
|
||||
return 1
|
||||
|
||||
def process_tokens(self):
|
||||
"""Main method for handling other methods. """
|
||||
first_token = 0
|
||||
second_token = 0
|
||||
read_obj = open(self.__file, 'r')
|
||||
write_obj = open(self.__write_to, 'w')
|
||||
line_to_read = "dummy"
|
||||
line_count = 0
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
token = line_to_read
|
||||
token = token.replace("\n","")
|
||||
if not token:
|
||||
continue
|
||||
line_count += 1
|
||||
try:
|
||||
token.decode('us-ascii')
|
||||
except UnicodeError, msg:
|
||||
msg = str(msg)
|
||||
msg += 'Invalid RTF: File not ascii encoded.\n'
|
||||
raise self.__exception_handler, msg
|
||||
if not first_token:
|
||||
if token != '\\{':
|
||||
msg = 'Invalid RTF: document doesn\'t start with {\n'
|
||||
raise self.__exception_handler, msg
|
||||
first_token = 1
|
||||
elif first_token and not second_token:
|
||||
if token[0:4] != '\\rtf':
|
||||
msg ='Invalid RTF: document doesn\'t start with \\rtf \n'
|
||||
raise self.__exception_handler, msg
|
||||
second_token = 1
|
||||
##token = self.evaluate_token(token)
|
||||
the_index = token.find('\\ ')
|
||||
if token != None and the_index > -1:
|
||||
msg ='Invalid RTF: token "\\ " not valid. \n'
|
||||
raise self.__exception_handler, msg
|
||||
elif token[0:1] == "\\":
|
||||
line = self.process_cw(token)
|
||||
if line != None:
|
||||
write_obj.write(line)
|
||||
else:
|
||||
fields = re.split(self.__utf_exp, token)
|
||||
for field in fields:
|
||||
if not field:
|
||||
continue
|
||||
if field[0:1] == '&':
|
||||
write_obj.write('tx<ut<__________<%s\n' % field)
|
||||
with open(self.__file, 'r') as read_obj:
|
||||
with open(self.__write_to, 'wb') as write_obj:
|
||||
for line in read_obj:
|
||||
token = line.replace("\n","")
|
||||
line_count += 1
|
||||
if line_count == 1 and token != '\\{':
|
||||
msg = 'Invalid RTF: document doesn\'t start with {\n'
|
||||
raise self.__exception_handler, msg
|
||||
elif line_count == 2 and token[0:4] != '\\rtf':
|
||||
msg = 'Invalid RTF: document doesn\'t start with \\rtf \n'
|
||||
raise self.__exception_handler, msg
|
||||
|
||||
the_index = token.find('\\ ')
|
||||
if token is not None and the_index > -1:
|
||||
msg = 'Invalid RTF: token "\\ " not valid.\n'
|
||||
raise self.__exception_handler, msg
|
||||
elif token[:1] == "\\":
|
||||
try:
|
||||
token.decode('us-ascii')
|
||||
except UnicodeError, msg:
|
||||
msg = 'Invalid RTF: Tokens not ascii encoded.\n%s' % str(msg)
|
||||
raise self.__exception_handler, msg
|
||||
line = self.process_cw(token)
|
||||
if line is not None:
|
||||
write_obj.write(line)
|
||||
else:
|
||||
write_obj.write('tx<nu<__________<%s\n' % field)
|
||||
read_obj.close()
|
||||
write_obj.close()
|
||||
fields = re.split(self.__utf_exp, token)
|
||||
for field in fields:
|
||||
if not field:
|
||||
continue
|
||||
if field[0:1] == '&':
|
||||
write_obj.write('tx<ut<__________<%s\n' % field)
|
||||
else:
|
||||
write_obj.write('tx<nu<__________<%s\n' % field)
|
||||
|
||||
if not line_count:
|
||||
msg ='Invalid RTF: file appears to be empty. \n'
|
||||
msg = 'Invalid RTF: file appears to be empty.\n'
|
||||
raise self.__exception_handler, msg
|
||||
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "processed_tokens.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
|
||||
bad_brackets = self.__check_brackets(self.__file)
|
||||
if bad_brackets:
|
||||
msg = 'Invalid RTF: document does not have matching brackets.\n'
|
||||
|
@ -16,7 +16,10 @@
|
||||
# #
|
||||
#########################################################################
|
||||
import os, tempfile
|
||||
|
||||
from calibre.ebooks.rtf2xml import copy
|
||||
from calibre.utils.cleantext import clean_ascii_chars
|
||||
|
||||
class ReplaceIllegals:
|
||||
"""
|
||||
reaplace illegal lower ascii characters
|
||||
@ -30,21 +33,14 @@ class ReplaceIllegals:
|
||||
self.__copy = copy
|
||||
self.__run_level = run_level
|
||||
self.__write_to = tempfile.mktemp()
|
||||
|
||||
def replace_illegals(self):
|
||||
"""
|
||||
"""
|
||||
nums = [0, 1, 2, 3, 4, 5, 6, 7, 8, 11, 13, 14, 15, 16, 17, 18, 19]
|
||||
read_obj = open(self.__file, 'r')
|
||||
write_obj = open(self.__write_to, 'w')
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
for num in nums:
|
||||
line = line.replace(chr(num), '')
|
||||
write_obj.write(line)
|
||||
read_obj.close()
|
||||
write_obj.close()
|
||||
with open(self.__file, 'r') as read_obj:
|
||||
with open(self.__write_to, 'w') as write_obj:
|
||||
for line in read_obj:
|
||||
write_obj.write(clean_ascii_chars(line))
|
||||
copy_obj = copy.Copy()
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "replace_illegals.data")
|
||||
|
@ -16,7 +16,10 @@
|
||||
# #
|
||||
#########################################################################
|
||||
import os, re, tempfile
|
||||
|
||||
from calibre.ebooks.rtf2xml import copy
|
||||
from calibre.utils.mreplace import MReplace
|
||||
|
||||
class Tokenize:
|
||||
"""Tokenize RTF into one line per field. Each line will contain information useful for the rest of the script"""
|
||||
def __init__(self,
|
||||
@ -28,89 +31,175 @@ class Tokenize:
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__special_tokens = [ '_', '~', "'", '{', '}' ]
|
||||
self.__write_to = tempfile.mktemp()
|
||||
def __from_ms_to_utf8(self,match_obj):
|
||||
uni_char = int(match_obj.group(1))
|
||||
if uni_char < 0:
|
||||
uni_char += 65536
|
||||
return '&#x' + str('%X' % uni_char) + ';'
|
||||
def __neg_unicode_func(self, match_obj):
|
||||
neg_uni_char = int(match_obj.group(1)) * -1
|
||||
# sys.stderr.write(str( neg_uni_char))
|
||||
uni_char = neg_uni_char + 65536
|
||||
return '&#x' + str('%X' % uni_char) + ';'
|
||||
def __sub_line_reg(self,line):
|
||||
line = line.replace("\\\\", "\\backslash ")
|
||||
line = line.replace("\\~", "\\~ ")
|
||||
line = line.replace("\\;", "\\; ")
|
||||
line = line.replace("&", "&")
|
||||
line = line.replace("<", "<")
|
||||
line = line.replace(">", ">")
|
||||
line = line.replace("\\~", "\\~ ")
|
||||
line = line.replace("\\_", "\\_ ")
|
||||
line = line.replace("\\:", "\\: ")
|
||||
line = line.replace("\\-", "\\- ")
|
||||
# turn into a generic token to eliminate special
|
||||
# cases and make processing easier
|
||||
line = line.replace("\\{", "\\ob ")
|
||||
# turn into a generic token to eliminate special
|
||||
# cases and make processing easier
|
||||
line = line.replace("\\}", "\\cb ")
|
||||
# put a backslash in front of to eliminate special cases and
|
||||
# make processing easier
|
||||
line = line.replace("{", "\\{")
|
||||
# put a backslash in front of to eliminate special cases and
|
||||
# make processing easier
|
||||
line = line.replace("}", "\\}")
|
||||
line = re.sub(self.__utf_exp, self.__from_ms_to_utf8, line)
|
||||
# line = re.sub( self.__neg_utf_exp, self.__neg_unicode_func, line)
|
||||
line = re.sub(self.__ms_hex_exp, "\\mshex0\g<1> ", line)
|
||||
##line = line.replace("\\backslash", "\\\\")
|
||||
# this is for older RTF
|
||||
line = re.sub(self.__par_exp, '\\par ', line)
|
||||
return line
|
||||
def __compile_expressions(self):
|
||||
self.__ms_hex_exp = re.compile(r"\\\'(..)")
|
||||
self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) {0,1}")
|
||||
self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\\[^\s\\{}&]+(?:\s)?)")
|
||||
self.__par_exp = re.compile(r'\\$')
|
||||
self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)")
|
||||
##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)")
|
||||
def __create_tokens(self):
|
||||
self.__compile_expressions()
|
||||
read_obj = open(self.__file, 'r')
|
||||
write_obj = open(self.__write_to, 'w')
|
||||
line_to_read = "dummy"
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
line = line.replace("\n", "")
|
||||
line = self.__sub_line_reg(line)
|
||||
tokens = re.split(self.__splitexp, line)
|
||||
##print tokens
|
||||
for token in tokens:
|
||||
if token != "":
|
||||
write_obj.write(token + "\n")
|
||||
"""
|
||||
match_obj = re.search(self.__mixed_exp, token)
|
||||
if match_obj != None:
|
||||
first = match_obj.group(1)
|
||||
second = match_obj.group(2)
|
||||
write_obj.write(first + "\n")
|
||||
write_obj.write(second + "\n")
|
||||
else:
|
||||
write_obj.write(token + "\n")
|
||||
"""
|
||||
read_obj.close()
|
||||
write_obj.close()
|
||||
#variables
|
||||
self.__uc_char = 0
|
||||
self.__uc_bin = False
|
||||
self.__uc_value = [1]
|
||||
|
||||
def __reini_utf8_counters(self):
|
||||
self.__uc_char = 0
|
||||
self.__uc_bin = False
|
||||
|
||||
def __remove_uc_chars(self, startchar, token):
|
||||
for i in xrange(startchar, len(token)):
|
||||
if token[i] == " ":
|
||||
continue
|
||||
elif self.__uc_char:
|
||||
self.__uc_char -= 1
|
||||
else:
|
||||
return token[i:]
|
||||
#if only " " and char to skip
|
||||
return ''
|
||||
|
||||
def __unicode_process(self, token):
|
||||
#change scope in
|
||||
if token == '\{':
|
||||
self.__uc_value.append(self.__uc_value[-1])
|
||||
#basic error handling
|
||||
self.__reini_utf8_counters()
|
||||
return token
|
||||
#change scope out
|
||||
elif token == '\}':
|
||||
self.__uc_value.pop()
|
||||
self.__reini_utf8_counters()
|
||||
return token
|
||||
#add a uc control
|
||||
elif token[:3] == '\uc':
|
||||
self.__uc_value[-1] = int(token[3:])
|
||||
self.__reini_utf8_counters()
|
||||
return token
|
||||
#bin data to slip
|
||||
elif self.__uc_bin:
|
||||
self.__uc_bin = False
|
||||
return ''
|
||||
#uc char to remove
|
||||
elif self.__uc_char:
|
||||
#handle \bin tag in case of uc char to skip
|
||||
if token[:4] == '\bin':
|
||||
self.__uc_char -=1
|
||||
self.__uc_bin = True
|
||||
return ''
|
||||
elif token[:1] == "\\" :
|
||||
self.__uc_char -=1
|
||||
return ''
|
||||
else:
|
||||
return self.__remove_uc_chars(0, token)
|
||||
#go for real \u token
|
||||
match_obj = self.__utf_exp.match(token)
|
||||
if match_obj is not None:
|
||||
self.__reini_utf8_counters()
|
||||
#get value and handle negative case
|
||||
uni_char = int(match_obj.group(1))
|
||||
uni_len = len(match_obj.group(1)) + 2
|
||||
if uni_char < 0:
|
||||
uni_char += 65536
|
||||
uni_char = unichr(uni_char).encode('ascii', 'xmlcharrefreplace')
|
||||
self.__uc_char = self.__uc_value[-1]
|
||||
#there is only an unicode char
|
||||
if len(token)<= uni_len:
|
||||
return uni_char
|
||||
#an unicode char and something else
|
||||
#must be after as it is splited on \
|
||||
#necessary? maybe for \bin?
|
||||
elif not self.__uc_char:
|
||||
return uni_char + token[uni_len:]
|
||||
#if not uc0 and chars
|
||||
else:
|
||||
return uni_char + self.__remove_uc_chars(uni_len, token)
|
||||
#default
|
||||
return token
|
||||
|
||||
def __sub_reg_split(self,input_file):
|
||||
input_file = self.__replace_spchar.mreplace(input_file)
|
||||
input_file = self.__ms_hex_exp.sub("\\mshex0\g<1> ", input_file)
|
||||
input_file = self.__utf_ud.sub("\\{\\uc0 \g<1>\\}", input_file)
|
||||
#remove \n in bin data
|
||||
input_file = self.__bin_exp.sub(lambda x: \
|
||||
x.group().replace('\n', '') + '\n', input_file)
|
||||
#split
|
||||
tokens = re.split(self.__splitexp, input_file)
|
||||
#remove empty tokens and \n
|
||||
return filter(lambda x: len(x) > 0 and x != '\n', tokens)
|
||||
#input_file = re.sub(self.__utf_exp, self.__from_ms_to_utf8, input_file)
|
||||
# line = re.sub( self.__neg_utf_exp, self.__neg_unicode_func, line)
|
||||
# this is for older RTF
|
||||
#line = re.sub(self.__par_exp, '\\par ', line)
|
||||
#return filter(lambda x: len(x) > 0, \
|
||||
#(self.__remove_line.sub('', x) for x in tokens))
|
||||
|
||||
def __compile_expressions(self):
|
||||
SIMPLE_RPL = {
|
||||
"\\\\": "\\backslash ",
|
||||
"\\~": "\\~ ",
|
||||
"\\;": "\\; ",
|
||||
"&": "&",
|
||||
"<": "<",
|
||||
">": ">",
|
||||
"\\~": "\\~ ",
|
||||
"\\_": "\\_ ",
|
||||
"\\:": "\\: ",
|
||||
"\\-": "\\- ",
|
||||
# turn into a generic token to eliminate special
|
||||
# cases and make processing easier
|
||||
"\\{": "\\ob ",
|
||||
# turn into a generic token to eliminate special
|
||||
# cases and make processing easier
|
||||
"\\}": "\\cb ",
|
||||
# put a backslash in front of to eliminate special cases and
|
||||
# make processing easier
|
||||
"{": "\\{",
|
||||
# put a backslash in front of to eliminate special cases and
|
||||
# make processing easier
|
||||
"}": "\\}",
|
||||
# this is for older RTF
|
||||
r'\\$': '\\par ',
|
||||
}
|
||||
self.__replace_spchar = MReplace(SIMPLE_RPL)
|
||||
#add ;? in case of char following \u
|
||||
self.__ms_hex_exp = re.compile(r"\\\'([0-9a-fA-F]{2})") #r"\\\'(..)"
|
||||
self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) ?")
|
||||
self.__bin_exp = re.compile(r"(?:\\bin(-?\d{0,10})[\n ]+)[01\n]+")
|
||||
#manage upr/ud situations
|
||||
self.__utf_ud = re.compile(r"\\{[\n ]?\\upr[\n ]?(?:\\{.*?\\})[\n ]?" + \
|
||||
r"\\{[\n ]?\\*[\n ]?\\ud[\n ]?(\\{.*?\\})[\n ]?\\}[\n ]?\\}")
|
||||
#add \n in split for whole file reading
|
||||
#why keep backslash whereas \is replaced before?
|
||||
#remove \n from endline char
|
||||
self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)")
|
||||
#self.__bin_exp = re.compile(r"\\bin(-?\d{1,8}) {0,1}")
|
||||
#self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})")
|
||||
#self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\n|\\[^\s\\{}&]+(?:\s)?)")
|
||||
#self.__par_exp = re.compile(r'\\$')
|
||||
#self.__remove_line = re.compile(r'\n+')
|
||||
#self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)")
|
||||
##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)")
|
||||
|
||||
def tokenize(self):
|
||||
"""Main class for handling other methods. Reads in one line \
|
||||
at a time, usues method self.sub_line to make basic substitutions,\
|
||||
uses ? to process tokens"""
|
||||
self.__create_tokens()
|
||||
"""Main class for handling other methods. Reads the file \
|
||||
, uses method self.sub_reg to make basic substitutions,\
|
||||
and process tokens by itself"""
|
||||
#read
|
||||
with open(self.__file, 'r') as read_obj:
|
||||
input_file = read_obj.read()
|
||||
|
||||
#process simple replacements and split giving us a correct list
|
||||
#remove '' and \n in the process
|
||||
tokens = self.__sub_reg_split(input_file)
|
||||
#correct unicode
|
||||
tokens = map(self.__unicode_process, tokens)
|
||||
#remove empty items created by removing \uc
|
||||
tokens = filter(lambda x: len(x) > 0, tokens)
|
||||
|
||||
#write
|
||||
with open(self.__write_to, 'wb') as write_obj:
|
||||
write_obj.write('\n'.join(tokens))
|
||||
#Move and copy
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "tokenize.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
|
||||
#self.__special_tokens = [ '_', '~', "'", '{', '}' ]
|
Loading…
x
Reference in New Issue
Block a user