This commit is contained in:
Kovid Goyal 2019-05-20 12:20:25 +05:30
commit 4c5e9a20a4
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
18 changed files with 82 additions and 84 deletions

View File

@ -14,16 +14,16 @@ ${head_content}$
${for title in meta.titles():}$ ${for title in meta.titles():}$
${if pos1:}$ ${if pos1:}$
<h1> <h1>
<a href="${tocUrl}$">${print title}$</a> <a href="${tocUrl}$">${print(title)}$</a>
</h1> </h1>
${:else:}$ ${:else:}$
<div class="calibreMetaSubtitle">${print title}$</div> <div class="calibreMetaSubtitle">${print(title)}$</div>
${:endif}$ ${:endif}$
${pos1=0}$ ${pos1=0}$
${:endfor}$ ${:endfor}$
</div> </div>
<div class="calibreMetaAuthor"> <div class="calibreMetaAuthor">
${print ', '.join(meta.creators())}$ ${print(', '.join(meta.creators()))}$
</div> </div>
</div> </div>
@ -33,13 +33,13 @@ ${head_content}$
${if prevLink or nextLink:}$ ${if prevLink or nextLink:}$
<div class="calibreEbNavTop"> <div class="calibreEbNavTop">
${if prevLink:}$ ${if prevLink:}$
<a href="${prevLink}$" class="calibreAPrev">${print _('previous page'),}$</a> <a href="${prevLink}$" class="calibreAPrev">${print(_('previous page'))}$</a>
${:else:}$ ${:else:}$
<a href="${tocUrl}$" class="calibreAPrev">${print _('previous page'),}$</a> <a href="${tocUrl}$" class="calibreAPrev">${print(_('previous page'))}$</a>
${:endif}$ ${:endif}$
${if nextLink:}$ ${if nextLink:}$
<a href="${nextLink}$" class="calibreANext">${print _('next page'),}$</a> <a href="${nextLink}$" class="calibreANext">${print(_('next page'))}$</a>
${:endif}$ ${:endif}$
</div> </div>
${:endif}$ ${:endif}$
@ -49,22 +49,22 @@ ${head_content}$
${if has_toc:}$ ${if has_toc:}$
<div class="calibreToc"> <div class="calibreToc">
<h2><a href="${tocUrl}$">${print _('Table of contents'),}$</a></h2> <h2><a href="${tocUrl}$">${print( _('Table of contents'))}$</a></h2>
${print toc()}$ ${print(toc())}$
</div> </div>
${:endif}$ ${:endif}$
<div class="calibreEbNav"> <div class="calibreEbNav">
${if prevLink:}$ ${if prevLink:}$
<a href="${prevLink}$" class="calibreAPrev">${print _('previous page'),}$</a> <a href="${prevLink}$" class="calibreAPrev">${print(_('previous page'))}$</a>
${:else:}$ ${:else:}$
<a href="${tocUrl}$" class="calibreAPrev">${print _('previous page'),}$</a> <a href="${tocUrl}$" class="calibreAPrev">${print(_('previous page'))}$</a>
${:endif}$ ${:endif}$
<a href="${tocUrl}$" class="calibreAHome">${print _('start'),}$</a> <a href="${tocUrl}$" class="calibreAHome">${print(_('start'))}$</a>
${if nextLink:}$ ${if nextLink:}$
<a href="${nextLink}$" class="calibreANext">${print _('next page'),}$</a> <a href="${nextLink}$" class="calibreANext">${print(_('next page'))}$</a>
${:endif}$ ${:endif}$
</div> </div>

View File

@ -6,10 +6,10 @@
<link rel="schema.DC" href="http://purl.org/dc/elements/1.1/" /> <link rel="schema.DC" href="http://purl.org/dc/elements/1.1/" />
<link rel="schema.DCTERMS" href="http://purl.org/dc/terms/" /> <link rel="schema.DCTERMS" href="http://purl.org/dc/terms/" />
<title>${print ', '.join(meta.creators()),}$ - ${print meta.titles().next(); meta.titles().close()}$</title> <title>${print(', '.join(meta.creators()))}$ - ${print(next(meta.titles())); print(meta.titles().close())}$</title>
${for item in meta:}$ ${for item in meta:}$
<meta ${print 'name="DC.'+item['name']+'"',}$ ${print 'content="'+item['value']+'"',}$ /> <meta ${print('name="DC.'+item['name']+'"')}$ ${print('content="'+item['value']+'"')}$ />
${:endfor}$ ${:endfor}$
<link href="${cssLink}$" type="text/css" rel="stylesheet" /> <link href="${cssLink}$" type="text/css" rel="stylesheet" />
@ -22,16 +22,16 @@ ${:endfor}$
${for title in meta.titles():}$ ${for title in meta.titles():}$
${if pos1:}$ ${if pos1:}$
<h1> <h1>
<a href="${tocUrl}$">${print title}$</a> <a href="${tocUrl}$">${print(title)}$</a>
</h1> </h1>
${:else:}$ ${:else:}$
<div class="calibreMetaSubtitle">${print title}$</div> <div class="calibreMetaSubtitle">${print(title)}$</div>
${:endif}$ ${:endif}$
${pos1=0}$ ${pos1=0}$
${:endfor}$ ${:endfor}$
</div> </div>
<div class="calibreMetaAuthor"> <div class="calibreMetaAuthor">
${print ', '.join(meta.creators()),}$ ${print(', '.join(meta.creators()))}$
</div> </div>
</div> </div>
@ -40,19 +40,19 @@ ${:endfor}$
${if has_toc:}$ ${if has_toc:}$
<div class="calibreTocIndex"> <div class="calibreTocIndex">
<h2>${print _('Table of contents'),}$</h2> <h2>${print(_('Table of contents'))}$</h2>
${toc}$ ${toc}$
</div> </div>
${:else:}$ ${:else:}$
<h2>${print _('No table of contents present'),}$</h2> <h2>${print(_('No table of contents present'))}$</h2>
<div><strong><a href="${nextLink}$">${print _('begin to read'),}$</a></strong></div> <div><strong><a href="${nextLink}$">${print(_('begin to read'))}$</a></strong></div>
${:endif}$ ${:endif}$
</div> </div>
<div class="calibreEbNav"> <div class="calibreEbNav">
${if nextLink:}$ ${if nextLink:}$
<a href="${nextLink}$" class="calibreANext">${print _('next page'),}$</a> <a href="${nextLink}$" class="calibreANext">${print(_('next page'))}$</a>
${:endif}$ ${:endif}$
</div> </div>
</div> </div>

View File

@ -90,7 +90,8 @@ class FB2Input(InputFormatPlugin):
css = re.sub(r'name\s*=\s*', 'class=', css) css = re.sub(r'name\s*=\s*', 'class=', css)
self.extract_embedded_content(doc) self.extract_embedded_content(doc)
log.debug('Converting XML to HTML...') log.debug('Converting XML to HTML...')
ss = open(P('templates/fb2.xsl'), 'rb').read() with open(P('templates/fb2.xsl'), 'rb') as f:
ss = f.read().decode('utf-8')
ss = ss.replace("__FB_NS__", fb_ns) ss = ss.replace("__FB_NS__", fb_ns)
if options.no_inline_fb2_toc: if options.no_inline_fb2_toc:
log('Disabling generation of inline FB2 TOC') log('Disabling generation of inline FB2 TOC')
@ -124,8 +125,10 @@ class FB2Input(InputFormatPlugin):
src = img.get('src') src = img.get('src')
img.set('src', self.binary_map.get(src, src)) img.set('src', self.binary_map.get(src, src))
index = transform.tostring(result) index = transform.tostring(result)
open(u'index.xhtml', 'wb').write(index) with open(u'index.xhtml', 'wb') as f:
open(u'inline-styles.css', 'wb').write(css) f.write(index.encode('utf-8'))
with open(u'inline-styles.css', 'wb') as f:
f.write(css.encode('utf-8'))
stream.seek(0) stream.seek(0)
mi = get_metadata(stream, 'fb2') mi = get_metadata(stream, 'fb2')
if not mi.title: if not mi.title:

View File

@ -79,7 +79,7 @@ class HTMLOutput(OutputFormatPlugin):
from lxml import etree from lxml import etree
root = self.generate_toc(oeb_book, ref_url, output_dir) root = self.generate_toc(oeb_book, ref_url, output_dir)
return etree.tostring(root, pretty_print=True, encoding='utf-8', return etree.tostring(root, pretty_print=True, encoding='unicode',
xml_declaration=False) xml_declaration=False)
def convert(self, oeb_book, output_path, input_plugin, opts, log): def convert(self, oeb_book, output_path, input_plugin, opts, log):
@ -161,14 +161,14 @@ class HTMLOutput(OutputFormatPlugin):
# get & clean HTML <HEAD>-data # get & clean HTML <HEAD>-data
head = root.xpath('//h:head', namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0] head = root.xpath('//h:head', namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0]
head_content = etree.tostring(head, pretty_print=True, encoding='utf-8') head_content = etree.tostring(head, pretty_print=True, encoding='unicode')
head_content = re.sub(r'\<\/?head.*\>', '', head_content) head_content = re.sub(r'\<\/?head.*\>', '', head_content)
head_content = re.sub(re.compile(r'\<style.*\/style\>', re.M|re.S), '', head_content) head_content = re.sub(re.compile(r'\<style.*\/style\>', re.M|re.S), '', head_content)
head_content = re.sub(r'<(title)([^>]*)/>', r'<\1\2></\1>', head_content) head_content = re.sub(r'<(title)([^>]*)/>', r'<\1\2></\1>', head_content)
# get & clean HTML <BODY>-data # get & clean HTML <BODY>-data
body = root.xpath('//h:body', namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0] body = root.xpath('//h:body', namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0]
ebook_content = etree.tostring(body, pretty_print=True, encoding='utf-8') ebook_content = etree.tostring(body, pretty_print=True, encoding='unicode')
ebook_content = re.sub(r'\<\/?body.*\>', '', ebook_content) ebook_content = re.sub(r'\<\/?body.*\>', '', ebook_content)
ebook_content = re.sub(r'<(div|a|span)([^>]*)/>', r'<\1\2></\1>', ebook_content) ebook_content = re.sub(r'<(div|a|span)([^>]*)/>', r'<\1\2></\1>', ebook_content)
@ -202,7 +202,7 @@ class HTMLOutput(OutputFormatPlugin):
# write html to file # write html to file
with open(path, 'wb') as f: with open(path, 'wb') as f:
f.write(t) f.write(t.encode('utf-8'))
item.unload_data_from_memory(memory=path) item.unload_data_from_memory(memory=path)
zfile = zipfile.ZipFile(output_path, "w") zfile = zipfile.ZipFile(output_path, "w")

View File

@ -41,7 +41,9 @@ class PMLInput(InputFormatPlugin):
else: else:
html_stream = html_path html_stream = html_path
ienc = pml_stream.encoding if pml_stream.encoding else 'cp1252' ienc = getattr(pml_stream, 'encoding', None)
if ienc is None:
ienc = 'cp1252'
if self.options.input_encoding: if self.options.input_encoding:
ienc = self.options.input_encoding ienc = self.options.input_encoding

View File

@ -142,7 +142,7 @@ class SNBOutput(OutputFormatPlugin):
for tocitem in oeb_book.toc: for tocitem in oeb_book.toc:
if tocitem.href.find('#') != -1: if tocitem.href.find('#') != -1:
item = string.split(tocitem.href, '#') item = tocitem.href.split('#')
if len(item) != 2: if len(item) != 2:
log.error('Error in TOC item: %s' % tocitem) log.error('Error in TOC item: %s' % tocitem)
else: else:

View File

@ -138,7 +138,7 @@ class TXTInput(InputFormatPlugin):
block_to_single_line, separate_hard_scene_breaks) block_to_single_line, separate_hard_scene_breaks)
self.log = log self.log = log
txt = '' txt = b''
log.debug('Reading text from file...') log.debug('Reading text from file...')
length = 0 length = 0
base_dir = getcwd() base_dir = getcwd()
@ -151,7 +151,7 @@ class TXTInput(InputFormatPlugin):
for x in walk('.'): for x in walk('.'):
if os.path.splitext(x)[1].lower() in ('.txt', '.text'): if os.path.splitext(x)[1].lower() in ('.txt', '.text'):
with open(x, 'rb') as tf: with open(x, 'rb') as tf:
txt += tf.read() + '\n\n' txt += tf.read() + b'\n\n'
else: else:
if getattr(stream, 'name', None): if getattr(stream, 'name', None):
base_dir = os.path.dirname(stream.name) base_dir = os.path.dirname(stream.name)

View File

@ -584,7 +584,7 @@ class HTMLPreProcessor(object):
end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: '')) end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
end_rules.append( end_rules.append(
# Un wrap using punctuation # Un wrap using punctuation
(re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), # noqa (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\\\\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), # noqa
) )
for rule in self.PREPROCESS + start_rules: for rule in self.PREPROCESS + start_rules:

View File

@ -391,7 +391,7 @@ class Mobi8Reader(object):
fi = self.get_file_info(pos) fi = self.get_file_info(pos)
if fi.filename is None: if fi.filename is None:
raise ValueError('Index entry has invalid pos: %d'%pos) raise ValueError('Index entry has invalid pos: %d'%pos)
idtag = self.get_id_tag(pos).decode(self.header.codec) idtag = self.get_id_tag(pos)
href = '%s/%s'%(fi.type, fi.filename) href = '%s/%s'%(fi.type, fi.filename)
else: else:
try: try:
@ -403,7 +403,7 @@ class Mobi8Reader(object):
continue continue
entry['href'] = href entry['href'] = href
entry['idtag'] = idtag entry['idtag'] = idtag.decode(self.header.codec)
for e in remove: for e in remove:
index_entries.remove(e) index_entries.remove(e)

View File

@ -605,7 +605,7 @@ class DirContainer(object):
for root, dirs, files in os.walk(base): for root, dirs, files in os.walk(base):
for fname in files: for fname in files:
fname = os.path.join(root, fname) fname = os.path.join(root, fname)
fname = fname.replace('\\', '/') fname = fname.replace(b'\\', b'/')
if not isinstance(fname, unicode_type): if not isinstance(fname, unicode_type):
try: try:
fname = fname.decode(filesystem_encoding) fname = fname.decode(filesystem_encoding)

View File

@ -24,7 +24,7 @@ class PdbHeaderReader(object):
def identity(self): def identity(self):
self.stream.seek(60) self.stream.seek(60)
ident = self.stream.read(8) ident = self.stream.read(8)
return ident return ident.decode('utf-8')
def section_count(self): def section_count(self):
self.stream.seek(76) self.stream.seek(76)
@ -67,8 +67,8 @@ class PdbHeaderReader(object):
class PdbHeaderBuilder(object): class PdbHeaderBuilder(object):
def __init__(self, identity, title): def __init__(self, identity, title):
self.identity = identity.ljust(3, '\x00')[:8] self.identity = identity.ljust(3, '\x00')[:8].encode('utf-8')
self.title = '%s\x00' % re.sub('[^-A-Za-z0-9 ]+', '_', title).ljust(31, '\x00')[:31].encode('ascii', 'replace') self.title = b'%s\x00' % re.sub('[^-A-Za-z0-9 ]+', '_', title).ljust(31, '\x00')[:31].encode('ascii', 'replace')
def build_header(self, section_lengths, out_stream): def build_header(self, section_lengths, out_stream):
''' '''
@ -85,4 +85,4 @@ class PdbHeaderBuilder(object):
for id, record in enumerate(section_lengths): for id, record in enumerate(section_lengths):
out_stream.write(struct.pack('>LBBBB', long_type(offset), 0, 0, 0, 0)) out_stream.write(struct.pack('>LBBBB', long_type(offset), 0, 0, 0, 0))
offset += record offset += record
out_stream.write('\x00\x00') out_stream.write(b'\x00\x00')

View File

@ -1,4 +1,5 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from __future__ import division
''' '''
Writer content to palmdoc pdb file. Writer content to palmdoc pdb file.
@ -57,13 +58,13 @@ class Writer(FormatWriter):
txt_length = len(txt) txt_length = len(txt)
txt_records = [] txt_records = []
for i in range(0, (len(txt) / MAX_RECORD_SIZE) + 1): for i in range(0, (len(txt) // MAX_RECORD_SIZE) + 1):
txt_records.append(txt[i * MAX_RECORD_SIZE: (i * MAX_RECORD_SIZE) + MAX_RECORD_SIZE]) txt_records.append(txt[i * MAX_RECORD_SIZE: (i * MAX_RECORD_SIZE) + MAX_RECORD_SIZE])
return txt_records, txt_length return txt_records, txt_length
def _header_record(self, txt_length, record_count): def _header_record(self, txt_length, record_count):
record = '' record = b''
record += struct.pack('>H', 2) # [0:2], PalmDoc compression. (1 = No compression). record += struct.pack('>H', 2) # [0:2], PalmDoc compression. (1 = No compression).
record += struct.pack('>H', 0) # [2:4], Always 0. record += struct.pack('>H', 0) # [2:4], Always 0.
@ -73,4 +74,3 @@ class Writer(FormatWriter):
record += struct.pack('>L', 0) # [12-16], Current reading position, as an offset into the uncompressed text. record += struct.pack('>L', 0) # [12-16], Current reading position, as an offset into the uncompressed text.
return record return record

View File

@ -174,8 +174,8 @@ class PMLMLizer(object):
return text return text
def prepare_text(self, text): def prepare_text(self, text):
# Replace empty paragraphs with \c pml codes used to denote emtpy lines. # Replace empty paragraphs with \c pml codes used to denote empty lines.
text = re.sub(unicode_type(r'(?<=</p>)\s*<p[^>]*>[\xc2\xa0\s]*</p>'), '\\c\n\\c', text) text = re.sub(unicode_type(r'(?<=</p>)\s*<p[^>]*>[\xc2\xa0\s]*</p>'), r'\\c\n\\c', text)
return text return text
def clean_text(self, text): def clean_text(self, text):
@ -207,7 +207,7 @@ class PMLMLizer(object):
text = re.sub('[ ]{2,}', ' ', text) text = re.sub('[ ]{2,}', ' ', text)
# Condense excessive \c empty line sequences. # Condense excessive \c empty line sequences.
text = re.sub('(\\c\\s*\\c\\s*){2,}', '\\c \n\\c\n', text) text = re.sub(r'(\\c\\s*\\c\\s*){2,}', r'\\c \n\\c\n', text)
# Remove excessive newlines. # Remove excessive newlines.
text = re.sub('\n[ ]+\n', '\n\n', text) text = re.sub('\n[ ]+\n', '\n\n', text)

View File

@ -562,7 +562,7 @@ class ParseRtf:
def __make_temp_file(self,file): def __make_temp_file(self,file):
"""Make a temporary file to parse""" """Make a temporary file to parse"""
write_file="rtf_write_file" write_file="rtf_write_file"
read_obj = file if hasattr(file, 'read') else open(file,'r') read_obj = file if hasattr(file, 'read') else open(file,'rb')
with open(write_file, 'wb') as write_obj: with open(write_file, 'wb') as write_obj:
for line in read_obj: for line in read_obj:
write_obj.write(line) write_obj.write(line)

View File

@ -36,11 +36,11 @@ class FixLineEndings:
def fix_endings(self): def fix_endings(self):
# read # read
with open(self.__file, 'r') as read_obj: with open(self.__file, 'rb') as read_obj:
input_file = read_obj.read() input_file = read_obj.read()
# calibre go from win and mac to unix # calibre go from win and mac to unix
input_file = input_file.replace('\r\n', '\n') input_file = input_file.replace(b'\r\n', b'\n')
input_file = input_file.replace('\r', '\n') input_file = input_file.replace(b'\r', b'\n')
# remove ASCII invalid chars : 0 to 8 and 11-14 to 24-26-27 # remove ASCII invalid chars : 0 to 8 and 11-14 to 24-26-27
if self.__replace_illegals: if self.__replace_illegals:
input_file = clean_ascii_chars(input_file) input_file = clean_ascii_chars(input_file)

View File

@ -608,12 +608,10 @@ if another paragraph_def is found, the state changes to collect_tokens.
# when determining uniqueness for a style, ingorne these values, since # when determining uniqueness for a style, ingorne these values, since
# they don't tell us if the style is unique # they don't tell us if the style is unique
ignore_values = ['style-num', 'nest-level', 'in-table'] ignore_values = ['style-num', 'nest-level', 'in-table']
keys = self.__att_val_dict.keys() for k, v in self.__att_val_dict.items():
keys.sort() if k in ignore_values:
for key in keys:
if key in ignore_values:
continue continue
my_string += '%s:%s' % (key, self.__att_val_dict[key]) my_string += '%s:%s' % (k, v)
if my_string in self.__style_num_strings: if my_string in self.__style_num_strings:
num = self.__style_num_strings.index(my_string) num = self.__style_num_strings.index(my_string)
num += 1 # since indexing starts at zero, rather than 1 num += 1 # since indexing starts at zero, rather than 1
@ -637,12 +635,9 @@ if another paragraph_def is found, the state changes to collect_tokens.
the_value = self.__att_val_dict['tabs'] the_value = self.__att_val_dict['tabs']
# the_value = the_value[:-1] # the_value = the_value[:-1]
style_string += ('<%s>%s' % ('tabs', the_value)) style_string += ('<%s>%s' % ('tabs', the_value))
keys = self.__att_val_dict.keys() for k, v in self.__att_val_dict.items():
keys.sort() if k not in ['name', 'style-num', 'in-table'] + tabs_list:
for key in keys: style_string += ('<%s>%s' % (k, v))
if key != 'name' and key !='style-num' and key != 'in-table'\
and key not in tabs_list:
style_string += ('<%s>%s' % (key, self.__att_val_dict[key]))
style_string += '\n' style_string += '\n'
self.__body_style_strings.append(style_string) self.__body_style_strings.append(style_string)
@ -690,11 +685,9 @@ if another paragraph_def is found, the state changes to collect_tokens.
the_value = self.__att_val_dict['tabs'] the_value = self.__att_val_dict['tabs']
# the_value = the_value[:-1] # the_value = the_value[:-1]
self.__write_obj.write('<%s>%s' % ('tabs', the_value)) self.__write_obj.write('<%s>%s' % ('tabs', the_value))
keys = self.__att_val_dict.keys() keys = sorted(self.__att_val_dict.keys())
keys.sort()
for key in keys: for key in keys:
if key != 'name' and key !='style-num' and key != 'in-table'\ if key not in ['name', 'style-num', 'in-table'] + tabs_list:
and key not in tabs_list:
self.__write_obj.write('<%s>%s' % (key, self.__att_val_dict[key])) self.__write_obj.write('<%s>%s' % (key, self.__att_val_dict[key]))
self.__write_obj.write('\n') self.__write_obj.write('\n')
self.__write_obj.write(self.__start2_marker) self.__write_obj.write(self.__start2_marker)

View File

@ -43,8 +43,8 @@ class ProcessTokens:
self.__bug_handler = bug_handler self.__bug_handler = bug_handler
def compile_expressions(self): def compile_expressions(self):
self.__num_exp = re.compile(r"([a-zA-Z]+)(.*)") self.__num_exp = re.compile(br"([a-zA-Z]+)(.*)")
self.__utf_exp = re.compile(r'(&.*?;)') self.__utf_exp = re.compile(br'(&.*?;)')
def initiate_token_dict(self): def initiate_token_dict(self):
self.__return_code = 0 self.__return_code = 0
@ -762,10 +762,10 @@ class ProcessTokens:
def process_cw(self, token): def process_cw(self, token):
"""Change the value of the control word by determining what dictionary """Change the value of the control word by determining what dictionary
it belongs to""" it belongs to"""
special = ['*', ':', '}', '{', '~', '_', '-', ';'] special = [b'*', b':', b'}', b'{', b'~', b'_', b'-', b';']
# if token != "{" or token != "}": # if token != "{" or token != "}":
token = token[1:] # strip off leading \ token = token[1:] # strip off leading \
token = token.replace(" ", "") token = token.replace(b" ", b"")
# if not token: return # if not token: return
only_alpha = token.isalpha() only_alpha = token.isalpha()
num = None num = None
@ -784,24 +784,24 @@ class ProcessTokens:
def process_tokens(self): def process_tokens(self):
"""Main method for handling other methods. """ """Main method for handling other methods. """
line_count = 0 line_count = 0
with open(self.__file, 'r') as read_obj: with open(self.__file, 'rb') as read_obj:
with open(self.__write_to, 'wb') as write_obj: with open(self.__write_to, 'wb') as write_obj:
for line in read_obj: for line in read_obj:
token = line.replace("\n","") token = line.replace(b"\n",b"")
line_count += 1 line_count += 1
if line_count == 1 and token != '\\{': if line_count == 1 and token != b'\\{':
msg = '\nInvalid RTF: document doesn\'t start with {\n' msg = '\nInvalid RTF: document doesn\'t start with {\n'
raise self.__exception_handler(msg) raise self.__exception_handler(msg)
elif line_count == 2 and token[0:4] != '\\rtf': elif line_count == 2 and token[0:4] != b'\\rtf':
msg = '\nInvalid RTF: document doesn\'t start with \\rtf \n' msg = '\nInvalid RTF: document doesn\'t start with \\rtf \n'
raise self.__exception_handler(msg) raise self.__exception_handler(msg)
the_index = token.find('\\ ') the_index = token.find(b'\\ ')
if token is not None and the_index > -1: if token is not None and the_index > -1:
msg = '\nInvalid RTF: token "\\ " not valid.\nError at line %d'\ msg = '\nInvalid RTF: token "\\ " not valid.\nError at line %d'\
% line_count % line_count
raise self.__exception_handler(msg) raise self.__exception_handler(msg)
elif token[:1] == "\\": elif token[:1] == b"\\":
try: try:
token.decode('us-ascii') token.decode('us-ascii')
except UnicodeError as msg: except UnicodeError as msg:
@ -816,10 +816,10 @@ class ProcessTokens:
for field in fields: for field in fields:
if not field: if not field:
continue continue
if field[0:1] == '&': if field[0:1] == b'&':
write_obj.write('tx<ut<__________<%s\n' % field) write_obj.write(b'tx<ut<__________<%s\n' % field)
else: else:
write_obj.write('tx<nu<__________<%s\n' % field) write_obj.write(b'tx<nu<__________<%s\n' % field)
if not line_count: if not line_count:
msg = '\nInvalid RTF: file appears to be empty.\n' msg = '\nInvalid RTF: file appears to be empty.\n'

View File

@ -94,7 +94,7 @@ class Tokenize:
uni_len = len(match_obj.group(0)) uni_len = len(match_obj.group(0))
if uni_char < 0: if uni_char < 0:
uni_char += 65536 uni_char += 65536
uni_char = codepoint_to_chr(uni_char).encode('ascii', 'xmlcharrefreplace') uni_char = codepoint_to_chr(uni_char).encode('ascii', 'xmlcharrefreplace').decode('ascii')
self.__uc_char = self.__uc_value[-1] self.__uc_char = self.__uc_value[-1]
# there is only an unicode char # there is only an unicode char
if len(token)<= uni_len: if len(token)<= uni_len:
@ -113,11 +113,11 @@ class Tokenize:
def __sub_reg_split(self,input_file): def __sub_reg_split(self,input_file):
input_file = self.__replace_spchar.mreplace(input_file) input_file = self.__replace_spchar.mreplace(input_file)
# this is for older RTF # this is for older RTF
input_file = self.__par_exp.sub('\n\\par \n', input_file) input_file = self.__par_exp.sub(r'\n\\par \n', input_file)
input_file = self.__cwdigit_exp.sub("\\g<1>\n\\g<2>", input_file) input_file = self.__cwdigit_exp.sub(r"\g<1>\n\g<2>", input_file)
input_file = self.__cs_ast.sub(r"\g<1>", input_file) input_file = self.__cs_ast.sub(r"\g<1>", input_file)
input_file = self.__ms_hex_exp.sub("\\mshex0\\g<1> ", input_file) input_file = self.__ms_hex_exp.sub(r"\\mshex0\g<1> ", input_file)
input_file = self.__utf_ud.sub("\\{\\uc0 \\g<1>\\}", input_file) input_file = self.__utf_ud.sub(r"\\{\\uc0 \g<1>\\}", input_file)
# remove \n in bin data # remove \n in bin data
input_file = self.__bin_exp.sub(lambda x: input_file = self.__bin_exp.sub(lambda x:
x.group().replace('\n', '') + '\n', input_file) x.group().replace('\n', '') + '\n', input_file)
@ -188,7 +188,7 @@ class Tokenize:
# write # write
with open(self.__write_to, 'wb') as write_obj: with open(self.__write_to, 'wb') as write_obj:
write_obj.write('\n'.join(tokens)) write_obj.write('\n'.join(tokens).encode('utf-8'))
# Move and copy # Move and copy
copy_obj = copy.Copy(bug_handler=self.__bug_handler) copy_obj = copy.Copy(bug_handler=self.__bug_handler)
if self.__copy: if self.__copy: