mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Merge branch 'py3' of https://github.com/eli-schwartz/calibre
This commit is contained in:
commit
4c5e9a20a4
@ -14,16 +14,16 @@ ${head_content}$
|
||||
${for title in meta.titles():}$
|
||||
${if pos1:}$
|
||||
<h1>
|
||||
<a href="${tocUrl}$">${print title}$</a>
|
||||
<a href="${tocUrl}$">${print(title)}$</a>
|
||||
</h1>
|
||||
${:else:}$
|
||||
<div class="calibreMetaSubtitle">${print title}$</div>
|
||||
<div class="calibreMetaSubtitle">${print(title)}$</div>
|
||||
${:endif}$
|
||||
${pos1=0}$
|
||||
${:endfor}$
|
||||
</div>
|
||||
<div class="calibreMetaAuthor">
|
||||
${print ', '.join(meta.creators())}$
|
||||
${print(', '.join(meta.creators()))}$
|
||||
</div>
|
||||
</div>
|
||||
|
||||
@ -33,13 +33,13 @@ ${head_content}$
|
||||
${if prevLink or nextLink:}$
|
||||
<div class="calibreEbNavTop">
|
||||
${if prevLink:}$
|
||||
<a href="${prevLink}$" class="calibreAPrev">${print _('previous page'),}$</a>
|
||||
<a href="${prevLink}$" class="calibreAPrev">${print(_('previous page'))}$</a>
|
||||
${:else:}$
|
||||
<a href="${tocUrl}$" class="calibreAPrev">${print _('previous page'),}$</a>
|
||||
<a href="${tocUrl}$" class="calibreAPrev">${print(_('previous page'))}$</a>
|
||||
${:endif}$
|
||||
|
||||
${if nextLink:}$
|
||||
<a href="${nextLink}$" class="calibreANext">${print _('next page'),}$</a>
|
||||
<a href="${nextLink}$" class="calibreANext">${print(_('next page'))}$</a>
|
||||
${:endif}$
|
||||
</div>
|
||||
${:endif}$
|
||||
@ -49,22 +49,22 @@ ${head_content}$
|
||||
|
||||
${if has_toc:}$
|
||||
<div class="calibreToc">
|
||||
<h2><a href="${tocUrl}$">${print _('Table of contents'),}$</a></h2>
|
||||
${print toc()}$
|
||||
<h2><a href="${tocUrl}$">${print( _('Table of contents'))}$</a></h2>
|
||||
${print(toc())}$
|
||||
</div>
|
||||
${:endif}$
|
||||
|
||||
<div class="calibreEbNav">
|
||||
${if prevLink:}$
|
||||
<a href="${prevLink}$" class="calibreAPrev">${print _('previous page'),}$</a>
|
||||
<a href="${prevLink}$" class="calibreAPrev">${print(_('previous page'))}$</a>
|
||||
${:else:}$
|
||||
<a href="${tocUrl}$" class="calibreAPrev">${print _('previous page'),}$</a>
|
||||
<a href="${tocUrl}$" class="calibreAPrev">${print(_('previous page'))}$</a>
|
||||
${:endif}$
|
||||
|
||||
<a href="${tocUrl}$" class="calibreAHome">${print _('start'),}$</a>
|
||||
<a href="${tocUrl}$" class="calibreAHome">${print(_('start'))}$</a>
|
||||
|
||||
${if nextLink:}$
|
||||
<a href="${nextLink}$" class="calibreANext">${print _('next page'),}$</a>
|
||||
<a href="${nextLink}$" class="calibreANext">${print(_('next page'))}$</a>
|
||||
${:endif}$
|
||||
</div>
|
||||
|
||||
|
@ -6,10 +6,10 @@
|
||||
<link rel="schema.DC" href="http://purl.org/dc/elements/1.1/" />
|
||||
<link rel="schema.DCTERMS" href="http://purl.org/dc/terms/" />
|
||||
|
||||
<title>${print ', '.join(meta.creators()),}$ - ${print meta.titles().next(); meta.titles().close()}$</title>
|
||||
<title>${print(', '.join(meta.creators()))}$ - ${print(next(meta.titles())); print(meta.titles().close())}$</title>
|
||||
|
||||
${for item in meta:}$
|
||||
<meta ${print 'name="DC.'+item['name']+'"',}$ ${print 'content="'+item['value']+'"',}$ />
|
||||
<meta ${print('name="DC.'+item['name']+'"')}$ ${print('content="'+item['value']+'"')}$ />
|
||||
${:endfor}$
|
||||
|
||||
<link href="${cssLink}$" type="text/css" rel="stylesheet" />
|
||||
@ -22,16 +22,16 @@ ${:endfor}$
|
||||
${for title in meta.titles():}$
|
||||
${if pos1:}$
|
||||
<h1>
|
||||
<a href="${tocUrl}$">${print title}$</a>
|
||||
<a href="${tocUrl}$">${print(title)}$</a>
|
||||
</h1>
|
||||
${:else:}$
|
||||
<div class="calibreMetaSubtitle">${print title}$</div>
|
||||
<div class="calibreMetaSubtitle">${print(title)}$</div>
|
||||
${:endif}$
|
||||
${pos1=0}$
|
||||
${:endfor}$
|
||||
</div>
|
||||
<div class="calibreMetaAuthor">
|
||||
${print ', '.join(meta.creators()),}$
|
||||
${print(', '.join(meta.creators()))}$
|
||||
</div>
|
||||
</div>
|
||||
|
||||
@ -40,19 +40,19 @@ ${:endfor}$
|
||||
|
||||
${if has_toc:}$
|
||||
<div class="calibreTocIndex">
|
||||
<h2>${print _('Table of contents'),}$</h2>
|
||||
<h2>${print(_('Table of contents'))}$</h2>
|
||||
${toc}$
|
||||
</div>
|
||||
${:else:}$
|
||||
<h2>${print _('No table of contents present'),}$</h2>
|
||||
<div><strong><a href="${nextLink}$">${print _('begin to read'),}$</a></strong></div>
|
||||
<h2>${print(_('No table of contents present'))}$</h2>
|
||||
<div><strong><a href="${nextLink}$">${print(_('begin to read'))}$</a></strong></div>
|
||||
${:endif}$
|
||||
|
||||
</div>
|
||||
|
||||
<div class="calibreEbNav">
|
||||
${if nextLink:}$
|
||||
<a href="${nextLink}$" class="calibreANext">${print _('next page'),}$</a>
|
||||
<a href="${nextLink}$" class="calibreANext">${print(_('next page'))}$</a>
|
||||
${:endif}$
|
||||
</div>
|
||||
</div>
|
||||
|
@ -90,7 +90,8 @@ class FB2Input(InputFormatPlugin):
|
||||
css = re.sub(r'name\s*=\s*', 'class=', css)
|
||||
self.extract_embedded_content(doc)
|
||||
log.debug('Converting XML to HTML...')
|
||||
ss = open(P('templates/fb2.xsl'), 'rb').read()
|
||||
with open(P('templates/fb2.xsl'), 'rb') as f:
|
||||
ss = f.read().decode('utf-8')
|
||||
ss = ss.replace("__FB_NS__", fb_ns)
|
||||
if options.no_inline_fb2_toc:
|
||||
log('Disabling generation of inline FB2 TOC')
|
||||
@ -124,8 +125,10 @@ class FB2Input(InputFormatPlugin):
|
||||
src = img.get('src')
|
||||
img.set('src', self.binary_map.get(src, src))
|
||||
index = transform.tostring(result)
|
||||
open(u'index.xhtml', 'wb').write(index)
|
||||
open(u'inline-styles.css', 'wb').write(css)
|
||||
with open(u'index.xhtml', 'wb') as f:
|
||||
f.write(index.encode('utf-8'))
|
||||
with open(u'inline-styles.css', 'wb') as f:
|
||||
f.write(css.encode('utf-8'))
|
||||
stream.seek(0)
|
||||
mi = get_metadata(stream, 'fb2')
|
||||
if not mi.title:
|
||||
|
@ -79,7 +79,7 @@ class HTMLOutput(OutputFormatPlugin):
|
||||
from lxml import etree
|
||||
|
||||
root = self.generate_toc(oeb_book, ref_url, output_dir)
|
||||
return etree.tostring(root, pretty_print=True, encoding='utf-8',
|
||||
return etree.tostring(root, pretty_print=True, encoding='unicode',
|
||||
xml_declaration=False)
|
||||
|
||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||
@ -161,14 +161,14 @@ class HTMLOutput(OutputFormatPlugin):
|
||||
|
||||
# get & clean HTML <HEAD>-data
|
||||
head = root.xpath('//h:head', namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0]
|
||||
head_content = etree.tostring(head, pretty_print=True, encoding='utf-8')
|
||||
head_content = etree.tostring(head, pretty_print=True, encoding='unicode')
|
||||
head_content = re.sub(r'\<\/?head.*\>', '', head_content)
|
||||
head_content = re.sub(re.compile(r'\<style.*\/style\>', re.M|re.S), '', head_content)
|
||||
head_content = re.sub(r'<(title)([^>]*)/>', r'<\1\2></\1>', head_content)
|
||||
|
||||
# get & clean HTML <BODY>-data
|
||||
body = root.xpath('//h:body', namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0]
|
||||
ebook_content = etree.tostring(body, pretty_print=True, encoding='utf-8')
|
||||
ebook_content = etree.tostring(body, pretty_print=True, encoding='unicode')
|
||||
ebook_content = re.sub(r'\<\/?body.*\>', '', ebook_content)
|
||||
ebook_content = re.sub(r'<(div|a|span)([^>]*)/>', r'<\1\2></\1>', ebook_content)
|
||||
|
||||
@ -202,7 +202,7 @@ class HTMLOutput(OutputFormatPlugin):
|
||||
|
||||
# write html to file
|
||||
with open(path, 'wb') as f:
|
||||
f.write(t)
|
||||
f.write(t.encode('utf-8'))
|
||||
item.unload_data_from_memory(memory=path)
|
||||
|
||||
zfile = zipfile.ZipFile(output_path, "w")
|
||||
|
@ -41,7 +41,9 @@ class PMLInput(InputFormatPlugin):
|
||||
else:
|
||||
html_stream = html_path
|
||||
|
||||
ienc = pml_stream.encoding if pml_stream.encoding else 'cp1252'
|
||||
ienc = getattr(pml_stream, 'encoding', None)
|
||||
if ienc is None:
|
||||
ienc = 'cp1252'
|
||||
if self.options.input_encoding:
|
||||
ienc = self.options.input_encoding
|
||||
|
||||
|
@ -142,7 +142,7 @@ class SNBOutput(OutputFormatPlugin):
|
||||
|
||||
for tocitem in oeb_book.toc:
|
||||
if tocitem.href.find('#') != -1:
|
||||
item = string.split(tocitem.href, '#')
|
||||
item = tocitem.href.split('#')
|
||||
if len(item) != 2:
|
||||
log.error('Error in TOC item: %s' % tocitem)
|
||||
else:
|
||||
|
@ -138,7 +138,7 @@ class TXTInput(InputFormatPlugin):
|
||||
block_to_single_line, separate_hard_scene_breaks)
|
||||
|
||||
self.log = log
|
||||
txt = ''
|
||||
txt = b''
|
||||
log.debug('Reading text from file...')
|
||||
length = 0
|
||||
base_dir = getcwd()
|
||||
@ -151,7 +151,7 @@ class TXTInput(InputFormatPlugin):
|
||||
for x in walk('.'):
|
||||
if os.path.splitext(x)[1].lower() in ('.txt', '.text'):
|
||||
with open(x, 'rb') as tf:
|
||||
txt += tf.read() + '\n\n'
|
||||
txt += tf.read() + b'\n\n'
|
||||
else:
|
||||
if getattr(stream, 'name', None):
|
||||
base_dir = os.path.dirname(stream.name)
|
||||
|
@ -584,7 +584,7 @@ class HTMLPreProcessor(object):
|
||||
end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
|
||||
end_rules.append(
|
||||
# Un wrap using punctuation
|
||||
(re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), # noqa
|
||||
(re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\\\\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), # noqa
|
||||
)
|
||||
|
||||
for rule in self.PREPROCESS + start_rules:
|
||||
|
@ -391,7 +391,7 @@ class Mobi8Reader(object):
|
||||
fi = self.get_file_info(pos)
|
||||
if fi.filename is None:
|
||||
raise ValueError('Index entry has invalid pos: %d'%pos)
|
||||
idtag = self.get_id_tag(pos).decode(self.header.codec)
|
||||
idtag = self.get_id_tag(pos)
|
||||
href = '%s/%s'%(fi.type, fi.filename)
|
||||
else:
|
||||
try:
|
||||
@ -403,7 +403,7 @@ class Mobi8Reader(object):
|
||||
continue
|
||||
|
||||
entry['href'] = href
|
||||
entry['idtag'] = idtag
|
||||
entry['idtag'] = idtag.decode(self.header.codec)
|
||||
|
||||
for e in remove:
|
||||
index_entries.remove(e)
|
||||
|
@ -605,7 +605,7 @@ class DirContainer(object):
|
||||
for root, dirs, files in os.walk(base):
|
||||
for fname in files:
|
||||
fname = os.path.join(root, fname)
|
||||
fname = fname.replace('\\', '/')
|
||||
fname = fname.replace(b'\\', b'/')
|
||||
if not isinstance(fname, unicode_type):
|
||||
try:
|
||||
fname = fname.decode(filesystem_encoding)
|
||||
|
@ -24,7 +24,7 @@ class PdbHeaderReader(object):
|
||||
def identity(self):
|
||||
self.stream.seek(60)
|
||||
ident = self.stream.read(8)
|
||||
return ident
|
||||
return ident.decode('utf-8')
|
||||
|
||||
def section_count(self):
|
||||
self.stream.seek(76)
|
||||
@ -67,8 +67,8 @@ class PdbHeaderReader(object):
|
||||
class PdbHeaderBuilder(object):
|
||||
|
||||
def __init__(self, identity, title):
|
||||
self.identity = identity.ljust(3, '\x00')[:8]
|
||||
self.title = '%s\x00' % re.sub('[^-A-Za-z0-9 ]+', '_', title).ljust(31, '\x00')[:31].encode('ascii', 'replace')
|
||||
self.identity = identity.ljust(3, '\x00')[:8].encode('utf-8')
|
||||
self.title = b'%s\x00' % re.sub('[^-A-Za-z0-9 ]+', '_', title).ljust(31, '\x00')[:31].encode('ascii', 'replace')
|
||||
|
||||
def build_header(self, section_lengths, out_stream):
|
||||
'''
|
||||
@ -85,4 +85,4 @@ class PdbHeaderBuilder(object):
|
||||
for id, record in enumerate(section_lengths):
|
||||
out_stream.write(struct.pack('>LBBBB', long_type(offset), 0, 0, 0, 0))
|
||||
offset += record
|
||||
out_stream.write('\x00\x00')
|
||||
out_stream.write(b'\x00\x00')
|
||||
|
@ -1,4 +1,5 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import division
|
||||
|
||||
'''
|
||||
Writer content to palmdoc pdb file.
|
||||
@ -57,13 +58,13 @@ class Writer(FormatWriter):
|
||||
txt_length = len(txt)
|
||||
|
||||
txt_records = []
|
||||
for i in range(0, (len(txt) / MAX_RECORD_SIZE) + 1):
|
||||
for i in range(0, (len(txt) // MAX_RECORD_SIZE) + 1):
|
||||
txt_records.append(txt[i * MAX_RECORD_SIZE: (i * MAX_RECORD_SIZE) + MAX_RECORD_SIZE])
|
||||
|
||||
return txt_records, txt_length
|
||||
|
||||
def _header_record(self, txt_length, record_count):
|
||||
record = ''
|
||||
record = b''
|
||||
|
||||
record += struct.pack('>H', 2) # [0:2], PalmDoc compression. (1 = No compression).
|
||||
record += struct.pack('>H', 0) # [2:4], Always 0.
|
||||
@ -73,4 +74,3 @@ class Writer(FormatWriter):
|
||||
record += struct.pack('>L', 0) # [12-16], Current reading position, as an offset into the uncompressed text.
|
||||
|
||||
return record
|
||||
|
||||
|
@ -174,8 +174,8 @@ class PMLMLizer(object):
|
||||
return text
|
||||
|
||||
def prepare_text(self, text):
|
||||
# Replace empty paragraphs with \c pml codes used to denote emtpy lines.
|
||||
text = re.sub(unicode_type(r'(?<=</p>)\s*<p[^>]*>[\xc2\xa0\s]*</p>'), '\\c\n\\c', text)
|
||||
# Replace empty paragraphs with \c pml codes used to denote empty lines.
|
||||
text = re.sub(unicode_type(r'(?<=</p>)\s*<p[^>]*>[\xc2\xa0\s]*</p>'), r'\\c\n\\c', text)
|
||||
return text
|
||||
|
||||
def clean_text(self, text):
|
||||
@ -207,7 +207,7 @@ class PMLMLizer(object):
|
||||
text = re.sub('[ ]{2,}', ' ', text)
|
||||
|
||||
# Condense excessive \c empty line sequences.
|
||||
text = re.sub('(\\c\\s*\\c\\s*){2,}', '\\c \n\\c\n', text)
|
||||
text = re.sub(r'(\\c\\s*\\c\\s*){2,}', r'\\c \n\\c\n', text)
|
||||
|
||||
# Remove excessive newlines.
|
||||
text = re.sub('\n[ ]+\n', '\n\n', text)
|
||||
|
@ -562,7 +562,7 @@ class ParseRtf:
|
||||
def __make_temp_file(self,file):
|
||||
"""Make a temporary file to parse"""
|
||||
write_file="rtf_write_file"
|
||||
read_obj = file if hasattr(file, 'read') else open(file,'r')
|
||||
read_obj = file if hasattr(file, 'read') else open(file,'rb')
|
||||
with open(write_file, 'wb') as write_obj:
|
||||
for line in read_obj:
|
||||
write_obj.write(line)
|
||||
|
@ -36,11 +36,11 @@ class FixLineEndings:
|
||||
|
||||
def fix_endings(self):
|
||||
# read
|
||||
with open(self.__file, 'r') as read_obj:
|
||||
with open(self.__file, 'rb') as read_obj:
|
||||
input_file = read_obj.read()
|
||||
# calibre go from win and mac to unix
|
||||
input_file = input_file.replace('\r\n', '\n')
|
||||
input_file = input_file.replace('\r', '\n')
|
||||
input_file = input_file.replace(b'\r\n', b'\n')
|
||||
input_file = input_file.replace(b'\r', b'\n')
|
||||
# remove ASCII invalid chars : 0 to 8 and 11-14 to 24-26-27
|
||||
if self.__replace_illegals:
|
||||
input_file = clean_ascii_chars(input_file)
|
||||
|
@ -608,12 +608,10 @@ if another paragraph_def is found, the state changes to collect_tokens.
|
||||
# when determining uniqueness for a style, ingorne these values, since
|
||||
# they don't tell us if the style is unique
|
||||
ignore_values = ['style-num', 'nest-level', 'in-table']
|
||||
keys = self.__att_val_dict.keys()
|
||||
keys.sort()
|
||||
for key in keys:
|
||||
if key in ignore_values:
|
||||
for k, v in self.__att_val_dict.items():
|
||||
if k in ignore_values:
|
||||
continue
|
||||
my_string += '%s:%s' % (key, self.__att_val_dict[key])
|
||||
my_string += '%s:%s' % (k, v)
|
||||
if my_string in self.__style_num_strings:
|
||||
num = self.__style_num_strings.index(my_string)
|
||||
num += 1 # since indexing starts at zero, rather than 1
|
||||
@ -637,12 +635,9 @@ if another paragraph_def is found, the state changes to collect_tokens.
|
||||
the_value = self.__att_val_dict['tabs']
|
||||
# the_value = the_value[:-1]
|
||||
style_string += ('<%s>%s' % ('tabs', the_value))
|
||||
keys = self.__att_val_dict.keys()
|
||||
keys.sort()
|
||||
for key in keys:
|
||||
if key != 'name' and key !='style-num' and key != 'in-table'\
|
||||
and key not in tabs_list:
|
||||
style_string += ('<%s>%s' % (key, self.__att_val_dict[key]))
|
||||
for k, v in self.__att_val_dict.items():
|
||||
if k not in ['name', 'style-num', 'in-table'] + tabs_list:
|
||||
style_string += ('<%s>%s' % (k, v))
|
||||
style_string += '\n'
|
||||
self.__body_style_strings.append(style_string)
|
||||
|
||||
@ -690,11 +685,9 @@ if another paragraph_def is found, the state changes to collect_tokens.
|
||||
the_value = self.__att_val_dict['tabs']
|
||||
# the_value = the_value[:-1]
|
||||
self.__write_obj.write('<%s>%s' % ('tabs', the_value))
|
||||
keys = self.__att_val_dict.keys()
|
||||
keys.sort()
|
||||
keys = sorted(self.__att_val_dict.keys())
|
||||
for key in keys:
|
||||
if key != 'name' and key !='style-num' and key != 'in-table'\
|
||||
and key not in tabs_list:
|
||||
if key not in ['name', 'style-num', 'in-table'] + tabs_list:
|
||||
self.__write_obj.write('<%s>%s' % (key, self.__att_val_dict[key]))
|
||||
self.__write_obj.write('\n')
|
||||
self.__write_obj.write(self.__start2_marker)
|
||||
|
@ -43,8 +43,8 @@ class ProcessTokens:
|
||||
self.__bug_handler = bug_handler
|
||||
|
||||
def compile_expressions(self):
|
||||
self.__num_exp = re.compile(r"([a-zA-Z]+)(.*)")
|
||||
self.__utf_exp = re.compile(r'(&.*?;)')
|
||||
self.__num_exp = re.compile(br"([a-zA-Z]+)(.*)")
|
||||
self.__utf_exp = re.compile(br'(&.*?;)')
|
||||
|
||||
def initiate_token_dict(self):
|
||||
self.__return_code = 0
|
||||
@ -762,10 +762,10 @@ class ProcessTokens:
|
||||
def process_cw(self, token):
|
||||
"""Change the value of the control word by determining what dictionary
|
||||
it belongs to"""
|
||||
special = ['*', ':', '}', '{', '~', '_', '-', ';']
|
||||
special = [b'*', b':', b'}', b'{', b'~', b'_', b'-', b';']
|
||||
# if token != "{" or token != "}":
|
||||
token = token[1:] # strip off leading \
|
||||
token = token.replace(" ", "")
|
||||
token = token.replace(b" ", b"")
|
||||
# if not token: return
|
||||
only_alpha = token.isalpha()
|
||||
num = None
|
||||
@ -784,24 +784,24 @@ class ProcessTokens:
|
||||
def process_tokens(self):
|
||||
"""Main method for handling other methods. """
|
||||
line_count = 0
|
||||
with open(self.__file, 'r') as read_obj:
|
||||
with open(self.__file, 'rb') as read_obj:
|
||||
with open(self.__write_to, 'wb') as write_obj:
|
||||
for line in read_obj:
|
||||
token = line.replace("\n","")
|
||||
token = line.replace(b"\n",b"")
|
||||
line_count += 1
|
||||
if line_count == 1 and token != '\\{':
|
||||
if line_count == 1 and token != b'\\{':
|
||||
msg = '\nInvalid RTF: document doesn\'t start with {\n'
|
||||
raise self.__exception_handler(msg)
|
||||
elif line_count == 2 and token[0:4] != '\\rtf':
|
||||
elif line_count == 2 and token[0:4] != b'\\rtf':
|
||||
msg = '\nInvalid RTF: document doesn\'t start with \\rtf \n'
|
||||
raise self.__exception_handler(msg)
|
||||
|
||||
the_index = token.find('\\ ')
|
||||
the_index = token.find(b'\\ ')
|
||||
if token is not None and the_index > -1:
|
||||
msg = '\nInvalid RTF: token "\\ " not valid.\nError at line %d'\
|
||||
% line_count
|
||||
raise self.__exception_handler(msg)
|
||||
elif token[:1] == "\\":
|
||||
elif token[:1] == b"\\":
|
||||
try:
|
||||
token.decode('us-ascii')
|
||||
except UnicodeError as msg:
|
||||
@ -816,10 +816,10 @@ class ProcessTokens:
|
||||
for field in fields:
|
||||
if not field:
|
||||
continue
|
||||
if field[0:1] == '&':
|
||||
write_obj.write('tx<ut<__________<%s\n' % field)
|
||||
if field[0:1] == b'&':
|
||||
write_obj.write(b'tx<ut<__________<%s\n' % field)
|
||||
else:
|
||||
write_obj.write('tx<nu<__________<%s\n' % field)
|
||||
write_obj.write(b'tx<nu<__________<%s\n' % field)
|
||||
|
||||
if not line_count:
|
||||
msg = '\nInvalid RTF: file appears to be empty.\n'
|
||||
|
@ -94,7 +94,7 @@ class Tokenize:
|
||||
uni_len = len(match_obj.group(0))
|
||||
if uni_char < 0:
|
||||
uni_char += 65536
|
||||
uni_char = codepoint_to_chr(uni_char).encode('ascii', 'xmlcharrefreplace')
|
||||
uni_char = codepoint_to_chr(uni_char).encode('ascii', 'xmlcharrefreplace').decode('ascii')
|
||||
self.__uc_char = self.__uc_value[-1]
|
||||
# there is only an unicode char
|
||||
if len(token)<= uni_len:
|
||||
@ -113,11 +113,11 @@ class Tokenize:
|
||||
def __sub_reg_split(self,input_file):
|
||||
input_file = self.__replace_spchar.mreplace(input_file)
|
||||
# this is for older RTF
|
||||
input_file = self.__par_exp.sub('\n\\par \n', input_file)
|
||||
input_file = self.__cwdigit_exp.sub("\\g<1>\n\\g<2>", input_file)
|
||||
input_file = self.__par_exp.sub(r'\n\\par \n', input_file)
|
||||
input_file = self.__cwdigit_exp.sub(r"\g<1>\n\g<2>", input_file)
|
||||
input_file = self.__cs_ast.sub(r"\g<1>", input_file)
|
||||
input_file = self.__ms_hex_exp.sub("\\mshex0\\g<1> ", input_file)
|
||||
input_file = self.__utf_ud.sub("\\{\\uc0 \\g<1>\\}", input_file)
|
||||
input_file = self.__ms_hex_exp.sub(r"\\mshex0\g<1> ", input_file)
|
||||
input_file = self.__utf_ud.sub(r"\\{\\uc0 \g<1>\\}", input_file)
|
||||
# remove \n in bin data
|
||||
input_file = self.__bin_exp.sub(lambda x:
|
||||
x.group().replace('\n', '') + '\n', input_file)
|
||||
@ -188,7 +188,7 @@ class Tokenize:
|
||||
|
||||
# write
|
||||
with open(self.__write_to, 'wb') as write_obj:
|
||||
write_obj.write('\n'.join(tokens))
|
||||
write_obj.write('\n'.join(tokens).encode('utf-8'))
|
||||
# Move and copy
|
||||
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
|
||||
if self.__copy:
|
||||
|
Loading…
x
Reference in New Issue
Block a user