This commit is contained in:
Kovid Goyal 2019-05-20 12:20:25 +05:30
commit 4c5e9a20a4
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
18 changed files with 82 additions and 84 deletions

View File

@ -14,16 +14,16 @@ ${head_content}$
${for title in meta.titles():}$
${if pos1:}$
<h1>
<a href="${tocUrl}$">${print title}$</a>
<a href="${tocUrl}$">${print(title)}$</a>
</h1>
${:else:}$
<div class="calibreMetaSubtitle">${print title}$</div>
<div class="calibreMetaSubtitle">${print(title)}$</div>
${:endif}$
${pos1=0}$
${:endfor}$
</div>
<div class="calibreMetaAuthor">
${print ', '.join(meta.creators())}$
${print(', '.join(meta.creators()))}$
</div>
</div>
@ -33,13 +33,13 @@ ${head_content}$
${if prevLink or nextLink:}$
<div class="calibreEbNavTop">
${if prevLink:}$
<a href="${prevLink}$" class="calibreAPrev">${print _('previous page'),}$</a>
<a href="${prevLink}$" class="calibreAPrev">${print(_('previous page'))}$</a>
${:else:}$
<a href="${tocUrl}$" class="calibreAPrev">${print _('previous page'),}$</a>
<a href="${tocUrl}$" class="calibreAPrev">${print(_('previous page'))}$</a>
${:endif}$
${if nextLink:}$
<a href="${nextLink}$" class="calibreANext">${print _('next page'),}$</a>
<a href="${nextLink}$" class="calibreANext">${print(_('next page'))}$</a>
${:endif}$
</div>
${:endif}$
@ -49,22 +49,22 @@ ${head_content}$
${if has_toc:}$
<div class="calibreToc">
<h2><a href="${tocUrl}$">${print _('Table of contents'),}$</a></h2>
${print toc()}$
<h2><a href="${tocUrl}$">${print( _('Table of contents'))}$</a></h2>
${print(toc())}$
</div>
${:endif}$
<div class="calibreEbNav">
${if prevLink:}$
<a href="${prevLink}$" class="calibreAPrev">${print _('previous page'),}$</a>
<a href="${prevLink}$" class="calibreAPrev">${print(_('previous page'))}$</a>
${:else:}$
<a href="${tocUrl}$" class="calibreAPrev">${print _('previous page'),}$</a>
<a href="${tocUrl}$" class="calibreAPrev">${print(_('previous page'))}$</a>
${:endif}$
<a href="${tocUrl}$" class="calibreAHome">${print _('start'),}$</a>
<a href="${tocUrl}$" class="calibreAHome">${print(_('start'))}$</a>
${if nextLink:}$
<a href="${nextLink}$" class="calibreANext">${print _('next page'),}$</a>
<a href="${nextLink}$" class="calibreANext">${print(_('next page'))}$</a>
${:endif}$
</div>

View File

@ -6,10 +6,10 @@
<link rel="schema.DC" href="http://purl.org/dc/elements/1.1/" />
<link rel="schema.DCTERMS" href="http://purl.org/dc/terms/" />
<title>${print ', '.join(meta.creators()),}$ - ${print meta.titles().next(); meta.titles().close()}$</title>
<title>${print(', '.join(meta.creators()))}$ - ${print(next(meta.titles())); print(meta.titles().close())}$</title>
${for item in meta:}$
<meta ${print 'name="DC.'+item['name']+'"',}$ ${print 'content="'+item['value']+'"',}$ />
<meta ${print('name="DC.'+item['name']+'"')}$ ${print('content="'+item['value']+'"')}$ />
${:endfor}$
<link href="${cssLink}$" type="text/css" rel="stylesheet" />
@ -22,16 +22,16 @@ ${:endfor}$
${for title in meta.titles():}$
${if pos1:}$
<h1>
<a href="${tocUrl}$">${print title}$</a>
<a href="${tocUrl}$">${print(title)}$</a>
</h1>
${:else:}$
<div class="calibreMetaSubtitle">${print title}$</div>
<div class="calibreMetaSubtitle">${print(title)}$</div>
${:endif}$
${pos1=0}$
${:endfor}$
</div>
<div class="calibreMetaAuthor">
${print ', '.join(meta.creators()),}$
${print(', '.join(meta.creators()))}$
</div>
</div>
@ -40,19 +40,19 @@ ${:endfor}$
${if has_toc:}$
<div class="calibreTocIndex">
<h2>${print _('Table of contents'),}$</h2>
<h2>${print(_('Table of contents'))}$</h2>
${toc}$
</div>
${:else:}$
<h2>${print _('No table of contents present'),}$</h2>
<div><strong><a href="${nextLink}$">${print _('begin to read'),}$</a></strong></div>
<h2>${print(_('No table of contents present'))}$</h2>
<div><strong><a href="${nextLink}$">${print(_('begin to read'))}$</a></strong></div>
${:endif}$
</div>
<div class="calibreEbNav">
${if nextLink:}$
<a href="${nextLink}$" class="calibreANext">${print _('next page'),}$</a>
<a href="${nextLink}$" class="calibreANext">${print(_('next page'))}$</a>
${:endif}$
</div>
</div>

View File

@ -90,7 +90,8 @@ class FB2Input(InputFormatPlugin):
css = re.sub(r'name\s*=\s*', 'class=', css)
self.extract_embedded_content(doc)
log.debug('Converting XML to HTML...')
ss = open(P('templates/fb2.xsl'), 'rb').read()
with open(P('templates/fb2.xsl'), 'rb') as f:
ss = f.read().decode('utf-8')
ss = ss.replace("__FB_NS__", fb_ns)
if options.no_inline_fb2_toc:
log('Disabling generation of inline FB2 TOC')
@ -124,8 +125,10 @@ class FB2Input(InputFormatPlugin):
src = img.get('src')
img.set('src', self.binary_map.get(src, src))
index = transform.tostring(result)
open(u'index.xhtml', 'wb').write(index)
open(u'inline-styles.css', 'wb').write(css)
with open(u'index.xhtml', 'wb') as f:
f.write(index.encode('utf-8'))
with open(u'inline-styles.css', 'wb') as f:
f.write(css.encode('utf-8'))
stream.seek(0)
mi = get_metadata(stream, 'fb2')
if not mi.title:

View File

@ -79,7 +79,7 @@ class HTMLOutput(OutputFormatPlugin):
from lxml import etree
root = self.generate_toc(oeb_book, ref_url, output_dir)
return etree.tostring(root, pretty_print=True, encoding='utf-8',
return etree.tostring(root, pretty_print=True, encoding='unicode',
xml_declaration=False)
def convert(self, oeb_book, output_path, input_plugin, opts, log):
@ -161,14 +161,14 @@ class HTMLOutput(OutputFormatPlugin):
# get & clean HTML <HEAD>-data
head = root.xpath('//h:head', namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0]
head_content = etree.tostring(head, pretty_print=True, encoding='utf-8')
head_content = etree.tostring(head, pretty_print=True, encoding='unicode')
head_content = re.sub(r'\<\/?head.*\>', '', head_content)
head_content = re.sub(re.compile(r'\<style.*\/style\>', re.M|re.S), '', head_content)
head_content = re.sub(r'<(title)([^>]*)/>', r'<\1\2></\1>', head_content)
# get & clean HTML <BODY>-data
body = root.xpath('//h:body', namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0]
ebook_content = etree.tostring(body, pretty_print=True, encoding='utf-8')
ebook_content = etree.tostring(body, pretty_print=True, encoding='unicode')
ebook_content = re.sub(r'\<\/?body.*\>', '', ebook_content)
ebook_content = re.sub(r'<(div|a|span)([^>]*)/>', r'<\1\2></\1>', ebook_content)
@ -202,7 +202,7 @@ class HTMLOutput(OutputFormatPlugin):
# write html to file
with open(path, 'wb') as f:
f.write(t)
f.write(t.encode('utf-8'))
item.unload_data_from_memory(memory=path)
zfile = zipfile.ZipFile(output_path, "w")

View File

@ -41,7 +41,9 @@ class PMLInput(InputFormatPlugin):
else:
html_stream = html_path
ienc = pml_stream.encoding if pml_stream.encoding else 'cp1252'
ienc = getattr(pml_stream, 'encoding', None)
if ienc is None:
ienc = 'cp1252'
if self.options.input_encoding:
ienc = self.options.input_encoding

View File

@ -142,7 +142,7 @@ class SNBOutput(OutputFormatPlugin):
for tocitem in oeb_book.toc:
if tocitem.href.find('#') != -1:
item = string.split(tocitem.href, '#')
item = tocitem.href.split('#')
if len(item) != 2:
log.error('Error in TOC item: %s' % tocitem)
else:

View File

@ -138,7 +138,7 @@ class TXTInput(InputFormatPlugin):
block_to_single_line, separate_hard_scene_breaks)
self.log = log
txt = ''
txt = b''
log.debug('Reading text from file...')
length = 0
base_dir = getcwd()
@ -151,7 +151,7 @@ class TXTInput(InputFormatPlugin):
for x in walk('.'):
if os.path.splitext(x)[1].lower() in ('.txt', '.text'):
with open(x, 'rb') as tf:
txt += tf.read() + '\n\n'
txt += tf.read() + b'\n\n'
else:
if getattr(stream, 'name', None):
base_dir = os.path.dirname(stream.name)

View File

@ -584,7 +584,7 @@ class HTMLPreProcessor(object):
end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
end_rules.append(
# Un wrap using punctuation
(re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), # noqa
(re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\\\\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), # noqa
)
for rule in self.PREPROCESS + start_rules:

View File

@ -391,7 +391,7 @@ class Mobi8Reader(object):
fi = self.get_file_info(pos)
if fi.filename is None:
raise ValueError('Index entry has invalid pos: %d'%pos)
idtag = self.get_id_tag(pos).decode(self.header.codec)
idtag = self.get_id_tag(pos)
href = '%s/%s'%(fi.type, fi.filename)
else:
try:
@ -403,7 +403,7 @@ class Mobi8Reader(object):
continue
entry['href'] = href
entry['idtag'] = idtag
entry['idtag'] = idtag.decode(self.header.codec)
for e in remove:
index_entries.remove(e)

View File

@ -605,7 +605,7 @@ class DirContainer(object):
for root, dirs, files in os.walk(base):
for fname in files:
fname = os.path.join(root, fname)
fname = fname.replace('\\', '/')
fname = fname.replace(b'\\', b'/')
if not isinstance(fname, unicode_type):
try:
fname = fname.decode(filesystem_encoding)

View File

@ -24,7 +24,7 @@ class PdbHeaderReader(object):
def identity(self):
self.stream.seek(60)
ident = self.stream.read(8)
return ident
return ident.decode('utf-8')
def section_count(self):
self.stream.seek(76)
@ -67,8 +67,8 @@ class PdbHeaderReader(object):
class PdbHeaderBuilder(object):
def __init__(self, identity, title):
self.identity = identity.ljust(3, '\x00')[:8]
self.title = '%s\x00' % re.sub('[^-A-Za-z0-9 ]+', '_', title).ljust(31, '\x00')[:31].encode('ascii', 'replace')
self.identity = identity.ljust(3, '\x00')[:8].encode('utf-8')
self.title = b'%s\x00' % re.sub('[^-A-Za-z0-9 ]+', '_', title).ljust(31, '\x00')[:31].encode('ascii', 'replace')
def build_header(self, section_lengths, out_stream):
'''
@ -85,4 +85,4 @@ class PdbHeaderBuilder(object):
for id, record in enumerate(section_lengths):
out_stream.write(struct.pack('>LBBBB', long_type(offset), 0, 0, 0, 0))
offset += record
out_stream.write('\x00\x00')
out_stream.write(b'\x00\x00')

View File

@ -1,4 +1,5 @@
# -*- coding: utf-8 -*-
from __future__ import division
'''
Writer content to palmdoc pdb file.
@ -57,13 +58,13 @@ class Writer(FormatWriter):
txt_length = len(txt)
txt_records = []
for i in range(0, (len(txt) / MAX_RECORD_SIZE) + 1):
for i in range(0, (len(txt) // MAX_RECORD_SIZE) + 1):
txt_records.append(txt[i * MAX_RECORD_SIZE: (i * MAX_RECORD_SIZE) + MAX_RECORD_SIZE])
return txt_records, txt_length
def _header_record(self, txt_length, record_count):
record = ''
record = b''
record += struct.pack('>H', 2) # [0:2], PalmDoc compression. (1 = No compression).
record += struct.pack('>H', 0) # [2:4], Always 0.
@ -73,4 +74,3 @@ class Writer(FormatWriter):
record += struct.pack('>L', 0) # [12-16], Current reading position, as an offset into the uncompressed text.
return record

View File

@ -174,8 +174,8 @@ class PMLMLizer(object):
return text
def prepare_text(self, text):
# Replace empty paragraphs with \c pml codes used to denote emtpy lines.
text = re.sub(unicode_type(r'(?<=</p>)\s*<p[^>]*>[\xc2\xa0\s]*</p>'), '\\c\n\\c', text)
# Replace empty paragraphs with \c pml codes used to denote empty lines.
text = re.sub(unicode_type(r'(?<=</p>)\s*<p[^>]*>[\xc2\xa0\s]*</p>'), r'\\c\n\\c', text)
return text
def clean_text(self, text):
@ -207,7 +207,7 @@ class PMLMLizer(object):
text = re.sub('[ ]{2,}', ' ', text)
# Condense excessive \c empty line sequences.
text = re.sub('(\\c\\s*\\c\\s*){2,}', '\\c \n\\c\n', text)
text = re.sub(r'(\\c\\s*\\c\\s*){2,}', r'\\c \n\\c\n', text)
# Remove excessive newlines.
text = re.sub('\n[ ]+\n', '\n\n', text)

View File

@ -562,7 +562,7 @@ class ParseRtf:
def __make_temp_file(self,file):
"""Make a temporary file to parse"""
write_file="rtf_write_file"
read_obj = file if hasattr(file, 'read') else open(file,'r')
read_obj = file if hasattr(file, 'read') else open(file,'rb')
with open(write_file, 'wb') as write_obj:
for line in read_obj:
write_obj.write(line)

View File

@ -36,11 +36,11 @@ class FixLineEndings:
def fix_endings(self):
# read
with open(self.__file, 'r') as read_obj:
with open(self.__file, 'rb') as read_obj:
input_file = read_obj.read()
# calibre go from win and mac to unix
input_file = input_file.replace('\r\n', '\n')
input_file = input_file.replace('\r', '\n')
input_file = input_file.replace(b'\r\n', b'\n')
input_file = input_file.replace(b'\r', b'\n')
# remove ASCII invalid chars : 0 to 8 and 11-14 to 24-26-27
if self.__replace_illegals:
input_file = clean_ascii_chars(input_file)

View File

@ -608,12 +608,10 @@ if another paragraph_def is found, the state changes to collect_tokens.
# when determining uniqueness for a style, ingorne these values, since
# they don't tell us if the style is unique
ignore_values = ['style-num', 'nest-level', 'in-table']
keys = self.__att_val_dict.keys()
keys.sort()
for key in keys:
if key in ignore_values:
for k, v in self.__att_val_dict.items():
if k in ignore_values:
continue
my_string += '%s:%s' % (key, self.__att_val_dict[key])
my_string += '%s:%s' % (k, v)
if my_string in self.__style_num_strings:
num = self.__style_num_strings.index(my_string)
num += 1 # since indexing starts at zero, rather than 1
@ -637,12 +635,9 @@ if another paragraph_def is found, the state changes to collect_tokens.
the_value = self.__att_val_dict['tabs']
# the_value = the_value[:-1]
style_string += ('<%s>%s' % ('tabs', the_value))
keys = self.__att_val_dict.keys()
keys.sort()
for key in keys:
if key != 'name' and key !='style-num' and key != 'in-table'\
and key not in tabs_list:
style_string += ('<%s>%s' % (key, self.__att_val_dict[key]))
for k, v in self.__att_val_dict.items():
if k not in ['name', 'style-num', 'in-table'] + tabs_list:
style_string += ('<%s>%s' % (k, v))
style_string += '\n'
self.__body_style_strings.append(style_string)
@ -690,11 +685,9 @@ if another paragraph_def is found, the state changes to collect_tokens.
the_value = self.__att_val_dict['tabs']
# the_value = the_value[:-1]
self.__write_obj.write('<%s>%s' % ('tabs', the_value))
keys = self.__att_val_dict.keys()
keys.sort()
keys = sorted(self.__att_val_dict.keys())
for key in keys:
if key != 'name' and key !='style-num' and key != 'in-table'\
and key not in tabs_list:
if key not in ['name', 'style-num', 'in-table'] + tabs_list:
self.__write_obj.write('<%s>%s' % (key, self.__att_val_dict[key]))
self.__write_obj.write('\n')
self.__write_obj.write(self.__start2_marker)

View File

@ -43,8 +43,8 @@ class ProcessTokens:
self.__bug_handler = bug_handler
def compile_expressions(self):
self.__num_exp = re.compile(r"([a-zA-Z]+)(.*)")
self.__utf_exp = re.compile(r'(&.*?;)')
self.__num_exp = re.compile(br"([a-zA-Z]+)(.*)")
self.__utf_exp = re.compile(br'(&.*?;)')
def initiate_token_dict(self):
self.__return_code = 0
@ -762,10 +762,10 @@ class ProcessTokens:
def process_cw(self, token):
"""Change the value of the control word by determining what dictionary
it belongs to"""
special = ['*', ':', '}', '{', '~', '_', '-', ';']
special = [b'*', b':', b'}', b'{', b'~', b'_', b'-', b';']
# if token != "{" or token != "}":
token = token[1:] # strip off leading \
token = token.replace(" ", "")
token = token.replace(b" ", b"")
# if not token: return
only_alpha = token.isalpha()
num = None
@ -784,24 +784,24 @@ class ProcessTokens:
def process_tokens(self):
"""Main method for handling other methods. """
line_count = 0
with open(self.__file, 'r') as read_obj:
with open(self.__file, 'rb') as read_obj:
with open(self.__write_to, 'wb') as write_obj:
for line in read_obj:
token = line.replace("\n","")
token = line.replace(b"\n",b"")
line_count += 1
if line_count == 1 and token != '\\{':
if line_count == 1 and token != b'\\{':
msg = '\nInvalid RTF: document doesn\'t start with {\n'
raise self.__exception_handler(msg)
elif line_count == 2 and token[0:4] != '\\rtf':
elif line_count == 2 and token[0:4] != b'\\rtf':
msg = '\nInvalid RTF: document doesn\'t start with \\rtf \n'
raise self.__exception_handler(msg)
the_index = token.find('\\ ')
the_index = token.find(b'\\ ')
if token is not None and the_index > -1:
msg = '\nInvalid RTF: token "\\ " not valid.\nError at line %d'\
% line_count
raise self.__exception_handler(msg)
elif token[:1] == "\\":
elif token[:1] == b"\\":
try:
token.decode('us-ascii')
except UnicodeError as msg:
@ -816,10 +816,10 @@ class ProcessTokens:
for field in fields:
if not field:
continue
if field[0:1] == '&':
write_obj.write('tx<ut<__________<%s\n' % field)
if field[0:1] == b'&':
write_obj.write(b'tx<ut<__________<%s\n' % field)
else:
write_obj.write('tx<nu<__________<%s\n' % field)
write_obj.write(b'tx<nu<__________<%s\n' % field)
if not line_count:
msg = '\nInvalid RTF: file appears to be empty.\n'

View File

@ -94,7 +94,7 @@ class Tokenize:
uni_len = len(match_obj.group(0))
if uni_char < 0:
uni_char += 65536
uni_char = codepoint_to_chr(uni_char).encode('ascii', 'xmlcharrefreplace')
uni_char = codepoint_to_chr(uni_char).encode('ascii', 'xmlcharrefreplace').decode('ascii')
self.__uc_char = self.__uc_value[-1]
# there is only an unicode char
if len(token)<= uni_len:
@ -113,11 +113,11 @@ class Tokenize:
def __sub_reg_split(self,input_file):
input_file = self.__replace_spchar.mreplace(input_file)
# this is for older RTF
input_file = self.__par_exp.sub('\n\\par \n', input_file)
input_file = self.__cwdigit_exp.sub("\\g<1>\n\\g<2>", input_file)
input_file = self.__par_exp.sub(r'\n\\par \n', input_file)
input_file = self.__cwdigit_exp.sub(r"\g<1>\n\g<2>", input_file)
input_file = self.__cs_ast.sub(r"\g<1>", input_file)
input_file = self.__ms_hex_exp.sub("\\mshex0\\g<1> ", input_file)
input_file = self.__utf_ud.sub("\\{\\uc0 \\g<1>\\}", input_file)
input_file = self.__ms_hex_exp.sub(r"\\mshex0\g<1> ", input_file)
input_file = self.__utf_ud.sub(r"\\{\\uc0 \g<1>\\}", input_file)
# remove \n in bin data
input_file = self.__bin_exp.sub(lambda x:
x.group().replace('\n', '') + '\n', input_file)
@ -188,7 +188,7 @@ class Tokenize:
# write
with open(self.__write_to, 'wb') as write_obj:
write_obj.write('\n'.join(tokens))
write_obj.write('\n'.join(tokens).encode('utf-8'))
# Move and copy
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
if self.__copy: