py3: Use unicode_literals and migrate str() in a few more files

This commit is contained in:
Kovid Goyal 2019-05-19 13:20:26 +05:30
parent 7f7c83a709
commit 052cb43ae1
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
3 changed files with 29 additions and 30 deletions

View File

@ -1,6 +1,6 @@
#!/usr/bin/env python2 #!/usr/bin/env python2
from __future__ import print_function from __future__ import print_function, unicode_literals
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
''' '''
@ -48,7 +48,7 @@ class Article(object):
print('Failed to process article summary, deleting:') print('Failed to process article summary, deleting:')
print(summary.encode('utf-8')) print(summary.encode('utf-8'))
traceback.print_exc() traceback.print_exc()
summary = u'' summary = ''
self.text_summary = clean_ascii_chars(summary) self.text_summary = clean_ascii_chars(summary)
self.author = author self.author = author
self.content = content self.content = content
@ -83,7 +83,7 @@ class Article(object):
def __repr__(self): def __repr__(self):
return \ return \
(u'''\ ('''\
Title : %s Title : %s
URL : %s URL : %s
Author : %s Author : %s
@ -93,7 +93,7 @@ TOC thumb : %s
Has content : %s Has content : %s
'''%(self.title, self.url, self.author, self.summary[:20]+'...', '''%(self.title, self.url, self.author, self.summary[:20]+'...',
self.localtime.strftime('%a, %d %b, %Y %H:%M'), self.toc_thumbnail, self.localtime.strftime('%a, %d %b, %Y %H:%M'), self.toc_thumbnail,
bool(self.content))).encode('utf-8') bool(self.content)))
def __str__(self): def __str__(self):
return repr(self) return repr(self)
@ -208,7 +208,7 @@ class Feed(object):
content = [i.value for i in item.get('content', []) if i.value] content = [i.value for i in item.get('content', []) if i.value]
content = [i if isinstance(i, unicode_type) else i.decode('utf-8', 'replace') content = [i if isinstance(i, unicode_type) else i.decode('utf-8', 'replace')
for i in content] for i in content]
content = u'\n'.join(content) content = '\n'.join(content)
if not content.strip(): if not content.strip():
content = None content = None
if not link and not content: if not link and not content:
@ -286,8 +286,8 @@ class FeedCollection(list):
def __init__(self, feeds): def __init__(self, feeds):
list.__init__(self, [f for f in feeds if len(f.articles) > 0]) list.__init__(self, [f for f in feeds if len(f.articles) > 0])
found_articles = set([]) found_articles = set()
duplicates = set([]) duplicates = set()
def in_set(s, a): def in_set(s, a):
for x in s: for x in s:

View File

@ -1,4 +1,4 @@
from __future__ import with_statement from __future__ import with_statement, unicode_literals
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
''' '''
@ -54,7 +54,7 @@ class BasicNewsRecipe(Recipe):
#: A couple of lines that describe the content this recipe downloads. #: A couple of lines that describe the content this recipe downloads.
#: This will be used primarily in a GUI that presents a list of recipes. #: This will be used primarily in a GUI that presents a list of recipes.
description = u'' description = ''
#: The author of this recipe #: The author of this recipe
__author__ = __appname__ __author__ = __appname__
@ -288,7 +288,7 @@ class BasicNewsRecipe(Recipe):
#: The CSS that is used to style the templates, i.e., the navigation bars and #: The CSS that is used to style the templates, i.e., the navigation bars and
#: the Tables of Contents. Rather than overriding this variable, you should #: the Tables of Contents. Rather than overriding this variable, you should
#: use `extra_css` in your recipe to customize look and feel. #: use `extra_css` in your recipe to customize look and feel.
template_css = u''' template_css = '''
.article_date { .article_date {
color: gray; font-family: monospace; color: gray; font-family: monospace;
} }
@ -446,7 +446,7 @@ class BasicNewsRecipe(Recipe):
so, override in your subclass. so, override in your subclass.
''' '''
if not self.feeds: if not self.feeds:
raise NotImplementedError raise NotImplementedError()
if self.test: if self.test:
return self.feeds[:self.test[0]] return self.feeds[:self.test[0]]
return self.feeds return self.feeds
@ -462,7 +462,7 @@ class BasicNewsRecipe(Recipe):
return url + '?&pagewanted=print' return url + '?&pagewanted=print'
''' '''
raise NotImplementedError raise NotImplementedError()
@classmethod @classmethod
def image_url_processor(cls, baseurl, url): def image_url_processor(cls, baseurl, url):
@ -665,7 +665,7 @@ class BasicNewsRecipe(Recipe):
`url_or_raw`: Either a URL or the downloaded index page as a string `url_or_raw`: Either a URL or the downloaded index page as a string
''' '''
if re.match(r'\w+://', url_or_raw): if re.match((br'\w+://' if isinstance(url_or_raw, bytes) else r'\w+://'), url_or_raw):
# We may be called in a thread (in the skip_ad_pages method), so # We may be called in a thread (in the skip_ad_pages method), so
# clone the browser to be safe. We cannot use self.cloned_browser # clone the browser to be safe. We cannot use self.cloned_browser
# as it may or may not actually clone the browser, depending on if # as it may or may not actually clone the browser, depending on if
@ -698,9 +698,7 @@ class BasicNewsRecipe(Recipe):
if as_tree: if as_tree:
from html5_parser import parse from html5_parser import parse
return parse(_raw) return parse(_raw)
else:
return BeautifulSoup(_raw) return BeautifulSoup(_raw)
return parse(_raw, return_root=False)
def extract_readable_article(self, html, url): def extract_readable_article(self, html, url):
''' '''
@ -725,12 +723,12 @@ class BasicNewsRecipe(Recipe):
root = frag root = frag
elif frag.tag == 'body': elif frag.tag == 'body':
root = document_fromstring( root = document_fromstring(
u'<html><head><title>%s</title></head></html>' % '<html><head><title>%s</title></head></html>' %
extracted_title) extracted_title)
root.append(frag) root.append(frag)
else: else:
root = document_fromstring( root = document_fromstring(
u'<html><head><title>%s</title></head><body/></html>' % '<html><head><title>%s</title></head><body/></html>' %
extracted_title) extracted_title)
root.xpath('//body')[0].append(frag) root.xpath('//body')[0].append(frag)
@ -794,7 +792,7 @@ class BasicNewsRecipe(Recipe):
calibre show the user a simple message instead of an error, call calibre show the user a simple message instead of an error, call
:meth:`abort_recipe_processing`. :meth:`abort_recipe_processing`.
''' '''
raise NotImplementedError raise NotImplementedError()
def abort_recipe_processing(self, msg): def abort_recipe_processing(self, msg):
''' '''
@ -815,7 +813,7 @@ class BasicNewsRecipe(Recipe):
This method is typically useful for sites that try to make it difficult to This method is typically useful for sites that try to make it difficult to
access article content automatically. access article content automatically.
''' '''
raise NotImplementedError raise NotImplementedError()
def add_toc_thumbnail(self, article, src): def add_toc_thumbnail(self, article, src):
''' '''
@ -902,9 +900,9 @@ class BasicNewsRecipe(Recipe):
self.css_map = {} self.css_map = {}
web2disk_cmdline = ['web2disk', web2disk_cmdline = ['web2disk',
'--timeout', str(self.timeout), '--timeout', unicode_type(self.timeout),
'--max-recursions', str(self.recursions), '--max-recursions', unicode_type(self.recursions),
'--delay', str(self.delay), '--delay', unicode_type(self.delay),
] ]
if self.verbose: if self.verbose:
@ -1068,10 +1066,10 @@ class BasicNewsRecipe(Recipe):
src = force_unicode(src, 'utf-8') src = force_unicode(src, 'utf-8')
pos = cls.summary_length pos = cls.summary_length
fuzz = 50 fuzz = 50
si = src.find(u';', pos) si = src.find(';', pos)
if si > 0 and si-pos > fuzz: if si > 0 and si-pos > fuzz:
si = -1 si = -1
gi = src.find(u'>', pos) gi = src.find('>', pos)
if gi > 0 and gi-pos > fuzz: if gi > 0 and gi-pos > fuzz:
gi = -1 gi = -1
npos = max(si, gi) npos = max(si, gi)
@ -1081,7 +1079,7 @@ class BasicNewsRecipe(Recipe):
if len(ans) < len(src): if len(ans) < len(src):
from calibre.utils.cleantext import clean_xml_chars from calibre.utils.cleantext import clean_xml_chars
# Truncating the string could cause a dangling UTF-16 half-surrogate, which will cause lxml to barf, clean it # Truncating the string could cause a dangling UTF-16 half-surrogate, which will cause lxml to barf, clean it
ans = clean_xml_chars(ans) + u'\u2026' ans = clean_xml_chars(ans) + '\u2026'
return ans return ans
def feed2index(self, f, feeds): def feed2index(self, f, feeds):
@ -1590,7 +1588,7 @@ class BasicNewsRecipe(Recipe):
article.sub_pages = result[1][1:] article.sub_pages = result[1][1:]
self.jobs_done += 1 self.jobs_done += 1
self.report_progress(float(self.jobs_done)/len(self.jobs), self.report_progress(float(self.jobs_done)/len(self.jobs),
_(u'Article downloaded: %s')%force_unicode(article.title)) _('Article downloaded: %s')%force_unicode(article.title))
if result[2]: if result[2]:
self.partial_failures.append((request.feed.title, article.title, article.url, result[2])) self.partial_failures.append((request.feed.title, article.title, article.url, result[2]))
@ -1684,7 +1682,7 @@ class BasicNewsRecipe(Recipe):
strings.append(item['alt']) strings.append(item['alt'])
except KeyError: except KeyError:
pass pass
ans = u''.join(strings) ans = ''.join(strings)
if normalize_whitespace: if normalize_whitespace:
ans = re.sub(r'\s+', ' ', ans) ans = re.sub(r'\s+', ' ', ans)
return ans return ans

View File

@ -1,4 +1,5 @@
#!/usr/bin/env python2 #!/usr/bin/env python2
from __future__ import unicode_literals
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
@ -36,13 +37,13 @@ class Template(object):
if isbytestring(kwargs[key]): if isbytestring(kwargs[key]):
kwargs[key] = kwargs[key].decode('utf-8', 'replace') kwargs[key] = kwargs[key].decode('utf-8', 'replace')
if kwargs[key] is None: if kwargs[key] is None:
kwargs[key] = u'' kwargs[key] = ''
args = list(args) args = list(args)
for i in range(len(args)): for i in range(len(args)):
if isbytestring(args[i]): if isbytestring(args[i]):
args[i] = args[i].decode('utf-8', 'replace') args[i] = args[i].decode('utf-8', 'replace')
if args[i] is None: if args[i] is None:
args[i] = u'' args[i] = ''
self._generate(*args, **kwargs) self._generate(*args, **kwargs)