mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Get rid of has_key from all recipes
This commit is contained in:
parent
3c86c9be6f
commit
5755625d1e
@ -217,7 +217,7 @@ A reasonably complex real life example that exposes more of the :term:`API` of `
|
|||||||
description = self.tag_to_string(summary, use_alt=False)
|
description = self.tag_to_string(summary, use_alt=False)
|
||||||
|
|
||||||
feed = key if key is not None else 'Uncategorized'
|
feed = key if key is not None else 'Uncategorized'
|
||||||
if not articles.has_key(feed):
|
if feed not in articles:
|
||||||
articles[feed] = []
|
articles[feed] = []
|
||||||
if not 'podcasts' in url:
|
if not 'podcasts' in url:
|
||||||
articles[feed].append(
|
articles[feed].append(
|
||||||
@ -225,7 +225,7 @@ A reasonably complex real life example that exposes more of the :term:`API` of `
|
|||||||
description=description,
|
description=description,
|
||||||
content=''))
|
content=''))
|
||||||
ans = self.sort_index_by(ans, {'The Front Page':-1, 'Dining In, Dining Out':1, 'Obituaries':2})
|
ans = self.sort_index_by(ans, {'The Front Page':-1, 'Dining In, Dining Out':1, 'Obituaries':2})
|
||||||
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
ans = [(key, articles[key]) for key in ans if key in articles]
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
|
@ -60,7 +60,6 @@ class t20Minutos(BasicNewsRecipe):
|
|||||||
else:
|
else:
|
||||||
str = self.tag_to_string(item)
|
str = self.tag_to_string(item)
|
||||||
item.replaceWith(str)
|
item.replaceWith(str)
|
||||||
for item in soup.findAll('img'):
|
for item in soup.findAll('img', alt=False):
|
||||||
if not item.has_key('alt'): # noqa
|
item['alt'] = 'image'
|
||||||
item['alt'] = 'image'
|
|
||||||
return soup
|
return soup
|
||||||
|
@ -112,7 +112,7 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe):
|
|||||||
for div in soup.findAll('div', attrs={'class': re.compile(self.author_reg_exp, re.IGNORECASE)}):
|
for div in soup.findAll('div', attrs={'class': re.compile(self.author_reg_exp, re.IGNORECASE)}):
|
||||||
div.extract()
|
div.extract()
|
||||||
for auth in div.findAll('a'):
|
for auth in div.findAll('a'):
|
||||||
if (auth.has_key('class') and auth['class'] == 'cm-source-image'): # noqa
|
if auth.get('class') == 'cm-source-image':
|
||||||
continue
|
continue
|
||||||
names = names + comma + auth.contents[0]
|
names = names + comma + auth.contents[0]
|
||||||
comma = ', '
|
comma = ', '
|
||||||
|
@ -25,7 +25,7 @@ class AdvancedUserRecipe1290663986(BasicNewsRecipe):
|
|||||||
def generic_parse(self, soup):
|
def generic_parse(self, soup):
|
||||||
articles = []
|
articles = []
|
||||||
# soup.findAll('li', 'hentry'):
|
# soup.findAll('li', 'hentry'):
|
||||||
for entry in soup.findAll(lambda tag: tag.name == 'li' and tag.has_key('class') and tag['class'].find('hentry') != -1): # noqa
|
for entry in soup.findAll('li', attrs={'class': lambda x: x and 'hentry' in x}):
|
||||||
article_url = entry.a['href'] + '?print=yes'
|
article_url = entry.a['href'] + '?print=yes'
|
||||||
article_title = entry.find('h3', 'entry-title')
|
article_title = entry.find('h3', 'entry-title')
|
||||||
article_title = self.tag_to_string(article_title)
|
article_title = self.tag_to_string(article_title)
|
||||||
@ -48,7 +48,7 @@ class AdvancedUserRecipe1290663986(BasicNewsRecipe):
|
|||||||
|
|
||||||
def plumaje_parse(self, soup):
|
def plumaje_parse(self, soup):
|
||||||
articles = []
|
articles = []
|
||||||
blogs_soup = soup.find(lambda tag: tag.name == 'ul' and tag.has_key('class') and tag['class'].find('bloglist-fecha') != -1) # noqa
|
blogs_soup = soup.find('ul', attrs={'class': lambda x: x and 'bloglist-fecha' in x})
|
||||||
for entry in blogs_soup.findAll('li'):
|
for entry in blogs_soup.findAll('li'):
|
||||||
article_title = entry.p
|
article_title = entry.p
|
||||||
article_url = article_title.a['href'] + '?print=yes'
|
article_url = article_title.a['href'] + '?print=yes'
|
||||||
@ -69,7 +69,7 @@ class AdvancedUserRecipe1290663986(BasicNewsRecipe):
|
|||||||
def boca_parse(self, soup):
|
def boca_parse(self, soup):
|
||||||
articles = []
|
articles = []
|
||||||
# soup.findAll('li', 'hentry'):
|
# soup.findAll('li', 'hentry'):
|
||||||
for entry in soup.findAll(lambda tag: tag.name == 'div' and tag.has_key('class') and tag['class'].find('hentry') != -1): # noqa
|
for entry in soup.findAll('div', attrs={'class': lambda x: x and 'hentry' in x}):
|
||||||
article_title = entry.find('h2', 'entry-title')
|
article_title = entry.find('h2', 'entry-title')
|
||||||
article_url = article_title.a['href'] + '?print=yes'
|
article_url = article_title.a['href'] + '?print=yes'
|
||||||
article_title = self.tag_to_string(article_title)
|
article_title = self.tag_to_string(article_title)
|
||||||
|
@ -62,8 +62,8 @@ class BuenosAiresHerald(BasicNewsRecipe):
|
|||||||
soup = self.index_to_soup(feedurl)
|
soup = self.index_to_soup(feedurl)
|
||||||
for item in soup.findAll('div', attrs={'class': 'nota_texto_seccion'}):
|
for item in soup.findAll('div', attrs={'class': 'nota_texto_seccion'}):
|
||||||
description = self.tag_to_string(item.h2)
|
description = self.tag_to_string(item.h2)
|
||||||
atag = item.h2.find('a')
|
atag = item.h2.find('a', href=True)
|
||||||
if atag and atag.has_key('href'): # noqa
|
if atag is not None:
|
||||||
url = self.INDEX + atag['href']
|
url = self.INDEX + atag['href']
|
||||||
title = description
|
title = description
|
||||||
date = strftime(self.timefmt)
|
date = strftime(self.timefmt)
|
||||||
|
@ -50,8 +50,8 @@ class BenchmarkPl(BasicNewsRecipe):
|
|||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
self.append_page(soup, soup.body)
|
self.append_page(soup, soup.body)
|
||||||
for a in soup('a'):
|
for a in soup.findAll('a', href=True):
|
||||||
if a.has_key('href') and not a['href'].startswith('http'): # noqa
|
if not a['href'].startswith('http'):
|
||||||
a['href'] = self.INDEX + a['href']
|
a['href'] = self.INDEX + a['href']
|
||||||
for r in soup.findAll(attrs={'class': ['comments', 'body']}):
|
for r in soup.findAll(attrs={'class': ['comments', 'body']}):
|
||||||
r.extract()
|
r.extract()
|
||||||
|
@ -55,7 +55,6 @@ class BigHollywood(BasicNewsRecipe):
|
|||||||
else:
|
else:
|
||||||
str = self.tag_to_string(item)
|
str = self.tag_to_string(item)
|
||||||
item.replaceWith(str)
|
item.replaceWith(str)
|
||||||
for item in soup.findAll('img'):
|
for item in soup.findAll('img', alt=False):
|
||||||
if not item.has_key('alt'): # noqa
|
item['alt'] = 'image'
|
||||||
item['alt'] = 'image'
|
|
||||||
return soup
|
return soup
|
||||||
|
@ -59,7 +59,6 @@ class Business_insider(BasicNewsRecipe):
|
|||||||
if item.string is not None:
|
if item.string is not None:
|
||||||
tstr = item.string
|
tstr = item.string
|
||||||
item.replaceWith(tstr)
|
item.replaceWith(tstr)
|
||||||
for item in soup.findAll('img'):
|
for item in soup.findAll('img', alt=False):
|
||||||
if not item.has_key('alt'): # noqa
|
item['alt'] = 'image'
|
||||||
item['alt'] = 'image'
|
|
||||||
return soup
|
return soup
|
||||||
|
@ -286,7 +286,7 @@ class CanWestPaper(BasicNewsRecipe):
|
|||||||
else:
|
else:
|
||||||
description = self.tag_to_string(dtag, False)
|
description = self.tag_to_string(dtag, False)
|
||||||
print("DESCRIPTION: " + description)
|
print("DESCRIPTION: " + description)
|
||||||
if not articles.has_key(key): # noqa
|
if key not in articles:
|
||||||
articles[key] = []
|
articles[key] = []
|
||||||
articles[key].append(dict(
|
articles[key].append(dict(
|
||||||
title=title, url=url, date='', description=description, author='', content=''))
|
title=title, url=url, date='', description=description, author='', content=''))
|
||||||
@ -310,5 +310,5 @@ class CanWestPaper(BasicNewsRecipe):
|
|||||||
|
|
||||||
for (k, url) in self.postmedia_index_pages:
|
for (k, url) in self.postmedia_index_pages:
|
||||||
parse_web_index(k, url)
|
parse_web_index(k, url)
|
||||||
ans = [(key, articles[key]) for key in ans if articles.has_key(key)] # noqa
|
ans = [(key, articles[key]) for key in ans if key in articles]
|
||||||
return ans
|
return ans
|
||||||
|
@ -22,7 +22,7 @@ class CD_Action(BasicNewsRecipe):
|
|||||||
return getattr(self, 'cover_url', self.cover_url)
|
return getattr(self, 'cover_url', self.cover_url)
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for a in soup('a'):
|
for a in soup.findAll('a', href=True):
|
||||||
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: # noqa
|
if 'http://' not in a['href'] and 'https://' not in a['href']:
|
||||||
a['href'] = self.index + a['href']
|
a['href'] = self.index + a['href']
|
||||||
return soup
|
return soup
|
||||||
|
@ -101,10 +101,7 @@ class CSMonitor(BasicNewsRecipe):
|
|||||||
else:
|
else:
|
||||||
str = self.tag_to_string(item)
|
str = self.tag_to_string(item)
|
||||||
item.replaceWith(str)
|
item.replaceWith(str)
|
||||||
for item in soup.findAll('img'):
|
for item in soup.findAll('img', src=True):
|
||||||
if 'scorecardresearch' in item['src']:
|
if 'scorecardresearch' in item['src']:
|
||||||
item.extract()
|
item.extract()
|
||||||
else:
|
|
||||||
if not item.has_key('alt'): # noqa
|
|
||||||
item['alt'] = 'image'
|
|
||||||
return soup
|
return soup
|
||||||
|
@ -36,10 +36,9 @@ class Cinebel(BasicNewsRecipe):
|
|||||||
]
|
]
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for alink in soup.findAll('a'):
|
for alink in soup.findAll('a', href=True):
|
||||||
if alink.has_key('href'): # noqa
|
tstr = "Site officiel: " + alink['href']
|
||||||
tstr = "Site officiel: " + alink['href']
|
alink.replaceWith(tstr)
|
||||||
alink.replaceWith(tstr)
|
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
|
@ -131,12 +131,12 @@ class CIO_Magazine(BasicNewsRecipe):
|
|||||||
|
|
||||||
# Esto esta copiado del NY times
|
# Esto esta copiado del NY times
|
||||||
feed = key if key is not None else 'Uncategorized'
|
feed = key if key is not None else 'Uncategorized'
|
||||||
if not articles.has_key(feed): # noqa
|
if feed not in articles:
|
||||||
articles[feed] = []
|
articles[feed] = []
|
||||||
if 'podcasts' not in url:
|
if 'podcasts' not in url:
|
||||||
articles[feed].append(
|
articles[feed].append(
|
||||||
dict(title=title, url=url, date=pubdate,
|
dict(title=title, url=url, date=pubdate,
|
||||||
description=description,
|
description=description,
|
||||||
content=''))
|
content=''))
|
||||||
feeds = [(k, articles[k]) for k in feeds if articles.has_key(k)] # noqa
|
feeds = [(k, articles[k]) for k in feeds if k in articles]
|
||||||
return feeds
|
return feeds
|
||||||
|
@ -54,7 +54,7 @@ class TheCND(BasicNewsRecipe):
|
|||||||
if re.search('cm', date):
|
if re.search('cm', date):
|
||||||
continue
|
continue
|
||||||
if (date is not None) and len(date) > 2:
|
if (date is not None) and len(date) > 2:
|
||||||
if not articles.has_key(date): # noqa
|
if date not in articles:
|
||||||
articles[date] = []
|
articles[date] = []
|
||||||
articles[date].append(
|
articles[date].append(
|
||||||
{'title': title, 'url': url, 'description': '', 'date': ''})
|
{'title': title, 'url': url, 'description': '', 'date': ''})
|
||||||
|
@ -54,7 +54,7 @@ class TheCND(BasicNewsRecipe):
|
|||||||
continue
|
continue
|
||||||
self.log('\tFound article: ', title, 'at', url, '@', date)
|
self.log('\tFound article: ', title, 'at', url, '@', date)
|
||||||
if (date is not None) and len(date) > 2:
|
if (date is not None) and len(date) > 2:
|
||||||
if not articles.has_key(date): # noqa
|
if date not in articles:
|
||||||
articles[date] = []
|
articles[date] = []
|
||||||
articles[date].append(
|
articles[date].append(
|
||||||
{'title': title, 'url': url, 'description': '', 'date': ''})
|
{'title': title, 'url': url, 'description': '', 'date': ''})
|
||||||
|
@ -53,8 +53,8 @@ class General(BasicNewsRecipe):
|
|||||||
for item in soup.body.findAll(name=['table', 'td', 'tr', 'th', 'caption', 'thead', 'tfoot', 'tbody', 'colgroup', 'col']):
|
for item in soup.body.findAll(name=['table', 'td', 'tr', 'th', 'caption', 'thead', 'tfoot', 'tbody', 'colgroup', 'col']):
|
||||||
item.name = 'div'
|
item.name = 'div'
|
||||||
for attrib in attribs:
|
for attrib in attribs:
|
||||||
if item.has_key(attrib): # noqa
|
item[attrib] = ''
|
||||||
del item[attrib]
|
del item[attrib]
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
|
@ -46,7 +46,6 @@ class CubaDebate(BasicNewsRecipe):
|
|||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for item in soup.findAll(style=True):
|
for item in soup.findAll(style=True):
|
||||||
del item['style']
|
del item['style']
|
||||||
for item in soup.findAll('img'):
|
for item in soup.findAll('img', alt=False):
|
||||||
if not item.has_key('alt'): # noqa
|
item['alt'] = 'image'
|
||||||
item['alt'] = 'image'
|
|
||||||
return soup
|
return soup
|
||||||
|
@ -57,7 +57,6 @@ class Pagina12(BasicNewsRecipe):
|
|||||||
else:
|
else:
|
||||||
str = self.tag_to_string(item)
|
str = self.tag_to_string(item)
|
||||||
item.replaceWith(str)
|
item.replaceWith(str)
|
||||||
for item in soup.findAll('img'):
|
for item in soup.findAll('img', alt=False):
|
||||||
if not item.has_key('alt'): # noqa
|
item['alt'] = 'image'
|
||||||
item['alt'] = 'image'
|
|
||||||
return soup
|
return soup
|
||||||
|
@ -64,7 +64,7 @@ class DeGentenaarOnline(BasicNewsRecipe):
|
|||||||
del item['style']
|
del item['style']
|
||||||
for item in soup.findAll('span'):
|
for item in soup.findAll('span'):
|
||||||
item.name = 'div'
|
item.name = 'div'
|
||||||
if item.has_key('id') and item['id'] == 'lblArticleTitle': # noqa
|
if item.get('id') == 'lblArticleTitle':
|
||||||
item.name = 'h3'
|
item.name = 'h3'
|
||||||
|
|
||||||
soup.html['lang'] = self.lang
|
soup.html['lang'] = self.lang
|
||||||
|
@ -65,8 +65,8 @@ class DeutscheWelle_bs(BasicNewsRecipe):
|
|||||||
if limg:
|
if limg:
|
||||||
item.name = 'div'
|
item.name = 'div'
|
||||||
del item['href']
|
del item['href']
|
||||||
if item.has_key('target'): # noqa
|
item['target'] = ''
|
||||||
del item['target']
|
del item['target']
|
||||||
else:
|
else:
|
||||||
str = self.tag_to_string(item)
|
str = self.tag_to_string(item)
|
||||||
item.replaceWith(str)
|
item.replaceWith(str)
|
||||||
|
@ -63,8 +63,8 @@ class DeutscheWelle_hr(BasicNewsRecipe):
|
|||||||
if limg:
|
if limg:
|
||||||
item.name = 'div'
|
item.name = 'div'
|
||||||
del item['href']
|
del item['href']
|
||||||
if item.has_key('target'): # noqa
|
item['target'] = ''
|
||||||
del item['target']
|
del item['target']
|
||||||
else:
|
else:
|
||||||
str = self.tag_to_string(item)
|
str = self.tag_to_string(item)
|
||||||
item.replaceWith(str)
|
item.replaceWith(str)
|
||||||
|
@ -54,8 +54,8 @@ class DeutscheWelle_pt(BasicNewsRecipe):
|
|||||||
if limg:
|
if limg:
|
||||||
item.name = 'div'
|
item.name = 'div'
|
||||||
del item['href']
|
del item['href']
|
||||||
if item.has_key('target'): # noqa
|
item['target'] = ''
|
||||||
del item['target']
|
del item['target']
|
||||||
else:
|
else:
|
||||||
str = self.tag_to_string(item)
|
str = self.tag_to_string(item)
|
||||||
item.replaceWith(str)
|
item.replaceWith(str)
|
||||||
|
@ -68,8 +68,8 @@ class DeutscheWelle_sr(BasicNewsRecipe):
|
|||||||
if limg:
|
if limg:
|
||||||
item.name = 'div'
|
item.name = 'div'
|
||||||
del item['href']
|
del item['href']
|
||||||
if item.has_key('target'): # noqa
|
item['target'] = ''
|
||||||
del item['target']
|
del item['target']
|
||||||
else:
|
else:
|
||||||
str = self.tag_to_string(item)
|
str = self.tag_to_string(item)
|
||||||
item.replaceWith(str)
|
item.replaceWith(str)
|
||||||
|
@ -55,8 +55,8 @@ class DnevnikCro(BasicNewsRecipe):
|
|||||||
for item in soup.body.findAll(name=['table', 'td', 'tr', 'th', 'caption', 'thead', 'tfoot', 'tbody', 'colgroup', 'col']):
|
for item in soup.body.findAll(name=['table', 'td', 'tr', 'th', 'caption', 'thead', 'tfoot', 'tbody', 'colgroup', 'col']):
|
||||||
item.name = 'div'
|
item.name = 'div'
|
||||||
for attrib in attribs:
|
for attrib in attribs:
|
||||||
if item.has_key(attrib): # noqa
|
item[attrib] = ''
|
||||||
del item[attrib]
|
del item[attrib]
|
||||||
|
|
||||||
mlang = Tag(soup, 'meta', [
|
mlang = Tag(soup, 'meta', [
|
||||||
("http-equiv", "Content-Language"), ("content", self.lang)])
|
("http-equiv", "Content-Language"), ("content", self.lang)])
|
||||||
|
@ -37,7 +37,6 @@ class DobaNevinosti(BasicNewsRecipe):
|
|||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for item in soup.findAll(style=True):
|
for item in soup.findAll(style=True):
|
||||||
del item['style']
|
del item['style']
|
||||||
for item in soup.findAll('img'):
|
for item in soup.findAll('img', alt=False):
|
||||||
if not item.has_key('alt'): # noqa
|
item['alt'] = 'image'
|
||||||
item['alt'] = 'image'
|
|
||||||
return soup
|
return soup
|
||||||
|
@ -28,8 +28,8 @@ class Dobreprogramy_pl(BasicNewsRecipe):
|
|||||||
('Blogi', 'http://feeds.feedburner.com/dobreprogramy/BlogCzytelnikow')]
|
('Blogi', 'http://feeds.feedburner.com/dobreprogramy/BlogCzytelnikow')]
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for a in soup('a'):
|
for a in soup('a', href=True):
|
||||||
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: # noqa
|
if 'http://' not in a['href'] and 'https://' not in a['href']:
|
||||||
a['href'] = self.index + a['href']
|
a['href'] = self.index + a['href']
|
||||||
for r in soup.findAll('iframe'):
|
for r in soup.findAll('iframe'):
|
||||||
r.parent.extract()
|
r.parent.extract()
|
||||||
|
@ -81,8 +81,8 @@ class Dzieje(BasicNewsRecipe):
|
|||||||
return feeds
|
return feeds
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for a in soup('a'):
|
for a in soup('a', href=True):
|
||||||
if a.has_key('href') and not a['href'].startswith('http'): # noqa
|
if not a['href'].startswith('http'):
|
||||||
a['href'] = self.index + a['href']
|
a['href'] = self.index + a['href']
|
||||||
self.append_page(soup, soup.body)
|
self.append_page(soup, soup.body)
|
||||||
return soup
|
return soup
|
||||||
|
@ -286,7 +286,7 @@ class CanWestPaper(BasicNewsRecipe):
|
|||||||
else:
|
else:
|
||||||
description = self.tag_to_string(dtag, False)
|
description = self.tag_to_string(dtag, False)
|
||||||
print("DESCRIPTION: " + description)
|
print("DESCRIPTION: " + description)
|
||||||
if not articles.has_key(key): # noqa
|
if key not in articles:
|
||||||
articles[key] = []
|
articles[key] = []
|
||||||
articles[key].append(dict(
|
articles[key].append(dict(
|
||||||
title=title, url=url, date='', description=description, author='', content=''))
|
title=title, url=url, date='', description=description, author='', content=''))
|
||||||
@ -310,5 +310,5 @@ class CanWestPaper(BasicNewsRecipe):
|
|||||||
|
|
||||||
for (k, url) in self.postmedia_index_pages:
|
for (k, url) in self.postmedia_index_pages:
|
||||||
parse_web_index(k, url)
|
parse_web_index(k, url)
|
||||||
ans = [(key, articles[key]) for key in ans if articles.has_key(key)] # noqa
|
ans = [(key, articles[key]) for key in ans if key in articles]
|
||||||
return ans
|
return ans
|
||||||
|
@ -113,11 +113,11 @@ class ElDiplo_Recipe(BasicNewsRecipe):
|
|||||||
if aut:
|
if aut:
|
||||||
auth = self.tag_to_string(aut, use_alt=False).strip()
|
auth = self.tag_to_string(aut, use_alt=False).strip()
|
||||||
|
|
||||||
if not articles.has_key(section): # noqa
|
if section not in articles: # noqa
|
||||||
articles[section] = []
|
articles[section] = []
|
||||||
|
|
||||||
articles[section].append(dict(
|
articles[section].append(dict(
|
||||||
title=title, author=auth, url=url, date=None, description=description, content=''))
|
title=title, author=auth, url=url, date=None, description=description, content=''))
|
||||||
|
|
||||||
ans = [(s, articles[s]) for s in ans if articles.has_key(s)] # noqa
|
ans = [(s, articles[s]) for s in ans if s in articles]
|
||||||
return ans
|
return ans
|
||||||
|
@ -53,7 +53,6 @@ class ElClubDelEbook(BasicNewsRecipe):
|
|||||||
else:
|
else:
|
||||||
str = self.tag_to_string(item)
|
str = self.tag_to_string(item)
|
||||||
item.replaceWith(str)
|
item.replaceWith(str)
|
||||||
for item in soup.findAll('img'):
|
for item in soup.findAll('img', alt=False):
|
||||||
if not item.has_key('alt'): # noqa
|
item['alt'] = 'image'
|
||||||
item['alt'] = 'image'
|
|
||||||
return soup
|
return soup
|
||||||
|
@ -35,17 +35,16 @@ class ElPaisSemanal(BasicNewsRecipe):
|
|||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
articles = []
|
articles = []
|
||||||
soup = self.index_to_soup(self.index)
|
soup = self.index_to_soup(self.index)
|
||||||
for item in soup.findAll('a', attrs={'class': ['g19i003', 'g17r003', 'g17i003']}):
|
for item in soup.findAll('a', attrs={'class': ['g19i003', 'g17r003', 'g17i003']}, href=True):
|
||||||
description = ''
|
description = ''
|
||||||
title_prefix = ''
|
title_prefix = ''
|
||||||
feed_link = item
|
feed_link = item
|
||||||
if item.has_key('href'): # noqa
|
url = 'http://www.elpais.com' + item['href'].rpartition('/')[0]
|
||||||
url = 'http://www.elpais.com' + item['href'].rpartition('/')[0]
|
title = title_prefix + self.tag_to_string(feed_link)
|
||||||
title = title_prefix + self.tag_to_string(feed_link)
|
date = strftime(self.timefmt)
|
||||||
date = strftime(self.timefmt)
|
articles.append({
|
||||||
articles.append({
|
'title': title, 'date': date, 'url': url, 'description': description
|
||||||
'title': title, 'date': date, 'url': url, 'description': description
|
})
|
||||||
})
|
|
||||||
return [(soup.head.title.string, articles)]
|
return [(soup.head.title.string, articles)]
|
||||||
|
|
||||||
def print_version(self, url):
|
def print_version(self, url):
|
||||||
|
@ -31,7 +31,7 @@ class ElUniversalImpresaRecipe(BasicNewsRecipe):
|
|||||||
|
|
||||||
table = soup.find('table', attrs={'width': '500'})
|
table = soup.find('table', attrs={'width': '500'})
|
||||||
articles = []
|
articles = []
|
||||||
for td in table.findAll(lambda tag: tag.name == 'td' and tag.has_key('class') and tag['class'] == 'arnegro12'): # noqa
|
for td in table.findAll('td', attrs={'class': 'arnegro12'}):
|
||||||
a = td.a
|
a = td.a
|
||||||
a.extract()
|
a.extract()
|
||||||
title = self.tag_to_string(a)
|
title = self.tag_to_string(a)
|
||||||
@ -79,8 +79,8 @@ class ElUniversalImpresaRecipe(BasicNewsRecipe):
|
|||||||
tag = soup.find('font', attrs={'color': '#0F046A'})
|
tag = soup.find('font', attrs={'color': '#0F046A'})
|
||||||
if tag:
|
if tag:
|
||||||
for attr in ['color', 'face', 'helvetica,', 'sans-serif', 'size']:
|
for attr in ['color', 'face', 'helvetica,', 'sans-serif', 'size']:
|
||||||
if tag.has_key(attr): # noqa
|
tag[attr] = ''
|
||||||
del tag[attr]
|
del tag[attr]
|
||||||
tag.name = 'h1'
|
tag.name = 'h1'
|
||||||
|
|
||||||
return soup
|
return soup
|
||||||
|
@ -62,8 +62,8 @@ class ESPN(BasicNewsRecipe):
|
|||||||
]
|
]
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for div in soup.findAll('div'):
|
for div in soup.findAll('div', style=True):
|
||||||
if div.has_key('style') and 'px' in div['style']: # noqa
|
if 'px' in div['style']:
|
||||||
div['style'] = ''
|
div['style'] = ''
|
||||||
|
|
||||||
return soup
|
return soup
|
||||||
|
@ -88,7 +88,7 @@ class Estadao(BasicNewsRecipe):
|
|||||||
def postprocess_html(self, soup, first):
|
def postprocess_html(self, soup, first):
|
||||||
# process all the images. assumes that the new html has the correct
|
# process all the images. assumes that the new html has the correct
|
||||||
# path
|
# path
|
||||||
for tag in soup.findAll(lambda tag: tag.name.lower() == 'img' and tag.has_key('src')): # noqa
|
for tag in soup.findAll('img', src=True):
|
||||||
iurl = tag['src']
|
iurl = tag['src']
|
||||||
img = Image()
|
img = Image()
|
||||||
img.open(iurl)
|
img.open(iurl)
|
||||||
|
@ -57,7 +57,6 @@ class FinancialSense(BasicNewsRecipe):
|
|||||||
else:
|
else:
|
||||||
str = self.tag_to_string(item)
|
str = self.tag_to_string(item)
|
||||||
item.replaceWith(str)
|
item.replaceWith(str)
|
||||||
for item in soup.findAll('img'):
|
for item in soup.findAll('img', alt=False):
|
||||||
if not item.has_key('alt'): # noqa
|
item['alt'] = 'image'
|
||||||
item['alt'] = 'image'
|
|
||||||
return soup
|
return soup
|
||||||
|
@ -84,8 +84,8 @@ class General(BasicNewsRecipe):
|
|||||||
for item in soup.body.findAll(name=['table', 'td', 'tr', 'th', 'caption', 'thead', 'tfoot', 'tbody', 'colgroup', 'col']):
|
for item in soup.body.findAll(name=['table', 'td', 'tr', 'th', 'caption', 'thead', 'tfoot', 'tbody', 'colgroup', 'col']):
|
||||||
item.name = 'div'
|
item.name = 'div'
|
||||||
for attrib in attribs:
|
for attrib in attribs:
|
||||||
if item.has_key(attrib): # noqa
|
item[attrib] = ''
|
||||||
del item[attrib]
|
del item[attrib]
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
|
@ -27,7 +27,7 @@ class Gameplay_pl(BasicNewsRecipe):
|
|||||||
return url
|
return url
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for a in soup('a'):
|
for a in soup('a', href=True):
|
||||||
if a.has_key('href') and '../' in a['href']: # noqa
|
if '../' in a['href']:
|
||||||
a['href'] = self.index + a['href'][2:]
|
a['href'] = self.index + a['href'][2:]
|
||||||
return soup
|
return soup
|
||||||
|
@ -56,7 +56,7 @@ class AdvancedUserRecipe1307556816(BasicNewsRecipe):
|
|||||||
extra_css = 'body, h3, p, div, span{margin:0px; padding:0px} h3.entry-header{font-size: 0.8em} div.entry-body{font-size: 0.7em}'
|
extra_css = 'body, h3, p, div, span{margin:0px; padding:0px} h3.entry-header{font-size: 0.8em} div.entry-body{font-size: 0.7em}'
|
||||||
|
|
||||||
def postprocess_html(self, soup, first):
|
def postprocess_html(self, soup, first):
|
||||||
for tag in soup.findAll(lambda tag: tag.name.lower() == 'img' and tag.has_key('src')): # noqa
|
for tag in soup.findAll('img', src=True):
|
||||||
iurl = tag['src']
|
iurl = tag['src']
|
||||||
img = Image()
|
img = Image()
|
||||||
img.open(iurl)
|
img.open(iurl)
|
||||||
|
@ -58,8 +58,8 @@ class Gildia(BasicNewsRecipe):
|
|||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
title = soup.title.renderContents().lower()
|
title = soup.title.renderContents().lower()
|
||||||
for a in soup('a'):
|
for a in soup('a', href=True):
|
||||||
if a.has_key('href') and not a['href'].startswith('http'): # noqa
|
if not a['href'].startswith('http'):
|
||||||
if '/gry/' in a['href']:
|
if '/gry/' in a['href']:
|
||||||
a['href'] = 'http://www.gry.gildia.pl' + a['href']
|
a['href'] = 'http://www.gry.gildia.pl' + a['href']
|
||||||
elif u'książk' in title or u'komiks' in title:
|
elif u'książk' in title or u'komiks' in title:
|
||||||
|
@ -43,8 +43,8 @@ class Gram_pl(BasicNewsRecipe):
|
|||||||
tag.p.img.extract()
|
tag.p.img.extract()
|
||||||
tag.p.insert(len(tag.p.contents) - 2,
|
tag.p.insert(len(tag.p.contents) - 2,
|
||||||
BeautifulSoup('<h2>Ocena: {0}</h2>'.format(rate)).h2)
|
BeautifulSoup('<h2>Ocena: {0}</h2>'.format(rate)).h2)
|
||||||
for a in soup('a'):
|
for a in soup.findAll('a', href=True):
|
||||||
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: # noqa
|
if 'http://' not in a['href'] and 'https://' not in a['href']: # noqa
|
||||||
a['href'] = self.index + a['href']
|
a['href'] = self.index + a['href']
|
||||||
tag = soup.find(name='span', attrs={'class': 'platforma'})
|
tag = soup.find(name='span', attrs={'class': 'platforma'})
|
||||||
if tag:
|
if tag:
|
||||||
|
@ -47,7 +47,7 @@ class in4(BasicNewsRecipe):
|
|||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
self.append_page(soup, soup.body)
|
self.append_page(soup, soup.body)
|
||||||
for a in soup('a'):
|
for a in soup.findAll('a', href=True):
|
||||||
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: # noqa
|
if 'http://' not in a['href'] and 'https://' not in a['href']:
|
||||||
a['href'] = self.index + a['href']
|
a['href'] = self.index + a['href']
|
||||||
return soup
|
return soup
|
||||||
|
@ -21,7 +21,7 @@ class INFRA(BasicNewsRecipe):
|
|||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for item in soup.findAll(style=True):
|
for item in soup.findAll(style=True):
|
||||||
del item['style']
|
del item['style']
|
||||||
for a in soup('a'):
|
for a in soup.findAll('a', href=True):
|
||||||
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: # noqa
|
if 'http://' not in a['href'] and 'https://' not in a['href']:
|
||||||
a['href'] = self.index + a['href']
|
a['href'] = self.index + a['href']
|
||||||
return soup
|
return soup
|
||||||
|
@ -65,8 +65,8 @@ class Jutarnji(BasicNewsRecipe):
|
|||||||
for item in soup.body.findAll(name=['table', 'td', 'tr', 'th', 'caption', 'thead', 'tfoot', 'tbody', 'colgroup', 'col']):
|
for item in soup.body.findAll(name=['table', 'td', 'tr', 'th', 'caption', 'thead', 'tfoot', 'tbody', 'colgroup', 'col']):
|
||||||
item.name = 'div'
|
item.name = 'div'
|
||||||
for attrib in attribs:
|
for attrib in attribs:
|
||||||
if item.has_key(attrib): # noqa
|
item[attrib] = ''
|
||||||
del item[attrib]
|
del item[attrib]
|
||||||
|
|
||||||
mlang = Tag(soup, 'meta', [
|
mlang = Tag(soup, 'meta', [
|
||||||
("http-equiv", "Content-Language"), ("content", self.lang)])
|
("http-equiv", "Content-Language"), ("content", self.lang)])
|
||||||
|
@ -47,7 +47,7 @@ class AdvancedUserRecipe1299694372(BasicNewsRecipe):
|
|||||||
soup = self.index_to_soup(feedurl)
|
soup = self.index_to_soup(feedurl)
|
||||||
for item in soup.findAll('table', attrs={'class': ['item', 'item new']}):
|
for item in soup.findAll('table', attrs={'class': ['item', 'item new']}):
|
||||||
atag = item.a
|
atag = item.a
|
||||||
if atag and atag.has_key('href'): # noqa
|
if atag and atag.get('href') is not None:
|
||||||
url = atag['href']
|
url = atag['href']
|
||||||
articles.append({
|
articles.append({
|
||||||
'url': url
|
'url': url
|
||||||
|
@ -61,17 +61,18 @@ class LeMondeDiplomatiqueEn(BasicNewsRecipe):
|
|||||||
cnt = soup.find('div', attrs={'class': 'som_num'})
|
cnt = soup.find('div', attrs={'class': 'som_num'})
|
||||||
for item in cnt.findAll('li'):
|
for item in cnt.findAll('li'):
|
||||||
description = ''
|
description = ''
|
||||||
feed_link = item.find('a')
|
feed_link = item.find('a', href=True)
|
||||||
|
if feed_link is None:
|
||||||
|
continue
|
||||||
desc = item.find('div', attrs={'class': 'chapo'})
|
desc = item.find('div', attrs={'class': 'chapo'})
|
||||||
if desc:
|
if desc:
|
||||||
description = desc.string
|
description = desc.string
|
||||||
if feed_link and feed_link.has_key('href'): # noqa
|
url = self.PREFIX + feed_link['href'].partition('/../')[2]
|
||||||
url = self.PREFIX + feed_link['href'].partition('/../')[2]
|
title = self.tag_to_string(feed_link)
|
||||||
title = self.tag_to_string(feed_link)
|
date = strftime(self.timefmt)
|
||||||
date = strftime(self.timefmt)
|
articles.append({
|
||||||
articles.append({
|
'title': title, 'date': date, 'url': url, 'description': description
|
||||||
'title': title, 'date': date, 'url': url, 'description': description
|
})
|
||||||
})
|
|
||||||
return [(self.title, articles)]
|
return [(self.title, articles)]
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
|
@ -60,14 +60,9 @@ class LentaRURecipe(BasicNewsRecipe):
|
|||||||
if not feedData:
|
if not feedData:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
self.log("parse_index: Feed loaded successfully.")
|
self.log("parse_index: Feed loaded successfully.")
|
||||||
if feedData.feed.has_key('title'): # noqa
|
|
||||||
self.title = feedData.feed.title
|
|
||||||
self.log("parse_index: Title updated to: ", self.title)
|
|
||||||
if feedData.feed.has_key('image'): # noqa
|
|
||||||
self.log("HAS IMAGE!!!!")
|
|
||||||
|
|
||||||
def get_virtual_feed_articles(feed):
|
def get_virtual_feed_articles(feed):
|
||||||
if feeds.has_key(feed): # noqa
|
if feed in feeds:
|
||||||
return feeds[feed][1]
|
return feeds[feed][1]
|
||||||
self.log("Adding new feed: ", feed)
|
self.log("Adding new feed: ", feed)
|
||||||
articles = []
|
articles = []
|
||||||
@ -84,7 +79,7 @@ class LentaRURecipe(BasicNewsRecipe):
|
|||||||
continue
|
continue
|
||||||
article = {'title': title, 'url': link, 'description': item.get(
|
article = {'title': title, 'url': link, 'description': item.get(
|
||||||
'description', ''), 'date': item.get('date', ''), 'content': ''}
|
'description', ''), 'date': item.get('date', ''), 'content': ''}
|
||||||
if not item.has_key('tags'): # noqa
|
if not item.get('tags'):
|
||||||
get_virtual_feed_articles('_default').append(article)
|
get_virtual_feed_articles('_default').append(article)
|
||||||
continue
|
continue
|
||||||
for tag in item.tags:
|
for tag in item.tags:
|
||||||
@ -101,7 +96,7 @@ class LentaRURecipe(BasicNewsRecipe):
|
|||||||
# Select sorted feeds first of all
|
# Select sorted feeds first of all
|
||||||
result = []
|
result = []
|
||||||
for feedName in self.sortOrder:
|
for feedName in self.sortOrder:
|
||||||
if (not feeds.has_key(feedName)): # noqa
|
if (not feeds.get(feedName)):
|
||||||
continue
|
continue
|
||||||
result.append(feeds[feedName])
|
result.append(feeds[feedName])
|
||||||
del feeds[feedName]
|
del feeds[feedName]
|
||||||
|
@ -45,7 +45,7 @@ class LondonReviewOfBooksPayed(BasicNewsRecipe):
|
|||||||
soup = self.index_to_soup(self.INDEX)
|
soup = self.index_to_soup(self.INDEX)
|
||||||
cover_item = soup.find('p', attrs={'class': 'cover'})
|
cover_item = soup.find('p', attrs={'class': 'cover'})
|
||||||
dates = str(soup.find('span', attrs={'class': 'coverdate'}))
|
dates = str(soup.find('span', attrs={'class': 'coverdate'}))
|
||||||
newdates = re.sub('\<.*\>', '', re.split('<br />', dates)[1])
|
newdates = re.sub(r'\<.*\>', '', re.split('<br />', dates)[1])
|
||||||
self.timefmt = ' [%s]' % newdates
|
self.timefmt = ' [%s]' % newdates
|
||||||
lrbtitle = self.title
|
lrbtitle = self.title
|
||||||
if cover_item:
|
if cover_item:
|
||||||
@ -58,13 +58,13 @@ class LondonReviewOfBooksPayed(BasicNewsRecipe):
|
|||||||
description = u''
|
description = u''
|
||||||
title_prefix = u''
|
title_prefix = u''
|
||||||
feed_link = item
|
feed_link = item
|
||||||
if feed_link.has_key('href'): # noqa
|
if feed_link.get('href'):
|
||||||
url = self.INDEX + feed_link['href']
|
url = self.INDEX + feed_link['href']
|
||||||
title_link = re.split('<br />', str(feed_link))
|
title_link = re.split('<br />', str(feed_link))
|
||||||
if len(title_link) > 1:
|
if len(title_link) > 1:
|
||||||
title = title_prefix + \
|
title = title_prefix + \
|
||||||
re.sub(
|
re.sub(
|
||||||
'\<.*\>', '', title_link[0]) + ' - ' + re.sub('\<.*\>', '', title_link[1])
|
r'\<.*\>', '', title_link[0]) + ' - ' + re.sub(r'\<.*\>', '', title_link[1])
|
||||||
else:
|
else:
|
||||||
title = title_prefix + self.tag_to_string(feed_link)
|
title = title_prefix + self.tag_to_string(feed_link)
|
||||||
desc = item.findNext('li')
|
desc = item.findNext('li')
|
||||||
|
@ -133,7 +133,7 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
|
|||||||
class MerryPreProcess():
|
class MerryPreProcess():
|
||||||
|
|
||||||
def optimizePicture(self, soup):
|
def optimizePicture(self, soup):
|
||||||
for tag in soup.findAll(lambda tag: tag.name.lower() == 'img' and tag.has_key('src')): # noqa
|
for tag in soup.findAll('img', src=True):
|
||||||
try:
|
try:
|
||||||
iurl = tag['src']
|
iurl = tag['src']
|
||||||
img = Image()
|
img = Image()
|
||||||
|
@ -42,11 +42,10 @@ class MoneyControlRecipe(BasicNewsRecipe):
|
|||||||
h1.append(self.tag_to_string(headline))
|
h1.append(self.tag_to_string(headline))
|
||||||
freshSoup.body.append(h1)
|
freshSoup.body.append(h1)
|
||||||
|
|
||||||
for p in soup.findAll('p'):
|
for p in soup.findAll('p', attrs={'class': true}):
|
||||||
if p.has_key('class'):
|
if p['class'] == 'MsoNormal':
|
||||||
if p['class'] == 'MsoNormal':
|
# We have some weird pagebreak marker here; it will not find all of them however
|
||||||
# We have some weird pagebreak marker here; it will not find all of them however
|
continue
|
||||||
continue
|
|
||||||
|
|
||||||
para = Tag(freshSoup, 'p')
|
para = Tag(freshSoup, 'p')
|
||||||
# Convert to string; this will loose all formatting but also all illegal markup
|
# Convert to string; this will loose all formatting but also all illegal markup
|
||||||
|
@ -286,7 +286,7 @@ class CanWestPaper(BasicNewsRecipe):
|
|||||||
else:
|
else:
|
||||||
description = self.tag_to_string(dtag, False)
|
description = self.tag_to_string(dtag, False)
|
||||||
print("DESCRIPTION: " + description)
|
print("DESCRIPTION: " + description)
|
||||||
if not articles.has_key(key): # noqa
|
if key not in articles:
|
||||||
articles[key] = []
|
articles[key] = []
|
||||||
articles[key].append(dict(
|
articles[key].append(dict(
|
||||||
title=title, url=url, date='', description=description, author='', content=''))
|
title=title, url=url, date='', description=description, author='', content=''))
|
||||||
@ -310,5 +310,5 @@ class CanWestPaper(BasicNewsRecipe):
|
|||||||
|
|
||||||
for (k, url) in self.postmedia_index_pages:
|
for (k, url) in self.postmedia_index_pages:
|
||||||
parse_web_index(k, url)
|
parse_web_index(k, url)
|
||||||
ans = [(key, articles[key]) for key in ans if articles.has_key(key)] # noqa
|
ans = [(key, articles[key]) for key in ans if key in articles]
|
||||||
return ans
|
return ans
|
||||||
|
@ -38,7 +38,7 @@ class naszdziennik(BasicNewsRecipe):
|
|||||||
section = self.tag_to_string(section)
|
section = self.tag_to_string(section)
|
||||||
# sprawdzamy czy w słowniku artykułów istnieje klucz dotyczący sekcji
|
# sprawdzamy czy w słowniku artykułów istnieje klucz dotyczący sekcji
|
||||||
# jeśli nie istnieje to :
|
# jeśli nie istnieje to :
|
||||||
if not articles.has_key(section): # noqa
|
if section not in articles:
|
||||||
# do listy sekcji dodajemy nową sekcje
|
# do listy sekcji dodajemy nową sekcje
|
||||||
sections.append(section)
|
sections.append(section)
|
||||||
# deklarujemy nową sekcje w słowniku artykułów przypisując jej
|
# deklarujemy nową sekcje w słowniku artykułów przypisując jej
|
||||||
|
@ -77,7 +77,7 @@ class NrcNextRecipe(BasicNewsRecipe):
|
|||||||
# Add the article to a temporary list
|
# Add the article to a temporary list
|
||||||
article = {'title': completeTitle, 'date': u'',
|
article = {'title': completeTitle, 'date': u'',
|
||||||
'url': href, 'description': '<p> </p>'}
|
'url': href, 'description': '<p> </p>'}
|
||||||
if not articles.has_key(index): # noqa
|
if index not in articles:
|
||||||
articles[index] = []
|
articles[index] = []
|
||||||
articles[index].append(article)
|
articles[index].append(article)
|
||||||
|
|
||||||
@ -90,7 +90,7 @@ class NrcNextRecipe(BasicNewsRecipe):
|
|||||||
indices, {u'columnisten': 1, u'koken': 3, u'geld & werk': 2, u'vandaag': 0})
|
indices, {u'columnisten': 1, u'koken': 3, u'geld & werk': 2, u'vandaag': 0})
|
||||||
# Apply this sort order to the actual list of feeds and articles
|
# Apply this sort order to the actual list of feeds and articles
|
||||||
answer = [(key, articles[key])
|
answer = [(key, articles[key])
|
||||||
for key in indices if articles.has_key(key)] # noqa
|
for key in indices if key in articles]
|
||||||
|
|
||||||
return answer
|
return answer
|
||||||
|
|
||||||
|
@ -201,8 +201,7 @@ class Newsweek(BasicNewsRecipe):
|
|||||||
self.DATE = matches.group(0)
|
self.DATE = matches.group(0)
|
||||||
|
|
||||||
# cover
|
# cover
|
||||||
img = main_section.find(lambda tag: tag.name == 'img' and tag.has_key( # noqa
|
img = main_section.find('img', src=True, alt=True, title=True)
|
||||||
'alt') and tag.has_key('title'))
|
|
||||||
self.cover_url = img['src']
|
self.cover_url = img['src']
|
||||||
feeds = []
|
feeds = []
|
||||||
articles = {}
|
articles = {}
|
||||||
@ -233,7 +232,7 @@ class Newsweek(BasicNewsRecipe):
|
|||||||
if article is None:
|
if article is None:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if articles.has_key(section): # noqa
|
if section in articles:
|
||||||
articles[section].append(article)
|
articles[section].append(article)
|
||||||
else:
|
else:
|
||||||
articles[section] = [article]
|
articles[section] = [article]
|
||||||
|
@ -82,7 +82,6 @@ class Novosti(BasicNewsRecipe):
|
|||||||
else:
|
else:
|
||||||
str = self.tag_to_string(item)
|
str = self.tag_to_string(item)
|
||||||
item.replaceWith(str)
|
item.replaceWith(str)
|
||||||
for item in soup.findAll('img'):
|
for item in soup.findAll('img', alt=False):
|
||||||
if not item.has_key('alt'): # noqa
|
item['alt'] = 'image'
|
||||||
item['alt'] = 'image'
|
|
||||||
return soup
|
return soup
|
||||||
|
@ -55,8 +55,8 @@ class NotSafeForWork(BasicNewsRecipe):
|
|||||||
def get_feeds(self):
|
def get_feeds(self):
|
||||||
self.feeds = []
|
self.feeds = []
|
||||||
soup = self.index_to_soup(self.SETTINGS)
|
soup = self.index_to_soup(self.SETTINGS)
|
||||||
for item in soup.findAll('input', attrs={'type': 'text'}):
|
for item in soup.findAll('input', value=True, attrs={'type': 'text'}):
|
||||||
if item.has_key('value') and item['value'].startswith('https://www.nsfwcorp.com/feed/'): # noqa
|
if item['value'].startswith('https://www.nsfwcorp.com/feed/'):
|
||||||
self.feeds.append(item['value'])
|
self.feeds.append(item['value'])
|
||||||
return self.feeds
|
return self.feeds
|
||||||
return self.feeds
|
return self.feeds
|
||||||
|
@ -109,7 +109,6 @@ class Nspm(BasicNewsRecipe):
|
|||||||
else:
|
else:
|
||||||
str = self.tag_to_string(item)
|
str = self.tag_to_string(item)
|
||||||
item.replaceWith(str)
|
item.replaceWith(str)
|
||||||
for item in soup.findAll('img'):
|
for item in soup.findAll('img', alt=False):
|
||||||
if not item.has_key('alt'): # noqa
|
item['alt'] = 'image'
|
||||||
item['alt'] = 'image'
|
|
||||||
return soup
|
return soup
|
||||||
|
@ -286,7 +286,7 @@ class CanWestPaper(BasicNewsRecipe):
|
|||||||
else:
|
else:
|
||||||
description = self.tag_to_string(dtag, False)
|
description = self.tag_to_string(dtag, False)
|
||||||
print("DESCRIPTION: " + description)
|
print("DESCRIPTION: " + description)
|
||||||
if not articles.has_key(key): # noqa
|
if key not in articles:
|
||||||
articles[key] = []
|
articles[key] = []
|
||||||
articles[key].append(dict(
|
articles[key].append(dict(
|
||||||
title=title, url=url, date='', description=description, author='', content=''))
|
title=title, url=url, date='', description=description, author='', content=''))
|
||||||
@ -310,5 +310,5 @@ class CanWestPaper(BasicNewsRecipe):
|
|||||||
|
|
||||||
for (k, url) in self.postmedia_index_pages:
|
for (k, url) in self.postmedia_index_pages:
|
||||||
parse_web_index(k, url)
|
parse_web_index(k, url)
|
||||||
ans = [(key, articles[key]) for key in ans if articles.has_key(key)] # noqa
|
ans = [(key, articles[key]) for key in ans if key in articles]
|
||||||
return ans
|
return ans
|
||||||
|
@ -86,8 +86,8 @@ class Pobjeda(BasicNewsRecipe):
|
|||||||
soup = self.index_to_soup(feedurl)
|
soup = self.index_to_soup(feedurl)
|
||||||
for item in soup.findAll('div', attrs={'class': 'vijest'}):
|
for item in soup.findAll('div', attrs={'class': 'vijest'}):
|
||||||
description = self.tag_to_string(item.h2)
|
description = self.tag_to_string(item.h2)
|
||||||
atag = item.h1.find('a')
|
atag = item.h1.find('a', href=True)
|
||||||
if atag and atag.has_key('href'): # noqa
|
if atag is not None:
|
||||||
url = self.INDEX + '/' + atag['href']
|
url = self.INDEX + '/' + atag['href']
|
||||||
title = self.tag_to_string(atag)
|
title = self.tag_to_string(atag)
|
||||||
date = strftime(self.timefmt)
|
date = strftime(self.timefmt)
|
||||||
|
@ -58,10 +58,9 @@ class Politika(BasicNewsRecipe):
|
|||||||
del item['style']
|
del item['style']
|
||||||
for item in soup.findAll('a', attrs={'class': 'category'}):
|
for item in soup.findAll('a', attrs={'class': 'category'}):
|
||||||
item.name = 'span'
|
item.name = 'span'
|
||||||
if item.has_key('href'): # noqa
|
item['href'] = item['title'] = ''
|
||||||
del item['href']
|
del item['href']
|
||||||
if item.has_key('title'): # noqa
|
del item['title']
|
||||||
del item['title']
|
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
|
@ -51,7 +51,7 @@ class Polityka(BasicNewsRecipe):
|
|||||||
'http://archiwum.polityka.pl' + div.a['href'],)
|
'http://archiwum.polityka.pl' + div.a['href'],)
|
||||||
section = self.tag_to_string(article_page.find(
|
section = self.tag_to_string(article_page.find(
|
||||||
'h2', attrs={'class': 'box_nag'})).split('/')[0].lstrip().rstrip()
|
'h2', attrs={'class': 'box_nag'})).split('/')[0].lstrip().rstrip()
|
||||||
if not articles.has_key(section): # noqa
|
if section not in articles:
|
||||||
articles[section] = []
|
articles[section] = []
|
||||||
articles[section].append({
|
articles[section].append({
|
||||||
'title': self.tag_to_string(div.a),
|
'title': self.tag_to_string(div.a),
|
||||||
|
@ -190,7 +190,7 @@ class CanWestPaper(BasicNewsRecipe):
|
|||||||
# Find each instance of class="sectiontitle", class="featurecontent"
|
# Find each instance of class="sectiontitle", class="featurecontent"
|
||||||
for divtag in soup.findAll('div', attrs={'class': ["section_title02", "featurecontent"]}):
|
for divtag in soup.findAll('div', attrs={'class': ["section_title02", "featurecontent"]}):
|
||||||
if divtag['class'].startswith('section_title'):
|
if divtag['class'].startswith('section_title'):
|
||||||
# div contains section title
|
# div contains section title
|
||||||
if not divtag.h3:
|
if not divtag.h3:
|
||||||
continue
|
continue
|
||||||
key = self.tag_to_string(divtag.h3, False)
|
key = self.tag_to_string(divtag.h3, False)
|
||||||
@ -215,11 +215,11 @@ class CanWestPaper(BasicNewsRecipe):
|
|||||||
autag = divtag.find('h4')
|
autag = divtag.find('h4')
|
||||||
if autag:
|
if autag:
|
||||||
author = self.tag_to_string(autag, False)
|
author = self.tag_to_string(autag, False)
|
||||||
if not articles.has_key(key): # noqa
|
if key not in articles:
|
||||||
articles[key] = []
|
articles[key] = []
|
||||||
articles[key].append(dict(title=title, url=url, date=pubdate,
|
articles[key].append(dict(title=title, url=url, date=pubdate,
|
||||||
description=description, author=author, content=''))
|
description=description, author=author, content=''))
|
||||||
|
|
||||||
ans = [(keyl, articles[keyl])
|
ans = [(keyl, articles[keyl])
|
||||||
for keyl in ans if articles.has_key(keyl)] # noqa
|
for keyl in ans if keyl in articles]
|
||||||
return ans
|
return ans
|
||||||
|
@ -46,8 +46,8 @@ class Republika(BasicNewsRecipe):
|
|||||||
for item in soup.body.findAll(name=['table', 'td', 'tr', 'th', 'caption', 'thead', 'tfoot', 'tbody', 'colgroup', 'col']):
|
for item in soup.body.findAll(name=['table', 'td', 'tr', 'th', 'caption', 'thead', 'tfoot', 'tbody', 'colgroup', 'col']):
|
||||||
item.name = 'div'
|
item.name = 'div'
|
||||||
for attrib in attribs:
|
for attrib in attribs:
|
||||||
if item.has_key(attrib): # noqa
|
item[attrib] = ''
|
||||||
del item[attrib]
|
del item[attrib]
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
|
@ -190,7 +190,7 @@ class CanWestPaper(BasicNewsRecipe):
|
|||||||
# Find each instance of class="sectiontitle", class="featurecontent"
|
# Find each instance of class="sectiontitle", class="featurecontent"
|
||||||
for divtag in soup.findAll('div', attrs={'class': ["section_title02", "featurecontent"]}):
|
for divtag in soup.findAll('div', attrs={'class': ["section_title02", "featurecontent"]}):
|
||||||
if divtag['class'].startswith('section_title'):
|
if divtag['class'].startswith('section_title'):
|
||||||
# div contains section title
|
# div contains section title
|
||||||
if not divtag.h3:
|
if not divtag.h3:
|
||||||
continue
|
continue
|
||||||
key = self.tag_to_string(divtag.h3, False)
|
key = self.tag_to_string(divtag.h3, False)
|
||||||
@ -215,10 +215,10 @@ class CanWestPaper(BasicNewsRecipe):
|
|||||||
autag = divtag.find('h4')
|
autag = divtag.find('h4')
|
||||||
if autag:
|
if autag:
|
||||||
author = self.tag_to_string(autag, False)
|
author = self.tag_to_string(autag, False)
|
||||||
if not articles.has_key(key): # noqa
|
if key not in articles:
|
||||||
articles[key] = []
|
articles[key] = []
|
||||||
articles[key].append(dict(title=title, url=url, date=pubdate,
|
articles[key].append(dict(title=title, url=url, date=pubdate,
|
||||||
description=description, author=author, content=''))
|
description=description, author=author, content=''))
|
||||||
|
|
||||||
ans = [(k, articles[k]) for k in ans if articles.has_key(k)] # noqa
|
ans = [(k, articles[k]) for k in ans if k in articles]
|
||||||
return ans
|
return ans
|
||||||
|
@ -45,11 +45,11 @@ class SCPrintMagazine(BasicNewsRecipe):
|
|||||||
if arttitlet is not None:
|
if arttitlet is not None:
|
||||||
mylink = arttitlet.find('a')
|
mylink = arttitlet.find('a')
|
||||||
if mylink is not None:
|
if mylink is not None:
|
||||||
if mylink.has_key('title'): # noqa
|
if mylink.get('title'):
|
||||||
arttitle = mylink['title']
|
arttitle = mylink['title']
|
||||||
else:
|
else:
|
||||||
arttitle = 'unknown'
|
arttitle = 'unknown'
|
||||||
if mylink.has_key('href'): # noqa
|
if mylink.get('href'):
|
||||||
artlink = mylink['href']
|
artlink = mylink['href']
|
||||||
artlink = artlink.replace(
|
artlink = artlink.replace(
|
||||||
"/article", "/printarticle")
|
"/article", "/printarticle")
|
||||||
|
@ -81,7 +81,7 @@ class SZmobil(BasicNewsRecipe):
|
|||||||
if itt['href'].startswith('article.php?id='):
|
if itt['href'].startswith('article.php?id='):
|
||||||
article_url = itt['href']
|
article_url = itt['href']
|
||||||
article_id = int(
|
article_id = int(
|
||||||
re.search("id=(\d*)&etag=", itt['href']).group(1))
|
re.search(r"id=(\d*)&etag=", itt['href']).group(1))
|
||||||
|
|
||||||
# first check if link is a special article in section
|
# first check if link is a special article in section
|
||||||
# "Meinungsseite"
|
# "Meinungsseite"
|
||||||
@ -104,7 +104,7 @@ class SZmobil(BasicNewsRecipe):
|
|||||||
# just another link ("mehr") to an article
|
# just another link ("mehr") to an article
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if itt.has_key('id'): # noqa
|
if itt.get('id') is not None:
|
||||||
shorttitles[article_id] = article_name
|
shorttitles[article_id] = article_name
|
||||||
else:
|
else:
|
||||||
articles.append(
|
articles.append(
|
||||||
@ -118,7 +118,7 @@ class SZmobil(BasicNewsRecipe):
|
|||||||
# pubdate = strftime('')
|
# pubdate = strftime('')
|
||||||
pubdate = strftime('[%a, %d %b]')
|
pubdate = strftime('[%a, %d %b]')
|
||||||
description = ''
|
description = ''
|
||||||
if shorttitles.has_key(article_id): # noqa
|
if shorttitles.get(article_id) is not None:
|
||||||
description = shorttitles[article_id]
|
description = shorttitles[article_id]
|
||||||
# we do not want the flag ("Impressum")
|
# we do not want the flag ("Impressum")
|
||||||
if "HERAUSGEGEBEN VOM" in description:
|
if "HERAUSGEGEBEN VOM" in description:
|
||||||
|
@ -55,7 +55,6 @@ class TechCrunch(BasicNewsRecipe):
|
|||||||
else:
|
else:
|
||||||
str = self.tag_to_string(item)
|
str = self.tag_to_string(item)
|
||||||
item.replaceWith(str)
|
item.replaceWith(str)
|
||||||
for item in soup.findAll('img'):
|
for item in soup.findAll('img', alt=False):
|
||||||
if not item.has_key('alt'): # noqa
|
item['alt'] = 'image'
|
||||||
item['alt'] = 'image'
|
|
||||||
return soup
|
return soup
|
||||||
|
@ -45,7 +45,7 @@ class TheAge(BasicNewsRecipe):
|
|||||||
|
|
||||||
# Make sure to skip: <a href="/">TheAge</a>
|
# Make sure to skip: <a href="/">TheAge</a>
|
||||||
|
|
||||||
elif section and tag.has_key('href') and len(tag['href'].strip()) > 1: # noqa
|
elif section and tag.get('href'):
|
||||||
url = tag['href'].strip()
|
url = tag['href'].strip()
|
||||||
if url.startswith('/'):
|
if url.startswith('/'):
|
||||||
url = 'http://www.theage.com.au' + url
|
url = 'http://www.theage.com.au' + url
|
||||||
@ -105,7 +105,7 @@ class TheAge(BasicNewsRecipe):
|
|||||||
|
|
||||||
# Filter out what's left of the text-mode navigation stuff
|
# Filter out what's left of the text-mode navigation stuff
|
||||||
|
|
||||||
if re.match('((\s)|(\ \;))*\[[\|\s*]*\]((\s)|(\ \;))*$', contents):
|
if re.match(r'((\s)|(\ \;))*\[[\|\s*]*\]((\s)|(\ \;))*$', contents):
|
||||||
p.extract()
|
p.extract()
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
@ -47,6 +47,6 @@ class Edgesingapore(BasicNewsRecipe):
|
|||||||
for item in soup.body.findAll(name=['table', 'td', 'tr', 'th', 'caption', 'thead', 'tfoot', 'tbody', 'colgroup', 'col']):
|
for item in soup.body.findAll(name=['table', 'td', 'tr', 'th', 'caption', 'thead', 'tfoot', 'tbody', 'colgroup', 'col']):
|
||||||
item.name = 'div'
|
item.name = 'div'
|
||||||
for attrib in attribs:
|
for attrib in attribs:
|
||||||
if item.has_key(attrib): # noqa
|
item[attrib] = ''
|
||||||
del item[attrib]
|
del item[attrib]
|
||||||
return self.adeify_images(soup)
|
return self.adeify_images(soup)
|
||||||
|
@ -78,12 +78,12 @@ class TheOnion(BasicNewsRecipe):
|
|||||||
if limg:
|
if limg:
|
||||||
item.name = 'div'
|
item.name = 'div'
|
||||||
item.attrs = []
|
item.attrs = []
|
||||||
if not limg.has_key('alt'): # noqa
|
if not limg.get('alt'):
|
||||||
limg['alt'] = 'image'
|
limg['alt'] = 'image'
|
||||||
else:
|
else:
|
||||||
str = self.tag_to_string(item)
|
str = self.tag_to_string(item)
|
||||||
item.replaceWith(str)
|
item.replaceWith(str)
|
||||||
for item in soup.findAll('img'):
|
for item in soup.findAll('img'):
|
||||||
if item.has_key('data-src'): # noqa
|
if item.get('data-src'):
|
||||||
item['src'] = item['data-src']
|
item['src'] = item['data-src']
|
||||||
return soup
|
return soup
|
||||||
|
@ -57,8 +57,8 @@ class Tomshardware(BasicNewsRecipe):
|
|||||||
def cleanup_image_tags(self, soup):
|
def cleanup_image_tags(self, soup):
|
||||||
for item in soup.findAll('img'):
|
for item in soup.findAll('img'):
|
||||||
for attrib in ['height', 'width', 'border', 'align']:
|
for attrib in ['height', 'width', 'border', 'align']:
|
||||||
if item.has_key(attrib): # noqa
|
item[attrib] = ''
|
||||||
del item[attrib]
|
del item[attrib]
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
|
@ -45,7 +45,6 @@ class Twitchfilm(BasicNewsRecipe):
|
|||||||
else:
|
else:
|
||||||
str = self.tag_to_string(item)
|
str = self.tag_to_string(item)
|
||||||
item.replaceWith(str)
|
item.replaceWith(str)
|
||||||
for item in soup.findAll('img'):
|
for item in soup.findAll('img', alt=False):
|
||||||
if not item.has_key('alt'): # noqa
|
item['alt'] = 'image'
|
||||||
item['alt'] = 'image'
|
|
||||||
return soup
|
return soup
|
||||||
|
@ -299,7 +299,7 @@ class CanWestPaper(BasicNewsRecipe):
|
|||||||
else:
|
else:
|
||||||
description = self.tag_to_string(dtag, False)
|
description = self.tag_to_string(dtag, False)
|
||||||
print("DESCRIPTION: " + description)
|
print("DESCRIPTION: " + description)
|
||||||
if not articles.has_key(key): # noqa
|
if key not in articles:
|
||||||
articles[key] = []
|
articles[key] = []
|
||||||
articles[key].append(dict(
|
articles[key].append(dict(
|
||||||
title=title, url=url, date='', description=description, author='', content=''))
|
title=title, url=url, date='', description=description, author='', content=''))
|
||||||
@ -323,5 +323,5 @@ class CanWestPaper(BasicNewsRecipe):
|
|||||||
|
|
||||||
for (k, url) in self.postmedia_index_pages:
|
for (k, url) in self.postmedia_index_pages:
|
||||||
parse_web_index(k, url)
|
parse_web_index(k, url)
|
||||||
ans = [(key, articles[key]) for key in ans if articles.has_key(key)] # noqa
|
ans = [(key, articles[key]) for key in ans if key in articles]
|
||||||
return ans
|
return ans
|
||||||
|
@ -287,7 +287,7 @@ class CanWestPaper(BasicNewsRecipe):
|
|||||||
else:
|
else:
|
||||||
description = self.tag_to_string(dtag, False)
|
description = self.tag_to_string(dtag, False)
|
||||||
print("DESCRIPTION: " + description)
|
print("DESCRIPTION: " + description)
|
||||||
if not articles.has_key(key): # noqa
|
if key not in articles:
|
||||||
articles[key] = []
|
articles[key] = []
|
||||||
articles[key].append(dict(
|
articles[key].append(dict(
|
||||||
title=title, url=url, date='', description=description, author='', content=''))
|
title=title, url=url, date='', description=description, author='', content=''))
|
||||||
@ -311,5 +311,5 @@ class CanWestPaper(BasicNewsRecipe):
|
|||||||
|
|
||||||
for (k, url) in self.postmedia_index_pages:
|
for (k, url) in self.postmedia_index_pages:
|
||||||
parse_web_index(k, url)
|
parse_web_index(k, url)
|
||||||
ans = [(key, articles[key]) for key in ans if articles.has_key(key)] # noqa
|
ans = [(key, articles[key]) for key in ans if key in articles] # noqa
|
||||||
return ans
|
return ans
|
||||||
|
@ -62,7 +62,6 @@ class Variety(BasicNewsRecipe):
|
|||||||
else:
|
else:
|
||||||
str = self.tag_to_string(item)
|
str = self.tag_to_string(item)
|
||||||
item.replaceWith(str)
|
item.replaceWith(str)
|
||||||
for item in soup.findAll('img'):
|
for item in soup.findAll('img', alt=False):
|
||||||
if not item.has_key('alt'): # noqa
|
item['alt'] = 'image'
|
||||||
item['alt'] = 'image'
|
|
||||||
return soup
|
return soup
|
||||||
|
@ -62,16 +62,22 @@ class VedomostiRecipe(BasicNewsRecipe):
|
|||||||
if not feedData:
|
if not feedData:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
self.log("parse_index: Feed loaded successfully.")
|
self.log("parse_index: Feed loaded successfully.")
|
||||||
if feedData.feed.has_key('title'): # noqa
|
try:
|
||||||
self.title = feedData.feed.title
|
if feedData.feed.title:
|
||||||
self.log("parse_index: Title updated to: ", self.title)
|
self.title = feedData.feed.title
|
||||||
if feedData.feed.has_key('description'): # noqa
|
self.log("parse_index: Title updated to: ", self.title)
|
||||||
self.description = feedData.feed.description
|
except Exception:
|
||||||
self.log("parse_index: Description updated to: ",
|
pass
|
||||||
self.description)
|
try:
|
||||||
|
if feedData.feed.description:
|
||||||
|
self.description = feedData.feed.description
|
||||||
|
self.log("parse_index: Description updated to: ",
|
||||||
|
self.description)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
def get_virtual_feed_articles(feed):
|
def get_virtual_feed_articles(feed):
|
||||||
if feeds.has_key(feed): # noqa
|
if feed in feeds:
|
||||||
return feeds[feed][1]
|
return feeds[feed][1]
|
||||||
self.log("Adding new feed: ", feed)
|
self.log("Adding new feed: ", feed)
|
||||||
articles = []
|
articles = []
|
||||||
@ -88,7 +94,7 @@ class VedomostiRecipe(BasicNewsRecipe):
|
|||||||
continue
|
continue
|
||||||
article = {'title': title, 'url': link, 'description': item.get(
|
article = {'title': title, 'url': link, 'description': item.get(
|
||||||
'description', ''), 'date': item.get('date', ''), 'content': ''}
|
'description', ''), 'date': item.get('date', ''), 'content': ''}
|
||||||
if not item.has_key('tags'): # noqa
|
if not item.get('tags'): # noqa
|
||||||
get_virtual_feed_articles('_default').append(article)
|
get_virtual_feed_articles('_default').append(article)
|
||||||
continue
|
continue
|
||||||
for tag in item.tags:
|
for tag in item.tags:
|
||||||
@ -105,7 +111,7 @@ class VedomostiRecipe(BasicNewsRecipe):
|
|||||||
# Select sorted feeds first of all
|
# Select sorted feeds first of all
|
||||||
result = []
|
result = []
|
||||||
for feedName in self.sortOrder:
|
for feedName in self.sortOrder:
|
||||||
if (not feeds.has_key(feedName)): # noqa
|
if (not feeds.get(feedName)):
|
||||||
continue
|
continue
|
||||||
result.append(feeds[feedName])
|
result.append(feeds[feedName])
|
||||||
del feeds[feedName]
|
del feeds[feedName]
|
||||||
@ -142,9 +148,9 @@ class VedomostiRecipe(BasicNewsRecipe):
|
|||||||
imgDiv = Tag(soup, 'div')
|
imgDiv = Tag(soup, 'div')
|
||||||
imgDiv['class'] = 'article_img'
|
imgDiv['class'] = 'article_img'
|
||||||
|
|
||||||
if img.has_key('width'): # noqa
|
if img.get('width'):
|
||||||
del(img['width'])
|
del(img['width'])
|
||||||
if img.has_key('height'): # noqa
|
if img.get('height'):
|
||||||
del(img['height'])
|
del(img['height'])
|
||||||
|
|
||||||
# find description
|
# find description
|
||||||
@ -180,11 +186,9 @@ class VedomostiRecipe(BasicNewsRecipe):
|
|||||||
contents.insert(len(contents.contents), authorsP)
|
contents.insert(len(contents.contents), authorsP)
|
||||||
|
|
||||||
# Fix urls that use relative path
|
# Fix urls that use relative path
|
||||||
urls = contents.findAll('a')
|
urls = contents.findAll('a', href=True)
|
||||||
if urls:
|
if urls:
|
||||||
for url in urls:
|
for url in urls:
|
||||||
if not url.has_key('href'): # noqa
|
|
||||||
continue
|
|
||||||
if '/' == url['href'][0]:
|
if '/' == url['href'][0]:
|
||||||
url['href'] = self.base_url + url['href']
|
url['href'] = self.base_url + url['href']
|
||||||
|
|
||||||
|
@ -94,8 +94,8 @@ class WaPoCartoonsRecipe(BasicNewsRecipe):
|
|||||||
img = soup.find('img', attrs={'class': 'pic_big'})
|
img = soup.find('img', attrs={'class': 'pic_big'})
|
||||||
if img:
|
if img:
|
||||||
td = img.parent
|
td = img.parent
|
||||||
if td.has_key('style'): # noqa
|
td['style'] = ''
|
||||||
del td['style']
|
del td['style']
|
||||||
td.name = 'div'
|
td.name = 'div'
|
||||||
td['id'] = 'comic_full'
|
td['id'] = 'comic_full'
|
||||||
freshSoup.body.append(td)
|
freshSoup.body.append(td)
|
||||||
@ -134,11 +134,8 @@ class WaPoCartoonsRecipe(BasicNewsRecipe):
|
|||||||
'June': '06', 'July': '07', 'August': '08', 'September': '09', 'October': '10',
|
'June': '06', 'July': '07', 'August': '08', 'September': '09', 'October': '10',
|
||||||
'November': '11', 'December': '12'}
|
'November': '11', 'December': '12'}
|
||||||
|
|
||||||
opts = select.findAll('option')
|
opts = select.findAll('option', selected=False)
|
||||||
for i in range(1, len(opts)):
|
for i in range(1, len(opts)):
|
||||||
if opts[i].has_key('selected'): # noqa
|
|
||||||
continue
|
|
||||||
|
|
||||||
dateString = self.tag_to_string(opts[i])
|
dateString = self.tag_to_string(opts[i])
|
||||||
rest, sep, year = dateString.rpartition(', ')
|
rest, sep, year = dateString.rpartition(', ')
|
||||||
parts = rest.split(' ')
|
parts = rest.split(' ')
|
||||||
|
@ -42,11 +42,11 @@ class TheCND(BasicNewsRecipe):
|
|||||||
url = 'http://bbs.wenxuecity.com' + url
|
url = 'http://bbs.wenxuecity.com' + url
|
||||||
title = self.tag_to_string(a)
|
title = self.tag_to_string(a)
|
||||||
self.log('\tFound article: ', title, ' at:', url)
|
self.log('\tFound article: ', title, ' at:', url)
|
||||||
dateReg = re.search('(\d\d?)/(\d\d?)/(\d\d)',
|
dateReg = re.search(r'(\d\d?)/(\d\d?)/(\d\d)',
|
||||||
self.tag_to_string(a.parent))
|
self.tag_to_string(a.parent))
|
||||||
date = '%(y)s/%(m)02d/%(d)02d' % {'y': dateReg.group(3),
|
date = '%(y)s/%(m)02d/%(d)02d' % {'y': dateReg.group(3),
|
||||||
'm': int(dateReg.group(1)), 'd': int(dateReg.group(2))}
|
'm': int(dateReg.group(1)), 'd': int(dateReg.group(2))}
|
||||||
if not articles.has_key(date): # noqa
|
if date not in articles: # noqa
|
||||||
articles[date] = []
|
articles[date] = []
|
||||||
articles[date].append(
|
articles[date].append(
|
||||||
{'title': title, 'url': url, 'description': '', 'date': ''})
|
{'title': title, 'url': url, 'description': '', 'date': ''})
|
||||||
|
@ -96,10 +96,10 @@ class CanWestPaper(BasicNewsRecipe):
|
|||||||
autag = divtag.find('h4')
|
autag = divtag.find('h4')
|
||||||
if autag:
|
if autag:
|
||||||
author = self.tag_to_string(autag, False)
|
author = self.tag_to_string(autag, False)
|
||||||
if not articles.has_key(key): # noqa
|
if key not in articles:
|
||||||
articles[key] = []
|
articles[key] = []
|
||||||
articles[key].append(dict(title=title, url=url, date=pubdate,
|
articles[key].append(dict(title=title, url=url, date=pubdate,
|
||||||
description=description, author=author, content=''))
|
description=description, author=author, content=''))
|
||||||
|
|
||||||
ans = [(keyl, articles[key]) for keyl in ans if articles.has_key(keyl)] # noqa
|
ans = [(keyl, articles[key]) for keyl in ans if keyl in articles]
|
||||||
return ans
|
return ans
|
||||||
|
@ -189,9 +189,9 @@ class CanWestPaper(BasicNewsRecipe):
|
|||||||
|
|
||||||
# Find each instance of class="sectiontitle", class="featurecontent"
|
# Find each instance of class="sectiontitle", class="featurecontent"
|
||||||
for divtag in soup.findAll('div', attrs={'class': ["section_title02", "featurecontent"]}):
|
for divtag in soup.findAll('div', attrs={'class': ["section_title02", "featurecontent"]}):
|
||||||
# self.log(" div class = %s" % divtag['class'])
|
# self.log(" div class = %s" % divtag['class'])
|
||||||
if divtag['class'].startswith('section_title'):
|
if divtag['class'].startswith('section_title'):
|
||||||
# div contains section title
|
# div contains section title
|
||||||
if not divtag.h3:
|
if not divtag.h3:
|
||||||
continue
|
continue
|
||||||
key = self.tag_to_string(divtag.h3, False)
|
key = self.tag_to_string(divtag.h3, False)
|
||||||
@ -221,11 +221,11 @@ class CanWestPaper(BasicNewsRecipe):
|
|||||||
if autag:
|
if autag:
|
||||||
author = self.tag_to_string(autag, False)
|
author = self.tag_to_string(autag, False)
|
||||||
# self.log("author %s" % author)
|
# self.log("author %s" % author)
|
||||||
if not articles.has_key(key): # noqa
|
if key not in articles:
|
||||||
articles[key] = []
|
articles[key] = []
|
||||||
articles[key].append(dict(title=title, url=url, date=pubdate,
|
articles[key].append(dict(title=title, url=url, date=pubdate,
|
||||||
description=description, author=author, content=''))
|
description=description, author=author, content=''))
|
||||||
|
|
||||||
ans = [(keyl, articles[keyl])
|
ans = [(keyl, articles[keyl])
|
||||||
for keyl in ans if articles.has_key(keyl)] # noqa
|
for keyl in ans if keyl in articles]
|
||||||
return ans
|
return ans
|
||||||
|
@ -83,11 +83,10 @@ class ZAOBAO(BasicNewsRecipe):
|
|||||||
]
|
]
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for tag in soup.findAll(name='a'):
|
for tag in soup.findAll(name='a', href=True):
|
||||||
if tag.has_key('href'): # noqa
|
tag_url = tag['href']
|
||||||
tag_url = tag['href']
|
if tag_url.find('http://') != -1 and tag_url.find('zaobao.com') == -1:
|
||||||
if tag_url.find('http://') != -1 and tag_url.find('zaobao.com') == -1:
|
del tag['href']
|
||||||
del tag['href']
|
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
def postprocess_html(self, soup, first):
|
def postprocess_html(self, soup, first):
|
||||||
@ -107,8 +106,8 @@ class ZAOBAO(BasicNewsRecipe):
|
|||||||
for i, item in enumerate(soup.findAll('li')):
|
for i, item in enumerate(soup.findAll('li')):
|
||||||
if i >= self.MAX_ITEMS_IN_INDEX:
|
if i >= self.MAX_ITEMS_IN_INDEX:
|
||||||
break
|
break
|
||||||
a = item.find('a')
|
a = item.find('a', href=True)
|
||||||
if a and a.has_key('href'): # noqa
|
if a is not None:
|
||||||
a_url = a['href']
|
a_url = a['href']
|
||||||
a_title = self.tag_to_string(a)
|
a_title = self.tag_to_string(a)
|
||||||
date = ''
|
date = ''
|
||||||
|
Loading…
x
Reference in New Issue
Block a user