calibre/recipes/watchingamerica.recipe
Kovid Goyal 567040ee1e Perform PEP8 compliance checks on the entire codebase
Some bits of PEP 8 are turned off via setup.cfg
2016-07-29 21:25:17 +05:30

103 lines
3.6 KiB
Plaintext

from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class WatchingAmericaRecipe(BasicNewsRecipe):
__license__ = 'GPL v3'
__author__ = 'kwetal'
language = 'en'
version = 1
title = u'Watching America'
publisher = u'watchingamerica.com'
category = u'News'
description = u'Global opinion about the United States'
oldest_article = 7
max_articles_per_feed = 100
use_embedded_content = False
no_stylesheets = True
remove_javascript = True
remove_attributes = ['style']
extra_css = '''
body{font-family:verdana,arial,helvetica,geneva,sans-serif ;}
.main_content em {font-size: x-small; font-style: italic; color: #696969;}
.main_content span strong {font-size: x-large; font-weight: bold;}
.insideitro {font-size: xx-small; font-style: italic; color: #666666;}
span {padding: 0em; margin 0em;}
'''
INDEX = u'http://watchingamerica.com/News/'
def parse_index(self):
answer = []
soup = self.index_to_soup(self.INDEX)
articles = []
feature = soup.find('div', attrs={'id': 'headzone'})
if feature:
link = feature.find('a', attrs={'class': 'feature'})
url = link.get('href', None)
title = self.tag_to_string(link)
description = self.tag_to_string(
feature.find('h1', attrs={'class': 'pull'}))
article = {'title': title, 'date': u'',
'url': url, 'description': description}
articles.append(article)
answer.append(('Feature', articles))
feed_titles = ['Translations from the West',
'Translations from the East']
for i in range(1, 3):
articles = []
div = soup.find('div', attrs={'class': 'newscol' + str(i)})
if div:
for link in div.findAll('a', attrs={'class': 'headline'}):
url = link.get('href', None)
title = self.tag_to_string(link)
description = None
h3 = link.findNextSibling('h3')
if h3:
description = self.tag_to_string(h3)
article = {'title': title, 'date': u'',
'url': url, 'description': description}
articles.append(article)
answer.append((feed_titles[i - 1], articles))
return answer
def preprocess_html(self, soup):
freshSoup = self.get_fresh_soup(soup)
article = soup.find('p', attrs={'class': 'MsoNormal'}).parent
if article:
article.name = 'div'
del article['width']
article['class'] = 'main_content'
org = article.find('a', attrs={'href': '?SHOW_ORIGINAL_TEXT'})
if org:
org.parent.extract()
intro = article.find('span', attrs={'class': 'insideitro'})
if intro:
for el in intro.findAll(['strong', 'em', 'br']):
if el.name == 'br':
el.extract()
else:
el.name = 'div'
freshSoup.body.append(article)
return freshSoup
def get_fresh_soup(self, oldSoup):
freshSoup = BeautifulSoup(
'<html><head><title></title></head><body></body></html>')
if oldSoup.head.title:
freshSoup.head.title.append(self.tag_to_string(oldSoup.head.title))
return freshSoup