diff --git a/resources/recipes/watchingamerica.recipe b/resources/recipes/watchingamerica.recipe new file mode 100644 index 0000000000..9048e2550c --- /dev/null +++ b/resources/recipes/watchingamerica.recipe @@ -0,0 +1,96 @@ +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup + +class WatchingAmericaRecipe(BasicNewsRecipe): + __license__ = 'GPL v3' + __author__ = 'kwetal' + language = 'en' + version = 1 + + title = u'Watching America' + publisher = u'watchingamerica.com' + category = u'News' + description = u'Global opinion about the United States' + + oldest_article = 7 + max_articles_per_feed = 100 + use_embedded_content = False + + no_stylesheets = True + remove_javascript = True + remove_attributes = ['style'] + + extra_css = ''' + body{font-family:verdana,arial,helvetica,geneva,sans-serif ;} + .main_content em {font-size: x-small; font-style: italic; color: #696969;} + .main_content span strong {font-size: x-large; font-weight: bold;} + .insideitro {font-size: xx-small; font-style: italic; color: #666666;} + span {padding: 0em; margin 0em;} + ''' + + INDEX = u'http://watchingamerica.com/News/' + + def parse_index(self): + answer = [] + + soup = self.index_to_soup(self.INDEX) + + articles = [] + feature = soup.find('div', attrs = {'id': 'headzone'}) + if feature: + link = feature.find('a', attrs = {'class': 'feature'}) + url = link.get('href', None) + title = self.tag_to_string(link) + description = self.tag_to_string(feature.find('h1', attrs = {'class': 'pull'})) + article = {'title': title, 'date': u'', 'url': url, 'description': description} + articles.append(article) + answer.append(('Feature', articles)) + + feed_titles = ['Translations from the West', 'Translations from the East'] + for i in range(1, 3): + articles = [] + div = soup.find('div', attrs = {'class': 'newscol' + str(i)}) + if div: + for link in div.findAll('a', attrs = {'class': 'headline'}): + url = link.get('href', None) + title = self.tag_to_string(link) + + description = None + h3 = link.findNextSibling('h3') + if h3: + description = self.tag_to_string(h3) + + article = {'title': title, 'date': u'', 'url': url, 'description': description} + articles.append(article) + answer.append((feed_titles[i - 1], articles)) + + return answer + + def preprocess_html(self, soup): + freshSoup = self.get_fresh_soup(soup) + article = soup.find('p', attrs = {'class': 'MsoNormal'}).parent + if article: + article.name = 'div' + del article['width'] + article['class'] = 'main_content' + org = article.find('a', attrs = {'href': '?SHOW_ORIGINAL_TEXT'}) + if org: + org.parent.extract() + + intro = article.find('span', attrs = {'class': 'insideitro'}) + if intro: + for el in intro.findAll(['strong', 'em', 'br']): + if el.name == 'br': + el.extract() + else: + el.name = 'div' + + freshSoup.body.append(article) + + return freshSoup + + def get_fresh_soup(self, oldSoup): + freshSoup = BeautifulSoup('