diff --git a/recipes/counterpunch.recipe b/recipes/counterpunch.recipe new file mode 100644 index 0000000000..5fefc86cb4 --- /dev/null +++ b/recipes/counterpunch.recipe @@ -0,0 +1,40 @@ +import re +from lxml.html import parse +from calibre.web.feeds.news import BasicNewsRecipe + +class Counterpunch(BasicNewsRecipe): + ''' + Parses counterpunch.com for articles + ''' + title = 'Counterpunch' + description = 'Daily political opinion from www.Counterpunch.com' + language = 'en' + __author__ = 'O. Emmerson' + keep_only_tags = [dict(name='td', attrs={'width': '522'})] + max_articles_per_feed = 10 + + def parse_index(self): + feeds = [] + title, url = 'Counterpunch', 'http://www.counterpunch.com' + articles = self.parse_page(url) + if articles: + feeds.append((title, articles)) + return feeds + + def parse_page(self, url): + parsed_page = parse(url).getroot() + articles = [] + unwanted_text = re.compile('Website\ of\ the|I\ urge\ you|Subscribe\ now|DONATE|\@asis\.com|donation\ button|click\ over\ to\ our') + parsed_articles = [a for a in parsed_page.cssselect("html>body>table tr>td>p[class='style2']") if not unwanted_text.search(a.text_content())] + for art in parsed_articles: + try: + author = art.text + title = art.cssselect("a")[0].text + ' by {0}'.format(author) + art_url = 'http://www.counterpunch.com/' + art.cssselect("a")[0].attrib['href'] + articles.append({'title': title, 'url': art_url}) + except Exception as e: + e + #print('Handler Error: ', e, 'title :', a.text_content()) + pass + return articles +