calibre/recipes/counterpunch.recipe

import re
from lxml.html import parse
from calibre.web.feeds.news import BasicNewsRecipe

class Counterpunch(BasicNewsRecipe):
    '''
    Parses counterpunch.com for articles
    '''
    title = 'Counterpunch'
    description = 'Daily political opinion from www.Counterpunch.com'
    language = 'en'
    __author__ = 'O. Emmerson'
    keep_only_tags = [dict(name='td', attrs={'width': '522'})]
    max_articles_per_feed = 10

    def parse_index(self):
        feeds = []
        title, url = 'Counterpunch', 'http://www.counterpunch.com'
        articles = self.parse_page(url)
        if articles:
            feeds.append((title, articles))
        return feeds

    def parse_page(self, url):
        parsed_page = parse(url).getroot()
        articles = []
        unwanted_text = re.compile('Website\ of\ the|I\ urge\ you|Subscribe\ now|DONATE|\@asis\.com|donation\ button|click\ over\ to\ our')
        parsed_articles = [a for a in parsed_page.cssselect("html>body>table tr>td>p[class='style2']") if not unwanted_text.search(a.text_content())]
        for art in parsed_articles:
            try:
                author = art.text
                title = art.cssselect("a")[0].text + ' by {0}'.format(author)
                art_url = 'http://www.counterpunch.com/' + art.cssselect("a")[0].attrib['href']
                articles.append({'title': title, 'url': art_url})
            except Exception as e:
                e
                #print('Handler Error: ', e, 'title :', a.text_content())
                pass
        return articles