calibre/recipes/glennbeck.recipe

from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, Comment

class GlennBeckRecipe(BasicNewsRecipe):
    __license__   = 'GPL v3'
    __author__ = 'kwetal'
    language = 'en'
    version = 1

    title = u'Glenn Beck'
    publisher = u'Premiere Radio Networks'
    category = u'News, Opinion'
    description = u'The fusion of entertainment and enlightenment'

    oldest_article = 7
    max_articles_per_feed = 100

    no_stylesheets = True
    remove_javascript = True
    use_embedded_content = False

    feeds = [(u'Glenn Beck', u'http://feeds.feedburner.com/GlennBeckArticles')]

    def preprocess_html(self, soup):
        # Their html is horribly broken; if we search for the div that has the content BeatifulSoup returns the div with only the headline and no content.
        # This is due to illegal nesting of tags. So we do it the hard way.

        # We can find this one, and we don't want it.
        div = soup.find('div', attrs = {'id': 'extraInfo'})
        if div:
            div.extract()

        # Don't want these either.
        iframes = soup.findAll('iframe')
        [iframe.extract() for iframe in iframes]

        # Get empty document.
        freshSoup = self.getFreshSoup()

        # This is the broken div; but we can find the headline.
        newsDiv = soup.find('div', attrs = {'class': 'news-detail'})
        if newsDiv:
            if newsDiv.h1:
                freshSoup.body.append(newsDiv.h1)

        # The content is  wrapped in <p></p> tags, most of the time anyway.
        counter = 0
        for p in soup.findAll('p'):
            if p.get('class') == 'smalltextwhite':
                # But we don't want this one.
                continue

            freshSoup.body.append(p)
            counter += 1

        # Debugging block
        #h3 = Tag(freshSoup, 'h3')
        #h3.append('First counter: ' + str(counter))
        #freshSoup.body.insert(0, h3)

        # In some articles the content is not wrapped in <p></p> tags. In that case the counter is low.
        # 2 is the magic number that seems to work.
        if counter <= 2:
            # So they are playing hard-to-get: first throw out all comments.
            comments = soup.findAll(text = lambda text: isinstance(text, Comment))
            [comment.extract() for comment in comments]

            # Find all unwrapped strings.
            for txt in soup.findAll(text = True):
                raw = txt.strip()
                # Debugging line
                #para.append(raw + '(parent: ' + txt.parent.name + '; length: ' + str(len(raw)) + '; start: ' + raw[0:4] + ')')

                if (txt.parent.name == 'body' and len(raw) > 0) and not (len(raw) == 6 and raw == '&nbsp;'):
                    # This is our content; ignore the rest.
                    para = Tag(freshSoup, 'p')
                    para.append(raw)
                    freshSoup.body.append(para)
                    counter += 1

            # Now if the counter is still 0 or 1 they did something completely different and we still have an empty article. In a last attempt, add the whole content div, just in case.
            if counter < 2:
                freshSoup.body.append(newsDiv)

            # Debugging block
            #h3 = Tag(freshSoup, 'h3')
            #h3.append('Second counter: ' + str(counter))
            #freshSoup.body.insert(1, h3)

        return freshSoup

    def getFreshSoup(self, title = None):
        if title:
            return BeautifulSoup('<html><head><title>' + str(title) + '</title></head><body></body></html>')
        else:
            return BeautifulSoup('<html><head><title></title></head><body></body></html>')