from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, Comment def new_tag(soup, name, attrs=()): impl = getattr(soup, 'new_tag', None) if impl is not None: return impl(name, attrs=dict(attrs)) return Tag(soup, name, attrs=attrs or None) class GlennBeckRecipe(BasicNewsRecipe): __license__ = 'GPL v3' __author__ = 'kwetal' language = 'en' version = 1 title = u'Glenn Beck' publisher = u'Premiere Radio Networks' category = u'News, Opinion' description = u'The fusion of entertainment and enlightenment' oldest_article = 7 max_articles_per_feed = 100 no_stylesheets = True remove_javascript = True use_embedded_content = False feeds = [(u'Glenn Beck', u'http://feeds.feedburner.com/GlennBeckArticles')] def preprocess_html(self, soup): # Their html is horribly broken; if we search for the div that has the content BeatifulSoup returns the div with only the headline and no content. # This is due to illegal nesting of tags. So we do it the hard way. # We can find this one, and we don't want it. div = soup.find('div', attrs={'id': 'extraInfo'}) if div: div.extract() # Don't want these either. iframes = soup.findAll('iframe') [iframe.extract() for iframe in iframes] # Get empty document. freshSoup = self.getFreshSoup() # This is the broken div; but we can find the headline. newsDiv = soup.find('div', attrs={'class': 'news-detail'}) if newsDiv: if newsDiv.h1: freshSoup.body.append(newsDiv.h1) # The content is wrapped in

tags, most of the time anyway. counter = 0 for p in soup.findAll('p'): if p.get('class') == 'smalltextwhite': # But we don't want this one. continue freshSoup.body.append(p) counter += 1 # Debugging block # In some articles the content is not wrapped in

tags. In that case the counter is low. # 2 is the magic number that seems to work. if counter <= 2: # So they are playing hard-to-get: first throw out all comments. comments = soup.findAll( text=lambda text: isinstance(text, Comment)) [comment.extract() for comment in comments] # Find all unwrapped strings. for txt in soup.findAll(text=True): raw = txt.strip() # Debugging line if (txt.parent.name == 'body' and len(raw) > 0) and not (len(raw) == 6 and raw == ' '): # This is our content; ignore the rest. para = new_tag(freshSoup, 'p') para.append(raw) freshSoup.body.append(para) counter += 1 # Now if the counter is still 0 or 1 they did something completely # different and we still have an empty article. In a last attempt, # add the whole content div, just in case. if counter < 2: freshSoup.body.append(newsDiv) # Debugging block return freshSoup def getFreshSoup(self, title=None): if title: return BeautifulSoup('' + str(title) + '') else: return BeautifulSoup('')