#!/usr/bin/env python __license__ = 'GPL v3' ''' Hacker News ''' from calibre.web.feeds.news import BasicNewsRecipe from calibre.ptempfile import PersistentTemporaryFile from urlparse import urlparse import re class HNWithCommentsLink(BasicNewsRecipe): title = 'HN With Comments Link' __author__ = 'Tom Scholl & David Kerschner' description = u'Hacker News, run by Y Combinator. Anything that good hackers would find interesting, with a focus on programming and startups.' publisher = 'Y Combinator' category = 'news, programming, it, technology' masthead_url = 'http://img585.imageshack.us/img585/5011/hnle.png' cover_url = 'http://img585.imageshack.us/img585/5011/hnle.png' delay = 1 max_articles_per_feed = 30 use_embedded_content = False no_stylesheets = True encoding = 'utf-8' language = 'en' requires_version = (0,8,16) feeds = [ (u'Hacker News', 'http://news.ycombinator.com/rss') ] temp_files = [] articles_are_obfuscated = True def get_readable_content(self, url): self.log('get_readable_content(' + url + ')') br = self.get_browser() f = br.open(url) html = f.read() f.close() return self.extract_readable_article(html, url) def get_hn_content(self, url): self.log('get_hn_content(' + url + ')') soup = self.index_to_soup(url) main = soup.find('tr').findNextSiblings('tr', limit=2)[1].td title = self.tag_to_string(main.find('td', 'title')) link = main.find('td', 'title').find('a')['href'] if link.startswith('item?'): link = 'http://news.ycombinator.com/' + link readable_link = link.rpartition('http://')[2].rpartition('https://')[2] subtext = self.tag_to_string(main.find('td', 'subtext')) title_content_td = main.find('td', 'title').findParent('tr').findNextSiblings('tr', limit=3)[2].findAll('td', limit=2)[1] title_content = u'' if not title_content_td.find('form'): title_content_td.name ='div' title_content = title_content_td.prettify() comments = u'' for td in main.findAll('td', 'default'): comhead = td.find('span', 'comhead') if comhead: com_title = u'
' + readable_link + u'
' + subtext + u'