#!/usr/bin/env python __license__ = 'GPL v3' ''' Hacker News ''' from calibre.web.feeds.news import BasicNewsRecipe from calibre.ptempfile import PersistentTemporaryFile from urlparse import urlparse class HackerNews(BasicNewsRecipe): title = 'Hacker News' __author__ = 'Tom Scholl' description = u'Hacker News, run by Y Combinator. Anything that good hackers would find interesting, with a focus on programming and startups.' publisher = 'Y Combinator' category = 'news, programming, it, technology' masthead_url = 'http://i55.tinypic.com/2u6io76.png' cover_url = 'http://i55.tinypic.com/2u6io76.png' delay = 1 max_articles_per_feed = 30 use_embedded_content = False no_stylesheets = True encoding = 'utf-8' language = 'en' requires_version = (0,8,16) feeds = [ (u'Hacker News', 'http://news.ycombinator.com/rss') ] temp_files = [] articles_are_obfuscated = True def get_readable_content(self, url): self.log('get_readable_content(' + url + ')') br = self.get_browser() f = br.open(url) html = f.read() f.close() return self.extract_readable_article(html, url) def get_hn_content(self, url): self.log('get_hn_content(' + url + ')') # this could be improved br = self.get_browser() f = br.open(url) html = f.read() f.close() return html def get_obfuscated_article(self, url): if url.startswith('http://news.ycombinator.com'): content = self.get_hn_content(url) else: # TODO: use content-type header instead of url is_image = False for ext in ['.jpg', '.png', '.svg', '.gif', '.jpeg', '.tiff', '.bmp',]: if url.endswith(ext): is_image = True break if is_image: self.log('using image_content (' + url + ')') content = u'' else: content = self.get_readable_content(url) self.temp_files.append(PersistentTemporaryFile('_fa.html')) self.temp_files[-1].write(content) self.temp_files[-1].close() return self.temp_files[-1].name def is_link_wanted(self, url, tag): if url.endswith('.pdf'): return False return True def prettyify_url(self, url): return urlparse(url).hostname def populate_article_metadata(self, article, soup, first): article.text_summary = self.prettyify_url(article.url) article.summary = article.text_summary