calibre/recipes/hackernews.recipe
2011-08-25 06:57:58 -06:00

87 lines
2.8 KiB
Python

#!/usr/bin/env python
__license__ = 'GPL v3'
'''
Hacker News
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryFile
from urlparse import urlparse
class HackerNews(BasicNewsRecipe):
title = 'Hacker News'
__author__ = 'Tom Scholl'
description = u'Hacker News, run by Y Combinator. Anything that good hackers would find interesting, with a focus on programming and startups.'
publisher = 'Y Combinator'
category = 'news, programming, it, technology'
masthead_url = 'http://i55.tinypic.com/2u6io76.png'
cover_url = 'http://i55.tinypic.com/2u6io76.png'
delay = 1
max_articles_per_feed = 30
use_embedded_content = False
no_stylesheets = True
encoding = 'utf-8'
language = 'en'
requires_version = (0,8,16)
feeds = [
(u'Hacker News', 'http://news.ycombinator.com/rss')
]
temp_files = []
articles_are_obfuscated = True
def get_readable_content(self, url):
self.log('get_readable_content(' + url + ')')
br = self.get_browser()
f = br.open(url)
html = f.read()
f.close()
return self.extract_readable_article(html, url)
def get_hn_content(self, url):
self.log('get_hn_content(' + url + ')')
# this could be improved
br = self.get_browser()
f = br.open(url)
html = f.read()
f.close()
return html
def get_obfuscated_article(self, url):
if url.startswith('http://news.ycombinator.com'):
content = self.get_hn_content(url)
else:
# TODO: use content-type header instead of url
is_image = False
for ext in ['.jpg', '.png', '.svg', '.gif', '.jpeg', '.tiff', '.bmp',]:
if url.endswith(ext):
is_image = True
break
if is_image:
self.log('using image_content (' + url + ')')
content = u'<html><body><img src="' + url + u'"></body></html>'
else:
content = self.get_readable_content(url)
self.temp_files.append(PersistentTemporaryFile('_fa.html'))
self.temp_files[-1].write(content)
self.temp_files[-1].close()
return self.temp_files[-1].name
def is_link_wanted(self, url, tag):
if url.endswith('.pdf'):
return False
return True
def prettyify_url(self, url):
return urlparse(url).hostname
def populate_article_metadata(self, article, soup, first):
article.text_summary = self.prettyify_url(article.url)
article.summary = article.text_summary