Updated recipe for Hacker News

This commit is contained in:
Tom Scholl 2011-08-29 17:49:50 +01:00
parent ecabc13152
commit 02f58a2509

View File

@ -5,8 +5,10 @@ __license__ = 'GPL v3'
Hacker News Hacker News
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
from calibre.ptempfile import PersistentTemporaryFile from calibre.ptempfile import PersistentTemporaryFile
from urlparse import urlparse from urlparse import urlparse
import re
class HackerNews(BasicNewsRecipe): class HackerNews(BasicNewsRecipe):
title = 'Hacker News' title = 'Hacker News'
@ -14,8 +16,8 @@ class HackerNews(BasicNewsRecipe):
description = u'Hacker News, run by Y Combinator. Anything that good hackers would find interesting, with a focus on programming and startups.' description = u'Hacker News, run by Y Combinator. Anything that good hackers would find interesting, with a focus on programming and startups.'
publisher = 'Y Combinator' publisher = 'Y Combinator'
category = 'news, programming, it, technology' category = 'news, programming, it, technology'
masthead_url = 'http://i55.tinypic.com/2u6io76.png' masthead_url = 'http://img585.imageshack.us/img585/5011/hnle.png'
cover_url = 'http://i55.tinypic.com/2u6io76.png' cover_url = 'http://img585.imageshack.us/img585/5011/hnle.png'
delay = 1 delay = 1
max_articles_per_feed = 30 max_articles_per_feed = 30
use_embedded_content = False use_embedded_content = False
@ -42,13 +44,43 @@ class HackerNews(BasicNewsRecipe):
def get_hn_content(self, url): def get_hn_content(self, url):
self.log('get_hn_content(' + url + ')') self.log('get_hn_content(' + url + ')')
# this could be improved soup = self.index_to_soup(url)
br = self.get_browser() main = soup.find('tr').findNextSiblings('tr', limit=2)[1].td
f = br.open(url)
html = f.read() title = self.tag_to_string(main.find('td', 'title'))
f.close() link = main.find('td', 'title').find('a')['href']
return html if link.startswith('item?'):
link = 'http://news.ycombinator.com/' + link
readable_link = link.rpartition('http://')[2].rpartition('https://')[2]
subtext = self.tag_to_string(main.find('td', 'subtext'))
title_content_td = main.find('td', 'title').findParent('tr').findNextSiblings('tr', limit=3)[2].findAll('td', limit=2)[1]
title_content = u''
if not title_content_td.find('form'):
title_content_td.name ='div'
title_content = title_content_td.prettify()
comments = u''
for td in main.findAll('td', 'default'):
comhead = td.find('span', 'comhead')
if comhead:
com_title = u'<h4>' + self.tag_to_string(comhead).replace(' | link', '') + u'</h4>'
comhead.parent.extract()
br = td.find('br')
if br:
br.extract()
reply = td.find('a', attrs = {'href' : re.compile('^reply?')})
if reply:
reply.parent.extract()
td.name = 'div'
indent_width = (int(td.parent.find('td').img['width']) * 2) / 3
td['style'] = 'padding-left: ' + str(indent_width) + 'px'
comments = comments + com_title + td.prettify()
body = u'<h3>' + title + u'</h3><p><a href="' + link + u'">' + readable_link + u'</a><br/><strong>' + subtext + u'</strong></p>' + title_content + u'<br/>'
body = body + comments
return u'<html><title>' + title + u'</title><body>' + body + '</body></html>'
def get_obfuscated_article(self, url): def get_obfuscated_article(self, url):
if url.startswith('http://news.ycombinator.com'): if url.startswith('http://news.ycombinator.com'):
content = self.get_hn_content(url) content = self.get_hn_content(url)
@ -65,6 +97,10 @@ class HackerNews(BasicNewsRecipe):
content = u'<html><body><img src="' + url + u'"></body></html>' content = u'<html><body><img src="' + url + u'"></body></html>'
else: else:
content = self.get_readable_content(url) content = self.get_readable_content(url)
f = open('/tmp/hn_content.html', 'w')
f.write(content)
f.close()
self.temp_files.append(PersistentTemporaryFile('_fa.html')) self.temp_files.append(PersistentTemporaryFile('_fa.html'))
self.temp_files[-1].write(content) self.temp_files[-1].write(content)
@ -82,5 +118,11 @@ class HackerNews(BasicNewsRecipe):
def populate_article_metadata(self, article, soup, first): def populate_article_metadata(self, article, soup, first):
article.text_summary = self.prettyify_url(article.url) article.text_summary = self.prettyify_url(article.url)
article.summary = article.text_summary article.summary = article.text_summary
# def parse_index(self):
# feeds = []
# feeds.append((u'Hacker News',[{'title': 'Testing', 'url': 'http://news.ycombinator.com/item?id=2935944'}]))
# return feeds