diff --git a/recipes/hackernews.recipe b/recipes/hackernews.recipe index fa4b58864d..e57125465c 100644 --- a/recipes/hackernews.recipe +++ b/recipes/hackernews.recipe @@ -5,8 +5,10 @@ __license__ = 'GPL v3' Hacker News ''' from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Tag from calibre.ptempfile import PersistentTemporaryFile from urlparse import urlparse +import re class HackerNews(BasicNewsRecipe): title = 'Hacker News' @@ -14,8 +16,8 @@ class HackerNews(BasicNewsRecipe): description = u'Hacker News, run by Y Combinator. Anything that good hackers would find interesting, with a focus on programming and startups.' publisher = 'Y Combinator' category = 'news, programming, it, technology' - masthead_url = 'http://i55.tinypic.com/2u6io76.png' - cover_url = 'http://i55.tinypic.com/2u6io76.png' + masthead_url = 'http://img585.imageshack.us/img585/5011/hnle.png' + cover_url = 'http://img585.imageshack.us/img585/5011/hnle.png' delay = 1 max_articles_per_feed = 30 use_embedded_content = False @@ -42,13 +44,43 @@ class HackerNews(BasicNewsRecipe): def get_hn_content(self, url): self.log('get_hn_content(' + url + ')') - # this could be improved - br = self.get_browser() - f = br.open(url) - html = f.read() - f.close() - return html - + soup = self.index_to_soup(url) + main = soup.find('tr').findNextSiblings('tr', limit=2)[1].td + + title = self.tag_to_string(main.find('td', 'title')) + link = main.find('td', 'title').find('a')['href'] + if link.startswith('item?'): + link = 'http://news.ycombinator.com/' + link + readable_link = link.rpartition('http://')[2].rpartition('https://')[2] + subtext = self.tag_to_string(main.find('td', 'subtext')) + + title_content_td = main.find('td', 'title').findParent('tr').findNextSiblings('tr', limit=3)[2].findAll('td', limit=2)[1] + title_content = u'' + if not title_content_td.find('form'): + title_content_td.name ='div' + title_content = title_content_td.prettify() + + comments = u'' + for td in main.findAll('td', 'default'): + comhead = td.find('span', 'comhead') + if comhead: + com_title = u'
' + readable_link + u'
' + subtext + u'