From 02f58a25092b6b0c039124c971ad9dfae0f6b0a8 Mon Sep 17 00:00:00 2001 From: Tom Scholl Date: Mon, 29 Aug 2011 17:49:50 +0100 Subject: [PATCH] Updated recipe for Hacker News --- recipes/hackernews.recipe | 60 +++++++++++++++++++++++++++++++++------ 1 file changed, 51 insertions(+), 9 deletions(-) diff --git a/recipes/hackernews.recipe b/recipes/hackernews.recipe index fa4b58864d..e57125465c 100644 --- a/recipes/hackernews.recipe +++ b/recipes/hackernews.recipe @@ -5,8 +5,10 @@ __license__ = 'GPL v3' Hacker News ''' from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Tag from calibre.ptempfile import PersistentTemporaryFile from urlparse import urlparse +import re class HackerNews(BasicNewsRecipe): title = 'Hacker News' @@ -14,8 +16,8 @@ class HackerNews(BasicNewsRecipe): description = u'Hacker News, run by Y Combinator. Anything that good hackers would find interesting, with a focus on programming and startups.' publisher = 'Y Combinator' category = 'news, programming, it, technology' - masthead_url = 'http://i55.tinypic.com/2u6io76.png' - cover_url = 'http://i55.tinypic.com/2u6io76.png' + masthead_url = 'http://img585.imageshack.us/img585/5011/hnle.png' + cover_url = 'http://img585.imageshack.us/img585/5011/hnle.png' delay = 1 max_articles_per_feed = 30 use_embedded_content = False @@ -42,13 +44,43 @@ class HackerNews(BasicNewsRecipe): def get_hn_content(self, url): self.log('get_hn_content(' + url + ')') - # this could be improved - br = self.get_browser() - f = br.open(url) - html = f.read() - f.close() - return html - + soup = self.index_to_soup(url) + main = soup.find('tr').findNextSiblings('tr', limit=2)[1].td + + title = self.tag_to_string(main.find('td', 'title')) + link = main.find('td', 'title').find('a')['href'] + if link.startswith('item?'): + link = 'http://news.ycombinator.com/' + link + readable_link = link.rpartition('http://')[2].rpartition('https://')[2] + subtext = self.tag_to_string(main.find('td', 'subtext')) + + title_content_td = main.find('td', 'title').findParent('tr').findNextSiblings('tr', limit=3)[2].findAll('td', limit=2)[1] + title_content = u'' + if not title_content_td.find('form'): + title_content_td.name ='div' + title_content = title_content_td.prettify() + + comments = u'' + for td in main.findAll('td', 'default'): + comhead = td.find('span', 'comhead') + if comhead: + com_title = u'

' + self.tag_to_string(comhead).replace(' | link', '') + u'

' + comhead.parent.extract() + br = td.find('br') + if br: + br.extract() + reply = td.find('a', attrs = {'href' : re.compile('^reply?')}) + if reply: + reply.parent.extract() + td.name = 'div' + indent_width = (int(td.parent.find('td').img['width']) * 2) / 3 + td['style'] = 'padding-left: ' + str(indent_width) + 'px' + comments = comments + com_title + td.prettify() + + body = u'

' + title + u'

' + readable_link + u'
' + subtext + u'

' + title_content + u'
' + body = body + comments + return u'' + title + u'' + body + '' + def get_obfuscated_article(self, url): if url.startswith('http://news.ycombinator.com'): content = self.get_hn_content(url) @@ -65,6 +97,10 @@ class HackerNews(BasicNewsRecipe): content = u'' else: content = self.get_readable_content(url) + + f = open('/tmp/hn_content.html', 'w') + f.write(content) + f.close() self.temp_files.append(PersistentTemporaryFile('_fa.html')) self.temp_files[-1].write(content) @@ -82,5 +118,11 @@ class HackerNews(BasicNewsRecipe): def populate_article_metadata(self, article, soup, first): article.text_summary = self.prettyify_url(article.url) article.summary = article.text_summary + +# def parse_index(self): +# feeds = [] +# feeds.append((u'Hacker News',[{'title': 'Testing', 'url': 'http://news.ycombinator.com/item?id=2935944'}])) +# return feeds +