From 02f58a25092b6b0c039124c971ad9dfae0f6b0a8 Mon Sep 17 00:00:00 2001
From: Tom Scholl <thomasscholl@gmail.com>
Date: Mon, 29 Aug 2011 17:49:50 +0100
Subject: [PATCH] Updated recipe for Hacker News

---
 recipes/hackernews.recipe | 60 +++++++++++++++++++++++++++++++++------
 1 file changed, 51 insertions(+), 9 deletions(-)
diff --git a/recipes/hackernews.recipe b/recipes/hackernews.recipe
index fa4b58864d..e57125465c 100644
--- a/recipes/hackernews.recipe
+++ b/recipes/hackernews.recipe
@@ -5,8 +5,10 @@ __license__   = 'GPL v3'
 Hacker News
 '''
 from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import Tag
 from calibre.ptempfile import PersistentTemporaryFile
 from urlparse import urlparse
+import re
 
 class HackerNews(BasicNewsRecipe):
     title                 = 'Hacker News'
@@ -14,8 +16,8 @@ class HackerNews(BasicNewsRecipe):
     description           = u'Hacker News, run by Y Combinator. Anything that good hackers would find interesting, with a focus on programming and startups.'
     publisher             = 'Y Combinator'
     category              = 'news, programming, it, technology'
-    masthead_url          = 'http://i55.tinypic.com/2u6io76.png'
-    cover_url             = 'http://i55.tinypic.com/2u6io76.png'
+    masthead_url          = 'http://img585.imageshack.us/img585/5011/hnle.png'
+    cover_url             = 'http://img585.imageshack.us/img585/5011/hnle.png'
     delay                 = 1
     max_articles_per_feed = 30
     use_embedded_content  = False
@@ -42,13 +44,43 @@ class HackerNews(BasicNewsRecipe):
 
     def get_hn_content(self, url):
         self.log('get_hn_content(' + url + ')')
-        # this could be improved
-        br = self.get_browser()
-        f = br.open(url)
-        html = f.read()
-        f.close()
-        return html
-
+        soup = self.index_to_soup(url)
+        main = soup.find('tr').findNextSiblings('tr', limit=2)[1].td
+        
+        title = self.tag_to_string(main.find('td', 'title'))
+        link = main.find('td', 'title').find('a')['href']
+        if link.startswith('item?'):
+            link = 'http://news.ycombinator.com/' + link
+        readable_link = link.rpartition('http://')[2].rpartition('https://')[2]
+        subtext = self.tag_to_string(main.find('td', 'subtext'))
+        
+        title_content_td = main.find('td', 'title').findParent('tr').findNextSiblings('tr', limit=3)[2].findAll('td', limit=2)[1]
+        title_content = u''
+        if not title_content_td.find('form'):
+            title_content_td.name ='div'
+            title_content = title_content_td.prettify()
+        
+        comments = u''
+        for td in main.findAll('td', 'default'):
+            comhead = td.find('span', 'comhead')
+            if comhead:
+                com_title = u'<h4>' + self.tag_to_string(comhead).replace(' | link', '') + u'</h4>'
+                comhead.parent.extract()
+                br = td.find('br')
+                if br:
+                    br.extract()
+                reply = td.find('a', attrs = {'href' : re.compile('^reply?')})
+                if reply:
+                    reply.parent.extract()
+                td.name = 'div'
+                indent_width = (int(td.parent.find('td').img['width']) * 2) / 3
+                td['style'] = 'padding-left: ' + str(indent_width) + 'px'
+                comments = comments + com_title + td.prettify()
+        
+        body = u'<h3>' + title + u'</h3><p><a href="' + link + u'">' + readable_link + u'</a><br/><strong>' + subtext +  u'</strong></p>' + title_content + u'<br/>'
+        body = body + comments
+        return u'<html><title>' + title + u'</title><body>' + body + '</body></html>'
+        
     def get_obfuscated_article(self, url):
         if url.startswith('http://news.ycombinator.com'):
             content = self.get_hn_content(url)
@@ -65,6 +97,10 @@ class HackerNews(BasicNewsRecipe):
                 content = u'<html><body><img src="' + url + u'"></body></html>'
             else:
                 content = self.get_readable_content(url)
+                
+        f = open('/tmp/hn_content.html', 'w')
+        f.write(content)
+        f.close()
 
         self.temp_files.append(PersistentTemporaryFile('_fa.html'))
         self.temp_files[-1].write(content)
@@ -82,5 +118,11 @@ class HackerNews(BasicNewsRecipe):
     def populate_article_metadata(self, article, soup, first):
         article.text_summary = self.prettyify_url(article.url)
         article.summary = article.text_summary
+        
+#    def parse_index(self):
+#        feeds = []
+#        feeds.append((u'Hacker News',[{'title': 'Testing', 'url': 'http://news.ycombinator.com/item?id=2935944'}]))
+#        return feeds
+