diff --git a/recipes/hackernews_with_comments.recipe b/recipes/hackernews_with_comments.recipe new file mode 100644 index 0000000000..951a65fd78 --- /dev/null +++ b/recipes/hackernews_with_comments.recipe @@ -0,0 +1,149 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +''' +Hacker News (with comments) +''' +from calibre.ptempfile import PersistentTemporaryFile +from calibre.web.feeds.news import BasicNewsRecipe + +try: + from urllib.parse import urlparse +except ImportError: + from urlparse import urlparse +import re + + +class HNWithComments(BasicNewsRecipe): + title = 'HN With actual comments' + __author__ = 'Tom Scholl & David Kerschner' + description = u'Hacker News, run by Y Combinator. Anything that good hackers would find interesting, with a focus on programming and startups.' + publisher = 'Y Combinator' + category = 'news, programming, it, technology' + delay = 1 + max_articles_per_feed = 20 + oldest_article = 3 + use_embedded_content = False + no_stylesheets = True + encoding = 'utf-8' + language = 'en' + requires_version = (0, 8, 16) + + feeds = [ + (u'Hacker News Frontpage', 'https://hnrss.org/frontpage'), + (u'Ask Hacker News', 'https://hnrss.org/ask') + ] + + temp_files = [] + articles_are_obfuscated = True + + def get_readable_content(self, url): + self.log('get_readable_content(' + url + ')') + br = self.get_browser() + f = br.open(url) + html = f.read() + f.close() + + return self.extract_readable_article(html, url) + + def get_hn_content(self, url): + self.log('get_hn_content(' + url + ')') + soup = self.index_to_soup(url) + main = soup.find('tr').findNextSiblings('tr', limit=2)[1].td + + title_element = main.select('td.title .titleline a')[0] + self.log('title_element=' + repr(title_element)) + title = self.tag_to_string(title_element) + self.log('title=' + title) + link = title_element['href'] + # link = main.find('td', 'title').find('a')['href'] + if link.startswith('item?'): + link = 'https://news.ycombinator.com/' + link + readable_link = link.rpartition('http://')[2].rpartition('https://')[2] + subtext = self.tag_to_string(main.find('td', 'subtext')) + + title_content_td = main.find('td', 'title').findParent( + 'tr').findNextSiblings('tr', limit=3)[2].findAll('td', limit=2)[1] + title_content = u'' + if not title_content_td.find('form'): + title_content_td.name = 'div' + title_content = title_content_td.prettify() + + comments = u'' + for td in main.findAll('td', 'default'): + comhead = td.find('span', 'comhead') + if comhead: + com_title = u'
' + readable_link + \
+ u'
' + subtext + u'