#!/usr/bin/env python __license__ = 'GPL v3' ''' Hacker News ''' from calibre.ptempfile import PersistentTemporaryFile from calibre.web.feeds.news import BasicNewsRecipe try: from urllib.parse import urlparse except ImportError: from urlparse import urlparse import re class HNWithCommentsLink(BasicNewsRecipe): title = 'HN With Actual Comments' __author__ = 'Tom Scholl & David Kerschner' description = u'Hacker News, run by Y Combinator. Anything that good hackers would find interesting, with a focus on programming and startups.' publisher = 'Y Combinator' category = 'news, programming, it, technology' delay = 1 max_articles_per_feed = 20 oldest_article = 3 use_embedded_content = False no_stylesheets = True encoding = 'utf-8' language = 'en' requires_version = (0, 8, 16) feeds = [ (u'Hacker News Frontpage', 'https://hnrss.org/frontpage'), (u'Ask Hacker News', 'https://hnrss.org/ask') ] temp_files = [] articles_are_obfuscated = True def get_readable_content(self, url): self.log('get_readable_content(' + url + ')') br = self.get_browser() f = br.open(url) html = f.read() f.close() return self.extract_readable_article(html, url) def get_hn_content(self, url): self.log('get_hn_content(' + url + ')') soup = self.index_to_soup(url) main = soup.find('tr').findNextSiblings('tr', limit=2)[1].td title_element = main.select('td.title .titleline a')[0] title = self.tag_to_string(title_element) link = title_element['href'] # link = main.find('td', 'title').find('a')['href'] if link.startswith('item?'): link = 'https://news.ycombinator.com/' + link readable_link = link.rpartition('http://')[2].rpartition('https://')[2] subtext = self.tag_to_string(main.find('td', 'subtext')) title_content_td = main.find('td', 'title').findParent( 'tr').findNextSiblings('tr', limit=3)[2].findAll('td', limit=2)[1] title_content = u'' if not title_content_td.find('form'): title_content_td.name = 'div' title_content = title_content_td.prettify() comments = u'' for td in main.findAll('td', 'default'): comhead = td.find('span', 'comhead') if comhead: com_title = u'

' + self.tag_to_string(comhead).replace(' | link', '') + u'

' comhead.parent.extract() br = td.find('br') if br: br.extract() reply = td.find('a', attrs={'href': re.compile(r'^reply?')}) if reply: reply.parent.extract() td.name = 'div' indent_width = (int(td.parent.find('td').img['width']) * 2) / 3 td['style'] = 'padding-left: ' + str(indent_width) + 'px' comments = comments + com_title + td.prettify() body = (u'

' + title + u'

' + readable_link + u'
' + subtext + u'

' + title_content + u'
') body = body + comments return u'' + title + u'' + body + '' def parse_feeds(self): a = super().parse_feeds() self.hn_articles = a[0].articles return a def get_obfuscated_article(self, url): self.log('get_obfuscated_article with url=' + url) if url.startswith('https://news.ycombinator.com'): content = self.get_hn_content(url) else: # TODO: use content-type header instead of url is_image = False for ext in ['.jpg', '.png', '.svg', '.gif', '.jpeg', '.tiff', '.bmp', ]: if url.endswith(ext): is_image = True break if is_image: self.log('using image_content (' + url + ')') content = u'' else: content = self.get_readable_content(url) # content = re.sub(r'\s*\s*$', '', content) + article.summary + '' if not isinstance(content, bytes): content = content.encode('utf-8') self.temp_files.append(PersistentTemporaryFile('_fa.html')) self.temp_files[-1].write(content) self.temp_files[-1].close() return self.temp_files[-1].name def is_link_wanted(self, url, tag): if url.endswith('.pdf'): return False return True def prettyify_url(self, url): return urlparse(url).hostname def populate_article_metadata(self, article, soup, first): article.text_summary = self.prettyify_url(article.url) article.summary = article.text_summary # def parse_index(self): # feeds = [] # feeds.append((u'Hacker News',[{'title': 'Testing', 'url': 'https://news.ycombinator.com/item?id=2935944'}])) # return feeds