mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 10:44:09 -04:00
Updated recipe for Hacker News
This commit is contained in:
parent
ecabc13152
commit
02f58a2509
@ -5,8 +5,10 @@ __license__ = 'GPL v3'
|
|||||||
Hacker News
|
Hacker News
|
||||||
'''
|
'''
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.ebooks.BeautifulSoup import Tag
|
||||||
from calibre.ptempfile import PersistentTemporaryFile
|
from calibre.ptempfile import PersistentTemporaryFile
|
||||||
from urlparse import urlparse
|
from urlparse import urlparse
|
||||||
|
import re
|
||||||
|
|
||||||
class HackerNews(BasicNewsRecipe):
|
class HackerNews(BasicNewsRecipe):
|
||||||
title = 'Hacker News'
|
title = 'Hacker News'
|
||||||
@ -14,8 +16,8 @@ class HackerNews(BasicNewsRecipe):
|
|||||||
description = u'Hacker News, run by Y Combinator. Anything that good hackers would find interesting, with a focus on programming and startups.'
|
description = u'Hacker News, run by Y Combinator. Anything that good hackers would find interesting, with a focus on programming and startups.'
|
||||||
publisher = 'Y Combinator'
|
publisher = 'Y Combinator'
|
||||||
category = 'news, programming, it, technology'
|
category = 'news, programming, it, technology'
|
||||||
masthead_url = 'http://i55.tinypic.com/2u6io76.png'
|
masthead_url = 'http://img585.imageshack.us/img585/5011/hnle.png'
|
||||||
cover_url = 'http://i55.tinypic.com/2u6io76.png'
|
cover_url = 'http://img585.imageshack.us/img585/5011/hnle.png'
|
||||||
delay = 1
|
delay = 1
|
||||||
max_articles_per_feed = 30
|
max_articles_per_feed = 30
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
@ -42,13 +44,43 @@ class HackerNews(BasicNewsRecipe):
|
|||||||
|
|
||||||
def get_hn_content(self, url):
|
def get_hn_content(self, url):
|
||||||
self.log('get_hn_content(' + url + ')')
|
self.log('get_hn_content(' + url + ')')
|
||||||
# this could be improved
|
soup = self.index_to_soup(url)
|
||||||
br = self.get_browser()
|
main = soup.find('tr').findNextSiblings('tr', limit=2)[1].td
|
||||||
f = br.open(url)
|
|
||||||
html = f.read()
|
title = self.tag_to_string(main.find('td', 'title'))
|
||||||
f.close()
|
link = main.find('td', 'title').find('a')['href']
|
||||||
return html
|
if link.startswith('item?'):
|
||||||
|
link = 'http://news.ycombinator.com/' + link
|
||||||
|
readable_link = link.rpartition('http://')[2].rpartition('https://')[2]
|
||||||
|
subtext = self.tag_to_string(main.find('td', 'subtext'))
|
||||||
|
|
||||||
|
title_content_td = main.find('td', 'title').findParent('tr').findNextSiblings('tr', limit=3)[2].findAll('td', limit=2)[1]
|
||||||
|
title_content = u''
|
||||||
|
if not title_content_td.find('form'):
|
||||||
|
title_content_td.name ='div'
|
||||||
|
title_content = title_content_td.prettify()
|
||||||
|
|
||||||
|
comments = u''
|
||||||
|
for td in main.findAll('td', 'default'):
|
||||||
|
comhead = td.find('span', 'comhead')
|
||||||
|
if comhead:
|
||||||
|
com_title = u'<h4>' + self.tag_to_string(comhead).replace(' | link', '') + u'</h4>'
|
||||||
|
comhead.parent.extract()
|
||||||
|
br = td.find('br')
|
||||||
|
if br:
|
||||||
|
br.extract()
|
||||||
|
reply = td.find('a', attrs = {'href' : re.compile('^reply?')})
|
||||||
|
if reply:
|
||||||
|
reply.parent.extract()
|
||||||
|
td.name = 'div'
|
||||||
|
indent_width = (int(td.parent.find('td').img['width']) * 2) / 3
|
||||||
|
td['style'] = 'padding-left: ' + str(indent_width) + 'px'
|
||||||
|
comments = comments + com_title + td.prettify()
|
||||||
|
|
||||||
|
body = u'<h3>' + title + u'</h3><p><a href="' + link + u'">' + readable_link + u'</a><br/><strong>' + subtext + u'</strong></p>' + title_content + u'<br/>'
|
||||||
|
body = body + comments
|
||||||
|
return u'<html><title>' + title + u'</title><body>' + body + '</body></html>'
|
||||||
|
|
||||||
def get_obfuscated_article(self, url):
|
def get_obfuscated_article(self, url):
|
||||||
if url.startswith('http://news.ycombinator.com'):
|
if url.startswith('http://news.ycombinator.com'):
|
||||||
content = self.get_hn_content(url)
|
content = self.get_hn_content(url)
|
||||||
@ -65,6 +97,10 @@ class HackerNews(BasicNewsRecipe):
|
|||||||
content = u'<html><body><img src="' + url + u'"></body></html>'
|
content = u'<html><body><img src="' + url + u'"></body></html>'
|
||||||
else:
|
else:
|
||||||
content = self.get_readable_content(url)
|
content = self.get_readable_content(url)
|
||||||
|
|
||||||
|
f = open('/tmp/hn_content.html', 'w')
|
||||||
|
f.write(content)
|
||||||
|
f.close()
|
||||||
|
|
||||||
self.temp_files.append(PersistentTemporaryFile('_fa.html'))
|
self.temp_files.append(PersistentTemporaryFile('_fa.html'))
|
||||||
self.temp_files[-1].write(content)
|
self.temp_files[-1].write(content)
|
||||||
@ -82,5 +118,11 @@ class HackerNews(BasicNewsRecipe):
|
|||||||
def populate_article_metadata(self, article, soup, first):
|
def populate_article_metadata(self, article, soup, first):
|
||||||
article.text_summary = self.prettyify_url(article.url)
|
article.text_summary = self.prettyify_url(article.url)
|
||||||
article.summary = article.text_summary
|
article.summary = article.text_summary
|
||||||
|
|
||||||
|
# def parse_index(self):
|
||||||
|
# feeds = []
|
||||||
|
# feeds.append((u'Hacker News',[{'title': 'Testing', 'url': 'http://news.ycombinator.com/item?id=2935944'}]))
|
||||||
|
# return feeds
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user