mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-11-28 09:15:02 -05:00
87 lines
2.8 KiB
Python
87 lines
2.8 KiB
Python
#!/usr/bin/env python
|
|
|
|
__license__ = 'GPL v3'
|
|
'''
|
|
Hacker News
|
|
'''
|
|
from calibre.web.feeds.news import BasicNewsRecipe
|
|
from calibre.ptempfile import PersistentTemporaryFile
|
|
from urlparse import urlparse
|
|
|
|
class HackerNews(BasicNewsRecipe):
|
|
title = 'Hacker News'
|
|
__author__ = 'Tom Scholl'
|
|
description = u'Hacker News, run by Y Combinator. Anything that good hackers would find interesting, with a focus on programming and startups.'
|
|
publisher = 'Y Combinator'
|
|
category = 'news, programming, it, technology'
|
|
masthead_url = 'http://i55.tinypic.com/2u6io76.png'
|
|
cover_url = 'http://i55.tinypic.com/2u6io76.png'
|
|
delay = 1
|
|
max_articles_per_feed = 30
|
|
use_embedded_content = False
|
|
no_stylesheets = True
|
|
encoding = 'utf-8'
|
|
language = 'en'
|
|
requires_version = (0,8,16)
|
|
|
|
feeds = [
|
|
(u'Hacker News', 'http://news.ycombinator.com/rss')
|
|
]
|
|
|
|
temp_files = []
|
|
articles_are_obfuscated = True
|
|
|
|
def get_readable_content(self, url):
|
|
self.log('get_readable_content(' + url + ')')
|
|
br = self.get_browser()
|
|
f = br.open(url)
|
|
html = f.read()
|
|
f.close()
|
|
|
|
return self.extract_readable_article(html, url)
|
|
|
|
def get_hn_content(self, url):
|
|
self.log('get_hn_content(' + url + ')')
|
|
# this could be improved
|
|
br = self.get_browser()
|
|
f = br.open(url)
|
|
html = f.read()
|
|
f.close()
|
|
return html
|
|
|
|
def get_obfuscated_article(self, url):
|
|
if url.startswith('http://news.ycombinator.com'):
|
|
content = self.get_hn_content(url)
|
|
else:
|
|
# TODO: use content-type header instead of url
|
|
is_image = False
|
|
for ext in ['.jpg', '.png', '.svg', '.gif', '.jpeg', '.tiff', '.bmp',]:
|
|
if url.endswith(ext):
|
|
is_image = True
|
|
break
|
|
|
|
if is_image:
|
|
self.log('using image_content (' + url + ')')
|
|
content = u'<html><body><img src="' + url + u'"></body></html>'
|
|
else:
|
|
content = self.get_readable_content(url)
|
|
|
|
self.temp_files.append(PersistentTemporaryFile('_fa.html'))
|
|
self.temp_files[-1].write(content)
|
|
self.temp_files[-1].close()
|
|
return self.temp_files[-1].name
|
|
|
|
def is_link_wanted(self, url, tag):
|
|
if url.endswith('.pdf'):
|
|
return False
|
|
return True
|
|
|
|
def prettyify_url(self, url):
|
|
return urlparse(url).hostname
|
|
|
|
def populate_article_metadata(self, article, soup, first):
|
|
article.text_summary = self.prettyify_url(article.url)
|
|
article.summary = article.text_summary
|
|
|
|
|