Give me something to read and Let's get Critical by Barty

This commit is contained in:
Kovid Goyal 2011-11-24 09:52:35 +05:30
parent d52a9ded1f
commit c512404062
4 changed files with 202 additions and 22 deletions

View File

@ -0,0 +1,90 @@
import re
from calibre.web.feeds.news import BasicNewsRecipe
class GiveMeSomethingToRead(BasicNewsRecipe):
title = u'Give Me Something To Read'
description = 'Curation / aggregation of articles on diverse topics'
language = 'en'
__author__ = 'barty on mobileread.com forum'
max_articles_per_feed = 100
no_stylesheets = False
timefmt = ' [%a, %d %b, %Y]'
oldest_article = 365
auto_cleanup = True
INDEX = 'http://givemesomethingtoread.com'
CATEGORIES = [
# comment out categories you don't want
# (user friendly name, system name, max number of articles to load)
('The Arts','arts',25),
('Science','science',30),
('Technology','technology',30),
('Politics','politics',20),
('Media','media',30),
('Crime','crime',15),
('Other articles','',10)
]
def parse_index(self):
self.cover_url = 'http://thegretchenshow.files.wordpress.com/2009/12/well-read-cat-small.jpg'
feeds = []
seen_urls = set([])
regex = re.compile( r'http://(www\.)?([^/:]+)', re.I)
for category in self.CATEGORIES:
(cat_name, tag, max_articles) = category
tagurl = '' if tag=='' else '/tagged/'+tag
self.log('Reading category:', cat_name)
articles = []
pageno = 1
while len(articles) < max_articles and pageno < 100:
page = "%s%s/page/%d" % (self.INDEX, tagurl, pageno) if pageno > 1 else self.INDEX + tagurl
pageno += 1
self.log('\tReading page:', page)
try:
soup = self.index_to_soup(page)
except:
break
headers = soup.findAll('h2')
if len(headers) == .0:
break
for header in headers:
atag = header.find('a')
url = atag['href']
# skip promotionals and duplicate
if url.startswith('http://givemesomethingtoread') or url.startswith('/') or url in seen_urls:
continue
seen_urls.add(url)
title = self.tag_to_string(header)
self.log('\tFound article:', title)
#self.log('\t', url)
desc = header.parent.find('blockquote')
desc = self.tag_to_string(desc) if desc else ''
m = regex.match( url)
if m:
desc = "[%s] %s" % (m.group(2), desc)
#self.log('\t', desc)
date = ''
p = header.parent.previousSibling
# navigate up to find h3, which contains the date
while p:
if hasattr(p,'name') and p.name == 'h3':
date = self.tag_to_string(p)
break
p = p.previousSibling
articles.append({'title':title,'url':url,'description':desc,'date':date})
if len(articles) >= max_articles:
break
if articles:
feeds.append((cat_name, articles))
return feeds

View File

@ -0,0 +1,94 @@
import re
from calibre.web.feeds.news import BasicNewsRecipe
class LetsGetCritical(BasicNewsRecipe):
title = u"Let's Get Critical"
description = 'Curation / aggregation of criticisms of the arts and culture '
language = 'en'
__author__ = 'barty on mobileread.com forum'
max_articles_per_feed = 100
no_stylesheets = False
timefmt = ' [%a, %d %b, %Y]'
oldest_article = 365
auto_cleanup = True
INDEX = 'http://www.letsgetcritical.org'
CATEGORIES = [
# comment out categories you don't want
# (user friendly name, system name, max number of articles to load)
('Architecture','architecture',30),
('Art','art',30),
('Books','books',30),
('Design','design',30),
('Digital','digital',30),
('Food','food',30),
('Movies','movies',30),
('Music','music',30),
('Television','television',30),
('Other articles','',10)
]
def parse_index(self):
self.cover_url = 'http://www.letsgetcritical.org/wp-content/themes/lets_get_critical/images/lgc.jpg'
feeds = []
seen_urls = set([])
regex = re.compile( r'http://(www\.)?([^/:]+)', re.I)
for category in self.CATEGORIES:
(cat_name, tag, max_articles) = category
tagurl = '' if tag=='' else '/category/'+tag.lower()
self.log('Reading category:', cat_name)
articles = []
pageno = 1
while len(articles) < max_articles and pageno < 100:
page = "%s%s/page/%d" % (self.INDEX, tagurl, pageno) if pageno > 1 else self.INDEX + tagurl
pageno += 1
self.log('\tReading page:', page)
try:
soup = self.index_to_soup(page)
except:
break
posts = soup.findAll('div',attrs={'class':'post_multi'})
if len(posts) == 0:
break
for post in posts:
dt = post.find('div',attrs={'class':'title'})
atag = dt.find('a')
url = atag['href']
# skip promotionals and duplicate
if url.startswith('http://letsgetcritical') or url.startswith('/') or url in seen_urls:
continue
seen_urls.add(url)
title = self.tag_to_string(atag)
self.log('\tFound article:', title)
self.log('\t', url)
desc = post.find('blockquote')
desc = self.tag_to_string(desc) if desc else ''
m = regex.match( url)
if m:
desc = "[%s] %s" % (m.group(2), desc)
#self.log('\t', desc)
date = ''
p = post.previousSibling
# navigate up sibling to find date
while p:
if hasattr(p,'class') and p['class'] == 'singledate':
date = self.tag_to_string(p)
break
p = p.previousSibling
articles.append({'title':title,'url':url,'description':desc,'date':date})
if len(articles) >= max_articles:
break
if articles:
feeds.append((cat_name, articles))
return feeds

View File

@ -6,11 +6,7 @@ www.nin.co.rs
''' '''
import re import re
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from contextlib import closing
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre import entity_to_unicode
class Nin(BasicNewsRecipe): class Nin(BasicNewsRecipe):
title = 'NIN online' title = 'NIN online'

View File

@ -11,7 +11,7 @@ import re
import urllib2 import urllib2
from contextlib import closing from contextlib import closing
from lxml import etree, html from lxml import etree
from PyQt4.Qt import QUrl from PyQt4.Qt import QUrl
from calibre import browser, url_slash_cleaner, prints from calibre import browser, url_slash_cleaner, prints