mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 18:24:30 -04:00
Give me something to read and Let's get Critical by Barty
This commit is contained in:
parent
d52a9ded1f
commit
c512404062
90
recipes/givemesomethingtoread.recipe
Normal file
90
recipes/givemesomethingtoread.recipe
Normal file
@ -0,0 +1,90 @@
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class GiveMeSomethingToRead(BasicNewsRecipe):
|
||||
title = u'Give Me Something To Read'
|
||||
description = 'Curation / aggregation of articles on diverse topics'
|
||||
language = 'en'
|
||||
__author__ = 'barty on mobileread.com forum'
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = False
|
||||
timefmt = ' [%a, %d %b, %Y]'
|
||||
oldest_article = 365
|
||||
auto_cleanup = True
|
||||
INDEX = 'http://givemesomethingtoread.com'
|
||||
CATEGORIES = [
|
||||
# comment out categories you don't want
|
||||
# (user friendly name, system name, max number of articles to load)
|
||||
('The Arts','arts',25),
|
||||
('Science','science',30),
|
||||
('Technology','technology',30),
|
||||
('Politics','politics',20),
|
||||
('Media','media',30),
|
||||
('Crime','crime',15),
|
||||
('Other articles','',10)
|
||||
]
|
||||
|
||||
def parse_index(self):
|
||||
self.cover_url = 'http://thegretchenshow.files.wordpress.com/2009/12/well-read-cat-small.jpg'
|
||||
feeds = []
|
||||
seen_urls = set([])
|
||||
regex = re.compile( r'http://(www\.)?([^/:]+)', re.I)
|
||||
|
||||
for category in self.CATEGORIES:
|
||||
|
||||
(cat_name, tag, max_articles) = category
|
||||
|
||||
tagurl = '' if tag=='' else '/tagged/'+tag
|
||||
self.log('Reading category:', cat_name)
|
||||
|
||||
articles = []
|
||||
pageno = 1
|
||||
|
||||
while len(articles) < max_articles and pageno < 100:
|
||||
|
||||
page = "%s%s/page/%d" % (self.INDEX, tagurl, pageno) if pageno > 1 else self.INDEX + tagurl
|
||||
pageno += 1
|
||||
|
||||
self.log('\tReading page:', page)
|
||||
try:
|
||||
soup = self.index_to_soup(page)
|
||||
except:
|
||||
break
|
||||
|
||||
headers = soup.findAll('h2')
|
||||
if len(headers) == .0:
|
||||
break
|
||||
|
||||
for header in headers:
|
||||
atag = header.find('a')
|
||||
url = atag['href']
|
||||
# skip promotionals and duplicate
|
||||
if url.startswith('http://givemesomethingtoread') or url.startswith('/') or url in seen_urls:
|
||||
continue
|
||||
seen_urls.add(url)
|
||||
title = self.tag_to_string(header)
|
||||
self.log('\tFound article:', title)
|
||||
#self.log('\t', url)
|
||||
desc = header.parent.find('blockquote')
|
||||
desc = self.tag_to_string(desc) if desc else ''
|
||||
m = regex.match( url)
|
||||
if m:
|
||||
desc = "[%s] %s" % (m.group(2), desc)
|
||||
#self.log('\t', desc)
|
||||
date = ''
|
||||
p = header.parent.previousSibling
|
||||
# navigate up to find h3, which contains the date
|
||||
while p:
|
||||
if hasattr(p,'name') and p.name == 'h3':
|
||||
date = self.tag_to_string(p)
|
||||
break
|
||||
p = p.previousSibling
|
||||
articles.append({'title':title,'url':url,'description':desc,'date':date})
|
||||
if len(articles) >= max_articles:
|
||||
break
|
||||
|
||||
if articles:
|
||||
feeds.append((cat_name, articles))
|
||||
|
||||
return feeds
|
||||
|
94
recipes/letsgetcritical.recipe
Normal file
94
recipes/letsgetcritical.recipe
Normal file
@ -0,0 +1,94 @@
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class LetsGetCritical(BasicNewsRecipe):
|
||||
title = u"Let's Get Critical"
|
||||
description = 'Curation / aggregation of criticisms of the arts and culture '
|
||||
language = 'en'
|
||||
__author__ = 'barty on mobileread.com forum'
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = False
|
||||
timefmt = ' [%a, %d %b, %Y]'
|
||||
oldest_article = 365
|
||||
auto_cleanup = True
|
||||
INDEX = 'http://www.letsgetcritical.org'
|
||||
CATEGORIES = [
|
||||
# comment out categories you don't want
|
||||
# (user friendly name, system name, max number of articles to load)
|
||||
('Architecture','architecture',30),
|
||||
('Art','art',30),
|
||||
('Books','books',30),
|
||||
('Design','design',30),
|
||||
('Digital','digital',30),
|
||||
('Food','food',30),
|
||||
('Movies','movies',30),
|
||||
('Music','music',30),
|
||||
('Television','television',30),
|
||||
('Other articles','',10)
|
||||
]
|
||||
|
||||
def parse_index(self):
|
||||
self.cover_url = 'http://www.letsgetcritical.org/wp-content/themes/lets_get_critical/images/lgc.jpg'
|
||||
feeds = []
|
||||
seen_urls = set([])
|
||||
regex = re.compile( r'http://(www\.)?([^/:]+)', re.I)
|
||||
|
||||
for category in self.CATEGORIES:
|
||||
|
||||
(cat_name, tag, max_articles) = category
|
||||
|
||||
tagurl = '' if tag=='' else '/category/'+tag.lower()
|
||||
self.log('Reading category:', cat_name)
|
||||
|
||||
articles = []
|
||||
pageno = 1
|
||||
|
||||
while len(articles) < max_articles and pageno < 100:
|
||||
|
||||
page = "%s%s/page/%d" % (self.INDEX, tagurl, pageno) if pageno > 1 else self.INDEX + tagurl
|
||||
pageno += 1
|
||||
|
||||
self.log('\tReading page:', page)
|
||||
try:
|
||||
soup = self.index_to_soup(page)
|
||||
except:
|
||||
break
|
||||
|
||||
posts = soup.findAll('div',attrs={'class':'post_multi'})
|
||||
if len(posts) == 0:
|
||||
break
|
||||
|
||||
for post in posts:
|
||||
dt = post.find('div',attrs={'class':'title'})
|
||||
atag = dt.find('a')
|
||||
url = atag['href']
|
||||
# skip promotionals and duplicate
|
||||
if url.startswith('http://letsgetcritical') or url.startswith('/') or url in seen_urls:
|
||||
continue
|
||||
seen_urls.add(url)
|
||||
title = self.tag_to_string(atag)
|
||||
self.log('\tFound article:', title)
|
||||
self.log('\t', url)
|
||||
desc = post.find('blockquote')
|
||||
desc = self.tag_to_string(desc) if desc else ''
|
||||
m = regex.match( url)
|
||||
if m:
|
||||
desc = "[%s] %s" % (m.group(2), desc)
|
||||
#self.log('\t', desc)
|
||||
date = ''
|
||||
p = post.previousSibling
|
||||
# navigate up sibling to find date
|
||||
while p:
|
||||
if hasattr(p,'class') and p['class'] == 'singledate':
|
||||
date = self.tag_to_string(p)
|
||||
break
|
||||
p = p.previousSibling
|
||||
articles.append({'title':title,'url':url,'description':desc,'date':date})
|
||||
if len(articles) >= max_articles:
|
||||
break
|
||||
|
||||
if articles:
|
||||
feeds.append((cat_name, articles))
|
||||
|
||||
return feeds
|
||||
|
@ -6,11 +6,7 @@ www.nin.co.rs
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre import strftime
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from contextlib import closing
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
from calibre import entity_to_unicode
|
||||
|
||||
class Nin(BasicNewsRecipe):
|
||||
title = 'NIN online'
|
||||
|
@ -11,7 +11,7 @@ import re
|
||||
import urllib2
|
||||
|
||||
from contextlib import closing
|
||||
from lxml import etree, html
|
||||
from lxml import etree
|
||||
from PyQt4.Qt import QUrl
|
||||
|
||||
from calibre import browser, url_slash_cleaner, prints
|
||||
|
Loading…
x
Reference in New Issue
Block a user