Give me something to read and Let's get Critical by Barty

This commit is contained in:
Kovid Goyal 2011-11-24 09:52:35 +05:30
parent d52a9ded1f
commit c512404062
4 changed files with 202 additions and 22 deletions

View File

@ -0,0 +1,90 @@
import re
from calibre.web.feeds.news import BasicNewsRecipe
class GiveMeSomethingToRead(BasicNewsRecipe):
title = u'Give Me Something To Read'
description = 'Curation / aggregation of articles on diverse topics'
language = 'en'
__author__ = 'barty on mobileread.com forum'
max_articles_per_feed = 100
no_stylesheets = False
timefmt = ' [%a, %d %b, %Y]'
oldest_article = 365
auto_cleanup = True
INDEX = 'http://givemesomethingtoread.com'
CATEGORIES = [
# comment out categories you don't want
# (user friendly name, system name, max number of articles to load)
('The Arts','arts',25),
('Science','science',30),
('Technology','technology',30),
('Politics','politics',20),
('Media','media',30),
('Crime','crime',15),
('Other articles','',10)
]
def parse_index(self):
self.cover_url = 'http://thegretchenshow.files.wordpress.com/2009/12/well-read-cat-small.jpg'
feeds = []
seen_urls = set([])
regex = re.compile( r'http://(www\.)?([^/:]+)', re.I)
for category in self.CATEGORIES:
(cat_name, tag, max_articles) = category
tagurl = '' if tag=='' else '/tagged/'+tag
self.log('Reading category:', cat_name)
articles = []
pageno = 1
while len(articles) < max_articles and pageno < 100:
page = "%s%s/page/%d" % (self.INDEX, tagurl, pageno) if pageno > 1 else self.INDEX + tagurl
pageno += 1
self.log('\tReading page:', page)
try:
soup = self.index_to_soup(page)
except:
break
headers = soup.findAll('h2')
if len(headers) == .0:
break
for header in headers:
atag = header.find('a')
url = atag['href']
# skip promotionals and duplicate
if url.startswith('http://givemesomethingtoread') or url.startswith('/') or url in seen_urls:
continue
seen_urls.add(url)
title = self.tag_to_string(header)
self.log('\tFound article:', title)
#self.log('\t', url)
desc = header.parent.find('blockquote')
desc = self.tag_to_string(desc) if desc else ''
m = regex.match( url)
if m:
desc = "[%s] %s" % (m.group(2), desc)
#self.log('\t', desc)
date = ''
p = header.parent.previousSibling
# navigate up to find h3, which contains the date
while p:
if hasattr(p,'name') and p.name == 'h3':
date = self.tag_to_string(p)
break
p = p.previousSibling
articles.append({'title':title,'url':url,'description':desc,'date':date})
if len(articles) >= max_articles:
break
if articles:
feeds.append((cat_name, articles))
return feeds

View File

@ -0,0 +1,94 @@
import re
from calibre.web.feeds.news import BasicNewsRecipe
class LetsGetCritical(BasicNewsRecipe):
title = u"Let's Get Critical"
description = 'Curation / aggregation of criticisms of the arts and culture '
language = 'en'
__author__ = 'barty on mobileread.com forum'
max_articles_per_feed = 100
no_stylesheets = False
timefmt = ' [%a, %d %b, %Y]'
oldest_article = 365
auto_cleanup = True
INDEX = 'http://www.letsgetcritical.org'
CATEGORIES = [
# comment out categories you don't want
# (user friendly name, system name, max number of articles to load)
('Architecture','architecture',30),
('Art','art',30),
('Books','books',30),
('Design','design',30),
('Digital','digital',30),
('Food','food',30),
('Movies','movies',30),
('Music','music',30),
('Television','television',30),
('Other articles','',10)
]
def parse_index(self):
self.cover_url = 'http://www.letsgetcritical.org/wp-content/themes/lets_get_critical/images/lgc.jpg'
feeds = []
seen_urls = set([])
regex = re.compile( r'http://(www\.)?([^/:]+)', re.I)
for category in self.CATEGORIES:
(cat_name, tag, max_articles) = category
tagurl = '' if tag=='' else '/category/'+tag.lower()
self.log('Reading category:', cat_name)
articles = []
pageno = 1
while len(articles) < max_articles and pageno < 100:
page = "%s%s/page/%d" % (self.INDEX, tagurl, pageno) if pageno > 1 else self.INDEX + tagurl
pageno += 1
self.log('\tReading page:', page)
try:
soup = self.index_to_soup(page)
except:
break
posts = soup.findAll('div',attrs={'class':'post_multi'})
if len(posts) == 0:
break
for post in posts:
dt = post.find('div',attrs={'class':'title'})
atag = dt.find('a')
url = atag['href']
# skip promotionals and duplicate
if url.startswith('http://letsgetcritical') or url.startswith('/') or url in seen_urls:
continue
seen_urls.add(url)
title = self.tag_to_string(atag)
self.log('\tFound article:', title)
self.log('\t', url)
desc = post.find('blockquote')
desc = self.tag_to_string(desc) if desc else ''
m = regex.match( url)
if m:
desc = "[%s] %s" % (m.group(2), desc)
#self.log('\t', desc)
date = ''
p = post.previousSibling
# navigate up sibling to find date
while p:
if hasattr(p,'class') and p['class'] == 'singledate':
date = self.tag_to_string(p)
break
p = p.previousSibling
articles.append({'title':title,'url':url,'description':desc,'date':date})
if len(articles) >= max_articles:
break
if articles:
feeds.append((cat_name, articles))
return feeds

View File

@ -6,11 +6,7 @@ www.nin.co.rs
''' '''
import re import re
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from contextlib import closing
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre import entity_to_unicode
class Nin(BasicNewsRecipe): class Nin(BasicNewsRecipe):
title = 'NIN online' title = 'NIN online'
@ -81,7 +77,7 @@ class Nin(BasicNewsRecipe):
return cover_url return cover_url
feeds = [(u'NIN Online', u'http://www.nin.co.rs/misc/rss.php?feed=RSS2.0')] feeds = [(u'NIN Online', u'http://www.nin.co.rs/misc/rss.php?feed=RSS2.0')]
def get_article_url(self, article): def get_article_url(self, article):
url = BasicNewsRecipe.get_article_url(self, article) url = BasicNewsRecipe.get_article_url(self, article)
return url.replace('.co.yu', '.co.rs') return url.replace('.co.yu', '.co.rs')

View File

@ -11,7 +11,7 @@ import re
import urllib2 import urllib2
from contextlib import closing from contextlib import closing
from lxml import etree, html from lxml import etree
from PyQt4.Qt import QUrl from PyQt4.Qt import QUrl
from calibre import browser, url_slash_cleaner, prints from calibre import browser, url_slash_cleaner, prints
@ -25,18 +25,18 @@ from calibre.gui2.store.web_store_dialog import WebStoreDialog
class LitResStore(BasicStoreConfig, StorePlugin): class LitResStore(BasicStoreConfig, StorePlugin):
shop_url = u'http://www.litres.ru' shop_url = u'http://www.litres.ru'
#http://robot.litres.ru/pages/biblio_book/?art=174405 #http://robot.litres.ru/pages/biblio_book/?art=174405
def open(self, parent=None, detail_item=None, external=False): def open(self, parent=None, detail_item=None, external=False):
aff_id = u'?' + _get_affiliate_id() aff_id = u'?' + _get_affiliate_id()
url = self.shop_url + aff_id url = self.shop_url + aff_id
detail_url = None detail_url = None
if detail_item: if detail_item:
# http://www.litres.ru/pages/biblio_book/?art=157074 # http://www.litres.ru/pages/biblio_book/?art=157074
detail_url = self.shop_url + u'/pages/biblio_book/' + aff_id +\ detail_url = self.shop_url + u'/pages/biblio_book/' + aff_id +\
u'&art=' + urllib2.quote(detail_item) u'&art=' + urllib2.quote(detail_item)
if external or self.config.get('open_external', False): if external or self.config.get('open_external', False):
open_url(QUrl(url_slash_cleaner(detail_url if detail_url else url))) open_url(QUrl(url_slash_cleaner(detail_url if detail_url else url)))
else: else:
@ -44,28 +44,28 @@ class LitResStore(BasicStoreConfig, StorePlugin):
d.setWindowTitle(self.name) d.setWindowTitle(self.name)
d.set_tags(self.config.get('tags', '')) d.set_tags(self.config.get('tags', ''))
d.exec_() d.exec_()
def search(self, query, max_results=10, timeout=60): def search(self, query, max_results=10, timeout=60):
search_url = u'http://robot.litres.ru/pages/catalit_browser/?checkpoint=2000-01-02&'\ search_url = u'http://robot.litres.ru/pages/catalit_browser/?checkpoint=2000-01-02&'\
'search=%s&limit=0,%s' 'search=%s&limit=0,%s'
search_url = search_url % (urllib2.quote(query), max_results) search_url = search_url % (urllib2.quote(query), max_results)
counter = max_results counter = max_results
br = browser() br = browser()
br.addheaders.append( ['Accept-Encoding','gzip'] ) br.addheaders.append( ['Accept-Encoding','gzip'] )
with closing(br.open(search_url, timeout=timeout)) as r: with closing(br.open(search_url, timeout=timeout)) as r:
ungzipResponse(r,br) ungzipResponse(r,br)
raw= xml_to_unicode(r.read(), strip_encoding_pats=True, assume_utf8=True)[0] raw= xml_to_unicode(r.read(), strip_encoding_pats=True, assume_utf8=True)[0]
parser = etree.XMLParser(recover=True, no_network=True) parser = etree.XMLParser(recover=True, no_network=True)
doc = etree.fromstring(raw, parser=parser) doc = etree.fromstring(raw, parser=parser)
for data in doc.xpath('//*[local-name() = "fb2-book"]'): for data in doc.xpath('//*[local-name() = "fb2-book"]'):
if counter <= 0: if counter <= 0:
break break
counter -= 1 counter -= 1
try: try:
sRes = self.create_search_result(data) sRes = self.create_search_result(data)
except Exception as e: except Exception as e:
@ -75,10 +75,10 @@ class LitResStore(BasicStoreConfig, StorePlugin):
def get_details(self, search_result, timeout=60): def get_details(self, search_result, timeout=60):
pass pass
def create_search_result(self, data): def create_search_result(self, data):
xp_template = 'normalize-space(@{0})' xp_template = 'normalize-space(@{0})'
sRes = SearchResult() sRes = SearchResult()
sRes.drm = SearchResult.DRM_UNLOCKED sRes.drm = SearchResult.DRM_UNLOCKED
sRes.detail_item = data.xpath(xp_template.format('hub_id')) sRes.detail_item = data.xpath(xp_template.format('hub_id'))
@ -92,7 +92,7 @@ class LitResStore(BasicStoreConfig, StorePlugin):
# cover vs cover_preview # cover vs cover_preview
sRes.cover_url = data.xpath(xp_template.format('cover_preview')) sRes.cover_url = data.xpath(xp_template.format('cover_preview'))
sRes.price = format_price_in_RUR(sRes.price) sRes.price = format_price_in_RUR(sRes.price)
types = data.xpath('//fb2-book//files/file/@type') types = data.xpath('//fb2-book//files/file/@type')
fmt_set = _parse_ebook_formats(' '.join(types)) fmt_set = _parse_ebook_formats(' '.join(types))
sRes.formats = ', '.join(fmt_set) sRes.formats = ', '.join(fmt_set)
@ -134,8 +134,8 @@ def _get_affiliate_id():
def _parse_ebook_formats(formatsStr): def _parse_ebook_formats(formatsStr):
''' '''
Creates a set with displayable names of the formats Creates a set with displayable names of the formats
:param formatsStr: string with comma separated book formats :param formatsStr: string with comma separated book formats
as it provided by ozon.ru as it provided by ozon.ru
:return: a list with displayable book formats :return: a list with displayable book formats
''' '''
@ -166,4 +166,4 @@ def _parse_ebook_formats(formatsStr):
formats.add('LRF') formats.add('LRF')
if 'jar' in formatsUnstruct: if 'jar' in formatsUnstruct:
formats.add('JAR') formats.add('JAR')
return formats return formats