Give me something to read and Let's get Critical by Barty

2025-09-29 15:31:08 -04:00 · 2011-11-24 09:52:35 +05:30 · 2011-11-24 09:52:35 +05:30 · c512404062
commit c512404062
parent d52a9ded1f
4 changed files with 202 additions and 22 deletions
--- a/recipes/givemesomethingtoread.recipe
+++ b/recipes/givemesomethingtoread.recipe
@ -0,0 +1,90 @@
 import re
 from calibre.web.feeds.news import BasicNewsRecipe
 class GiveMeSomethingToRead(BasicNewsRecipe):
    title          = u'Give Me Something To Read'
    description    = 'Curation / aggregation of articles on diverse topics'
    language = 'en'
    __author__     = 'barty on mobileread.com forum'
    max_articles_per_feed = 100
    no_stylesheets = False
    timefmt        = ' [%a, %d %b, %Y]'
    oldest_article = 365
    auto_cleanup   = True
    INDEX          = 'http://givemesomethingtoread.com'
    CATEGORIES     = [
        # comment out categories you don't want
        # (user friendly name, system name, max number of articles to load)
        ('The Arts','arts',25),
        ('Science','science',30),
        ('Technology','technology',30),
        ('Politics','politics',20),
        ('Media','media',30),
        ('Crime','crime',15),
        ('Other articles','',10)
        ]
    def parse_index(self):
        self.cover_url = 'http://thegretchenshow.files.wordpress.com/2009/12/well-read-cat-small.jpg'
        feeds = []
        seen_urls = set([])
        regex = re.compile( r'http://(www\.)?([^/:]+)', re.I)
        for category in self.CATEGORIES:
            (cat_name, tag, max_articles) = category
            tagurl = '' if tag=='' else '/tagged/'+tag
            self.log('Reading category:', cat_name)
            articles = []
            pageno = 1
            while len(articles) < max_articles and pageno < 100:
                page = "%s%s/page/%d" % (self.INDEX, tagurl, pageno) if pageno > 1 else self.INDEX + tagurl
                pageno += 1
                self.log('\tReading page:', page)
                try:
                    soup = self.index_to_soup(page)
                except:
                    break
                headers = soup.findAll('h2')
                if len(headers) == .0:
                    break
                for header in headers:
                    atag = header.find('a')
                    url = atag['href']
                    # skip promotionals and duplicate
                    if url.startswith('http://givemesomethingtoread') or url.startswith('/') or url in seen_urls:
                        continue
                    seen_urls.add(url)
                    title = self.tag_to_string(header)
                    self.log('\tFound article:', title)
                    #self.log('\t', url)
                    desc = header.parent.find('blockquote')
                    desc = self.tag_to_string(desc) if desc else ''
                    m = regex.match( url)
                    if m:
                        desc = "[%s] %s" %  (m.group(2), desc)
                    #self.log('\t', desc)
                    date = ''
                    p = header.parent.previousSibling
                    # navigate up to find h3, which contains the date
                    while p:
                        if hasattr(p,'name') and p.name == 'h3':
                            date = self.tag_to_string(p)
                            break
                        p = p.previousSibling
                    articles.append({'title':title,'url':url,'description':desc,'date':date})
                    if len(articles) >= max_articles:
                        break
            if articles:
                feeds.append((cat_name, articles))
        return feeds
--- a/recipes/letsgetcritical.recipe
+++ b/recipes/letsgetcritical.recipe
@ -0,0 +1,94 @@
 import re
 from calibre.web.feeds.news import BasicNewsRecipe
 class LetsGetCritical(BasicNewsRecipe):
    title          = u"Let's Get Critical"
    description    = 'Curation / aggregation of criticisms of the arts and culture '
    language = 'en'
    __author__     = 'barty on mobileread.com forum'
    max_articles_per_feed = 100
    no_stylesheets = False
    timefmt        = ' [%a, %d %b, %Y]'
    oldest_article = 365
    auto_cleanup   = True
    INDEX          = 'http://www.letsgetcritical.org'
    CATEGORIES     = [
        # comment out categories you don't want
        # (user friendly name, system name, max number of articles to load)
        ('Architecture','architecture',30),
        ('Art','art',30),
        ('Books','books',30),
        ('Design','design',30),
        ('Digital','digital',30),
        ('Food','food',30),
        ('Movies','movies',30),
        ('Music','music',30),
        ('Television','television',30),
        ('Other articles','',10)
        ]
    def parse_index(self):
        self.cover_url = 'http://www.letsgetcritical.org/wp-content/themes/lets_get_critical/images/lgc.jpg'
        feeds = []
        seen_urls = set([])
        regex = re.compile( r'http://(www\.)?([^/:]+)', re.I)
        for category in self.CATEGORIES:
            (cat_name, tag, max_articles) = category
            tagurl = '' if tag=='' else '/category/'+tag.lower()
            self.log('Reading category:', cat_name)
            articles = []
            pageno = 1
            while len(articles) < max_articles and pageno < 100:
                page = "%s%s/page/%d" % (self.INDEX, tagurl, pageno) if pageno > 1 else self.INDEX + tagurl
                pageno += 1
                self.log('\tReading page:', page)
                try:
                    soup = self.index_to_soup(page)
                except:
                    break
                posts = soup.findAll('div',attrs={'class':'post_multi'})
                if len(posts) == 0:
                    break
                for post in posts:
                    dt = post.find('div',attrs={'class':'title'})
                    atag = dt.find('a')
                    url = atag['href']
                    # skip promotionals and duplicate
                    if url.startswith('http://letsgetcritical') or url.startswith('/') or url in seen_urls:
                        continue
                    seen_urls.add(url)
                    title = self.tag_to_string(atag)
                    self.log('\tFound article:', title)
                    self.log('\t', url)
                    desc = post.find('blockquote')
                    desc = self.tag_to_string(desc) if desc else ''
                    m = regex.match( url)
                    if m:
                        desc = "[%s] %s" %  (m.group(2), desc)
                    #self.log('\t', desc)
                    date = ''
                    p = post.previousSibling
                    # navigate up sibling to find date
                    while p:
                        if hasattr(p,'class') and p['class'] == 'singledate':
                            date = self.tag_to_string(p)
                            break
                        p = p.previousSibling
                    articles.append({'title':title,'url':url,'description':desc,'date':date})
                    if len(articles) >= max_articles:
                        break
            if articles:
                feeds.append((cat_name, articles))
        return feeds
--- a/recipes/nin.recipe
+++ b/recipes/nin.recipe
@ -6,11 +6,7 @@ www.nin.co.rs
 '''
 import re
 from calibre import strftime
 from calibre.web.feeds.news import BasicNewsRecipe
 from contextlib import closing
 from calibre.ebooks.BeautifulSoup import BeautifulSoup
 from calibre import entity_to_unicode
 class Nin(BasicNewsRecipe):
    title                  = 'NIN online'
--- a/src/calibre/gui2/store/stores/litres_plugin.py
+++ b/src/calibre/gui2/store/stores/litres_plugin.py
@ -11,7 +11,7 @@ import re
 import urllib2
 from contextlib import closing
-from lxml import etree, html
+from lxml import etree
 from PyQt4.Qt import QUrl
 from calibre import browser, url_slash_cleaner, prints