Give me something to read and Let's get Critical by Barty

2025-11-26 16:25:02 -05:00 · 2011-11-24 09:52:35 +05:30 · 2011-11-24 09:52:35 +05:30 · c512404062
commit c512404062
parent d52a9ded1f
4 changed files with 202 additions and 22 deletions
--- a/recipes/givemesomethingtoread.recipe
+++ b/recipes/givemesomethingtoread.recipe
@ -0,0 +1,90 @@
+import re
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class GiveMeSomethingToRead(BasicNewsRecipe):
+    title          = u'Give Me Something To Read'
+    description    = 'Curation / aggregation of articles on diverse topics'
+    language = 'en'
+    __author__     = 'barty on mobileread.com forum'
+    max_articles_per_feed = 100
+    no_stylesheets = False
+    timefmt        = ' [%a, %d %b, %Y]'
+    oldest_article = 365
+    auto_cleanup   = True
+    INDEX          = 'http://givemesomethingtoread.com'
+    CATEGORIES     = [
+        # comment out categories you don't want
+        # (user friendly name, system name, max number of articles to load)
+        ('The Arts','arts',25),
+        ('Science','science',30),
+        ('Technology','technology',30),
+        ('Politics','politics',20),
+        ('Media','media',30),
+        ('Crime','crime',15),
+        ('Other articles','',10)
+        ]
+
+    def parse_index(self):
+        self.cover_url = 'http://thegretchenshow.files.wordpress.com/2009/12/well-read-cat-small.jpg'
+        feeds = []
+        seen_urls = set([])
+        regex = re.compile( r'http://(www\.)?([^/:]+)', re.I)
+
+        for category in self.CATEGORIES:
+
+            (cat_name, tag, max_articles) = category
+
+            tagurl = '' if tag=='' else '/tagged/'+tag
+            self.log('Reading category:', cat_name)
+
+            articles = []
+            pageno = 1
+
+            while len(articles) < max_articles and pageno < 100:
+
+                page = "%s%s/page/%d" % (self.INDEX, tagurl, pageno) if pageno > 1 else self.INDEX + tagurl
+                pageno += 1
+
+                self.log('\tReading page:', page)
+                try:
+                    soup = self.index_to_soup(page)
+                except:
+                    break
+
+                headers = soup.findAll('h2')
+                if len(headers) == .0:
+                    break
+
+                for header in headers:
+                    atag = header.find('a')
+                    url = atag['href']
+                    # skip promotionals and duplicate
+                    if url.startswith('http://givemesomethingtoread') or url.startswith('/') or url in seen_urls:
+                        continue
+                    seen_urls.add(url)
+                    title = self.tag_to_string(header)
+                    self.log('\tFound article:', title)
+                    #self.log('\t', url)
+                    desc = header.parent.find('blockquote')
+                    desc = self.tag_to_string(desc) if desc else ''
+                    m = regex.match( url)
+                    if m:
+                        desc = "[%s] %s" %  (m.group(2), desc)
+                    #self.log('\t', desc)
+                    date = ''
+                    p = header.parent.previousSibling
+                    # navigate up to find h3, which contains the date
+                    while p:
+                        if hasattr(p,'name') and p.name == 'h3':
+                            date = self.tag_to_string(p)
+                            break
+                        p = p.previousSibling
+                    articles.append({'title':title,'url':url,'description':desc,'date':date})
+                    if len(articles) >= max_articles:
+                        break
+
+            if articles:
+                feeds.append((cat_name, articles))
+
+        return feeds
+
--- a/recipes/letsgetcritical.recipe
+++ b/recipes/letsgetcritical.recipe
@ -0,0 +1,94 @@
+import re
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class LetsGetCritical(BasicNewsRecipe):
+    title          = u"Let's Get Critical"
+    description    = 'Curation / aggregation of criticisms of the arts and culture '
+    language = 'en'
+    __author__     = 'barty on mobileread.com forum'
+    max_articles_per_feed = 100
+    no_stylesheets = False
+    timefmt        = ' [%a, %d %b, %Y]'
+    oldest_article = 365
+    auto_cleanup   = True
+    INDEX          = 'http://www.letsgetcritical.org'
+    CATEGORIES     = [
+        # comment out categories you don't want
+        # (user friendly name, system name, max number of articles to load)
+        ('Architecture','architecture',30),
+        ('Art','art',30),
+        ('Books','books',30),
+        ('Design','design',30),
+        ('Digital','digital',30),
+        ('Food','food',30),
+        ('Movies','movies',30),
+        ('Music','music',30),
+        ('Television','television',30),
+        ('Other articles','',10)
+        ]
+
+    def parse_index(self):
+        self.cover_url = 'http://www.letsgetcritical.org/wp-content/themes/lets_get_critical/images/lgc.jpg'
+        feeds = []
+        seen_urls = set([])
+        regex = re.compile( r'http://(www\.)?([^/:]+)', re.I)
+
+        for category in self.CATEGORIES:
+
+            (cat_name, tag, max_articles) = category
+
+            tagurl = '' if tag=='' else '/category/'+tag.lower()
+            self.log('Reading category:', cat_name)
+
+            articles = []
+            pageno = 1
+
+            while len(articles) < max_articles and pageno < 100:
+
+                page = "%s%s/page/%d" % (self.INDEX, tagurl, pageno) if pageno > 1 else self.INDEX + tagurl
+                pageno += 1
+
+                self.log('\tReading page:', page)
+                try:
+                    soup = self.index_to_soup(page)
+                except:
+                    break
+
+                posts = soup.findAll('div',attrs={'class':'post_multi'})
+                if len(posts) == 0:
+                    break
+
+                for post in posts:
+                    dt = post.find('div',attrs={'class':'title'})
+                    atag = dt.find('a')
+                    url = atag['href']
+                    # skip promotionals and duplicate
+                    if url.startswith('http://letsgetcritical') or url.startswith('/') or url in seen_urls:
+                        continue
+                    seen_urls.add(url)
+                    title = self.tag_to_string(atag)
+                    self.log('\tFound article:', title)
+                    self.log('\t', url)
+                    desc = post.find('blockquote')
+                    desc = self.tag_to_string(desc) if desc else ''
+                    m = regex.match( url)
+                    if m:
+                        desc = "[%s] %s" %  (m.group(2), desc)
+                    #self.log('\t', desc)
+                    date = ''
+                    p = post.previousSibling
+                    # navigate up sibling to find date
+                    while p:
+                        if hasattr(p,'class') and p['class'] == 'singledate':
+                            date = self.tag_to_string(p)
+                            break
+                        p = p.previousSibling
+                    articles.append({'title':title,'url':url,'description':desc,'date':date})
+                    if len(articles) >= max_articles:
+                        break
+
+            if articles:
+                feeds.append((cat_name, articles))
+
+        return feeds
+
--- a/recipes/nin.recipe
+++ b/recipes/nin.recipe
@ -6,11 +6,7 @@ www.nin.co.rs
 '''

 import re
-from calibre import strftime
 from calibre.web.feeds.news import BasicNewsRecipe
-from contextlib import closing
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
-from calibre import entity_to_unicode

 class Nin(BasicNewsRecipe):
    title                  = 'NIN online'
@ -81,7 +77,7 @@ class Nin(BasicNewsRecipe):
        return cover_url

    feeds          = [(u'NIN Online', u'http://www.nin.co.rs/misc/rss.php?feed=RSS2.0')]
-    
+
    def get_article_url(self, article):
        url = BasicNewsRecipe.get_article_url(self, article)
        return url.replace('.co.yu', '.co.rs')
--- a/src/calibre/gui2/store/stores/litres_plugin.py
+++ b/src/calibre/gui2/store/stores/litres_plugin.py
@ -11,7 +11,7 @@ import re
 import urllib2

 from contextlib import closing
-from lxml import etree, html
+from lxml import etree
 from PyQt4.Qt import QUrl

 from calibre import browser, url_slash_cleaner, prints
@ -25,18 +25,18 @@ from calibre.gui2.store.web_store_dialog import WebStoreDialog
 class LitResStore(BasicStoreConfig, StorePlugin):
    shop_url = u'http://www.litres.ru'
    #http://robot.litres.ru/pages/biblio_book/?art=174405
-    
+
    def open(self, parent=None, detail_item=None, external=False):
-        
+
        aff_id = u'?' + _get_affiliate_id()
-        
+
        url = self.shop_url + aff_id
        detail_url = None
        if detail_item:
            # http://www.litres.ru/pages/biblio_book/?art=157074
            detail_url = self.shop_url + u'/pages/biblio_book/' + aff_id +\
-                u'&art=' + urllib2.quote(detail_item) 
-        
+                u'&art=' + urllib2.quote(detail_item)
+
        if external or self.config.get('open_external', False):
            open_url(QUrl(url_slash_cleaner(detail_url if detail_url else url)))
        else:
@ -44,28 +44,28 @@ class LitResStore(BasicStoreConfig, StorePlugin):
            d.setWindowTitle(self.name)
            d.set_tags(self.config.get('tags', ''))
            d.exec_()
-        
+

    def search(self, query, max_results=10, timeout=60):
        search_url = u'http://robot.litres.ru/pages/catalit_browser/?checkpoint=2000-01-02&'\
        'search=%s&limit=0,%s'
        search_url = search_url % (urllib2.quote(query), max_results)
-        
+
        counter = max_results
        br = browser()
        br.addheaders.append( ['Accept-Encoding','gzip'] )
-        
+
        with closing(br.open(search_url, timeout=timeout)) as r:
            ungzipResponse(r,br)
            raw= xml_to_unicode(r.read(), strip_encoding_pats=True, assume_utf8=True)[0]
-            
+
            parser = etree.XMLParser(recover=True, no_network=True)
            doc = etree.fromstring(raw, parser=parser)
            for data in doc.xpath('//*[local-name() = "fb2-book"]'):
                if counter <= 0:
                    break
                counter -= 1
-                
+
                try:
                    sRes = self.create_search_result(data)
                except Exception as e:
@ -75,10 +75,10 @@ class LitResStore(BasicStoreConfig, StorePlugin):

    def get_details(self, search_result, timeout=60):
        pass
-    
+
    def create_search_result(self, data):
        xp_template = 'normalize-space(@{0})'
-        
+
        sRes = SearchResult()
        sRes.drm = SearchResult.DRM_UNLOCKED
        sRes.detail_item = data.xpath(xp_template.format('hub_id'))
@ -92,7 +92,7 @@ class LitResStore(BasicStoreConfig, StorePlugin):
        # cover vs cover_preview
        sRes.cover_url = data.xpath(xp_template.format('cover_preview'))
        sRes.price = format_price_in_RUR(sRes.price)
-        
+
        types = data.xpath('//fb2-book//files/file/@type')
        fmt_set = _parse_ebook_formats(' '.join(types))
        sRes.formats = ', '.join(fmt_set)
@ -134,8 +134,8 @@ def _get_affiliate_id():
 def _parse_ebook_formats(formatsStr):
    '''
    Creates a set with displayable names of the formats
-    
-    :param formatsStr: string with comma separated book formats 
+
+    :param formatsStr: string with comma separated book formats
           as it provided by ozon.ru
    :return: a list with displayable book formats
    '''
@ -166,4 +166,4 @@ def _parse_ebook_formats(formatsStr):
        formats.add('LRF')
    if 'jar' in formatsUnstruct:
        formats.add('JAR')
-    return formats
+    return formats