diff --git a/recipes/givemesomethingtoread.recipe b/recipes/givemesomethingtoread.recipe new file mode 100644 index 0000000000..09b758536f --- /dev/null +++ b/recipes/givemesomethingtoread.recipe @@ -0,0 +1,90 @@ +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class GiveMeSomethingToRead(BasicNewsRecipe): + title = u'Give Me Something To Read' + description = 'Curation / aggregation of articles on diverse topics' + language = 'en' + __author__ = 'barty on mobileread.com forum' + max_articles_per_feed = 100 + no_stylesheets = False + timefmt = ' [%a, %d %b, %Y]' + oldest_article = 365 + auto_cleanup = True + INDEX = 'http://givemesomethingtoread.com' + CATEGORIES = [ + # comment out categories you don't want + # (user friendly name, system name, max number of articles to load) + ('The Arts','arts',25), + ('Science','science',30), + ('Technology','technology',30), + ('Politics','politics',20), + ('Media','media',30), + ('Crime','crime',15), + ('Other articles','',10) + ] + + def parse_index(self): + self.cover_url = 'http://thegretchenshow.files.wordpress.com/2009/12/well-read-cat-small.jpg' + feeds = [] + seen_urls = set([]) + regex = re.compile( r'http://(www\.)?([^/:]+)', re.I) + + for category in self.CATEGORIES: + + (cat_name, tag, max_articles) = category + + tagurl = '' if tag=='' else '/tagged/'+tag + self.log('Reading category:', cat_name) + + articles = [] + pageno = 1 + + while len(articles) < max_articles and pageno < 100: + + page = "%s%s/page/%d" % (self.INDEX, tagurl, pageno) if pageno > 1 else self.INDEX + tagurl + pageno += 1 + + self.log('\tReading page:', page) + try: + soup = self.index_to_soup(page) + except: + break + + headers = soup.findAll('h2') + if len(headers) == .0: + break + + for header in headers: + atag = header.find('a') + url = atag['href'] + # skip promotionals and duplicate + if url.startswith('http://givemesomethingtoread') or url.startswith('/') or url in seen_urls: + continue + seen_urls.add(url) + title = self.tag_to_string(header) + self.log('\tFound article:', title) + #self.log('\t', url) + desc = header.parent.find('blockquote') + desc = self.tag_to_string(desc) if desc else '' + m = regex.match( url) + if m: + desc = "[%s] %s" % (m.group(2), desc) + #self.log('\t', desc) + date = '' + p = header.parent.previousSibling + # navigate up to find h3, which contains the date + while p: + if hasattr(p,'name') and p.name == 'h3': + date = self.tag_to_string(p) + break + p = p.previousSibling + articles.append({'title':title,'url':url,'description':desc,'date':date}) + if len(articles) >= max_articles: + break + + if articles: + feeds.append((cat_name, articles)) + + return feeds + diff --git a/recipes/letsgetcritical.recipe b/recipes/letsgetcritical.recipe new file mode 100644 index 0000000000..b22512642d --- /dev/null +++ b/recipes/letsgetcritical.recipe @@ -0,0 +1,94 @@ +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class LetsGetCritical(BasicNewsRecipe): + title = u"Let's Get Critical" + description = 'Curation / aggregation of criticisms of the arts and culture ' + language = 'en' + __author__ = 'barty on mobileread.com forum' + max_articles_per_feed = 100 + no_stylesheets = False + timefmt = ' [%a, %d %b, %Y]' + oldest_article = 365 + auto_cleanup = True + INDEX = 'http://www.letsgetcritical.org' + CATEGORIES = [ + # comment out categories you don't want + # (user friendly name, system name, max number of articles to load) + ('Architecture','architecture',30), + ('Art','art',30), + ('Books','books',30), + ('Design','design',30), + ('Digital','digital',30), + ('Food','food',30), + ('Movies','movies',30), + ('Music','music',30), + ('Television','television',30), + ('Other articles','',10) + ] + + def parse_index(self): + self.cover_url = 'http://www.letsgetcritical.org/wp-content/themes/lets_get_critical/images/lgc.jpg' + feeds = [] + seen_urls = set([]) + regex = re.compile( r'http://(www\.)?([^/:]+)', re.I) + + for category in self.CATEGORIES: + + (cat_name, tag, max_articles) = category + + tagurl = '' if tag=='' else '/category/'+tag.lower() + self.log('Reading category:', cat_name) + + articles = [] + pageno = 1 + + while len(articles) < max_articles and pageno < 100: + + page = "%s%s/page/%d" % (self.INDEX, tagurl, pageno) if pageno > 1 else self.INDEX + tagurl + pageno += 1 + + self.log('\tReading page:', page) + try: + soup = self.index_to_soup(page) + except: + break + + posts = soup.findAll('div',attrs={'class':'post_multi'}) + if len(posts) == 0: + break + + for post in posts: + dt = post.find('div',attrs={'class':'title'}) + atag = dt.find('a') + url = atag['href'] + # skip promotionals and duplicate + if url.startswith('http://letsgetcritical') or url.startswith('/') or url in seen_urls: + continue + seen_urls.add(url) + title = self.tag_to_string(atag) + self.log('\tFound article:', title) + self.log('\t', url) + desc = post.find('blockquote') + desc = self.tag_to_string(desc) if desc else '' + m = regex.match( url) + if m: + desc = "[%s] %s" % (m.group(2), desc) + #self.log('\t', desc) + date = '' + p = post.previousSibling + # navigate up sibling to find date + while p: + if hasattr(p,'class') and p['class'] == 'singledate': + date = self.tag_to_string(p) + break + p = p.previousSibling + articles.append({'title':title,'url':url,'description':desc,'date':date}) + if len(articles) >= max_articles: + break + + if articles: + feeds.append((cat_name, articles)) + + return feeds + diff --git a/recipes/nin.recipe b/recipes/nin.recipe index bd31860126..ae09b3d0eb 100644 --- a/recipes/nin.recipe +++ b/recipes/nin.recipe @@ -6,11 +6,7 @@ www.nin.co.rs ''' import re -from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe -from contextlib import closing -from calibre.ebooks.BeautifulSoup import BeautifulSoup -from calibre import entity_to_unicode class Nin(BasicNewsRecipe): title = 'NIN online' @@ -81,7 +77,7 @@ class Nin(BasicNewsRecipe): return cover_url feeds = [(u'NIN Online', u'http://www.nin.co.rs/misc/rss.php?feed=RSS2.0')] - + def get_article_url(self, article): url = BasicNewsRecipe.get_article_url(self, article) return url.replace('.co.yu', '.co.rs') diff --git a/src/calibre/gui2/store/stores/litres_plugin.py b/src/calibre/gui2/store/stores/litres_plugin.py index 6223b61616..6f4c386dda 100644 --- a/src/calibre/gui2/store/stores/litres_plugin.py +++ b/src/calibre/gui2/store/stores/litres_plugin.py @@ -11,7 +11,7 @@ import re import urllib2 from contextlib import closing -from lxml import etree, html +from lxml import etree from PyQt4.Qt import QUrl from calibre import browser, url_slash_cleaner, prints @@ -25,18 +25,18 @@ from calibre.gui2.store.web_store_dialog import WebStoreDialog class LitResStore(BasicStoreConfig, StorePlugin): shop_url = u'http://www.litres.ru' #http://robot.litres.ru/pages/biblio_book/?art=174405 - + def open(self, parent=None, detail_item=None, external=False): - + aff_id = u'?' + _get_affiliate_id() - + url = self.shop_url + aff_id detail_url = None if detail_item: # http://www.litres.ru/pages/biblio_book/?art=157074 detail_url = self.shop_url + u'/pages/biblio_book/' + aff_id +\ - u'&art=' + urllib2.quote(detail_item) - + u'&art=' + urllib2.quote(detail_item) + if external or self.config.get('open_external', False): open_url(QUrl(url_slash_cleaner(detail_url if detail_url else url))) else: @@ -44,28 +44,28 @@ class LitResStore(BasicStoreConfig, StorePlugin): d.setWindowTitle(self.name) d.set_tags(self.config.get('tags', '')) d.exec_() - + def search(self, query, max_results=10, timeout=60): search_url = u'http://robot.litres.ru/pages/catalit_browser/?checkpoint=2000-01-02&'\ 'search=%s&limit=0,%s' search_url = search_url % (urllib2.quote(query), max_results) - + counter = max_results br = browser() br.addheaders.append( ['Accept-Encoding','gzip'] ) - + with closing(br.open(search_url, timeout=timeout)) as r: ungzipResponse(r,br) raw= xml_to_unicode(r.read(), strip_encoding_pats=True, assume_utf8=True)[0] - + parser = etree.XMLParser(recover=True, no_network=True) doc = etree.fromstring(raw, parser=parser) for data in doc.xpath('//*[local-name() = "fb2-book"]'): if counter <= 0: break counter -= 1 - + try: sRes = self.create_search_result(data) except Exception as e: @@ -75,10 +75,10 @@ class LitResStore(BasicStoreConfig, StorePlugin): def get_details(self, search_result, timeout=60): pass - + def create_search_result(self, data): xp_template = 'normalize-space(@{0})' - + sRes = SearchResult() sRes.drm = SearchResult.DRM_UNLOCKED sRes.detail_item = data.xpath(xp_template.format('hub_id')) @@ -92,7 +92,7 @@ class LitResStore(BasicStoreConfig, StorePlugin): # cover vs cover_preview sRes.cover_url = data.xpath(xp_template.format('cover_preview')) sRes.price = format_price_in_RUR(sRes.price) - + types = data.xpath('//fb2-book//files/file/@type') fmt_set = _parse_ebook_formats(' '.join(types)) sRes.formats = ', '.join(fmt_set) @@ -134,8 +134,8 @@ def _get_affiliate_id(): def _parse_ebook_formats(formatsStr): ''' Creates a set with displayable names of the formats - - :param formatsStr: string with comma separated book formats + + :param formatsStr: string with comma separated book formats as it provided by ozon.ru :return: a list with displayable book formats ''' @@ -166,4 +166,4 @@ def _parse_ebook_formats(formatsStr): formats.add('LRF') if 'jar' in formatsUnstruct: formats.add('JAR') - return formats \ No newline at end of file + return formats