diff --git a/recipes/psych.recipe b/recipes/psych.recipe index 3fc940b4a2..a21acefe30 100644 --- a/recipes/psych.recipe +++ b/recipes/psych.recipe @@ -1,44 +1,79 @@ +import re +from calibre.web.feeds.recipes import BasicNewsRecipe -from calibre.ptempfile import PersistentTemporaryFile -from calibre.web.feeds.news import BasicNewsRecipe -class AdvancedUserRecipe1275708473(BasicNewsRecipe): - title = u'Psychology Today' - _author__ = 'rty' - publisher = u'www.psychologytoday.com' - category = u'Psychology' - max_articles_per_feed = 100 - remove_javascript = True - use_embedded_content = False - no_stylesheets = True +class PsychologyToday(BasicNewsRecipe): + + title = 'Psychology Today' + __author__ = 'Rick Shang' + + description = 'This magazine takes information from the latest research in the field of psychology and makes it useful to people in their everyday lives. Its coverage encompasses self-improvement, relationships, the mind-body connection, health, family, the workplace and culture.' language = 'en' - temp_files = [] - articles_are_obfuscated = True - remove_tags = [ - dict(name='div', attrs={'class':['print-source_url','field-items','print-footer']}), - dict(name='span', attrs={'class':'print-footnote'}), - ] - remove_tags_before = dict(name='h1', attrs={'class':'print-title'}) - remove_tags_after = dict(name='div', attrs={'class':['field-items','print-footer']}) + category = 'news' + encoding = 'UTF-8' + keep_only_tags = [dict(attrs={'class':['print-title', 'print-submitted', 'print-content', 'print-footer', 'print-source_url', 'print-links']})] + no_javascript = True + no_stylesheets = True - feeds = [(u'Contents', u'http://www.psychologytoday.com/articles/index.rss')] - def get_article_url(self, article): - return article.get('link', None) + def parse_index(self): + articles = [] + soup = self.index_to_soup('http://www.psychologytoday.com/magazine') + + + #Go to the main body + div = soup.find('div',attrs={'id':'content-content'}) + #Find cover & date + cover_item = div.find('div', attrs={'class':'collections-header-image'}) + cover = cover_item.find('img',src=True) + self.cover_url = cover['src'] + date = self.tag_to_string(cover['title']) + self.timefmt = u' [%s]'%date + + articles = [] + for post in div.findAll('div', attrs={'class':'collections-node-feature-info'}): + title = self.tag_to_string(post.find('h2')) + author_item=post.find('div', attrs={'class':'collection-node-byline'}) + author = re.sub(r'.*by\s',"",self.tag_to_string(author_item).strip()) + title = title + u' (%s)'%author + article_page= self.index_to_soup('http://www.psychologytoday.com'+post.find('a', href=True)['href']) + print_page=article_page.find('li', attrs={'class':'print_html first'}) + url='http://www.psychologytoday.com'+print_page.find('a',href=True)['href'] + desc = self.tag_to_string(post.find('div', attrs={'class':'collection-node-description'})).strip() + self.log('Found article:', title) + self.log('\t', url) + self.log('\t', desc) + articles.append({'title':title, 'url':url, 'date':'','description':desc}) + + for post in div.findAll('div', attrs={'class':'collections-node-thumbnail-info'}): + title = self.tag_to_string(post.find('h2')) + author_item=post.find('div', attrs={'class':'collection-node-byline'}) + article_page= self.index_to_soup('http://www.psychologytoday.com'+post.find('a', href=True)['href']) + print_page=article_page.find('li', attrs={'class':'print_html first'}) + description = post.find('div', attrs={'class':'collection-node-description'}) + author = re.sub(r'.*by\s',"",self.tag_to_string(description.nextSibling).strip()) + desc = self.tag_to_string(description).strip() + url='http://www.psychologytoday.com'+print_page.find('a',href=True)['href'] + title = title + u' (%s)'%author + self.log('Found article:', title) + self.log('\t', url) + self.log('\t', desc) + articles.append({'title':title, 'url':url, 'date':'','description':desc}) + + for post in div.findAll('li', attrs={'class':['collection-item-list-odd','collection-item-list-even']}): + title = self.tag_to_string(post.find('h2')) + author_item=post.find('div', attrs={'class':'collection-node-byline'}) + author = re.sub(r'.*by\s',"",self.tag_to_string(author_item).strip()) + title = title + u' (%s)'%author + article_page= self.index_to_soup('http://www.psychologytoday.com'+post.find('a', href=True)['href']) + print_page=article_page.find('li', attrs={'class':'print_html first'}) + url='http://www.psychologytoday.com'+print_page.find('a',href=True)['href'] + desc = self.tag_to_string(post.find('div', attrs={'class':'collection-node-description'})).strip() + self.log('Found article:', title) + self.log('\t', url) + self.log('\t', desc) + articles.append({'title':title, 'url':url, 'date':'','description':desc}) + + return [('Current Issue', articles)] - def get_obfuscated_article(self, url): - br = self.get_browser() - br.open(url) - response = br.follow_link(url_regex = r'/print/[0-9]+', nr = 0) - html = response.read() - self.temp_files.append(PersistentTemporaryFile('_fa.html')) - self.temp_files[-1].write(html) - self.temp_files[-1].close() - return self.temp_files[-1].name - def get_cover_url(self): - index = 'http://www.psychologytoday.com/magazine/' - soup = self.index_to_soup(index) - for image in soup.findAll('img',{ "class" : "imagefield imagefield-field_magazine_cover" }): - return image['src'] + '.jpg' - return None diff --git a/recipes/smith.recipe b/recipes/smith.recipe index 8bf60a227a..3d6a95c494 100644 --- a/recipes/smith.recipe +++ b/recipes/smith.recipe @@ -1,61 +1,67 @@ import re -from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup +from calibre.web.feeds.recipes import BasicNewsRecipe +from collections import OrderedDict -class SmithsonianMagazine(BasicNewsRecipe): - title = u'Smithsonian Magazine' - language = 'en' - __author__ = 'Krittika Goyal and TerminalVeracity' - oldest_article = 31#days - max_articles_per_feed = 50 - use_embedded_content = False - recursions = 1 - cover_url = 'http://sphotos.xx.fbcdn.net/hphotos-snc7/431147_10150602715983253_764313347_n.jpg' - match_regexps = ['&page=[2-9]$'] - preprocess_regexps = [ - (re.compile(r'for more of Smithsonian\'s coverage on history, science and nature.', re.DOTALL), lambda m: '') - ] - extra_css = """ - h1{font-size: large; margin: .2em 0} - h2{font-size: medium; margin: .2em 0} - h3{font-size: medium; margin: .2em 0} - #byLine{margin: .2em 0} - .articleImageCaptionwide{font-style: italic} - .wp-caption-text{font-style: italic} - img{display: block} - """ +class Smithsonian(BasicNewsRecipe): + title = 'Smithsonian Magazine' + __author__ = 'Rick Shang' - remove_stylesheets = True - remove_tags_after = dict(name='div', attrs={'class':['post','articlePaginationWrapper']}) - remove_tags = [ - dict(name='iframe'), - dict(name='div', attrs={'class':['article_sidebar_border','viewMorePhotos','addtoany_share_save_container','meta','social','OUTBRAIN','related-articles-inpage']}), - dict(name='div', attrs={'id':['article_sidebar_border', 'most-popular_large', 'most-popular-body_large','comment_section','article-related']}), - dict(name='ul', attrs={'class':'cat-breadcrumb col three last'}), - dict(name='h4', attrs={'id':'related-topics'}), - dict(name='table'), - dict(name='a', attrs={'href':['/subArticleBottomWeb','/subArticleTopWeb','/subArticleTopMag','/subArticleBottomMag']}), - dict(name='a', attrs={'name':'comments_shaded'}), - ] + description = 'This magazine chronicles the arts, environment, sciences and popular culture of the times. It is edited for modern, well-rounded individuals with diverse, general interests. With your order, you become a National Associate Member of the Smithsonian. Membership benefits include your subscription to Smithsonian magazine, a personalized membership card, discounts from the Smithsonian catalog, and more.' + language = 'en' + category = 'news' + encoding = 'UTF-8' + keep_only_tags = [dict(attrs={'id':['articleTitle', 'subHead', 'byLine', 'articleImage', 'article-text']})] + remove_tags = [dict(attrs={'class':['related-articles-inpage', 'viewMorePhotos']})] + no_javascript = True + no_stylesheets = True + def parse_index(self): + #Go to the issue + soup0 = self.index_to_soup('http://www.smithsonianmag.com/issue/archive/') + div = soup0.find('div',attrs={'id':'archives'}) + issue = div.find('ul',attrs={'class':'clear-both'}) + current_issue_url = issue.find('a', href=True)['href'] + soup = self.index_to_soup(current_issue_url) - feeds = [ -('History and Archeology', - 'http://feeds.feedburner.com/smithsonianmag/history-archaeology'), -('People and Places', - 'http://feeds.feedburner.com/smithsonianmag/people-places'), -('Science and Nature', - 'http://feeds.feedburner.com/smithsonianmag/science-nature'), -('Arts and Culture', - 'http://feeds.feedburner.com/smithsonianmag/arts-culture'), -('Travel', - 'http://feeds.feedburner.com/smithsonianmag/travel'), -] + #Go to the main body + div = soup.find ('div', attrs={'id':'content-inset'}) + + #Find date + date = re.sub('.*\:\W*', "", self.tag_to_string(div.find('h2')).strip()) + self.timefmt = u' [%s]'%date + + #Find cover + self.cover_url = div.find('img',src=True)['src'] + + feeds = OrderedDict() + section_title = '' + subsection_title = '' + for post in div.findAll('div', attrs={'class':['plainModule', 'departments plainModule']}): + articles = [] + prefix = '' + h3=post.find('h3') + if h3 is not None: + section_title = self.tag_to_string(h3) + else: + subsection=post.find('p',attrs={'class':'article-cat'}) + link=post.find('a',href=True) + url=link['href']+'?c=y&story=fullstory' + if subsection is not None: + subsection_title = self.tag_to_string(subsection) + prefix = (subsection_title+': ') + description=self.tag_to_string(post('p', limit=2)[1]).strip() + else: + description=self.tag_to_string(post.find('p')).strip() + desc=re.sub('\sBy\s.*', '', description, re.DOTALL) + author=re.sub('.*By\s', '', description, re.DOTALL) + title=prefix + self.tag_to_string(link).strip()+ u' (%s)'%author + articles.append({'title':title, 'url':url, 'description':desc, 'date':''}) + + if articles: + if section_title not in feeds: + feeds[section_title] = [] + feeds[section_title] += articles + ans = [(key, val) for key, val in feeds.iteritems()] + return ans - def preprocess_html(self, soup): - story = soup.find(name='div', attrs={'id':'article-body'}) - soup = BeautifulSoup('t') - body = soup.find(name='body') - body.insert(0, story) - return soup diff --git a/recipes/the_new_republic.recipe b/recipes/the_new_republic.recipe index 59ccef3607..649a8c46f3 100644 --- a/recipes/the_new_republic.recipe +++ b/recipes/the_new_republic.recipe @@ -1,45 +1,64 @@ -from calibre.web.feeds.news import BasicNewsRecipe +import re +from calibre.web.feeds.recipes import BasicNewsRecipe +from collections import OrderedDict -class The_New_Republic(BasicNewsRecipe): - title = 'The New Republic' - __author__ = 'cix3' +class TNR(BasicNewsRecipe): + + title = 'The New Republic' + __author__ = 'Rick Shang' + + description = 'The New Republic is a journal of opinion with an emphasis on politics and domestic and international affairs. It carries feature articles by staff and contributing editors. The second half of each issue is devoted to book and the arts, theater, motion pictures, music and art.' language = 'en' - description = 'Intelligent, stimulating and rigorous examination of American politics, foreign policy and culture' - timefmt = ' [%b %d, %Y]' - - oldest_article = 7 - max_articles_per_feed = 100 + category = 'news' + encoding = 'UTF-8' + remove_tags = [dict(attrs={'class':['print-logo','print-site_name','print-hr']})] + no_javascript = True no_stylesheets = True - remove_tags = [ - dict(name='div', attrs={'class':['print-logo', 'print-site_name', 'img-left', 'print-source_url']}), - dict(name='hr', attrs={'class':'print-hr'}), dict(name='img') - ] - feeds = [ - ('Politics', 'http://www.tnr.com/rss/articles/Politics'), - ('Books and Arts', 'http://www.tnr.com/rss/articles/Books-and-Arts'), - ('Economy', 'http://www.tnr.com/rss/articles/Economy'), - ('Environment and Energy', 'http://www.tnr.com/rss/articles/Environment-%2526-Energy'), - ('Health Care', 'http://www.tnr.com/rss/articles/Health-Care'), - ('Metro Policy', 'http://www.tnr.com/rss/articles/Metro-Policy'), - ('World', 'http://www.tnr.com/rss/articles/World'), - ('Film', 'http://www.tnr.com/rss/articles/Film'), - ('Books', 'http://www.tnr.com/rss/articles/books'), - ('The Book', 'http://www.tnr.com/rss/book'), - ('Jonathan Chait', 'http://www.tnr.com/rss/blogs/Jonathan-Chait'), - ('The Plank', 'http://www.tnr.com/rss/blogs/The-Plank'), - ('The Treatment', 'http://www.tnr.com/rss/blogs/The-Treatment'), - ('The Spine', 'http://www.tnr.com/rss/blogs/The-Spine'), - ('The Vine', 'http://www.tnr.com/rss/blogs/The-Vine'), - ('The Avenue', 'http://www.tnr.com/rss/blogs/The-Avenue'), - ('William Galston', 'http://www.tnr.com/rss/blogs/William-Galston'), - ('Simon Johnson', 'http://www.tnr.com/rss/blogs/Simon-Johnson'), - ('Ed Kilgore', 'http://www.tnr.com/rss/blogs/Ed-Kilgore'), - ('Damon Linker', 'http://www.tnr.com/rss/blogs/Damon-Linker'), - ('John McWhorter', 'http://www.tnr.com/rss/blogs/John-McWhorter') - ] + def parse_index(self): - def print_version(self, url): - return url.replace('http://www.tnr.com/', 'http://www.tnr.com/print/') + #Go to the issue + soup0 = self.index_to_soup('http://www.tnr.com/magazine-issues') + issue = soup0.find('div',attrs={'id':'current_issue'}) + #Find date + date = self.tag_to_string(issue.find('div',attrs={'class':'date'})).strip() + self.timefmt = u' [%s]'%date + + #Go to the main body + current_issue_url = 'http://www.tnr.com' + issue.find('a', href=True)['href'] + soup = self.index_to_soup(current_issue_url) + div = soup.find ('div', attrs={'class':'article_detail_body'}) + + + + #Find cover + self.cover_url = div.find('img',src=True)['src'] + + feeds = OrderedDict() + section_title = '' + subsection_title = '' + for post in div.findAll('p'): + articles = [] + em=post.find('em') + b=post.find('b') + a=post.find('a',href=True) + if em is not None: + section_title = self.tag_to_string(em).strip() + subsection_title = '' + elif b is not None: + subsection_title=self.tag_to_string(b).strip() + elif a is not None: + prefix = (subsection_title+': ') if subsection_title else '' + url=re.sub('www.tnr.com','www.tnr.com/print', a['href']) + author=re.sub('.*by\s', '', self.tag_to_string(post), re.DOTALL) + title=prefix + self.tag_to_string(a).strip()+ u' (%s)'%author + articles.append({'title':title, 'url':url, 'description':'', 'date':''}) + + if articles: + if section_title not in feeds: + feeds[section_title] = [] + feeds[section_title] += articles + ans = [(key, val) for key, val in feeds.iteritems()] + return ans