Update Psychology Today, The SMithsonian and The New Republic

2025-07-09 03:04:10 -04:00 · 2012-07-27 01:26:49 +05:30 · 2012-07-27 01:26:49 +05:30 · 9d90cfd756
commit 9d90cfd756
parent d13e49b401
3 changed files with 189 additions and 129 deletions
--- a/recipes/psych.recipe
+++ b/recipes/psych.recipe
@ -1,44 +1,79 @@
 import re
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from calibre.ptempfile import PersistentTemporaryFile
 from calibre.web.feeds.news import BasicNewsRecipe
-class AdvancedUserRecipe1275708473(BasicNewsRecipe):
+class PsychologyToday(BasicNewsRecipe):
-    title          = u'Psychology Today'
+
-    _author__ = 'rty'
+    title       = 'Psychology Today'
-    publisher = u'www.psychologytoday.com'
+    __author__  = 'Rick Shang'
-    category = u'Psychology'
+
-    max_articles_per_feed = 100
+    description = 'This magazine takes information from the latest research in the field of psychology and makes it useful to people in their everyday lives. Its coverage encompasses self-improvement, relationships, the mind-body connection, health, family, the workplace and culture.'
    remove_javascript = True
    use_embedded_content   = False
    no_stylesheets = True
    language = 'en'
-    temp_files = []
+    category = 'news'
-    articles_are_obfuscated = True
+    encoding = 'UTF-8'
-    remove_tags = [
+    keep_only_tags = [dict(attrs={'class':['print-title', 'print-submitted', 'print-content', 'print-footer', 'print-source_url', 'print-links']})]
-                    dict(name='div', attrs={'class':['print-source_url','field-items','print-footer']}),
+    no_javascript = True
-                    dict(name='span', attrs={'class':'print-footnote'}),
+    no_stylesheets = True
                  ]
    remove_tags_before  = dict(name='h1', attrs={'class':'print-title'})
    remove_tags_after     = dict(name='div', attrs={'class':['field-items','print-footer']})
    feeds          = [(u'Contents', u'http://www.psychologytoday.com/articles/index.rss')]
-    def get_article_url(self, article):
+    def parse_index(self):
-       return article.get('link',  None)
+        articles = []
        soup = self.index_to_soup('http://www.psychologytoday.com/magazine')
        #Go to the main body
        div = soup.find('div',attrs={'id':'content-content'})
        #Find cover & date
        cover_item = div.find('div', attrs={'class':'collections-header-image'})
        cover = cover_item.find('img',src=True)
        self.cover_url = cover['src']
        date = self.tag_to_string(cover['title'])
        self.timefmt = u' [%s]'%date
        articles = []
        for post in div.findAll('div', attrs={'class':'collections-node-feature-info'}):
            title = self.tag_to_string(post.find('h2'))
            author_item=post.find('div', attrs={'class':'collection-node-byline'})
            author = re.sub(r'.*by\s',"",self.tag_to_string(author_item).strip())
            title = title + u' (%s)'%author
            article_page= self.index_to_soup('http://www.psychologytoday.com'+post.find('a', href=True)['href'])
            print_page=article_page.find('li', attrs={'class':'print_html first'})
            url='http://www.psychologytoday.com'+print_page.find('a',href=True)['href']
            desc = self.tag_to_string(post.find('div', attrs={'class':'collection-node-description'})).strip()
            self.log('Found article:', title)
            self.log('\t', url)
            self.log('\t', desc)
            articles.append({'title':title, 'url':url, 'date':'','description':desc})
        for post in div.findAll('div', attrs={'class':'collections-node-thumbnail-info'}):
            title = self.tag_to_string(post.find('h2'))
            author_item=post.find('div', attrs={'class':'collection-node-byline'})
            article_page= self.index_to_soup('http://www.psychologytoday.com'+post.find('a', href=True)['href'])
            print_page=article_page.find('li', attrs={'class':'print_html first'})
            description = post.find('div', attrs={'class':'collection-node-description'})
            author = re.sub(r'.*by\s',"",self.tag_to_string(description.nextSibling).strip())
            desc = self.tag_to_string(description).strip()
            url='http://www.psychologytoday.com'+print_page.find('a',href=True)['href']
            title = title + u' (%s)'%author
            self.log('Found article:', title)
            self.log('\t', url)
            self.log('\t', desc)
            articles.append({'title':title, 'url':url, 'date':'','description':desc})
        for post in div.findAll('li', attrs={'class':['collection-item-list-odd','collection-item-list-even']}):
            title = self.tag_to_string(post.find('h2'))
            author_item=post.find('div', attrs={'class':'collection-node-byline'})
            author = re.sub(r'.*by\s',"",self.tag_to_string(author_item).strip())
            title = title + u' (%s)'%author
            article_page= self.index_to_soup('http://www.psychologytoday.com'+post.find('a', href=True)['href'])
            print_page=article_page.find('li', attrs={'class':'print_html first'})
            url='http://www.psychologytoday.com'+print_page.find('a',href=True)['href']
            desc = self.tag_to_string(post.find('div', attrs={'class':'collection-node-description'})).strip()
            self.log('Found article:', title)
            self.log('\t', url)
            self.log('\t', desc)
            articles.append({'title':title, 'url':url, 'date':'','description':desc})
        return [('Current Issue', articles)]
    def get_obfuscated_article(self, url):
        br = self.get_browser()
        br.open(url)
        response = br.follow_link(url_regex = r'/print/[0-9]+', nr = 0)
        html = response.read()
        self.temp_files.append(PersistentTemporaryFile('_fa.html'))
        self.temp_files[-1].write(html)
        self.temp_files[-1].close()
        return self.temp_files[-1].name
    def get_cover_url(self):
        index = 'http://www.psychologytoday.com/magazine/'
        soup = self.index_to_soup(index)
        for image in soup.findAll('img',{ "class" : "imagefield imagefield-field_magazine_cover" }):
              return image['src'] + '.jpg'
        return None
--- a/recipes/smith.recipe
+++ b/recipes/smith.recipe
@ -1,61 +1,67 @@
 import re
-from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.web.feeds.recipes import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
+from collections import OrderedDict
-class SmithsonianMagazine(BasicNewsRecipe):
+class Smithsonian(BasicNewsRecipe):
-    title          = u'Smithsonian Magazine'
+
    title       = 'Smithsonian Magazine'
    __author__  = 'Rick Shang'
    description = 'This magazine chronicles the arts, environment, sciences and popular culture of the times. It is edited for modern, well-rounded individuals with diverse, general interests. With your order, you become a National Associate Member of the Smithsonian. Membership benefits include your subscription to Smithsonian magazine, a personalized membership card, discounts from the Smithsonian catalog, and more.'
    language = 'en'
-    __author__     = 'Krittika Goyal and TerminalVeracity'
+    category = 'news'
-    oldest_article = 31#days
+    encoding = 'UTF-8'
-    max_articles_per_feed = 50
+    keep_only_tags = [dict(attrs={'id':['articleTitle', 'subHead', 'byLine', 'articleImage', 'article-text']})]
-    use_embedded_content = False
+    remove_tags = [dict(attrs={'class':['related-articles-inpage', 'viewMorePhotos']})]
-    recursions = 1
+    no_javascript = True
-    cover_url = 'http://sphotos.xx.fbcdn.net/hphotos-snc7/431147_10150602715983253_764313347_n.jpg'
+    no_stylesheets = True
    match_regexps = ['&page=[2-9]$']
    preprocess_regexps = [
        (re.compile(r'for more of Smithsonian\'s coverage on history, science and nature.', re.DOTALL), lambda m: '')
        ]
    extra_css             = """
                               h1{font-size: large; margin: .2em 0}
                               h2{font-size: medium; margin: .2em 0}
                               h3{font-size: medium; margin: .2em 0}
                               #byLine{margin: .2em 0}
                               .articleImageCaptionwide{font-style: italic}
                               .wp-caption-text{font-style: italic}
                               img{display: block}
                            """
    def parse_index(self):
        #Go to the issue
        soup0 = self.index_to_soup('http://www.smithsonianmag.com/issue/archive/')
        div = soup0.find('div',attrs={'id':'archives'})
        issue = div.find('ul',attrs={'class':'clear-both'})
        current_issue_url = issue.find('a', href=True)['href']
        soup = self.index_to_soup(current_issue_url)
-    remove_stylesheets = True
+        #Go to the main body
-    remove_tags_after  = dict(name='div', attrs={'class':['post','articlePaginationWrapper']})
+        div = soup.find ('div', attrs={'id':'content-inset'})
    remove_tags = [
       dict(name='iframe'),
       dict(name='div', attrs={'class':['article_sidebar_border','viewMorePhotos','addtoany_share_save_container','meta','social','OUTBRAIN','related-articles-inpage']}),
       dict(name='div', attrs={'id':['article_sidebar_border', 'most-popular_large', 'most-popular-body_large','comment_section','article-related']}),
       dict(name='ul', attrs={'class':'cat-breadcrumb col three last'}),
       dict(name='h4', attrs={'id':'related-topics'}),
       dict(name='table'),
       dict(name='a', attrs={'href':['/subArticleBottomWeb','/subArticleTopWeb','/subArticleTopMag','/subArticleBottomMag']}),
       dict(name='a', attrs={'name':'comments_shaded'}),
    ]
        #Find date
        date = re.sub('.*\:\W*', "", self.tag_to_string(div.find('h2')).strip())
        self.timefmt = u' [%s]'%date
-    feeds          = [
+        #Find cover
-('History and Archeology',
+        self.cover_url = div.find('img',src=True)['src']
- 'http://feeds.feedburner.com/smithsonianmag/history-archaeology'),
+
-('People and Places',
+        feeds = OrderedDict()
- 'http://feeds.feedburner.com/smithsonianmag/people-places'),
+        section_title = ''
-('Science and Nature',
+        subsection_title = ''
- 'http://feeds.feedburner.com/smithsonianmag/science-nature'),
+        for post in div.findAll('div', attrs={'class':['plainModule', 'departments plainModule']}):
-('Arts and Culture',
+            articles = []
- 'http://feeds.feedburner.com/smithsonianmag/arts-culture'),
+            prefix = ''
-('Travel',
+            h3=post.find('h3')
- 'http://feeds.feedburner.com/smithsonianmag/travel'),
+            if h3 is not None:
-]
+                section_title = self.tag_to_string(h3)
            else:
                subsection=post.find('p',attrs={'class':'article-cat'})
                link=post.find('a',href=True)
                url=link['href']+'?c=y&story=fullstory'
                if subsection is not None:
                    subsection_title = self.tag_to_string(subsection)
                    prefix = (subsection_title+': ')
                    description=self.tag_to_string(post('p', limit=2)[1]).strip()
                else:
                    description=self.tag_to_string(post.find('p')).strip()
                desc=re.sub('\sBy\s.*', '', description, re.DOTALL)
                author=re.sub('.*By\s', '', description, re.DOTALL)
                title=prefix + self.tag_to_string(link).strip()+ u' (%s)'%author
                articles.append({'title':title, 'url':url, 'description':desc, 'date':''})
            if articles:
                if section_title not in feeds:
                    feeds[section_title] = []
                feeds[section_title] += articles
        ans = [(key, val) for key, val in feeds.iteritems()]
        return ans
    def preprocess_html(self, soup):
        story = soup.find(name='div', attrs={'id':'article-body'})
        soup = BeautifulSoup('<html><head><title>t</title></head><body></body></html>')
        body = soup.find(name='body')
        body.insert(0, story)
        return soup
--- a/recipes/the_new_republic.recipe
+++ b/recipes/the_new_republic.recipe
@ -1,45 +1,64 @@
-from calibre.web.feeds.news import BasicNewsRecipe
+import re
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from collections import OrderedDict
 class TNR(BasicNewsRecipe):
 class The_New_Republic(BasicNewsRecipe):
    title       = 'The New Republic'
-    __author__ = 'cix3'
+    __author__  = 'Rick Shang'
    language = 'en'
    description = 'Intelligent, stimulating and rigorous examination of American politics, foreign policy and culture'
    timefmt = ' [%b %d, %Y]'
-    oldest_article = 7
+    description = 'The New Republic is a journal of opinion with an emphasis on politics and domestic and international affairs. It carries feature articles by staff and contributing editors. The second half of each issue is devoted to book and the arts, theater, motion pictures, music and art.'
-    max_articles_per_feed = 100
+    language = 'en'
    category = 'news'
    encoding = 'UTF-8'
    remove_tags = [dict(attrs={'class':['print-logo','print-site_name','print-hr']})]
    no_javascript = True
    no_stylesheets = True
    remove_tags = [
            dict(name='div', attrs={'class':['print-logo', 'print-site_name', 'img-left', 'print-source_url']}),
            dict(name='hr', attrs={'class':'print-hr'}), dict(name='img')
            ]
-    feeds = [
+    def parse_index(self):
        ('Politics', 'http://www.tnr.com/rss/articles/Politics'),
        ('Books and Arts', 'http://www.tnr.com/rss/articles/Books-and-Arts'),
        ('Economy', 'http://www.tnr.com/rss/articles/Economy'),
        ('Environment and Energy', 'http://www.tnr.com/rss/articles/Environment-%2526-Energy'),
        ('Health Care', 'http://www.tnr.com/rss/articles/Health-Care'),
        ('Metro Policy', 'http://www.tnr.com/rss/articles/Metro-Policy'),
        ('World', 'http://www.tnr.com/rss/articles/World'),
        ('Film', 'http://www.tnr.com/rss/articles/Film'),
        ('Books', 'http://www.tnr.com/rss/articles/books'),
        ('The Book', 'http://www.tnr.com/rss/book'),
        ('Jonathan Chait', 'http://www.tnr.com/rss/blogs/Jonathan-Chait'),
        ('The Plank', 'http://www.tnr.com/rss/blogs/The-Plank'),
        ('The Treatment', 'http://www.tnr.com/rss/blogs/The-Treatment'),
        ('The Spine', 'http://www.tnr.com/rss/blogs/The-Spine'),
        ('The Vine', 'http://www.tnr.com/rss/blogs/The-Vine'),
        ('The Avenue', 'http://www.tnr.com/rss/blogs/The-Avenue'),
        ('William Galston', 'http://www.tnr.com/rss/blogs/William-Galston'),
        ('Simon Johnson', 'http://www.tnr.com/rss/blogs/Simon-Johnson'),
        ('Ed Kilgore', 'http://www.tnr.com/rss/blogs/Ed-Kilgore'),
        ('Damon Linker', 'http://www.tnr.com/rss/blogs/Damon-Linker'),
        ('John McWhorter', 'http://www.tnr.com/rss/blogs/John-McWhorter')
            ]
-    def print_version(self, url):
+        #Go to the issue
-        return url.replace('http://www.tnr.com/', 'http://www.tnr.com/print/')
+        soup0 = self.index_to_soup('http://www.tnr.com/magazine-issues')
        issue = soup0.find('div',attrs={'id':'current_issue'})
        #Find date
        date = self.tag_to_string(issue.find('div',attrs={'class':'date'})).strip()
        self.timefmt = u' [%s]'%date
        #Go to the main body
        current_issue_url = 'http://www.tnr.com' + issue.find('a', href=True)['href']
        soup = self.index_to_soup(current_issue_url)
        div = soup.find ('div', attrs={'class':'article_detail_body'})
        #Find cover
        self.cover_url = div.find('img',src=True)['src']
        feeds = OrderedDict()
        section_title = ''
        subsection_title = ''
        for post in div.findAll('p'):
            articles = []
            em=post.find('em')
            b=post.find('b')
            a=post.find('a',href=True)
            if em is not None:
                section_title = self.tag_to_string(em).strip()
                subsection_title = ''
            elif b is not None:
                subsection_title=self.tag_to_string(b).strip()
            elif a is not None:
                prefix = (subsection_title+': ') if subsection_title else ''
                url=re.sub('www.tnr.com','www.tnr.com/print', a['href'])
                author=re.sub('.*by\s', '', self.tag_to_string(post), re.DOTALL)
                title=prefix + self.tag_to_string(a).strip()+ u' (%s)'%author
                articles.append({'title':title, 'url':url, 'description':'', 'date':''})
            if articles:
                if section_title not in feeds:
                    feeds[section_title] = []
                feeds[section_title] += articles
        ans = [(key, val) for key, val in feeds.iteritems()]
        return ans