Update Psychology Today, The SMithsonian and The New Republic

2025-07-09 03:04:10 -04:00 · 2012-07-27 01:26:49 +05:30 · 2012-07-27 01:26:49 +05:30 · 9d90cfd756
commit 9d90cfd756
parent d13e49b401
3 changed files with 189 additions and 129 deletions
--- a/recipes/psych.recipe
+++ b/recipes/psych.recipe
@ -1,44 +1,79 @@
+import re
+from calibre.web.feeds.recipes import BasicNewsRecipe

-from calibre.ptempfile import PersistentTemporaryFile
-from calibre.web.feeds.news import BasicNewsRecipe

-class AdvancedUserRecipe1275708473(BasicNewsRecipe):
-    title          = u'Psychology Today'
-    _author__ = 'rty'
-    publisher = u'www.psychologytoday.com'
-    category = u'Psychology'
-    max_articles_per_feed = 100
-    remove_javascript = True
-    use_embedded_content   = False
-    no_stylesheets = True
+class PsychologyToday(BasicNewsRecipe):
+
+    title       = 'Psychology Today'
+    __author__  = 'Rick Shang'
+
+    description = 'This magazine takes information from the latest research in the field of psychology and makes it useful to people in their everyday lives. Its coverage encompasses self-improvement, relationships, the mind-body connection, health, family, the workplace and culture.'
    language = 'en'
-    temp_files = []
-    articles_are_obfuscated = True
-    remove_tags = [
-                    dict(name='div', attrs={'class':['print-source_url','field-items','print-footer']}),
-                    dict(name='span', attrs={'class':'print-footnote'}),
-                  ]
-    remove_tags_before  = dict(name='h1', attrs={'class':'print-title'})
-    remove_tags_after     = dict(name='div', attrs={'class':['field-items','print-footer']})
+    category = 'news'
+    encoding = 'UTF-8'
+    keep_only_tags = [dict(attrs={'class':['print-title', 'print-submitted', 'print-content', 'print-footer', 'print-source_url', 'print-links']})]
+    no_javascript = True
+    no_stylesheets = True

-    feeds          = [(u'Contents', u'http://www.psychologytoday.com/articles/index.rss')]

-    def get_article_url(self, article):
-       return article.get('link',  None)
+    def parse_index(self):
+        articles = []
+        soup = self.index_to_soup('http://www.psychologytoday.com/magazine')
+
+
+        #Go to the main body
+        div = soup.find('div',attrs={'id':'content-content'})
+        #Find cover & date
+        cover_item = div.find('div', attrs={'class':'collections-header-image'})
+        cover = cover_item.find('img',src=True)
+        self.cover_url = cover['src']
+        date = self.tag_to_string(cover['title'])
+        self.timefmt = u' [%s]'%date
+
+        articles = []
+        for post in div.findAll('div', attrs={'class':'collections-node-feature-info'}):
+            title = self.tag_to_string(post.find('h2'))
+            author_item=post.find('div', attrs={'class':'collection-node-byline'})
+            author = re.sub(r'.*by\s',"",self.tag_to_string(author_item).strip())
+            title = title + u' (%s)'%author
+            article_page= self.index_to_soup('http://www.psychologytoday.com'+post.find('a', href=True)['href'])
+            print_page=article_page.find('li', attrs={'class':'print_html first'})
+            url='http://www.psychologytoday.com'+print_page.find('a',href=True)['href']
+            desc = self.tag_to_string(post.find('div', attrs={'class':'collection-node-description'})).strip()
+            self.log('Found article:', title)
+            self.log('\t', url)
+            self.log('\t', desc)
+            articles.append({'title':title, 'url':url, 'date':'','description':desc})
+
+        for post in div.findAll('div', attrs={'class':'collections-node-thumbnail-info'}):
+            title = self.tag_to_string(post.find('h2'))
+            author_item=post.find('div', attrs={'class':'collection-node-byline'})
+            article_page= self.index_to_soup('http://www.psychologytoday.com'+post.find('a', href=True)['href'])
+            print_page=article_page.find('li', attrs={'class':'print_html first'})
+            description = post.find('div', attrs={'class':'collection-node-description'})
+            author = re.sub(r'.*by\s',"",self.tag_to_string(description.nextSibling).strip())
+            desc = self.tag_to_string(description).strip()
+            url='http://www.psychologytoday.com'+print_page.find('a',href=True)['href']
+            title = title + u' (%s)'%author
+            self.log('Found article:', title)
+            self.log('\t', url)
+            self.log('\t', desc)
+            articles.append({'title':title, 'url':url, 'date':'','description':desc})
+
+        for post in div.findAll('li', attrs={'class':['collection-item-list-odd','collection-item-list-even']}):
+            title = self.tag_to_string(post.find('h2'))
+            author_item=post.find('div', attrs={'class':'collection-node-byline'})
+            author = re.sub(r'.*by\s',"",self.tag_to_string(author_item).strip())
+            title = title + u' (%s)'%author
+            article_page= self.index_to_soup('http://www.psychologytoday.com'+post.find('a', href=True)['href'])
+            print_page=article_page.find('li', attrs={'class':'print_html first'})
+            url='http://www.psychologytoday.com'+print_page.find('a',href=True)['href']
+            desc = self.tag_to_string(post.find('div', attrs={'class':'collection-node-description'})).strip()
+            self.log('Found article:', title)
+            self.log('\t', url)
+            self.log('\t', desc)
+            articles.append({'title':title, 'url':url, 'date':'','description':desc})
+
+        return [('Current Issue', articles)]

-    def get_obfuscated_article(self, url):
-        br = self.get_browser()
-        br.open(url)
-        response = br.follow_link(url_regex = r'/print/[0-9]+', nr = 0)
-        html = response.read()
-        self.temp_files.append(PersistentTemporaryFile('_fa.html'))
-        self.temp_files[-1].write(html)
-        self.temp_files[-1].close()
-        return self.temp_files[-1].name

-    def get_cover_url(self):
-        index = 'http://www.psychologytoday.com/magazine/'
-        soup = self.index_to_soup(index)
-        for image in soup.findAll('img',{ "class" : "imagefield imagefield-field_magazine_cover" }):
-              return image['src'] + '.jpg'
-        return None
--- a/recipes/smith.recipe
+++ b/recipes/smith.recipe
@ -1,61 +1,67 @@
 import re
-from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
+from calibre.web.feeds.recipes import BasicNewsRecipe
+from collections import OrderedDict

-class SmithsonianMagazine(BasicNewsRecipe):
-    title          = u'Smithsonian Magazine'
-    language       = 'en'
-    __author__     = 'Krittika Goyal and TerminalVeracity'
-    oldest_article = 31#days
-    max_articles_per_feed = 50
-    use_embedded_content = False
-    recursions = 1
-    cover_url = 'http://sphotos.xx.fbcdn.net/hphotos-snc7/431147_10150602715983253_764313347_n.jpg'
-    match_regexps = ['&page=[2-9]$']
-    preprocess_regexps = [
-        (re.compile(r'for more of Smithsonian\'s coverage on history, science and nature.', re.DOTALL), lambda m: '')
-        ]
-    extra_css             = """
-                               h1{font-size: large; margin: .2em 0}
-                               h2{font-size: medium; margin: .2em 0}
-                               h3{font-size: medium; margin: .2em 0}
-                               #byLine{margin: .2em 0}
-                               .articleImageCaptionwide{font-style: italic}
-                               .wp-caption-text{font-style: italic}
-                               img{display: block}
-                            """
+class Smithsonian(BasicNewsRecipe):

+    title       = 'Smithsonian Magazine'
+    __author__  = 'Rick Shang'

-    remove_stylesheets = True
-    remove_tags_after  = dict(name='div', attrs={'class':['post','articlePaginationWrapper']})
-    remove_tags = [
-       dict(name='iframe'),
-       dict(name='div', attrs={'class':['article_sidebar_border','viewMorePhotos','addtoany_share_save_container','meta','social','OUTBRAIN','related-articles-inpage']}),
-       dict(name='div', attrs={'id':['article_sidebar_border', 'most-popular_large', 'most-popular-body_large','comment_section','article-related']}),
-       dict(name='ul', attrs={'class':'cat-breadcrumb col three last'}),
-       dict(name='h4', attrs={'id':'related-topics'}),
-       dict(name='table'),
-       dict(name='a', attrs={'href':['/subArticleBottomWeb','/subArticleTopWeb','/subArticleTopMag','/subArticleBottomMag']}),
-       dict(name='a', attrs={'name':'comments_shaded'}),
-    ]
+    description = 'This magazine chronicles the arts, environment, sciences and popular culture of the times. It is edited for modern, well-rounded individuals with diverse, general interests. With your order, you become a National Associate Member of the Smithsonian. Membership benefits include your subscription to Smithsonian magazine, a personalized membership card, discounts from the Smithsonian catalog, and more.'
+    language = 'en'
+    category = 'news'
+    encoding = 'UTF-8'
+    keep_only_tags = [dict(attrs={'id':['articleTitle', 'subHead', 'byLine', 'articleImage', 'article-text']})]
+    remove_tags = [dict(attrs={'class':['related-articles-inpage', 'viewMorePhotos']})]
+    no_javascript = True
+    no_stylesheets = True

+    def parse_index(self):
+        #Go to the issue
+        soup0 = self.index_to_soup('http://www.smithsonianmag.com/issue/archive/')
+        div = soup0.find('div',attrs={'id':'archives'})
+        issue = div.find('ul',attrs={'class':'clear-both'})
+        current_issue_url = issue.find('a', href=True)['href']
+        soup = self.index_to_soup(current_issue_url)

-    feeds          = [
-('History and Archeology',
- 'http://feeds.feedburner.com/smithsonianmag/history-archaeology'),
-('People and Places',
- 'http://feeds.feedburner.com/smithsonianmag/people-places'),
-('Science and Nature',
- 'http://feeds.feedburner.com/smithsonianmag/science-nature'),
-('Arts and Culture',
- 'http://feeds.feedburner.com/smithsonianmag/arts-culture'),
-('Travel',
- 'http://feeds.feedburner.com/smithsonianmag/travel'),
-]
+        #Go to the main body
+        div = soup.find ('div', attrs={'id':'content-inset'})
+
+        #Find date
+        date = re.sub('.*\:\W*', "", self.tag_to_string(div.find('h2')).strip())
+        self.timefmt = u' [%s]'%date
+
+        #Find cover
+        self.cover_url = div.find('img',src=True)['src']
+
+        feeds = OrderedDict()
+        section_title = ''
+        subsection_title = ''
+        for post in div.findAll('div', attrs={'class':['plainModule', 'departments plainModule']}):
+            articles = []
+            prefix = ''
+            h3=post.find('h3')
+            if h3 is not None:
+                section_title = self.tag_to_string(h3)
+            else:
+                subsection=post.find('p',attrs={'class':'article-cat'})
+                link=post.find('a',href=True)
+                url=link['href']+'?c=y&story=fullstory'
+                if subsection is not None:
+                    subsection_title = self.tag_to_string(subsection)
+                    prefix = (subsection_title+': ')
+                    description=self.tag_to_string(post('p', limit=2)[1]).strip()
+                else:
+                    description=self.tag_to_string(post.find('p')).strip()
+                desc=re.sub('\sBy\s.*', '', description, re.DOTALL)
+                author=re.sub('.*By\s', '', description, re.DOTALL)
+                title=prefix + self.tag_to_string(link).strip()+ u' (%s)'%author
+                articles.append({'title':title, 'url':url, 'description':desc, 'date':''})
+
+            if articles:
+                if section_title not in feeds:
+                    feeds[section_title] = []
+                feeds[section_title] += articles
+        ans = [(key, val) for key, val in feeds.iteritems()]
+        return ans

-    def preprocess_html(self, soup):
-        story = soup.find(name='div', attrs={'id':'article-body'})
-        soup = BeautifulSoup('<html><head><title>t</title></head><body></body></html>')
-        body = soup.find(name='body')
-        body.insert(0, story)
-        return soup
--- a/recipes/the_new_republic.recipe
+++ b/recipes/the_new_republic.recipe
@ -1,45 +1,64 @@
-from calibre.web.feeds.news import BasicNewsRecipe
+import re
+from calibre.web.feeds.recipes import BasicNewsRecipe
+from collections import OrderedDict

-class The_New_Republic(BasicNewsRecipe):
-    title = 'The New Republic'
-    __author__ = 'cix3'
+class TNR(BasicNewsRecipe):
+
+    title       = 'The New Republic'
+    __author__  = 'Rick Shang'
+
+    description = 'The New Republic is a journal of opinion with an emphasis on politics and domestic and international affairs. It carries feature articles by staff and contributing editors. The second half of each issue is devoted to book and the arts, theater, motion pictures, music and art.'
    language = 'en'
-    description = 'Intelligent, stimulating and rigorous examination of American politics, foreign policy and culture'
-    timefmt = ' [%b %d, %Y]'
-
-    oldest_article = 7
-    max_articles_per_feed = 100
+    category = 'news'
+    encoding = 'UTF-8'
+    remove_tags = [dict(attrs={'class':['print-logo','print-site_name','print-hr']})]
+    no_javascript = True
    no_stylesheets = True

-    remove_tags = [
-            dict(name='div', attrs={'class':['print-logo', 'print-site_name', 'img-left', 'print-source_url']}),
-            dict(name='hr', attrs={'class':'print-hr'}), dict(name='img')
-            ]

-    feeds = [
-        ('Politics', 'http://www.tnr.com/rss/articles/Politics'),
-        ('Books and Arts', 'http://www.tnr.com/rss/articles/Books-and-Arts'),
-        ('Economy', 'http://www.tnr.com/rss/articles/Economy'),
-        ('Environment and Energy', 'http://www.tnr.com/rss/articles/Environment-%2526-Energy'),
-        ('Health Care', 'http://www.tnr.com/rss/articles/Health-Care'),
-        ('Metro Policy', 'http://www.tnr.com/rss/articles/Metro-Policy'),
-        ('World', 'http://www.tnr.com/rss/articles/World'),
-        ('Film', 'http://www.tnr.com/rss/articles/Film'),
-        ('Books', 'http://www.tnr.com/rss/articles/books'),
-        ('The Book', 'http://www.tnr.com/rss/book'),
-        ('Jonathan Chait', 'http://www.tnr.com/rss/blogs/Jonathan-Chait'),
-        ('The Plank', 'http://www.tnr.com/rss/blogs/The-Plank'),
-        ('The Treatment', 'http://www.tnr.com/rss/blogs/The-Treatment'),
-        ('The Spine', 'http://www.tnr.com/rss/blogs/The-Spine'),
-        ('The Vine', 'http://www.tnr.com/rss/blogs/The-Vine'),
-        ('The Avenue', 'http://www.tnr.com/rss/blogs/The-Avenue'),
-        ('William Galston', 'http://www.tnr.com/rss/blogs/William-Galston'),
-        ('Simon Johnson', 'http://www.tnr.com/rss/blogs/Simon-Johnson'),
-        ('Ed Kilgore', 'http://www.tnr.com/rss/blogs/Ed-Kilgore'),
-        ('Damon Linker', 'http://www.tnr.com/rss/blogs/Damon-Linker'),
-        ('John McWhorter', 'http://www.tnr.com/rss/blogs/John-McWhorter')
-            ]
+    def parse_index(self):

-    def print_version(self, url):
-        return url.replace('http://www.tnr.com/', 'http://www.tnr.com/print/')
+        #Go to the issue
+        soup0 = self.index_to_soup('http://www.tnr.com/magazine-issues')
+        issue = soup0.find('div',attrs={'id':'current_issue'})

+        #Find date
+        date = self.tag_to_string(issue.find('div',attrs={'class':'date'})).strip()
+        self.timefmt = u' [%s]'%date
+
+        #Go to the main body
+        current_issue_url = 'http://www.tnr.com' + issue.find('a', href=True)['href']
+        soup = self.index_to_soup(current_issue_url)
+        div = soup.find ('div', attrs={'class':'article_detail_body'})
+
+
+
+        #Find cover
+        self.cover_url = div.find('img',src=True)['src']
+
+        feeds = OrderedDict()
+        section_title = ''
+        subsection_title = ''
+        for post in div.findAll('p'):
+            articles = []
+            em=post.find('em')
+            b=post.find('b')
+            a=post.find('a',href=True)
+            if em is not None:
+                section_title = self.tag_to_string(em).strip()
+                subsection_title = ''
+            elif b is not None:
+                subsection_title=self.tag_to_string(b).strip()
+            elif a is not None:
+                prefix = (subsection_title+': ') if subsection_title else ''
+                url=re.sub('www.tnr.com','www.tnr.com/print', a['href'])
+                author=re.sub('.*by\s', '', self.tag_to_string(post), re.DOTALL)
+                title=prefix + self.tag_to_string(a).strip()+ u' (%s)'%author
+                articles.append({'title':title, 'url':url, 'description':'', 'date':''})
+
+            if articles:
+                if section_title not in feeds:
+                    feeds[section_title] = []
+                feeds[section_title] += articles
+        ans = [(key, val) for key, val in feeds.iteritems()]
+        return ans