Update The New Republic

2025-10-17 03:50:30 -04:00 · 2013-05-01 22:56:54 +05:30 · 2013-05-01 22:56:54 +05:30 · c504bbab3d
commit c504bbab3d
parent f013d5e371
1 changed files with 63 additions and 68 deletions
--- a/recipes/the_new_republic.recipe
+++ b/recipes/the_new_republic.recipe
@ -1,68 +1,63 @@
-import re
+import re
-from calibre.web.feeds.recipes import BasicNewsRecipe
+from calibre.web.feeds.recipes import BasicNewsRecipe
-from collections import OrderedDict
+
-
+
-class TNR(BasicNewsRecipe):
+class TNR(BasicNewsRecipe):
-
+
-    title       = 'The New Republic'
+    title = 'The New Republic'
-    __author__  = 'Rick Shang'
+    __author__ = 'Krittika Goyal'
-
+
-    description = 'The New Republic is a journal of opinion with an emphasis on politics and domestic and international affairs. It carries feature articles by staff and contributing editors. The second half of each issue is devoted to book and the arts, theater, motion pictures, music and art.'
+    description = '''The New Republic is a journal of opinion with an emphasis
-    language = 'en'
+    on politics and domestic and international affairs. It carries feature
-    category = 'news'
+    articles by staff and contributing editors. The second half of each issue
-    encoding = 'UTF-8'
+    is devoted to book and the arts, theater, motion pictures, music and art.'''
-    remove_tags = [dict(attrs={'class':['print-logo','print-site_name','print-hr']})]
+
-    no_javascript = True
+    language = 'en'
-    no_stylesheets = True
+    encoding = 'UTF-8'
-
+    needs_subscription = True
-
+
-    def parse_index(self):
+    preprocess_regexps = [
-
+        (re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''),
-        #Go to the issue
+        (re.compile(r'<script.*?</script>', re.DOTALL), lambda m: ''),
-        soup0 = self.index_to_soup('http://www.tnr.com/magazine-issues')
+    ]
-        issue = soup0.find('div',attrs={'id':'current_issue'})
+
-
+    def get_browser(self):
-        #Find date
+        br = BasicNewsRecipe.get_browser(self)
-        date = self.tag_to_string(issue.find('div',attrs={'class':'date'})).strip()
+        br.open('http://www.newrepublic.com/user')
-        self.timefmt = u' [%s]'%date
+        br.select_form(nr=1)
-
+        try:
-        #Go to the main body
+            br['user'] = self.username
-        current_issue_url = 'http://www.tnr.com' + issue.find('a', href=True)['href']
+        except:
-        soup = self.index_to_soup(current_issue_url)
+            br['name'] = self.username
-        div = soup.find ('div', attrs={'class':'article_detail_body'})
+        br['pass'] = self.password
-
+        self.log('Logging in...')
-
+        raw = br.submit().read()
-
+        if 'SIGN OUT' not in raw:
-        #Find cover
+            raise ValueError('Failed to log in to tnr.com, check your username and password')
-        self.cover_url = div.find('img',src=True)['src']
+        self.log('Logged in successfully')
-
+        return br
-        feeds = OrderedDict()
+
-        section_title = ''
+    def parse_index(self):
-        subsection_title = ''
+        raw = self.index_to_soup('http://www.newrepublic.com/current-issue', raw=True)
-        for post in div.findAll('p'):
+        # raw = self.index_to_soup(open('/t/raw.html').read().decode('utf-8'), raw=True)
-            articles = []
+        for pat, sub in self.preprocess_regexps:
-            em=post.find('em')
+            raw = pat.sub(sub, raw)
-            b=post.find('b')
+        soup = self.index_to_soup(raw)
-            a=post.find('a',href=True)
+        feed_title = 'The New Republic Magazine Articles'
-            p=post.find('img', src=True)
+
-            #Find cover
+        articles = []
-            if p is not None:
+        for div in soup.findAll('div', attrs={'class':lambda x: x and 'field-item' in x.split()}):
-                self.cover_url = p['src'].strip()
+            a = div.find('a', href=True, attrs={'class':lambda x: x != 'author'})
-            if em is not None:
+            if a is not None:
-                section_title = self.tag_to_string(em).strip()
+                art_title = self.tag_to_string(a)
-                subsection_title = ''
+                url = a.get('href')
-            elif b is not None:
+                num = re.search(r'/(\d+)/', url)
-                subsection_title=self.tag_to_string(b).strip()
+                if num is not None:
-            elif a is not None:
+                    art = num.group(1)
-                prefix = (subsection_title+': ') if subsection_title else ''
+                    url = 'http://www.newrepublic.com/node/%s/print'%art
-                url=re.sub('www.tnr.com','www.tnr.com/print', a['href'])
+                    self.log.info('\tFound article:', art_title, 'at', url)
-                author=re.sub('.*by\s', '', self.tag_to_string(post), re.DOTALL)
+                    article = {'title':art_title, 'url':url, 'description':'', 'date':''}
-                title=prefix + self.tag_to_string(a).strip()+ u' (%s)'%author
+                    articles.append(article)
-                articles.append({'title':title, 'url':url, 'description':'', 'date':''})
+
-
+        return [(feed_title, articles)]
-            if articles:
+
                if section_title not in feeds:
                    feeds[section_title] = []
                feeds[section_title] += articles
        ans = [(key, val) for key, val in feeds.iteritems()]
        return ans