Update chronicle of higher education

2025-07-09 03:04:10 -04:00 · 2012-09-25 11:08:02 +05:30 · 2012-09-25 11:08:02 +05:30 · 984d2b8b76
commit 984d2b8b76
parent 952f5709b0
1 changed files with 11 additions and 5 deletions
--- a/recipes/chronicle_higher_ed.recipe
+++ b/recipes/chronicle_higher_ed.recipe
@ -1,3 +1,4 @@
+import re
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from collections import OrderedDict

@ -14,7 +15,8 @@ class Chronicle(BasicNewsRecipe):
            dict(name='div', attrs={'class':'article'}),
            ]
    remove_tags = [dict(name='div',attrs={'class':['related module1','maintitle']}),
-            dict(name='div', attrs={'id':['section-nav','icon-row']})]
+            dict(name='div', attrs={'id':['section-nav','icon-row', 'enlarge-popup']}),
+            dict(name='a', attrs={'class':'show-enlarge enlarge'})]
    no_javascript = True
    no_stylesheets = True

@ -31,7 +33,6 @@ class Chronicle(BasicNewsRecipe):
        return br

    def parse_index(self):
-
        #Go to the issue
        soup0 = self.index_to_soup('http://chronicle.com/section/Archives/39/')
        issue = soup0.find('ul',attrs={'class':'feature-promo-list'}).li
@ -42,9 +43,12 @@ class Chronicle(BasicNewsRecipe):
        self.timefmt = u' [%s]'%dates

        #Find cover
-        cover=soup0.find('div',attrs={'class':'promo'}).findNext('div')
-        self.cover_url="http://chronicle.com"+cover.find('img')['src']
-
+        cover=soup0.find('div',attrs={'class':'side-content'}).find(attrs={'src':re.compile("photos/biz/Current")})
+        if cover is not None:
+            if "chronicle.com" in cover['src']:
+                self.cover_url=cover['src']
+            else:
+                self.cover_url="http://chronicle.com" + cover['src']
        #Go to the main body
        soup = self.index_to_soup(issueurl)
        div = soup.find ('div', attrs={'id':'article-body'})
@ -74,8 +78,10 @@ class Chronicle(BasicNewsRecipe):
    def preprocess_html(self,soup):
        #process all the images
        for div in soup.findAll('div', attrs={'class':'tableauPlaceholder'}):
+
            noscripts=div.find('noscript').a
            div.replaceWith(noscripts)
        for div0 in soup.findAll('div',text='Powered by Tableau'):
            div0.extract()
        return soup
+