Update The Independent

2025-10-26 00:02:25 -04:00 · 2012-06-09 08:18:30 +05:30 · 2012-06-09 08:18:30 +05:30 · c2edf7a890
commit c2edf7a890
parent 47a1649a22
1 changed files with 82 additions and 21 deletions
--- a/recipes/independent.recipe
+++ b/recipes/independent.recipe
@ -15,6 +15,10 @@ class TheIndependentNew(BasicNewsRecipe):
    #Flag to enable/disable image fetching (not business)
    _FETCH_IMAGES = True
    #Set max gallery images here (respects _FETCH_IMAGES)
    # -1 for infinite
    _MAX_GALLERY_IMAGES = -1
     #used for converting rating to stars
    _STAR_URL = 'http://www.independent.co.uk/skins/ind/images/rating_star.png'
@ -41,6 +45,7 @@ class TheIndependentNew(BasicNewsRecipe):
                               dict(attrs={'id' : ['RelatedArtTag','renderBiography']}),
                               dict(attrs={'class' : ['autoplay','openBiogPopup']}),
                               dict(name='img',attrs={'alt' : ['Get Adobe Flash player']}),
                               dict(name='img',attrs={'alt' : ['view gallery']}),
                               dict(attrs={'style' : re.compile('.*')}),
                             ]
@ -171,20 +176,38 @@ class TheIndependentNew(BasicNewsRecipe):
                for item in element.findAll('a',attrs={'href' : re.compile('.*')}):
                    if item.img is not None:
                        #use full size image
                        images = []
                        img = item.findNext('img')
                        if  not '?action=gallery' in item['href']:
                            img['src'] = item['href']
                        #insert caption if available
                        if img.get('title') and (len(img['title']) > 1):
                            tag = Tag(soup,'h3')
                            text = ''
                            try:
                                text = img['data-title']
                            except:
                                pass
                            if img.get('title') and (len(img['title']) > 1):
                                text = NavigableString(img['title'])
                            tag.insert(0,text)
-
+                            images.append((img, tag))
                        else:
                            gallery_images, remove_link = self._get_gallery_images(item['href'])
                            images = images + gallery_images
                            if remove_link:
                                gal_link = soup.find('a',attrs={'id' : 'view-gallery'})
                                if gal_link:
                                    gal_link.extract()
                            img.extract()
                        for (img, title) in images:
                            #insert caption if available
                            if title:
                                #picture before text
                                img.extract()
                                item.insert(0,img)
-                            item.insert(1,tag)
+                                item.insert(1,title)
                            # remove link
                            item.name = "div"
@ -324,6 +347,44 @@ class TheIndependentNew(BasicNewsRecipe):
        return soup
    def _get_gallery_images(self,url):
        gallery_soup = self.index_to_soup(url)
        images = []
        remove_link = True
        total = 1
        try:
            counter = gallery_soup.find('div',attrs={'id' : ['counter']})
            total = counter.contents[0].split('/')
            total = int(total[1].rstrip())
        except:
            total = 1
        if self._MAX_GALLERY_IMAGES >= 0 and total > self._MAX_GALLERY_IMAGES:
            total = self._MAX_GALLERY_IMAGES
            remove_link = False
        for i in range(1, total +1):
            image, title = self._get_image_from_gallery(gallery_soup)
            if image:
                images.append((image,title))
            next = url + '&ino=' + str(i + 1)
            gallery_soup = self.index_to_soup(next)
        images.reverse()
        return images, remove_link
    def _get_image_from_gallery(self,soup):
        try:
            container = soup.find('div',attrs={'id' : ['main-image']})
            image = container.find('img')
            if image:
                title = soup.find('div',attrs={'id' : ['image-title']})
            return image, title
        except:
            print 'error fetching gallery image'
            return None
    def _recurisvely_linearise_tag_tree(
        self,
        item,