diff --git a/recipes/independent.recipe b/recipes/independent.recipe index 7090b64077..5e746145ee 100644 --- a/recipes/independent.recipe +++ b/recipes/independent.recipe @@ -15,6 +15,10 @@ class TheIndependentNew(BasicNewsRecipe): #Flag to enable/disable image fetching (not business) _FETCH_IMAGES = True + #Set max gallery images here (respects _FETCH_IMAGES) + # -1 for infinite + _MAX_GALLERY_IMAGES = -1 + #used for converting rating to stars _STAR_URL = 'http://www.independent.co.uk/skins/ind/images/rating_star.png' @@ -41,6 +45,7 @@ class TheIndependentNew(BasicNewsRecipe): dict(attrs={'id' : ['RelatedArtTag','renderBiography']}), dict(attrs={'class' : ['autoplay','openBiogPopup']}), dict(name='img',attrs={'alt' : ['Get Adobe Flash player']}), + dict(name='img',attrs={'alt' : ['view gallery']}), dict(attrs={'style' : re.compile('.*')}), ] @@ -119,15 +124,15 @@ class TheIndependentNew(BasicNewsRecipe): if len(para.contents) and isinstance(para.contents[0],NavigableString) \ and para.contents[0] == 'ADVERTORIAL FEATURE': return None - - # remove Suggested Topics + + # remove Suggested Topics items_to_extract = [] - + for item in soup.findAll('div',attrs={'class' : re.compile('.*RelatedArtTag.*')}): items_to_extract.append(item) - + for item in items_to_extract: - item.extract() + item.extract() items_to_extract = [] slideshow_elements = [] @@ -171,25 +176,43 @@ class TheIndependentNew(BasicNewsRecipe): for item in element.findAll('a',attrs={'href' : re.compile('.*')}): if item.img is not None: #use full size image + images = [] + img = item.findNext('img') - img['src'] = item['href'] - - #insert caption if available - if img.get('title') and (len(img['title']) > 1): + if not '?action=gallery' in item['href']: + img['src'] = item['href'] tag = Tag(soup,'h3') - text = NavigableString(img['title']) + text = '' + try: + text = img['data-title'] + except: + pass + + if img.get('title') and (len(img['title']) > 1): + text = NavigableString(img['title']) tag.insert(0,text) - - #picture before text + images.append((img, tag)) + else: + gallery_images, remove_link = self._get_gallery_images(item['href']) + images = images + gallery_images + if remove_link: + gal_link = soup.find('a',attrs={'id' : 'view-gallery'}) + if gal_link: + gal_link.extract() img.extract() - item.insert(0,img) - item.insert(1,tag) + for (img, title) in images: + #insert caption if available + if title: + #picture before text + img.extract() + item.insert(0,img) + item.insert(1,title) - # remove link - item.name = "div" - item["class"]='image' - del item["href"] + # remove link + item.name = "div" + item["class"]='image' + del item["href"] #remove empty subtitles @@ -317,13 +340,51 @@ class TheIndependentNew(BasicNewsRecipe): for item in items_to_extract: item.extract() - - # nickredding's fix for non-justified text + + # nickredding's fix for non-justified text for ptag in soup.findAll('p',attrs={'align':'left'}): del(ptag['align']) - + return soup + def _get_gallery_images(self,url): + gallery_soup = self.index_to_soup(url) + images = [] + remove_link = True + total = 1 + try: + counter = gallery_soup.find('div',attrs={'id' : ['counter']}) + total = counter.contents[0].split('/') + total = int(total[1].rstrip()) + except: + total = 1 + + if self._MAX_GALLERY_IMAGES >= 0 and total > self._MAX_GALLERY_IMAGES: + total = self._MAX_GALLERY_IMAGES + remove_link = False + + for i in range(1, total +1): + image, title = self._get_image_from_gallery(gallery_soup) + if image: + images.append((image,title)) + next = url + '&ino=' + str(i + 1) + gallery_soup = self.index_to_soup(next) + images.reverse() + return images, remove_link + + def _get_image_from_gallery(self,soup): + try: + container = soup.find('div',attrs={'id' : ['main-image']}) + image = container.find('img') + if image: + title = soup.find('div',attrs={'id' : ['image-title']}) + return image, title + except: + print 'error fetching gallery image' + return None + + + def _recurisvely_linearise_tag_tree( self, item,