Update The Independent

This commit is contained in:
Kovid Goyal 2012-06-09 08:18:30 +05:30
parent 47a1649a22
commit c2edf7a890

View File

@ -15,6 +15,10 @@ class TheIndependentNew(BasicNewsRecipe):
#Flag to enable/disable image fetching (not business) #Flag to enable/disable image fetching (not business)
_FETCH_IMAGES = True _FETCH_IMAGES = True
#Set max gallery images here (respects _FETCH_IMAGES)
# -1 for infinite
_MAX_GALLERY_IMAGES = -1
#used for converting rating to stars #used for converting rating to stars
_STAR_URL = 'http://www.independent.co.uk/skins/ind/images/rating_star.png' _STAR_URL = 'http://www.independent.co.uk/skins/ind/images/rating_star.png'
@ -41,6 +45,7 @@ class TheIndependentNew(BasicNewsRecipe):
dict(attrs={'id' : ['RelatedArtTag','renderBiography']}), dict(attrs={'id' : ['RelatedArtTag','renderBiography']}),
dict(attrs={'class' : ['autoplay','openBiogPopup']}), dict(attrs={'class' : ['autoplay','openBiogPopup']}),
dict(name='img',attrs={'alt' : ['Get Adobe Flash player']}), dict(name='img',attrs={'alt' : ['Get Adobe Flash player']}),
dict(name='img',attrs={'alt' : ['view gallery']}),
dict(attrs={'style' : re.compile('.*')}), dict(attrs={'style' : re.compile('.*')}),
] ]
@ -171,20 +176,38 @@ class TheIndependentNew(BasicNewsRecipe):
for item in element.findAll('a',attrs={'href' : re.compile('.*')}): for item in element.findAll('a',attrs={'href' : re.compile('.*')}):
if item.img is not None: if item.img is not None:
#use full size image #use full size image
images = []
img = item.findNext('img') img = item.findNext('img')
if not '?action=gallery' in item['href']:
img['src'] = item['href'] img['src'] = item['href']
#insert caption if available
if img.get('title') and (len(img['title']) > 1):
tag = Tag(soup,'h3') tag = Tag(soup,'h3')
text = ''
try:
text = img['data-title']
except:
pass
if img.get('title') and (len(img['title']) > 1):
text = NavigableString(img['title']) text = NavigableString(img['title'])
tag.insert(0,text) tag.insert(0,text)
images.append((img, tag))
else:
gallery_images, remove_link = self._get_gallery_images(item['href'])
images = images + gallery_images
if remove_link:
gal_link = soup.find('a',attrs={'id' : 'view-gallery'})
if gal_link:
gal_link.extract()
img.extract()
for (img, title) in images:
#insert caption if available
if title:
#picture before text #picture before text
img.extract() img.extract()
item.insert(0,img) item.insert(0,img)
item.insert(1,tag) item.insert(1,title)
# remove link # remove link
item.name = "div" item.name = "div"
@ -324,6 +347,44 @@ class TheIndependentNew(BasicNewsRecipe):
return soup return soup
def _get_gallery_images(self,url):
gallery_soup = self.index_to_soup(url)
images = []
remove_link = True
total = 1
try:
counter = gallery_soup.find('div',attrs={'id' : ['counter']})
total = counter.contents[0].split('/')
total = int(total[1].rstrip())
except:
total = 1
if self._MAX_GALLERY_IMAGES >= 0 and total > self._MAX_GALLERY_IMAGES:
total = self._MAX_GALLERY_IMAGES
remove_link = False
for i in range(1, total +1):
image, title = self._get_image_from_gallery(gallery_soup)
if image:
images.append((image,title))
next = url + '&ino=' + str(i + 1)
gallery_soup = self.index_to_soup(next)
images.reverse()
return images, remove_link
def _get_image_from_gallery(self,soup):
try:
container = soup.find('div',attrs={'id' : ['main-image']})
image = container.find('img')
if image:
title = soup.find('div',attrs={'id' : ['image-title']})
return image, title
except:
print 'error fetching gallery image'
return None
def _recurisvely_linearise_tag_tree( def _recurisvely_linearise_tag_tree(
self, self,
item, item,