mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Update The Independent
This commit is contained in:
parent
47a1649a22
commit
c2edf7a890
@ -15,6 +15,10 @@ class TheIndependentNew(BasicNewsRecipe):
|
||||
#Flag to enable/disable image fetching (not business)
|
||||
_FETCH_IMAGES = True
|
||||
|
||||
#Set max gallery images here (respects _FETCH_IMAGES)
|
||||
# -1 for infinite
|
||||
_MAX_GALLERY_IMAGES = -1
|
||||
|
||||
|
||||
#used for converting rating to stars
|
||||
_STAR_URL = 'http://www.independent.co.uk/skins/ind/images/rating_star.png'
|
||||
@ -41,6 +45,7 @@ class TheIndependentNew(BasicNewsRecipe):
|
||||
dict(attrs={'id' : ['RelatedArtTag','renderBiography']}),
|
||||
dict(attrs={'class' : ['autoplay','openBiogPopup']}),
|
||||
dict(name='img',attrs={'alt' : ['Get Adobe Flash player']}),
|
||||
dict(name='img',attrs={'alt' : ['view gallery']}),
|
||||
dict(attrs={'style' : re.compile('.*')}),
|
||||
]
|
||||
|
||||
@ -119,15 +124,15 @@ class TheIndependentNew(BasicNewsRecipe):
|
||||
if len(para.contents) and isinstance(para.contents[0],NavigableString) \
|
||||
and para.contents[0] == 'ADVERTORIAL FEATURE':
|
||||
return None
|
||||
|
||||
# remove Suggested Topics
|
||||
|
||||
# remove Suggested Topics
|
||||
items_to_extract = []
|
||||
|
||||
|
||||
for item in soup.findAll('div',attrs={'class' : re.compile('.*RelatedArtTag.*')}):
|
||||
items_to_extract.append(item)
|
||||
|
||||
|
||||
for item in items_to_extract:
|
||||
item.extract()
|
||||
item.extract()
|
||||
|
||||
items_to_extract = []
|
||||
slideshow_elements = []
|
||||
@ -171,25 +176,43 @@ class TheIndependentNew(BasicNewsRecipe):
|
||||
for item in element.findAll('a',attrs={'href' : re.compile('.*')}):
|
||||
if item.img is not None:
|
||||
#use full size image
|
||||
images = []
|
||||
|
||||
img = item.findNext('img')
|
||||
|
||||
img['src'] = item['href']
|
||||
|
||||
#insert caption if available
|
||||
if img.get('title') and (len(img['title']) > 1):
|
||||
if not '?action=gallery' in item['href']:
|
||||
img['src'] = item['href']
|
||||
tag = Tag(soup,'h3')
|
||||
text = NavigableString(img['title'])
|
||||
text = ''
|
||||
try:
|
||||
text = img['data-title']
|
||||
except:
|
||||
pass
|
||||
|
||||
if img.get('title') and (len(img['title']) > 1):
|
||||
text = NavigableString(img['title'])
|
||||
tag.insert(0,text)
|
||||
|
||||
#picture before text
|
||||
images.append((img, tag))
|
||||
else:
|
||||
gallery_images, remove_link = self._get_gallery_images(item['href'])
|
||||
images = images + gallery_images
|
||||
if remove_link:
|
||||
gal_link = soup.find('a',attrs={'id' : 'view-gallery'})
|
||||
if gal_link:
|
||||
gal_link.extract()
|
||||
img.extract()
|
||||
item.insert(0,img)
|
||||
item.insert(1,tag)
|
||||
for (img, title) in images:
|
||||
#insert caption if available
|
||||
if title:
|
||||
#picture before text
|
||||
img.extract()
|
||||
item.insert(0,img)
|
||||
item.insert(1,title)
|
||||
|
||||
# remove link
|
||||
item.name = "div"
|
||||
item["class"]='image'
|
||||
del item["href"]
|
||||
# remove link
|
||||
item.name = "div"
|
||||
item["class"]='image'
|
||||
del item["href"]
|
||||
|
||||
|
||||
#remove empty subtitles
|
||||
@ -317,13 +340,51 @@ class TheIndependentNew(BasicNewsRecipe):
|
||||
|
||||
for item in items_to_extract:
|
||||
item.extract()
|
||||
|
||||
# nickredding's fix for non-justified text
|
||||
|
||||
# nickredding's fix for non-justified text
|
||||
for ptag in soup.findAll('p',attrs={'align':'left'}):
|
||||
del(ptag['align'])
|
||||
|
||||
|
||||
return soup
|
||||
|
||||
def _get_gallery_images(self,url):
|
||||
gallery_soup = self.index_to_soup(url)
|
||||
images = []
|
||||
remove_link = True
|
||||
total = 1
|
||||
try:
|
||||
counter = gallery_soup.find('div',attrs={'id' : ['counter']})
|
||||
total = counter.contents[0].split('/')
|
||||
total = int(total[1].rstrip())
|
||||
except:
|
||||
total = 1
|
||||
|
||||
if self._MAX_GALLERY_IMAGES >= 0 and total > self._MAX_GALLERY_IMAGES:
|
||||
total = self._MAX_GALLERY_IMAGES
|
||||
remove_link = False
|
||||
|
||||
for i in range(1, total +1):
|
||||
image, title = self._get_image_from_gallery(gallery_soup)
|
||||
if image:
|
||||
images.append((image,title))
|
||||
next = url + '&ino=' + str(i + 1)
|
||||
gallery_soup = self.index_to_soup(next)
|
||||
images.reverse()
|
||||
return images, remove_link
|
||||
|
||||
def _get_image_from_gallery(self,soup):
|
||||
try:
|
||||
container = soup.find('div',attrs={'id' : ['main-image']})
|
||||
image = container.find('img')
|
||||
if image:
|
||||
title = soup.find('div',attrs={'id' : ['image-title']})
|
||||
return image, title
|
||||
except:
|
||||
print 'error fetching gallery image'
|
||||
return None
|
||||
|
||||
|
||||
|
||||
def _recurisvely_linearise_tag_tree(
|
||||
self,
|
||||
item,
|
||||
|
Loading…
x
Reference in New Issue
Block a user