This commit is contained in:
Kovid Goyal 2011-11-26 07:58:02 +05:30
parent ac8cbdd168
commit d134be5b1b

View File

@ -39,7 +39,9 @@ class TheIndependentNew(BasicNewsRecipe):
encoding = 'utf-8'
remove_tags =[
dict(attrs={'id' : ['RelatedArtTag','renderBiography']}),
dict(attrs={'class' : ['autoplay','openBiogPopup']})
dict(attrs={'class' : ['autoplay','openBiogPopup']}),
dict(name='img',attrs={'alt' : ['Get Adobe Flash player']}),
dict(attrs={'style' : re.compile('.*')}),
]
keep_only_tags =[dict(attrs={'id':'main'})]
@ -113,6 +115,7 @@ class TheIndependentNew(BasicNewsRecipe):
return None
items_to_extract = []
slideshow_elements = []
for item in soup.findAll(attrs={'class' : re.compile("widget.*")}):
remove = True
@ -131,6 +134,7 @@ class TheIndependentNew(BasicNewsRecipe):
if (pattern.search(item['class'])) is not None:
if self._FETCH_IMAGES:
remove = False
slideshow_elements.append(item)
else:
remove = True
@ -148,7 +152,8 @@ class TheIndependentNew(BasicNewsRecipe):
items_to_extract = []
if self._FETCH_IMAGES:
for item in soup.findAll('a',attrs={'href' : re.compile('.*')}):
for element in slideshow_elements:
for item in element.findAll('a',attrs={'href' : re.compile('.*')}):
if item.img is not None:
#use full size image
img = item.findNext('img')
@ -156,7 +161,7 @@ class TheIndependentNew(BasicNewsRecipe):
img['src'] = item['href']
#insert caption if available
if img['title'] is not None and (len(img['title']) > 1):
if img.get('title') and (len(img['title']) > 1):
tag = Tag(soup,'h3')
text = NavigableString(img['title'])
tag.insert(0,text)
@ -283,7 +288,7 @@ class TheIndependentNew(BasicNewsRecipe):
items_to_extract = []
for item in soup.findAll('div', attrs={'class' : 'image'}):
img = item.findNext('img')
if img is not None and img['src'] is not None:
if img and img.get('src'):
# broken images still point to remote url
pattern = re.compile('http://www.independent.co.uk.*')
if pattern.match(img["src"]) is not None: