This commit is contained in:
Kovid Goyal 2011-11-26 07:58:02 +05:30
parent ac8cbdd168
commit d134be5b1b

View File

@ -39,7 +39,9 @@ class TheIndependentNew(BasicNewsRecipe):
encoding = 'utf-8' encoding = 'utf-8'
remove_tags =[ remove_tags =[
dict(attrs={'id' : ['RelatedArtTag','renderBiography']}), dict(attrs={'id' : ['RelatedArtTag','renderBiography']}),
dict(attrs={'class' : ['autoplay','openBiogPopup']}) dict(attrs={'class' : ['autoplay','openBiogPopup']}),
dict(name='img',attrs={'alt' : ['Get Adobe Flash player']}),
dict(attrs={'style' : re.compile('.*')}),
] ]
keep_only_tags =[dict(attrs={'id':'main'})] keep_only_tags =[dict(attrs={'id':'main'})]
@ -113,6 +115,7 @@ class TheIndependentNew(BasicNewsRecipe):
return None return None
items_to_extract = [] items_to_extract = []
slideshow_elements = []
for item in soup.findAll(attrs={'class' : re.compile("widget.*")}): for item in soup.findAll(attrs={'class' : re.compile("widget.*")}):
remove = True remove = True
@ -131,6 +134,7 @@ class TheIndependentNew(BasicNewsRecipe):
if (pattern.search(item['class'])) is not None: if (pattern.search(item['class'])) is not None:
if self._FETCH_IMAGES: if self._FETCH_IMAGES:
remove = False remove = False
slideshow_elements.append(item)
else: else:
remove = True remove = True
@ -148,28 +152,29 @@ class TheIndependentNew(BasicNewsRecipe):
items_to_extract = [] items_to_extract = []
if self._FETCH_IMAGES: if self._FETCH_IMAGES:
for item in soup.findAll('a',attrs={'href' : re.compile('.*')}): for element in slideshow_elements:
if item.img is not None: for item in element.findAll('a',attrs={'href' : re.compile('.*')}):
#use full size image if item.img is not None:
img = item.findNext('img') #use full size image
img = item.findNext('img')
img['src'] = item['href'] img['src'] = item['href']
#insert caption if available #insert caption if available
if img['title'] is not None and (len(img['title']) > 1): if img.get('title') and (len(img['title']) > 1):
tag = Tag(soup,'h3') tag = Tag(soup,'h3')
text = NavigableString(img['title']) text = NavigableString(img['title'])
tag.insert(0,text) tag.insert(0,text)
#picture before text #picture before text
img.extract() img.extract()
item.insert(0,img) item.insert(0,img)
item.insert(1,tag) item.insert(1,tag)
# remove link # remove link
item.name = "div" item.name = "div"
item["class"]='image' item["class"]='image'
del item["href"] del item["href"]
#remove empty subtitles #remove empty subtitles
@ -283,7 +288,7 @@ class TheIndependentNew(BasicNewsRecipe):
items_to_extract = [] items_to_extract = []
for item in soup.findAll('div', attrs={'class' : 'image'}): for item in soup.findAll('div', attrs={'class' : 'image'}):
img = item.findNext('img') img = item.findNext('img')
if img is not None and img['src'] is not None: if img and img.get('src'):
# broken images still point to remote url # broken images still point to remote url
pattern = re.compile('http://www.independent.co.uk.*') pattern = re.compile('http://www.independent.co.uk.*')
if pattern.match(img["src"]) is not None: if pattern.match(img["src"]) is not None: