mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
...
This commit is contained in:
parent
ac8cbdd168
commit
d134be5b1b
@ -39,7 +39,9 @@ class TheIndependentNew(BasicNewsRecipe):
|
||||
encoding = 'utf-8'
|
||||
remove_tags =[
|
||||
dict(attrs={'id' : ['RelatedArtTag','renderBiography']}),
|
||||
dict(attrs={'class' : ['autoplay','openBiogPopup']})
|
||||
dict(attrs={'class' : ['autoplay','openBiogPopup']}),
|
||||
dict(name='img',attrs={'alt' : ['Get Adobe Flash player']}),
|
||||
dict(attrs={'style' : re.compile('.*')}),
|
||||
]
|
||||
|
||||
keep_only_tags =[dict(attrs={'id':'main'})]
|
||||
@ -113,6 +115,7 @@ class TheIndependentNew(BasicNewsRecipe):
|
||||
return None
|
||||
|
||||
items_to_extract = []
|
||||
slideshow_elements = []
|
||||
|
||||
for item in soup.findAll(attrs={'class' : re.compile("widget.*")}):
|
||||
remove = True
|
||||
@ -131,6 +134,7 @@ class TheIndependentNew(BasicNewsRecipe):
|
||||
if (pattern.search(item['class'])) is not None:
|
||||
if self._FETCH_IMAGES:
|
||||
remove = False
|
||||
slideshow_elements.append(item)
|
||||
else:
|
||||
remove = True
|
||||
|
||||
@ -148,7 +152,8 @@ class TheIndependentNew(BasicNewsRecipe):
|
||||
items_to_extract = []
|
||||
|
||||
if self._FETCH_IMAGES:
|
||||
for item in soup.findAll('a',attrs={'href' : re.compile('.*')}):
|
||||
for element in slideshow_elements:
|
||||
for item in element.findAll('a',attrs={'href' : re.compile('.*')}):
|
||||
if item.img is not None:
|
||||
#use full size image
|
||||
img = item.findNext('img')
|
||||
@ -156,7 +161,7 @@ class TheIndependentNew(BasicNewsRecipe):
|
||||
img['src'] = item['href']
|
||||
|
||||
#insert caption if available
|
||||
if img['title'] is not None and (len(img['title']) > 1):
|
||||
if img.get('title') and (len(img['title']) > 1):
|
||||
tag = Tag(soup,'h3')
|
||||
text = NavigableString(img['title'])
|
||||
tag.insert(0,text)
|
||||
@ -283,7 +288,7 @@ class TheIndependentNew(BasicNewsRecipe):
|
||||
items_to_extract = []
|
||||
for item in soup.findAll('div', attrs={'class' : 'image'}):
|
||||
img = item.findNext('img')
|
||||
if img is not None and img['src'] is not None:
|
||||
if img and img.get('src'):
|
||||
# broken images still point to remote url
|
||||
pattern = re.compile('http://www.independent.co.uk.*')
|
||||
if pattern.match(img["src"]) is not None:
|
||||
|
Loading…
x
Reference in New Issue
Block a user