mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
...
This commit is contained in:
parent
ac8cbdd168
commit
d134be5b1b
@ -39,7 +39,9 @@ class TheIndependentNew(BasicNewsRecipe):
|
|||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
remove_tags =[
|
remove_tags =[
|
||||||
dict(attrs={'id' : ['RelatedArtTag','renderBiography']}),
|
dict(attrs={'id' : ['RelatedArtTag','renderBiography']}),
|
||||||
dict(attrs={'class' : ['autoplay','openBiogPopup']})
|
dict(attrs={'class' : ['autoplay','openBiogPopup']}),
|
||||||
|
dict(name='img',attrs={'alt' : ['Get Adobe Flash player']}),
|
||||||
|
dict(attrs={'style' : re.compile('.*')}),
|
||||||
]
|
]
|
||||||
|
|
||||||
keep_only_tags =[dict(attrs={'id':'main'})]
|
keep_only_tags =[dict(attrs={'id':'main'})]
|
||||||
@ -113,6 +115,7 @@ class TheIndependentNew(BasicNewsRecipe):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
items_to_extract = []
|
items_to_extract = []
|
||||||
|
slideshow_elements = []
|
||||||
|
|
||||||
for item in soup.findAll(attrs={'class' : re.compile("widget.*")}):
|
for item in soup.findAll(attrs={'class' : re.compile("widget.*")}):
|
||||||
remove = True
|
remove = True
|
||||||
@ -131,6 +134,7 @@ class TheIndependentNew(BasicNewsRecipe):
|
|||||||
if (pattern.search(item['class'])) is not None:
|
if (pattern.search(item['class'])) is not None:
|
||||||
if self._FETCH_IMAGES:
|
if self._FETCH_IMAGES:
|
||||||
remove = False
|
remove = False
|
||||||
|
slideshow_elements.append(item)
|
||||||
else:
|
else:
|
||||||
remove = True
|
remove = True
|
||||||
|
|
||||||
@ -148,7 +152,8 @@ class TheIndependentNew(BasicNewsRecipe):
|
|||||||
items_to_extract = []
|
items_to_extract = []
|
||||||
|
|
||||||
if self._FETCH_IMAGES:
|
if self._FETCH_IMAGES:
|
||||||
for item in soup.findAll('a',attrs={'href' : re.compile('.*')}):
|
for element in slideshow_elements:
|
||||||
|
for item in element.findAll('a',attrs={'href' : re.compile('.*')}):
|
||||||
if item.img is not None:
|
if item.img is not None:
|
||||||
#use full size image
|
#use full size image
|
||||||
img = item.findNext('img')
|
img = item.findNext('img')
|
||||||
@ -156,7 +161,7 @@ class TheIndependentNew(BasicNewsRecipe):
|
|||||||
img['src'] = item['href']
|
img['src'] = item['href']
|
||||||
|
|
||||||
#insert caption if available
|
#insert caption if available
|
||||||
if img['title'] is not None and (len(img['title']) > 1):
|
if img.get('title') and (len(img['title']) > 1):
|
||||||
tag = Tag(soup,'h3')
|
tag = Tag(soup,'h3')
|
||||||
text = NavigableString(img['title'])
|
text = NavigableString(img['title'])
|
||||||
tag.insert(0,text)
|
tag.insert(0,text)
|
||||||
@ -283,7 +288,7 @@ class TheIndependentNew(BasicNewsRecipe):
|
|||||||
items_to_extract = []
|
items_to_extract = []
|
||||||
for item in soup.findAll('div', attrs={'class' : 'image'}):
|
for item in soup.findAll('div', attrs={'class' : 'image'}):
|
||||||
img = item.findNext('img')
|
img = item.findNext('img')
|
||||||
if img is not None and img['src'] is not None:
|
if img and img.get('src'):
|
||||||
# broken images still point to remote url
|
# broken images still point to remote url
|
||||||
pattern = re.compile('http://www.independent.co.uk.*')
|
pattern = re.compile('http://www.independent.co.uk.*')
|
||||||
if pattern.match(img["src"]) is not None:
|
if pattern.match(img["src"]) is not None:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user