mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Update Smithsonian and Business Week Magazine
This commit is contained in:
parent
71f61ac049
commit
bd9e63ca92
@ -12,7 +12,7 @@ class BusinessWeekMagazine(BasicNewsRecipe):
|
|||||||
category = 'news'
|
category = 'news'
|
||||||
encoding = 'UTF-8'
|
encoding = 'UTF-8'
|
||||||
keep_only_tags = [
|
keep_only_tags = [
|
||||||
dict(name='div', attrs={'id':'article_body_container'}),
|
dict(name='div', attrs={'id':['article_body_container','story_body']}),
|
||||||
]
|
]
|
||||||
remove_tags = [dict(name='ui'),dict(name='li'),dict(name='div', attrs={'id':['share-email']})]
|
remove_tags = [dict(name='ui'),dict(name='li'),dict(name='div', attrs={'id':['share-email']})]
|
||||||
no_javascript = True
|
no_javascript = True
|
||||||
@ -26,7 +26,6 @@ class BusinessWeekMagazine(BasicNewsRecipe):
|
|||||||
|
|
||||||
#Find date
|
#Find date
|
||||||
mag=soup.find('h2',text='Magazine')
|
mag=soup.find('h2',text='Magazine')
|
||||||
self.log(mag)
|
|
||||||
dates=self.tag_to_string(mag.findNext('h3'))
|
dates=self.tag_to_string(mag.findNext('h3'))
|
||||||
self.timefmt = u' [%s]'%dates
|
self.timefmt = u' [%s]'%dates
|
||||||
|
|
||||||
@ -34,15 +33,16 @@ class BusinessWeekMagazine(BasicNewsRecipe):
|
|||||||
div0 = soup.find('div', attrs={'class':'column left'})
|
div0 = soup.find('div', attrs={'class':'column left'})
|
||||||
section_title = ''
|
section_title = ''
|
||||||
feeds = OrderedDict()
|
feeds = OrderedDict()
|
||||||
for div in div0.findAll(['h4','h5']):
|
for div in div0.findAll('a', attrs={'class': None}):
|
||||||
articles = []
|
articles = []
|
||||||
section_title = self.tag_to_string(div.findPrevious('h3')).strip()
|
section_title = self.tag_to_string(div.findPrevious('h3')).strip()
|
||||||
title=self.tag_to_string(div.a).strip()
|
title=self.tag_to_string(div).strip()
|
||||||
url=div.a['href']
|
url=div['href']
|
||||||
soup0 = self.index_to_soup(url)
|
soup0 = self.index_to_soup(url)
|
||||||
urlprint=soup0.find('a', attrs={'href':re.compile('.*printer.*')})['href']
|
urlprint=soup0.find('a', attrs={'href':re.compile('.*printer.*')})
|
||||||
articles.append({'title':title, 'url':urlprint, 'description':'', 'date':''})
|
if urlprint is not None:
|
||||||
|
url=urlprint['href']
|
||||||
|
articles.append({'title':title, 'url':url, 'description':'', 'date':''})
|
||||||
|
|
||||||
if articles:
|
if articles:
|
||||||
if section_title not in feeds:
|
if section_title not in feeds:
|
||||||
@ -50,19 +50,21 @@ class BusinessWeekMagazine(BasicNewsRecipe):
|
|||||||
feeds[section_title] += articles
|
feeds[section_title] += articles
|
||||||
div1 = soup.find('div', attrs={'class':'column center'})
|
div1 = soup.find('div', attrs={'class':'column center'})
|
||||||
section_title = ''
|
section_title = ''
|
||||||
for div in div1.findAll(['h4','h5']):
|
for div in div1.findAll('a'):
|
||||||
articles = []
|
articles = []
|
||||||
desc=self.tag_to_string(div.findNext('p')).strip()
|
desc=self.tag_to_string(div.findNext('p')).strip()
|
||||||
section_title = self.tag_to_string(div.findPrevious('h3')).strip()
|
section_title = self.tag_to_string(div.findPrevious('h3')).strip()
|
||||||
title=self.tag_to_string(div.a).strip()
|
title=self.tag_to_string(div).strip()
|
||||||
url=div.a['href']
|
url=div['href']
|
||||||
soup0 = self.index_to_soup(url)
|
soup0 = self.index_to_soup(url)
|
||||||
urlprint=soup0.find('a', attrs={'href':re.compile('.*printer.*')})['href']
|
urlprint=soup0.find('a', attrs={'href':re.compile('.*printer.*')})
|
||||||
articles.append({'title':title, 'url':urlprint, 'description':desc, 'date':''})
|
if urlprint is not None:
|
||||||
|
url=urlprint['href']
|
||||||
|
articles.append({'title':title, 'url':url, 'description':desc, 'date':''})
|
||||||
if articles:
|
if articles:
|
||||||
if section_title not in feeds:
|
if section_title not in feeds:
|
||||||
feeds[section_title] = []
|
feeds[section_title] = []
|
||||||
feeds[section_title] += articles
|
feeds[section_title] += articles
|
||||||
|
|
||||||
ans = [(key, val) for key, val in feeds.iteritems()]
|
ans = [(key, val) for key, val in feeds.iteritems()]
|
||||||
return ans
|
return ans
|
||||||
|
@ -49,16 +49,20 @@ class Smithsonian(BasicNewsRecipe):
|
|||||||
self.log('Found section:', section_title)
|
self.log('Found section:', section_title)
|
||||||
else:
|
else:
|
||||||
link=post.find('a',href=True)
|
link=post.find('a',href=True)
|
||||||
|
article_cat=link.findPrevious('p', attrs={'class':'article-cat'})
|
||||||
url=link['href']+'?c=y&story=fullstory'
|
url=link['href']+'?c=y&story=fullstory'
|
||||||
description=self.tag_to_string(post.find('p')).strip()
|
description=self.tag_to_string(post.findAll('p')[-1]).strip()
|
||||||
desc=re.sub('\sBy\s.*', '', description, re.DOTALL)
|
title=self.tag_to_string(link).strip()
|
||||||
author=re.sub('.*By\s', '', description, re.DOTALL)
|
if article_cat is not None:
|
||||||
title=self.tag_to_string(link).strip()+ u' (%s)'%author
|
title += u' (%s)'%self.tag_to_string(article_cat).strip()
|
||||||
self.log('\tFound article:', title)
|
self.log('\tFound article:', title)
|
||||||
articles.append({'title':title, 'url':url, 'description':desc, 'date':''})
|
articles.append({'title':title, 'url':url, 'description':description, 'date':''})
|
||||||
|
|
||||||
if articles:
|
if articles:
|
||||||
feeds[section_title] = articles
|
if section_title not in feeds:
|
||||||
|
feeds[section_title] = []
|
||||||
|
feeds[section_title] += articles
|
||||||
|
articles = []
|
||||||
|
|
||||||
ans = [(key, val) for key, val in feeds.iteritems()]
|
ans = [(key, val) for key, val in feeds.iteritems()]
|
||||||
return ans
|
return ans
|
||||||
|
Loading…
x
Reference in New Issue
Block a user