Updte Newsweek recipe for new site

This commit is contained in:
Kovid Goyal 2010-06-01 11:08:16 -06:00
parent ce67fe9797
commit ef0af86b19

View File

@ -1,189 +1,76 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
import re
from calibre import strftime
from calibre.ebooks.BeautifulSoup import BeautifulSoup
import string
from calibre.web.feeds.news import BasicNewsRecipe
class Newsweek(BasicNewsRecipe):
title = 'Newsweek'
__author__ = 'Kovid Goyal and Sujata Raman'
__author__ = 'Kovid Goyal'
description = 'Weekly news and current affairs in the US'
language = 'en'
encoding = 'utf-8'
no_stylesheets = True
extra_css = '''
h1{font-family:Arial,Helvetica,sans-serif; font-size:large; color:#383733;}
.deck{font-family:Georgia,sans-serif; color:#383733;}
.bylineDate{font-family:georgia ; color:#58544A; font-size:x-small;}
.authorInfo{font-family:arial,helvetica,sans-serif; color:#0066CC; font-size:x-small;}
.articleUpdated{font-family:arial,helvetica,sans-serif; color:#73726C; font-size:x-small;}
.issueDate{font-family:arial,helvetica,sans-serif; color:#73726C; font-size:x-small; font-style:italic;}
h5{font-family:arial,helvetica,sans-serif; color:#73726C; font-size:x-small;}
h6{font-family:arial,helvetica,sans-serif; color:#73726C; font-size:x-small;}
.story{font-family:georgia,sans-serif ;color:black;}
.photoCredit{color:#999999; font-family:Arial,Helvetica,sans-serif;font-size:x-small;}
.photoCaption{color:#0A0A09;font-family:Arial,Helvetica,sans-serif;font-size:x-small;}
.fwArticle{font-family:Arial,Helvetica,sans-serif;font-size:x-small;font-weight:bold;}
'''
BASE_URL = 'http://www.newsweek.com'
INDEX = BASE_URL+'/topics.html'
encoding = 'utf-8'
language = 'en'
keep_only_tags = dict(name='article', attrs={'class':'article-text'})
remove_tags = [dict(attrs={'data-dartad':True})]
remove_attributes = ['property']
remove_tags = [
{'class':['fwArticle noHr','fwArticle','hdlBulletItem','head-content','navbar','link', 'ad', 'sponsorLinksArticle', 'mm-content',
'inline-social-links-wrapper', 'email-article','ToolBox',
'inline-promo-link', 'sponsorship',
'inlineComponentRight',
'comments-and-social-links-wrapper', 'EmailArticleBlock']},
{'id' : ['footer', 'ticker-data', 'topTenVertical',
'digg-top-five', 'mesothorax', 'nw-comments', 'my-take-landing',
'ToolBox', 'EmailMain']},
{'class': re.compile('related-cloud')},
dict(name='li', attrs={'id':['slug_bigbox']})
]
def postprocess_html(self, soup, first):
for tag in soup.findAll(name=['article', 'header']):
tag.name = 'div'
return soup
def newsweek_sections(self):
soup = self.index_to_soup(self.INDEX)
for a in soup.findAll('a', title='Primary tag', href=True):
yield (string.capitalize(self.tag_to_string(a)),
self.BASE_URL+a['href'])
keep_only_tags = [{'class':['article HorizontalHeader',
'articlecontent','photoBox', 'article columnist first']}, ]
recursions = 1
match_regexps = [r'http://www.newsweek.com/id/\S+/page/\d+']
preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
def find_title(self, section):
d = {'scope':'Scope', 'thetake':'The Take', 'features':'Features',
None:'Departments', 'culture':'Culture'}
ans = None
a = section.find('a', attrs={'name':True})
if a is not None:
ans = a['name']
return d.get(ans, ans)
def find_articles(self, section):
ans = []
for x in section.findAll('h5'):
title = ' '.join(x.findAll(text=True)).strip()
a = x.find('a')
if not a: continue
href = a['href']
ans.append({'title':title, 'url':href, 'description':'', 'date': strftime('%a, %d %b')})
if not ans:
for x in section.findAll('div', attrs={'class':'hdlItem'}):
a = x.find('a', href=True)
if not a : continue
title = ' '.join(a.findAll(text=True)).strip()
href = a['href']
if 'http://xtra.newsweek.com' in href: continue
ans.append({'title':title, 'url':href, 'description':'', 'date': strftime('%a, %d %b')})
#for x in ans:
# x['url'] += '/output/print'
return ans
def newsweek_parse_section_page(self, soup):
for article in soup.findAll('article', about=True,
attrs={'class':'stream-item'}):
title = article.find(attrs={'property': 'dc:title'})
if title is None: continue
title = self.tag_to_string(title)
url = self.BASE_URL + article['about']
desc = ''
author = article.find({'property':'dc:creator'})
if author:
desc = u'by %s. '%self.tag_to_string(author)
p = article.find(attrs={'property':'dc:abstract'})
if p is not None:
for a in p.find('a'): a.extract()
desc += self.tag_to_string(p)
t = article.find('time', attrs={'property':'dc:created'})
date = ''
if t is not None:
date = u' [%s]'%self.tag_to_string(t)
self.log('\tFound article:', title, 'at', url)
self.log('\t\t', desc)
yield {'title':title, 'url':url, 'description':desc, 'date':date}
def parse_index(self):
soup = self.get_current_issue()
if not soup:
raise RuntimeError('Unable to connect to newsweek.com. Try again later.')
sections = soup.findAll('div', attrs={'class':'featurewell'})
titles = map(self.find_title, sections)
articles = map(self.find_articles, sections)
ans = list(zip(titles, articles))
def fcmp(x, y):
tx, ty = x[0], y[0]
if tx == "Features": return cmp(1, 2)
if ty == "Features": return cmp(2, 1)
return cmp(tx, ty)
return sorted(ans, cmp=fcmp)
def ensure_html(self, soup):
root = soup.find(name=True)
if root.name == 'html': return soup
nsoup = BeautifulSoup('<html><head></head><body/></html>')
nroot = nsoup.find(name='body')
for x in soup.contents:
if getattr(x, 'name', False):
x.extract()
nroot.insert(len(nroot), x)
return nsoup
def postprocess_html(self, soup, first_fetch):
if not first_fetch:
h1 = soup.find(id='headline')
if h1:
h1.extract()
div = soup.find(attrs={'class':'articleInfo'})
if div:
div.extract()
divs = list(soup.findAll('div', 'pagination'))
if not divs:
return self.ensure_html(soup)
for div in divs[1:]: div.extract()
all_a = divs[0].findAll('a', href=True)
divs[0]['style']="display:none"
if len(all_a) > 1:
all_a[-1].extract()
test = re.compile(self.match_regexps[0])
for a in soup.findAll('a', href=test):
if a not in all_a:
del a['href']
return self.ensure_html(soup)
def get_current_issue(self):
soup = self.index_to_soup('http://www.newsweek.com')
div = soup.find('div', attrs={'class':re.compile('more-from-mag')})
if div is None: return None
a = div.find('a')
if a is not None:
href = a['href'].split('#')[0]
return self.index_to_soup(href)
def get_cover_url(self):
cover_url = None
soup = self.index_to_soup('http://www.newsweek.com')
link_item = soup.find('div',attrs={'class':'cover-image'})
if link_item and link_item.a and link_item.a.img:
cover_url = link_item.a.img['src']
return cover_url
sections = []
for section, shref in self.newsweek_sections():
self.log('Processing section', section, shref)
articles = []
soups = [self.index_to_soup(shref)]
na = soups[0].find('a', rel='next')
if na:
soups.append(self.index_to_soup(self.BASE_URL+na['href']))
for soup in soups:
articles.extend(self.newsweek_parse_section_page(soup))
if self.test and len(articles) > 1:
break
if articles:
sections.append((section, articles))
if self.test and len(sections) > 1:
break
return sections
def postprocess_book(self, oeb, opts, log) :
def extractByline(href) :
soup = BeautifulSoup(str(oeb.manifest.hrefs[href]))
byline = soup.find(True,attrs={'class':'authorInfo'})
byline = self.tag_to_string(byline) if byline is not None else ''
issueDate = soup.find(True,attrs={'class':'issueDate'})
issueDate = self.tag_to_string(issueDate) if issueDate is not None else ''
issueDate = re.sub(',','', issueDate)
if byline > '' and issueDate > '' :
return byline + ' | ' + issueDate
else :
return byline + issueDate
def extractDescription(href) :
soup = BeautifulSoup(str(oeb.manifest.hrefs[href]))
description = soup.find(True,attrs={'name':'description'})
if description is not None and description.has_key('content'):
description = description['content']
if description.startswith('Newsweek magazine online plus') :
description = soup.find(True, attrs={'class':'story'})
firstPara = soup.find('p')
description = self.tag_to_string(firstPara)
else :
description = soup.find(True, attrs={'class':'story'})
firstPara = soup.find('p')
description = self.tag_to_string(firstPara)
return description
for section in oeb.toc :
for article in section :
if article.author is None :
article.author = extractByline(article.href)
if article.description is None :
article.description = extractDescription(article.href)
return