Fix #809329 ("Time" fetch news script fails)

This commit is contained in:
Kovid Goyal 2011-07-12 11:29:51 -06:00
parent 216b6f3557
commit 24ab3f6cc4

View File

@ -8,47 +8,33 @@ time.com
import re import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from lxml import html
class Time(BasicNewsRecipe): class Time(BasicNewsRecipe):
#recipe_disabled = ('This recipe has been disabled as TIME no longer' #recipe_disabled = ('This recipe has been disabled as TIME no longer'
# ' publish complete articles on the web.') # ' publish complete articles on the web.')
title = u'Time' title = u'Time'
__author__ = 'Kovid Goyal and Sujata Raman' __author__ = 'Kovid Goyal'
description = 'Weekly magazine' description = 'Weekly magazine'
encoding = 'utf-8' encoding = 'utf-8'
no_stylesheets = True no_stylesheets = True
language = 'en' language = 'en'
remove_javascript = True remove_javascript = True
extra_css = ''' h1 {font-family:georgia,serif;color:#000000;}
.mainHd{font-family:georgia,serif;color:#000000;}
h2 {font-family:Arial,Sans-serif;}
.name{font-family:Arial,Sans-serif; font-size:x-small;font-weight:bold; }
.date{font-family:Arial,Sans-serif; font-size:x-small ;color:#999999;}
.byline{font-family:Arial,Sans-serif; font-size:x-small ;}
.photoBkt{ font-size:x-small ;}
.vertPhoto{font-size:x-small ;}
.credits{font-family:Arial,Sans-serif; font-size:x-small ;color:gray;}
.credit{font-family:Arial,Sans-serif; font-size:x-small ;color:gray;}
.artTxt{font-family:georgia,serif;}
#content{font-family:georgia,serif;}
.caption{font-family:georgia,serif; font-size:x-small;color:#333333;}
.credit{font-family:georgia,serif; font-size:x-small;color:#999999;}
a:link{color:#CC0000;}
.breadcrumb{font-family:Arial,Sans-serif;font-size:x-small;}
'''
keep_only_tags = [
{
'class':['artHd', 'articleContent',
'entry-title','entry-meta', 'entry-content', 'thumbnail']
},
]
remove_tags = [
{'class':['content-tools', 'quigo', 'see',
'first-tier-social-tools', 'navigation', 'enlarge lightbox']},
{'id':['share-tools']},
{'rel':'lightbox'},
]
keep_only_tags = [ dict(name ="div",attrs = {"id" :["content"]}) ,
dict(name ="div",attrs = {"class" :["artHd","artTxt","photoBkt","vertPhoto","image","copy"]}) ,]
remove_tags = [ dict(name ="div",attrs = {'class':['articleFooterNav','listsByTopic','articleTools2','relatedContent','sideContent','topBannerWrap','articlePagination','nextUp',"rtCol","pagination","enlarge","contentTools2",]}),
dict(name ="span",attrs = {'class':['see']}),
dict(name ="div",attrs = {'id':['header','articleSideBar',"articleTools","articleFooter","cmBotLt","quigoPackage"]}),
dict(name ="a",attrs = {'class':['listLink']}),
dict(name ="ul",attrs = {'id':['shareSocial','tabs']}),
dict(name ="li",attrs = {'class':['back']}),
dict(name ="ul",attrs = {'class':['navCount']}),
]
recursions = 10 recursions = 10
match_regexps = [r'/[0-9,]+-(2|3|4|5|6|7|8|9)(,\d+){0,1}.html',r'http://www.time.com/time/specials/packages/article/.*'] match_regexps = [r'/[0-9,]+-(2|3|4|5|6|7|8|9)(,\d+){0,1}.html',r'http://www.time.com/time/specials/packages/article/.*']
@ -56,10 +42,11 @@ class Time(BasicNewsRecipe):
r'<meta .+/>'), lambda m:'')] r'<meta .+/>'), lambda m:'')]
def parse_index(self): def parse_index(self):
soup = self.index_to_soup('http://www.time.com/time/magazine') raw = self.index_to_soup('http://www.time.com/time/magazine', raw=True)
img = soup.find('a', title="View Large Cover", href=True) root = html.fromstring(raw)
if img is not None: img = root.xpath('//a[.="View Large Cover" and @href]')
cover_url = 'http://www.time.com'+img['href'] if img:
cover_url = 'http://www.time.com' + img[0].get('href')
try: try:
nsoup = self.index_to_soup(cover_url) nsoup = self.index_to_soup(cover_url)
img = nsoup.find('img', src=re.compile('archive/covers')) img = nsoup.find('img', src=re.compile('archive/covers'))
@ -70,46 +57,48 @@ class Time(BasicNewsRecipe):
feeds = [] feeds = []
parent = soup.find(id='tocGuts') parent = root.xpath('//div[@class="content-main-aside"]')[0]
for seched in parent.findAll(attrs={'class':'toc_seched'}): for sec in parent.xpath(
section = self.tag_to_string(seched).capitalize() 'descendant::section[contains(@class, "sec-mag-section")]'):
articles = list(self.find_articles(seched)) h3 = sec.xpath('./h3')
feeds.append((section, articles)) if h3:
section = html.tostring(h3[0], encoding=unicode,
method='text').strip().capitalize()
self.log('Found section', section)
articles = list(self.find_articles(sec))
if articles:
feeds.append((section, articles))
return feeds return feeds
def find_articles(self, seched): def find_articles(self, sec):
for a in seched.findNextSiblings( attrs={'class':['toc_hed','rule2']}):
if a.name in "div":
break
else:
yield {
'title' : self.tag_to_string(a),
'url' : 'http://www.time.com'+a['href'],
'date' : '',
'description' : self.article_description(a)
}
for article in sec.xpath('./article'):
h2 = article.xpath('./*[@class="entry-title"]')
def article_description(self, a): if not h2: continue
ans = [] a = h2[0].xpath('./a[@href]')
while True: if not a: continue
t = a.nextSibling title = html.tostring(a[0], encoding=unicode,
if t is None: method='text').strip()
break if not title: continue
a = t url = a[0].get('href')
if getattr(t, 'name', False): if url.startswith('/'):
if t.get('class', '') == 'toc_parens' or t.name == 'br': url = 'http://www.time.com'+url
continue desc = ''
if t.name in ('div', 'a'): p = article.xpath('./*[@class="entry-content"]')
break if p:
ans.append(self.tag_to_string(t)) desc = html.tostring(p[0], encoding=unicode,
else: method='text')
ans.append(unicode(t)) self.log('\t', title, ':\n\t\t', desc)
return u' '.join(ans).replace(u'\xa0', u'').strip() yield {
'title' : title,
'url' : url,
'date' : '',
'description' : desc
}
def postprocess_html(self,soup,first): def postprocess_html(self,soup,first):
for tag in soup.findAll(attrs ={'class':['artPag','pagination']}): for tag in soup.findAll(attrs ={'class':['artPag','pagination']}):
tag.extract() tag.extract()
return soup return soup