This commit is contained in:
Kovid Goyal 2012-08-25 00:01:55 +05:30
parent 016ec7ade3
commit 872a1434c7

View File

@ -2,6 +2,7 @@
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2008, Kovid Goyal <kovid@kovidgoyal.net>'
''' '''
time.com time.com
''' '''
@ -11,28 +12,23 @@ from calibre.web.feeds.news import BasicNewsRecipe
from lxml import html from lxml import html
class Time(BasicNewsRecipe): class Time(BasicNewsRecipe):
#recipe_disabled = ('This recipe has been disabled as TIME no longer'
# ' publish complete articles on the web.')
title = u'Time' title = u'Time'
__author__ = 'Kovid Goyal' __author__ = 'Kovid Goyal, Rick Shang'
description = ('Weekly US magazine.') description = ('Weekly US magazine.')
encoding = 'utf-8' encoding = 'utf-8'
no_stylesheets = True no_stylesheets = True
language = 'en' language = 'en'
remove_javascript = True remove_javascript = True
#needs_subscription = 'optional' needs_subscription = 'optional'
keep_only_tags = [ keep_only_tags = [
{ {
'class':['artHd', 'articleContent', 'class':['tout1', 'entry-content', 'external-gallery-img', 'image-meta']
'entry-title','entry-meta', 'entry-content', 'thumbnail']
}, },
] ]
remove_tags = [ remove_tags = [
{'class':['content-tools', 'quigo', 'see', {'class':['thumbnail', 'button']},
'first-tier-social-tools', 'navigation', 'enlarge lightbox']},
{'id':['share-tools']},
{'rel':'lightbox'},
] ]
recursions = 10 recursions = 10
@ -43,14 +39,19 @@ class Time(BasicNewsRecipe):
def get_browser(self): def get_browser(self):
br = BasicNewsRecipe.get_browser(self) br = BasicNewsRecipe.get_browser(self)
if False and self.username and self.password:
# This site uses javascript in its login process # This site uses javascript in its login process
res = br.open('http://www.time.com/time/magazine') if False and self.username is not None and self.password is not None:
br.select_form(nr=1) br.open('http://www.time.com/time/magazine')
br.select_form(predicate=lambda f: 'action' in f.attrs and f.attrs['action'] == 'https://auth.time.com/login.php')
br['username'] = self.username br['username'] = self.username
br['password'] = self.password br['password'] = self.password
res = br.submit() br['magcode'] = ['TD']
raw = res.read() br.find_control('turl').readonly = False
br['turl'] = 'http://www.time.com/time/magazine'
br.find_control('rurl').readonly = False
br['rurl'] = 'http://www.time.com/time/magazine'
br['remember'] = False
raw = br.submit().read()
if '>Log Out<' not in raw: if '>Log Out<' not in raw:
raise ValueError('Failed to login to time.com, check' raise ValueError('Failed to login to time.com, check'
' your username and password') ' your username and password')
@ -70,6 +71,9 @@ class Time(BasicNewsRecipe):
except: except:
self.log.exception('Failed to fetch cover') self.log.exception('Failed to fetch cover')
dates = ''.join(root.xpath('//time[@class="updated"]/text()'))
if dates:
self.timefmt = ' [%s]'%dates
feeds = [] feeds = []
parent = root.xpath('//div[@class="content-main-aside"]')[0] parent = root.xpath('//div[@class="content-main-aside"]')[0]
@ -97,6 +101,8 @@ class Time(BasicNewsRecipe):
method='text').strip() method='text').strip()
if not title: continue if not title: continue
url = a[0].get('href') url = a[0].get('href')
# url = re.sub('/magazine/article/0,9171','/subscriber/printout/0,8816',
# url)
if url.startswith('/'): if url.startswith('/'):
url = 'http://www.time.com'+url url = 'http://www.time.com'+url
desc = '' desc = ''
@ -111,9 +117,3 @@ class Time(BasicNewsRecipe):
'date' : '', 'date' : '',
'description' : desc 'description' : desc
} }
def postprocess_html(self,soup,first):
for tag in soup.findAll(attrs ={'class':['artPag','pagination']}):
tag.extract()
return soup