calibre/recipes/time_magazine.recipe

114 lines
4.9 KiB
Python

#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid@kovidgoyal.net>'
'''
time.com
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class Time(BasicNewsRecipe):
title = u'Time'
__author__ = 'Kovid Goyal and Sujata Raman'
description = 'Weekly magazine'
encoding = 'utf-8'
no_stylesheets = True
language = 'en'
remove_javascript = True
extra_css = ''' h1 {font-family:georgia,serif;color:#000000;}
.mainHd{font-family:georgia,serif;color:#000000;}
h2 {font-family:Arial,Sans-serif;}
.name{font-family:Arial,Sans-serif; font-size:x-small;font-weight:bold; }
.date{font-family:Arial,Sans-serif; font-size:x-small ;color:#999999;}
.byline{font-family:Arial,Sans-serif; font-size:x-small ;}
.photoBkt{ font-size:x-small ;}
.vertPhoto{font-size:x-small ;}
.credits{font-family:Arial,Sans-serif; font-size:x-small ;color:gray;}
.credit{font-family:Arial,Sans-serif; font-size:x-small ;color:gray;}
.artTxt{font-family:georgia,serif;}
#content{font-family:georgia,serif;}
.caption{font-family:georgia,serif; font-size:x-small;color:#333333;}
.credit{font-family:georgia,serif; font-size:x-small;color:#999999;}
a:link{color:#CC0000;}
.breadcrumb{font-family:Arial,Sans-serif;font-size:x-small;}
'''
keep_only_tags = [ dict(name ="div",attrs = {"id" :["content"]}) ,
dict(name ="div",attrs = {"class" :["artHd","artTxt","photoBkt","vertPhoto","image","copy"]}) ,]
remove_tags = [ dict(name ="div",attrs = {'class':['articleFooterNav','listsByTopic','articleTools2','relatedContent','sideContent','topBannerWrap','articlePagination','nextUp',"rtCol","pagination","enlarge","contentTools2",]}),
dict(name ="span",attrs = {'class':['see']}),
dict(name ="div",attrs = {'id':['header','articleSideBar',"articleTools","articleFooter","cmBotLt","quigoPackage"]}),
dict(name ="a",attrs = {'class':['listLink']}),
dict(name ="ul",attrs = {'id':['shareSocial','tabs']}),
dict(name ="li",attrs = {'class':['back']}),
dict(name ="ul",attrs = {'class':['navCount']}),
]
recursions = 10
match_regexps = [r'/[0-9,]+-(2|3|4|5|6|7|8|9)(,\d+){0,1}.html',r'http://www.time.com/time/specials/packages/article/.*']
preprocess_regexps = [(re.compile(
r'<meta .+/>'), lambda m:'')]
def parse_index(self):
soup = self.index_to_soup('http://www.time.com/time/magazine')
img = soup.find('a', title="View Large Cover", href=True)
if img is not None:
cover_url = 'http://www.time.com'+img['href']
try:
nsoup = self.index_to_soup(cover_url)
img = nsoup.find('img', src=re.compile('archive/covers'))
if img is not None:
self.cover_url = img['src']
except:
self.log.exception('Failed to fetch cover')
feeds = []
parent = soup.find(id='tocGuts')
for seched in parent.findAll(attrs={'class':'toc_seched'}):
section = self.tag_to_string(seched).capitalize()
articles = list(self.find_articles(seched))
feeds.append((section, articles))
return feeds
def find_articles(self, seched):
for a in seched.findNextSiblings( attrs={'class':['toc_hed','rule2']}):
if a.name in "div":
break
else:
yield {
'title' : self.tag_to_string(a),
'url' : 'http://www.time.com'+a['href'],
'date' : '',
'description' : self.article_description(a)
}
def article_description(self, a):
ans = []
while True:
t = a.nextSibling
if t is None:
break
a = t
if getattr(t, 'name', False):
if t.get('class', '') == 'toc_parens' or t.name == 'br':
continue
if t.name in ('div', 'a'):
break
ans.append(self.tag_to_string(t))
else:
ans.append(unicode(t))
return u' '.join(ans).replace(u'\xa0', u'').strip()
def postprocess_html(self,soup,first):
for tag in soup.findAll(attrs ={'class':['artPag','pagination']}):
tag.extract()
return soup