mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 18:24:30 -04:00
Fix #809329 ("Time" fetch news script fails)
This commit is contained in:
parent
216b6f3557
commit
24ab3f6cc4
@ -8,47 +8,33 @@ time.com
|
|||||||
|
|
||||||
import re
|
import re
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from lxml import html
|
||||||
|
|
||||||
class Time(BasicNewsRecipe):
|
class Time(BasicNewsRecipe):
|
||||||
#recipe_disabled = ('This recipe has been disabled as TIME no longer'
|
#recipe_disabled = ('This recipe has been disabled as TIME no longer'
|
||||||
# ' publish complete articles on the web.')
|
# ' publish complete articles on the web.')
|
||||||
title = u'Time'
|
title = u'Time'
|
||||||
__author__ = 'Kovid Goyal and Sujata Raman'
|
__author__ = 'Kovid Goyal'
|
||||||
description = 'Weekly magazine'
|
description = 'Weekly magazine'
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
language = 'en'
|
language = 'en'
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
|
|
||||||
extra_css = ''' h1 {font-family:georgia,serif;color:#000000;}
|
|
||||||
.mainHd{font-family:georgia,serif;color:#000000;}
|
|
||||||
h2 {font-family:Arial,Sans-serif;}
|
|
||||||
.name{font-family:Arial,Sans-serif; font-size:x-small;font-weight:bold; }
|
|
||||||
.date{font-family:Arial,Sans-serif; font-size:x-small ;color:#999999;}
|
|
||||||
.byline{font-family:Arial,Sans-serif; font-size:x-small ;}
|
|
||||||
.photoBkt{ font-size:x-small ;}
|
|
||||||
.vertPhoto{font-size:x-small ;}
|
|
||||||
.credits{font-family:Arial,Sans-serif; font-size:x-small ;color:gray;}
|
|
||||||
.credit{font-family:Arial,Sans-serif; font-size:x-small ;color:gray;}
|
|
||||||
.artTxt{font-family:georgia,serif;}
|
|
||||||
#content{font-family:georgia,serif;}
|
|
||||||
.caption{font-family:georgia,serif; font-size:x-small;color:#333333;}
|
|
||||||
.credit{font-family:georgia,serif; font-size:x-small;color:#999999;}
|
|
||||||
a:link{color:#CC0000;}
|
|
||||||
.breadcrumb{font-family:Arial,Sans-serif;font-size:x-small;}
|
|
||||||
'''
|
|
||||||
|
|
||||||
|
keep_only_tags = [
|
||||||
|
{
|
||||||
|
'class':['artHd', 'articleContent',
|
||||||
|
'entry-title','entry-meta', 'entry-content', 'thumbnail']
|
||||||
|
},
|
||||||
|
]
|
||||||
|
remove_tags = [
|
||||||
|
{'class':['content-tools', 'quigo', 'see',
|
||||||
|
'first-tier-social-tools', 'navigation', 'enlarge lightbox']},
|
||||||
|
{'id':['share-tools']},
|
||||||
|
{'rel':'lightbox'},
|
||||||
|
]
|
||||||
|
|
||||||
keep_only_tags = [ dict(name ="div",attrs = {"id" :["content"]}) ,
|
|
||||||
dict(name ="div",attrs = {"class" :["artHd","artTxt","photoBkt","vertPhoto","image","copy"]}) ,]
|
|
||||||
remove_tags = [ dict(name ="div",attrs = {'class':['articleFooterNav','listsByTopic','articleTools2','relatedContent','sideContent','topBannerWrap','articlePagination','nextUp',"rtCol","pagination","enlarge","contentTools2",]}),
|
|
||||||
dict(name ="span",attrs = {'class':['see']}),
|
|
||||||
dict(name ="div",attrs = {'id':['header','articleSideBar',"articleTools","articleFooter","cmBotLt","quigoPackage"]}),
|
|
||||||
dict(name ="a",attrs = {'class':['listLink']}),
|
|
||||||
dict(name ="ul",attrs = {'id':['shareSocial','tabs']}),
|
|
||||||
dict(name ="li",attrs = {'class':['back']}),
|
|
||||||
dict(name ="ul",attrs = {'class':['navCount']}),
|
|
||||||
]
|
|
||||||
recursions = 10
|
recursions = 10
|
||||||
match_regexps = [r'/[0-9,]+-(2|3|4|5|6|7|8|9)(,\d+){0,1}.html',r'http://www.time.com/time/specials/packages/article/.*']
|
match_regexps = [r'/[0-9,]+-(2|3|4|5|6|7|8|9)(,\d+){0,1}.html',r'http://www.time.com/time/specials/packages/article/.*']
|
||||||
|
|
||||||
@ -56,10 +42,11 @@ class Time(BasicNewsRecipe):
|
|||||||
r'<meta .+/>'), lambda m:'')]
|
r'<meta .+/>'), lambda m:'')]
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
soup = self.index_to_soup('http://www.time.com/time/magazine')
|
raw = self.index_to_soup('http://www.time.com/time/magazine', raw=True)
|
||||||
img = soup.find('a', title="View Large Cover", href=True)
|
root = html.fromstring(raw)
|
||||||
if img is not None:
|
img = root.xpath('//a[.="View Large Cover" and @href]')
|
||||||
cover_url = 'http://www.time.com'+img['href']
|
if img:
|
||||||
|
cover_url = 'http://www.time.com' + img[0].get('href')
|
||||||
try:
|
try:
|
||||||
nsoup = self.index_to_soup(cover_url)
|
nsoup = self.index_to_soup(cover_url)
|
||||||
img = nsoup.find('img', src=re.compile('archive/covers'))
|
img = nsoup.find('img', src=re.compile('archive/covers'))
|
||||||
@ -70,46 +57,48 @@ class Time(BasicNewsRecipe):
|
|||||||
|
|
||||||
|
|
||||||
feeds = []
|
feeds = []
|
||||||
parent = soup.find(id='tocGuts')
|
parent = root.xpath('//div[@class="content-main-aside"]')[0]
|
||||||
for seched in parent.findAll(attrs={'class':'toc_seched'}):
|
for sec in parent.xpath(
|
||||||
section = self.tag_to_string(seched).capitalize()
|
'descendant::section[contains(@class, "sec-mag-section")]'):
|
||||||
articles = list(self.find_articles(seched))
|
h3 = sec.xpath('./h3')
|
||||||
feeds.append((section, articles))
|
if h3:
|
||||||
|
section = html.tostring(h3[0], encoding=unicode,
|
||||||
|
method='text').strip().capitalize()
|
||||||
|
self.log('Found section', section)
|
||||||
|
articles = list(self.find_articles(sec))
|
||||||
|
if articles:
|
||||||
|
feeds.append((section, articles))
|
||||||
|
|
||||||
return feeds
|
return feeds
|
||||||
|
|
||||||
def find_articles(self, seched):
|
def find_articles(self, sec):
|
||||||
for a in seched.findNextSiblings( attrs={'class':['toc_hed','rule2']}):
|
|
||||||
if a.name in "div":
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
yield {
|
|
||||||
'title' : self.tag_to_string(a),
|
|
||||||
'url' : 'http://www.time.com'+a['href'],
|
|
||||||
'date' : '',
|
|
||||||
'description' : self.article_description(a)
|
|
||||||
}
|
|
||||||
|
|
||||||
|
for article in sec.xpath('./article'):
|
||||||
|
h2 = article.xpath('./*[@class="entry-title"]')
|
||||||
def article_description(self, a):
|
if not h2: continue
|
||||||
ans = []
|
a = h2[0].xpath('./a[@href]')
|
||||||
while True:
|
if not a: continue
|
||||||
t = a.nextSibling
|
title = html.tostring(a[0], encoding=unicode,
|
||||||
if t is None:
|
method='text').strip()
|
||||||
break
|
if not title: continue
|
||||||
a = t
|
url = a[0].get('href')
|
||||||
if getattr(t, 'name', False):
|
if url.startswith('/'):
|
||||||
if t.get('class', '') == 'toc_parens' or t.name == 'br':
|
url = 'http://www.time.com'+url
|
||||||
continue
|
desc = ''
|
||||||
if t.name in ('div', 'a'):
|
p = article.xpath('./*[@class="entry-content"]')
|
||||||
break
|
if p:
|
||||||
ans.append(self.tag_to_string(t))
|
desc = html.tostring(p[0], encoding=unicode,
|
||||||
else:
|
method='text')
|
||||||
ans.append(unicode(t))
|
self.log('\t', title, ':\n\t\t', desc)
|
||||||
return u' '.join(ans).replace(u'\xa0', u'').strip()
|
yield {
|
||||||
|
'title' : title,
|
||||||
|
'url' : url,
|
||||||
|
'date' : '',
|
||||||
|
'description' : desc
|
||||||
|
}
|
||||||
|
|
||||||
def postprocess_html(self,soup,first):
|
def postprocess_html(self,soup,first):
|
||||||
for tag in soup.findAll(attrs ={'class':['artPag','pagination']}):
|
for tag in soup.findAll(attrs ={'class':['artPag','pagination']}):
|
||||||
tag.extract()
|
tag.extract()
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user