mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Update Newsweek
This commit is contained in:
parent
7b284b949f
commit
b9bcb7008c
@ -1,94 +1,113 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.jsnews import JavascriptRecipe
|
||||||
|
from cssselect import HTMLTranslator
|
||||||
|
from lxml.etree import XPath
|
||||||
|
import datetime
|
||||||
|
|
||||||
class Newsweek(BasicNewsRecipe):
|
def CSSSelect(expr):
|
||||||
|
return XPath(HTMLTranslator().css_to_xpath(expr))
|
||||||
|
|
||||||
|
BASE = 'http://www.newsweek.com'
|
||||||
|
def href_to_url(a):
|
||||||
|
return BASE + a.get('href') + '?piano_t=1'
|
||||||
|
|
||||||
|
class Newsweek(JavascriptRecipe):
|
||||||
|
|
||||||
title = 'Newsweek'
|
title = 'Newsweek'
|
||||||
__author__ = 'Kovid Goyal'
|
__author__ = 'Kovid Goyal'
|
||||||
description = 'Weekly news and current affairs in the US'
|
description = 'Weekly news and current affairs in the US. Requires a subscription.'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
recipe_disabled = ('Newsweek was taken over by The Daily Beast,'
|
requires_version = (1, 40, 0)
|
||||||
' newsweek.com no longer exists, so this recipe '
|
|
||||||
' has been disabled.')
|
|
||||||
|
|
||||||
BASE_URL = 'http://www.newsweek.com'
|
keep_only_tags = ['article.content-fullwidth']
|
||||||
|
remove_tags = [
|
||||||
|
'meta', '.block-openadstream', '.block-ibtmedia-social', '.issue-next',
|
||||||
|
'.most-popular', '.ibt-media-stories', '.user-btn-group',
|
||||||
|
'#taboola-below-main-column', '.trc_related_container',
|
||||||
|
]
|
||||||
|
LOGIN = 'https://bar.piano-media.com/lite/authent/login//custom/newsweek/?service_id=25&loc=http%3A%2F%2Fwww.newsweek.com%2F' # noqa
|
||||||
|
|
||||||
topics = {
|
needs_subscription = True
|
||||||
'Culture' : '/tag/culture.html',
|
def do_login(self, br, username, password):
|
||||||
'Business' : '/tag/business.html',
|
br.visit(self.LOGIN)
|
||||||
'Society' : '/tag/society.html',
|
form = br.select_form('#pianomedia_login_form')
|
||||||
'Science' : '/tag/science.html',
|
form['login'] = username
|
||||||
'Education' : '/tag/education.html',
|
form['password'] = password
|
||||||
'Politics' : '/tag/politics.html',
|
br.submit()
|
||||||
'Health' : '/tag/health.html',
|
|
||||||
'World' : '/tag/world.html',
|
|
||||||
'Nation' : '/tag/nation.html',
|
|
||||||
'Technology' : '/tag/technology.html',
|
|
||||||
'Game Changers' : '/tag/game-changers.html',
|
|
||||||
}
|
|
||||||
|
|
||||||
keep_only_tags = dict(name='article', attrs={'class':'article-text'})
|
def get_publication_data(self, browser):
|
||||||
remove_tags = [dict(attrs={'data-dartad':True})]
|
browser.wait_for_element('nav.main-menu a[href]')
|
||||||
remove_attributes = ['property']
|
root = self.index_to_soup(browser.html)
|
||||||
|
for a in CSSSelect('nav.main-menu a[href]')(root):
|
||||||
|
if a.text and a.text.strip() == 'This Week\'s Edition':
|
||||||
|
return self.get_newsweek_publication_data(browser, href_to_url(a))
|
||||||
|
|
||||||
def postprocess_html(self, soup, first):
|
def get_newsweek_publication_data(self, browser, url):
|
||||||
for tag in soup.findAll(name=['article', 'header']):
|
root = self.index_to_soup(url)
|
||||||
tag.name = 'div'
|
sel = lambda expr: CSSSelect(expr)(root)
|
||||||
return soup
|
ans = {}
|
||||||
|
|
||||||
def newsweek_sections(self):
|
for img in sel('div.cover-story div.info img[src]'):
|
||||||
for topic_name, topic_url in self.topics.iteritems():
|
if '_Cover_' in img.get('title', ''):
|
||||||
yield (topic_name,
|
ans['cover'] = browser.get_resource(img.get('src'))
|
||||||
self.BASE_URL+topic_url)
|
break
|
||||||
|
for title in root.xpath('//title'):
|
||||||
|
raw = title.text
|
||||||
|
if raw:
|
||||||
|
self.timefmt = datetime.datetime.strptime(raw, '%Y/%m/%d').strftime(' [%b %d]')
|
||||||
|
|
||||||
|
|
||||||
def newsweek_parse_section_page(self, soup):
|
|
||||||
for article in soup.findAll('article', about=True,
|
|
||||||
attrs={'class':'stream-item'}):
|
|
||||||
title = article.find(attrs={'property': 'dc:title'})
|
|
||||||
if title is None: continue
|
|
||||||
title = self.tag_to_string(title)
|
|
||||||
url = self.BASE_URL + article['about']
|
|
||||||
desc = ''
|
|
||||||
author = article.find({'property':'dc:creator'})
|
|
||||||
if author:
|
|
||||||
desc = u'by %s. '%self.tag_to_string(author)
|
|
||||||
p = article.find(attrs={'property':'dc:abstract'})
|
|
||||||
if p is not None:
|
|
||||||
for a in p.find('a'): a.extract()
|
|
||||||
desc += self.tag_to_string(p)
|
|
||||||
t = article.find('time', attrs={'property':'dc:created'})
|
|
||||||
date = ''
|
|
||||||
if t is not None:
|
|
||||||
date = u' [%s]'%self.tag_to_string(t)
|
|
||||||
self.log('\tFound article:', title, 'at', url)
|
|
||||||
self.log('\t\t', desc)
|
|
||||||
yield {'title':title, 'url':url, 'description':desc, 'date':date}
|
|
||||||
|
|
||||||
|
|
||||||
def parse_index(self):
|
|
||||||
sections = []
|
sections = []
|
||||||
for section, shref in self.newsweek_sections():
|
for div in sel('div.cover-story div.info'):
|
||||||
self.log('Processing section', section, shref)
|
url = None
|
||||||
|
for a in div.xpath('descendant::a[@href]'):
|
||||||
|
url = href_to_url(a)
|
||||||
|
break
|
||||||
|
for s in div.xpath('descendant::div[@class="summary"]'):
|
||||||
|
sections.append(('Cover Story', [{'title':'Cover Story', 'date':'', 'url':url, 'description':self.tag_to_string(s)}]))
|
||||||
|
break
|
||||||
|
features = []
|
||||||
|
for li in sel('div.features li'):
|
||||||
|
url = None
|
||||||
|
for a in li.xpath('descendant::a[@class="article-link"]'):
|
||||||
|
url = href_to_url(a)
|
||||||
|
features.append({'title':self.tag_to_string(a), 'url':url})
|
||||||
|
break
|
||||||
|
if features:
|
||||||
|
sections.append(('Features', features))
|
||||||
|
|
||||||
|
for div in sel('div.issue-list-block'):
|
||||||
|
for d in div.xpath('descendant::div[@class="block-title"]'):
|
||||||
|
section_title = self.tag_to_string(d)
|
||||||
articles = []
|
articles = []
|
||||||
try:
|
break
|
||||||
soups = [self.index_to_soup(shref)]
|
else:
|
||||||
except:
|
|
||||||
self.log.warn('Section %s not found, skipping'%section)
|
|
||||||
continue
|
continue
|
||||||
na = soups[0].find('a', rel='next')
|
for li in div.xpath('descendant::li'):
|
||||||
if na:
|
desc = ''
|
||||||
soups.append(self.index_to_soup(self.BASE_URL+na['href']))
|
for d in li.xpath('descendant::div[@class="summary"]'):
|
||||||
for soup in soups:
|
desc = self.tag_to_string(d)
|
||||||
articles.extend(self.newsweek_parse_section_page(soup))
|
break
|
||||||
if self.test and len(articles) > 1:
|
for a in li.xpath('descendant::a[@class="article-link"]'):
|
||||||
|
articles.append({'title':self.tag_to_string(a), 'url':href_to_url(a), 'description':desc})
|
||||||
break
|
break
|
||||||
if articles:
|
if articles:
|
||||||
sections.append((section, articles))
|
sections.append((section_title, articles))
|
||||||
if self.test and len(sections) > 1:
|
|
||||||
break
|
|
||||||
return sections
|
|
||||||
|
|
||||||
|
ans['index'] = sections
|
||||||
|
return ans
|
||||||
|
|
||||||
|
def preprocess_stage1(self, article, browser, url, recursion_level):
|
||||||
|
# Parallax images in the articles are loaded as background images
|
||||||
|
# on <span> tags. Convert them to normal images.
|
||||||
|
for span in browser.css_select('span.parallax-image', all=True):
|
||||||
|
bg = unicode(span.styleProperty('background-image', span.InlineStyle))
|
||||||
|
if bg:
|
||||||
|
url = bg.partition('(')[-1][:-1]
|
||||||
|
span.appendInside('<img src="%s"></img>' % url)
|
||||||
|
span.setAttribute('style', '')
|
||||||
|
|
||||||
|
def postprocess_html(self, article, root, url, recursion_level):
|
||||||
|
for x in root.xpath('//*[@id="piano-root"]'):
|
||||||
|
x.getparent().remove(x)
|
||||||
|
return root
|
||||||
|
Loading…
x
Reference in New Issue
Block a user