diff --git a/recipes/newsweek.recipe b/recipes/newsweek.recipe index 03466b5b3d..a59dff0ec9 100644 --- a/recipes/newsweek.recipe +++ b/recipes/newsweek.recipe @@ -1,94 +1,113 @@ -from calibre.web.feeds.news import BasicNewsRecipe +from calibre.web.feeds.jsnews import JavascriptRecipe +from cssselect import HTMLTranslator +from lxml.etree import XPath +import datetime -class Newsweek(BasicNewsRecipe): +def CSSSelect(expr): + return XPath(HTMLTranslator().css_to_xpath(expr)) + +BASE = 'http://www.newsweek.com' +def href_to_url(a): + return BASE + a.get('href') + '?piano_t=1' + +class Newsweek(JavascriptRecipe): title = 'Newsweek' __author__ = 'Kovid Goyal' - description = 'Weekly news and current affairs in the US' + description = 'Weekly news and current affairs in the US. Requires a subscription.' language = 'en' encoding = 'utf-8' no_stylesheets = True - recipe_disabled = ('Newsweek was taken over by The Daily Beast,' - ' newsweek.com no longer exists, so this recipe ' - ' has been disabled.') + requires_version = (1, 40, 0) - BASE_URL = 'http://www.newsweek.com' + keep_only_tags = ['article.content-fullwidth'] + remove_tags = [ + 'meta', '.block-openadstream', '.block-ibtmedia-social', '.issue-next', + '.most-popular', '.ibt-media-stories', '.user-btn-group', + '#taboola-below-main-column', '.trc_related_container', + ] + LOGIN = 'https://bar.piano-media.com/lite/authent/login//custom/newsweek/?service_id=25&loc=http%3A%2F%2Fwww.newsweek.com%2F' # noqa - topics = { - 'Culture' : '/tag/culture.html', - 'Business' : '/tag/business.html', - 'Society' : '/tag/society.html', - 'Science' : '/tag/science.html', - 'Education' : '/tag/education.html', - 'Politics' : '/tag/politics.html', - 'Health' : '/tag/health.html', - 'World' : '/tag/world.html', - 'Nation' : '/tag/nation.html', - 'Technology' : '/tag/technology.html', - 'Game Changers' : '/tag/game-changers.html', - } + needs_subscription = True + def do_login(self, br, username, password): + br.visit(self.LOGIN) + form = br.select_form('#pianomedia_login_form') + form['login'] = username + form['password'] = password + br.submit() - keep_only_tags = dict(name='article', attrs={'class':'article-text'}) - remove_tags = [dict(attrs={'data-dartad':True})] - remove_attributes = ['property'] + def get_publication_data(self, browser): + browser.wait_for_element('nav.main-menu a[href]') + root = self.index_to_soup(browser.html) + for a in CSSSelect('nav.main-menu a[href]')(root): + if a.text and a.text.strip() == 'This Week\'s Edition': + return self.get_newsweek_publication_data(browser, href_to_url(a)) - def postprocess_html(self, soup, first): - for tag in soup.findAll(name=['article', 'header']): - tag.name = 'div' - return soup + def get_newsweek_publication_data(self, browser, url): + root = self.index_to_soup(url) + sel = lambda expr: CSSSelect(expr)(root) + ans = {} - def newsweek_sections(self): - for topic_name, topic_url in self.topics.iteritems(): - yield (topic_name, - self.BASE_URL+topic_url) + for img in sel('div.cover-story div.info img[src]'): + if '_Cover_' in img.get('title', ''): + ans['cover'] = browser.get_resource(img.get('src')) + break + for title in root.xpath('//title'): + raw = title.text + if raw: + self.timefmt = datetime.datetime.strptime(raw, '%Y/%m/%d').strftime(' [%b %d]') - - def newsweek_parse_section_page(self, soup): - for article in soup.findAll('article', about=True, - attrs={'class':'stream-item'}): - title = article.find(attrs={'property': 'dc:title'}) - if title is None: continue - title = self.tag_to_string(title) - url = self.BASE_URL + article['about'] - desc = '' - author = article.find({'property':'dc:creator'}) - if author: - desc = u'by %s. '%self.tag_to_string(author) - p = article.find(attrs={'property':'dc:abstract'}) - if p is not None: - for a in p.find('a'): a.extract() - desc += self.tag_to_string(p) - t = article.find('time', attrs={'property':'dc:created'}) - date = '' - if t is not None: - date = u' [%s]'%self.tag_to_string(t) - self.log('\tFound article:', title, 'at', url) - self.log('\t\t', desc) - yield {'title':title, 'url':url, 'description':desc, 'date':date} - - - def parse_index(self): sections = [] - for section, shref in self.newsweek_sections(): - self.log('Processing section', section, shref) - articles = [] - try: - soups = [self.index_to_soup(shref)] - except: - self.log.warn('Section %s not found, skipping'%section) + for div in sel('div.cover-story div.info'): + url = None + for a in div.xpath('descendant::a[@href]'): + url = href_to_url(a) + break + for s in div.xpath('descendant::div[@class="summary"]'): + sections.append(('Cover Story', [{'title':'Cover Story', 'date':'', 'url':url, 'description':self.tag_to_string(s)}])) + break + features = [] + for li in sel('div.features li'): + url = None + for a in li.xpath('descendant::a[@class="article-link"]'): + url = href_to_url(a) + features.append({'title':self.tag_to_string(a), 'url':url}) + break + if features: + sections.append(('Features', features)) + + for div in sel('div.issue-list-block'): + for d in div.xpath('descendant::div[@class="block-title"]'): + section_title = self.tag_to_string(d) + articles = [] + break + else: continue - na = soups[0].find('a', rel='next') - if na: - soups.append(self.index_to_soup(self.BASE_URL+na['href'])) - for soup in soups: - articles.extend(self.newsweek_parse_section_page(soup)) - if self.test and len(articles) > 1: + for li in div.xpath('descendant::li'): + desc = '' + for d in li.xpath('descendant::div[@class="summary"]'): + desc = self.tag_to_string(d) + break + for a in li.xpath('descendant::a[@class="article-link"]'): + articles.append({'title':self.tag_to_string(a), 'url':href_to_url(a), 'description':desc}) break if articles: - sections.append((section, articles)) - if self.test and len(sections) > 1: - break - return sections + sections.append((section_title, articles)) + ans['index'] = sections + return ans + def preprocess_stage1(self, article, browser, url, recursion_level): + # Parallax images in the articles are loaded as background images + # on tags. Convert them to normal images. + for span in browser.css_select('span.parallax-image', all=True): + bg = unicode(span.styleProperty('background-image', span.InlineStyle)) + if bg: + url = bg.partition('(')[-1][:-1] + span.appendInside('' % url) + span.setAttribute('style', '') + def postprocess_html(self, article, root, url, recursion_level): + for x in root.xpath('//*[@id="piano-root"]'): + x.getparent().remove(x) + return root