from calibre.web.feeds.jsnews import JavascriptRecipe import datetime BASE = 'http://www.newsweek.com' def href_to_url(a, add_piano=False): return BASE + a.get('href') + ('?piano_t=1' if add_piano else '') class Newsweek(JavascriptRecipe): title = 'Newsweek' __author__ = 'Kovid Goyal' description = 'Weekly news and current affairs in the US. Requires a subscription.' language = 'en' encoding = 'utf-8' no_stylesheets = True requires_version = (1, 40, 0) keep_only_tags = ['article.content-fullwidth'] remove_tags = [ 'meta', '.block-openadstream', '.block-ibtmedia-social', '.issue-next', '.most-popular', '.ibt-media-stories', '.user-btn-group', '#taboola-below-main-column', '.trc_related_container', '#block-nw-magazine-magazine-more-from-issue', '.block-ibtmedia-top-stories', ] LOGIN = 'https://bar.piano-media.com/lite/authent/login//custom/newsweek/?service_id=25&loc=http%3A%2F%2Fwww.newsweek.com%2F' # noqa needs_subscription = True def do_login(self, br, username, password): br.visit(self.LOGIN) form = br.select_form('#pianomedia_login_form') form['login'] = username form['password'] = password br.submit() def get_publication_data(self, browser): browser.wait_for_element('nav.main-menu a[href]') root = self.index_to_soup(browser.html) for a in root.xpath('''descendant-or-self::nav[@class and contains(concat(' ', normalize-space(@class), ' '), ' main-menu ')]/descendant-or-self::*/a[@href]'''): if a.text and a.text.strip() == 'This Week\'s Edition': return self.get_newsweek_publication_data(browser, href_to_url(a, True)) def get_newsweek_publication_data(self, browser, url): root = self.index_to_soup(url) sel = lambda expr: root.xpath(expr) ans = {} for img in sel('''descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' cover-story ')]/descendant-or-self::*/div[@class and contains(concat(' ', normalize-space(@class), ' '), ' info ')]/descendant-or-self::*/img[@src]'''): if '_Cover_' in img.get('title', ''): ans['cover'] = browser.get_resource(img.get('src')) break for title in root.xpath('//title'): raw = title.text if raw: self.timefmt = datetime.datetime.strptime(raw, '%Y/%m/%d').strftime(' [%b %d]') sections = [] for div in sel('''descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' cover-story ')]/descendant-or-self::*/div[@class and contains(concat(' ', normalize-space(@class), ' '), ' info ')]'''): url = None for a in div.xpath('descendant::a[@href]'): url = href_to_url(a) break for s in div.xpath('descendant::div[@class="summary"]'): sections.append(('Cover Story', [{'title':'Cover Story', 'date':'', 'url':url, 'description':self.tag_to_string(s)}])) break features = [] for li in sel('''descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' features ')]/descendant-or-self::*/li'''): url = None for a in li.xpath('descendant::a[@class="article-link"]'): url = href_to_url(a) features.append({'title':self.tag_to_string(a), 'url':url}) break if features: sections.append(('Features', features)) for div in sel('''descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' issue-list-block ')]'''): for d in div.xpath('descendant::div[@class="block-title"]'): section_title = self.tag_to_string(d) articles = [] break else: continue for li in div.xpath('descendant::li'): desc = '' for d in li.xpath('descendant::div[@class="summary"]'): desc = self.tag_to_string(d) break for a in li.xpath('descendant::a[@class="article-link"]'): articles.append({'title':self.tag_to_string(a), 'url':href_to_url(a), 'description':desc}) break if articles: sections.append((section_title, articles)) ans['index'] = sections return ans def load_complete(self, browser, url, recursion_level): browser.wait_for_element('div.article-body') return browser.load_completed # This is needed to allow the parallax images to load def preprocess_stage1(self, article, browser, url, recursion_level): # Parallax images in the articles are loaded as background images # on tags. Convert them to normal images. for span in browser.css_select('span.parallax-image', all=True): bg = unicode(span.styleProperty('background-image', span.InlineStyle)) if bg: url = bg.strip().partition('(')[-1][:-1] span.appendInside('' % url) span.setAttribute('style', '') browser.run_for_a_time(0.1) # This is needed to give the DOM time to update def postprocess_html(self, article, root, url, recursion_level): for x in root.xpath('//*[local-name()="body" and @style]'): del x.attrib['style'] # body has a fixed height, which causes problems with epub viewers for x in root.xpath('//*[@id="piano-root"]'): x.getparent().remove(x) return root