mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 10:44:09 -04:00
Update Newsweek
This commit is contained in:
parent
3e70e190dd
commit
1b413d27f9
@ -1,116 +1,83 @@
|
|||||||
from calibre.web.feeds.jsnews import JavascriptRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
import datetime
|
from collections import defaultdict
|
||||||
|
|
||||||
BASE = 'http://www.newsweek.com'
|
BASE = 'http://www.newsweek.com'
|
||||||
def href_to_url(a, add_piano=False):
|
def href_to_url(a, add_piano=False):
|
||||||
return BASE + a.get('href') + ('?piano_t=1' if add_piano else '')
|
return BASE + a.get('href') + ('?piano_d=1' if add_piano else '')
|
||||||
|
|
||||||
class Newsweek(JavascriptRecipe):
|
def class_sels(*args):
|
||||||
|
q = set(args)
|
||||||
|
return dict(attrs={'class':lambda x: x and set(x.split()).intersection(q)})
|
||||||
|
|
||||||
|
class Newsweek(BasicNewsRecipe):
|
||||||
|
|
||||||
title = 'Newsweek'
|
title = 'Newsweek'
|
||||||
__author__ = 'Kovid Goyal'
|
__author__ = 'Kovid Goyal'
|
||||||
description = 'Weekly news and current affairs in the US. Requires a subscription.'
|
description = 'Weekly news and current affairs in the US'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
requires_version = (1, 40, 0)
|
requires_version = (1, 40, 0)
|
||||||
|
|
||||||
keep_only_tags = ['article.content-fullwidth']
|
keep_only_tags = class_sels('article-header', 'article-body', 'header-image')
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
'meta', '.block-openadstream', '.block-ibtmedia-social', '.issue-next',
|
dict(name='meta'),
|
||||||
'.most-popular', '.ibt-media-stories', '.user-btn-group',
|
class_sels(
|
||||||
'#taboola-below-main-column', '.trc_related_container',
|
'block-openadstream', 'block-ibtmedia-social', 'issue-next',
|
||||||
'#block-nw-magazine-magazine-more-from-issue', '.block-ibtmedia-top-stories',
|
'most-popular', 'ibt-media-stories', 'user-btn-group',
|
||||||
|
'trial-link', 'trc_related_container',
|
||||||
|
'block-ibtmedia-top-stories'
|
||||||
|
),
|
||||||
|
dict(id=['taboola-below-main-column', 'piano-root', 'block-nw-magazine-magazine-more-from-issue']),
|
||||||
]
|
]
|
||||||
LOGIN = 'https://bar.piano-media.com/lite/authent/login//custom/newsweek/?service_id=25&loc=http%3A%2F%2Fwww.newsweek.com%2F' # noqa
|
remove_attributes = ['style']
|
||||||
|
|
||||||
needs_subscription = True
|
def parse_index(self):
|
||||||
def do_login(self, br, username, password):
|
root = self.index_to_soup('http://www.newsweek.com/archive', as_tree=True)
|
||||||
br.visit(self.LOGIN)
|
li = root.xpath('//ul[contains(@class, "magazine-archive-items")]/li')[0]
|
||||||
form = br.select_form('#pianomedia_login_form')
|
a = li.xpath('descendant::a[@href]')[0]
|
||||||
form['login'] = username
|
url = href_to_url(a, add_piano=True)
|
||||||
form['password'] = password
|
self.timefmt = self.tag_to_string(a)
|
||||||
br.submit()
|
img = li.xpath('descendant::a[@href]/img[@src]')[0]
|
||||||
|
self.cover_url = img.get('src')
|
||||||
def get_publication_data(self, browser):
|
root = self.index_to_soup(url, as_tree=True)
|
||||||
browser.wait_for_element('nav.main-menu a[href]')
|
div = root.xpath('//div[@id="block-nw-magazine-magazine-cover-story"]')[0]
|
||||||
root = self.index_to_soup(browser.html)
|
a = div.xpath('descendant::a[@href]')[0]
|
||||||
for a in root.xpath('''descendant-or-self::nav[@class and contains(concat(' ', normalize-space(@class), ' '), ' main-menu ')]/descendant-or-self::*/a[@href]'''):
|
index = [('Cover', [{'title':'Cover story', 'url':href_to_url(a)}])]
|
||||||
if a.text and a.text.strip() == 'This Week\'s Edition':
|
sections = defaultdict(list)
|
||||||
return self.get_newsweek_publication_data(browser, href_to_url(a, True))
|
div = root.xpath('//div[@id="block-nw-magazine-magazine-issue-story-list"]')[0]
|
||||||
|
for a in div.xpath('descendant::h3/a[@href and contains(@class, "article-link")]'):
|
||||||
def get_newsweek_publication_data(self, browser, url):
|
title = self.tag_to_string(a)
|
||||||
root = self.index_to_soup(url)
|
li = a.xpath('ancestor::li')[0]
|
||||||
sel = lambda expr: root.xpath(expr)
|
desc = ''
|
||||||
ans = {}
|
s = li.xpath('descendant::div[@class="summary"]')
|
||||||
|
if s:
|
||||||
for img in sel('''descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' cover-story ')]/descendant-or-self::*/div[@class and contains(concat(' ', normalize-space(@class), ' '), ' info ')]/descendant-or-self::*/img[@src]'''):
|
desc = self.tag_to_string(s[0])
|
||||||
if '_Cover_' in img.get('title', ''):
|
sec = li.xpath('descendant::div[@class="category"]')
|
||||||
ans['cover'] = browser.get_resource(img.get('src'))
|
if sec:
|
||||||
break
|
sec = self.tag_to_string(sec[0])
|
||||||
for title in root.xpath('//title'):
|
|
||||||
raw = title.text
|
|
||||||
if raw:
|
|
||||||
self.timefmt = datetime.datetime.strptime(raw, '%Y/%m/%d').strftime(' [%b %d]')
|
|
||||||
|
|
||||||
sections = []
|
|
||||||
for div in sel('''descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' cover-story ')]/descendant-or-self::*/div[@class and contains(concat(' ', normalize-space(@class), ' '), ' info ')]'''):
|
|
||||||
url = None
|
|
||||||
for a in div.xpath('descendant::a[@href]'):
|
|
||||||
url = href_to_url(a)
|
|
||||||
break
|
|
||||||
for s in div.xpath('descendant::div[@class="summary"]'):
|
|
||||||
sections.append(('Cover Story', [{'title':'Cover Story', 'date':'', 'url':url, 'description':self.tag_to_string(s)}]))
|
|
||||||
break
|
|
||||||
features = []
|
|
||||||
for li in sel('''descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' features ')]/descendant-or-self::*/li'''):
|
|
||||||
url = None
|
|
||||||
for a in li.xpath('descendant::a[@class="article-link"]'):
|
|
||||||
url = href_to_url(a)
|
|
||||||
features.append({'title':self.tag_to_string(a), 'url':url})
|
|
||||||
break
|
|
||||||
if features:
|
|
||||||
sections.append(('Features', features))
|
|
||||||
|
|
||||||
for div in sel('''descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' issue-list-block ')]'''):
|
|
||||||
for d in div.xpath('descendant::div[@class="block-title"]'):
|
|
||||||
section_title = self.tag_to_string(d)
|
|
||||||
articles = []
|
|
||||||
break
|
|
||||||
else:
|
else:
|
||||||
continue
|
sec = 'Articles'
|
||||||
for li in div.xpath('descendant::li'):
|
sections[sec].append({'title':title, 'url':href_to_url(a), 'description':desc})
|
||||||
desc = ''
|
self.log(title, url)
|
||||||
for d in li.xpath('descendant::div[@class="summary"]'):
|
if desc:
|
||||||
desc = self.tag_to_string(d)
|
self.log('\t' + desc)
|
||||||
break
|
self.log('')
|
||||||
for a in li.xpath('descendant::a[@class="article-link"]'):
|
for k in sorted(sections):
|
||||||
articles.append({'title':self.tag_to_string(a), 'url':href_to_url(a), 'description':desc})
|
index.append((k, sections[k]))
|
||||||
break
|
return index
|
||||||
if articles:
|
|
||||||
sections.append((section_title, articles))
|
|
||||||
|
|
||||||
ans['index'] = sections
|
def print_version(self, url):
|
||||||
return ans
|
return url + '?piano_d=1'
|
||||||
|
|
||||||
def load_complete(self, browser, url, recursion_level):
|
def preprocess_html(self, soup):
|
||||||
browser.wait_for_element('div.article-body')
|
|
||||||
return browser.load_completed # This is needed to allow the parallax images to load
|
|
||||||
|
|
||||||
def preprocess_stage1(self, article, browser, url, recursion_level):
|
|
||||||
# Parallax images in the articles are loaded as background images
|
# Parallax images in the articles are loaded as background images
|
||||||
# on <span> tags. Convert them to normal images.
|
# on <span> tags. Convert them to normal images.
|
||||||
for span in browser.css_select('span.parallax-image', all=True):
|
for span in soup.findAll('span', attrs={'class':lambda x: x and 'parallax' in x.split()}):
|
||||||
bg = unicode(span.styleProperty('background-image', span.InlineStyle))
|
s = span.find(style=True)
|
||||||
if bg:
|
if s is not None:
|
||||||
url = bg.strip().partition('(')[-1][:-1]
|
url = s['style'].partition('(')[-1][:-1]
|
||||||
span.appendInside('<img src="%s"></img>' % url)
|
s['style'] = 'display: block'
|
||||||
span.setAttribute('style', '')
|
s.name = 'img'
|
||||||
browser.run_for_a_time(0.1) # This is needed to give the DOM time to update
|
s['src'] = url
|
||||||
|
return soup
|
||||||
def postprocess_html(self, article, root, url, recursion_level):
|
|
||||||
for x in root.xpath('//*[local-name()="body" and @style]'):
|
|
||||||
del x.attrib['style'] # body has a fixed height, which causes problems with epub viewers
|
|
||||||
for x in root.xpath('//*[@id="piano-root"]'):
|
|
||||||
x.getparent().remove(x)
|
|
||||||
return root
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user