mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
196 lines
6.6 KiB
Python
196 lines
6.6 KiB
Python
#!/usr/bin/env python
|
|
import json
|
|
import re
|
|
|
|
import html5lib
|
|
import mechanize
|
|
from lxml import html
|
|
|
|
from calibre.web.feeds.news import BasicNewsRecipe
|
|
|
|
|
|
def classes(classes):
|
|
q = frozenset(classes.split(' '))
|
|
return dict(attrs={
|
|
'class': lambda x: x and frozenset(x.split()).intersection(q)})
|
|
|
|
|
|
def as_article(source, log):
|
|
url = source['url']
|
|
title = source['title']
|
|
desc = ''
|
|
if source.get('field_subtitle'):
|
|
desc += source['field_subtitle']
|
|
if source.get('field_display_authors'):
|
|
desc += ' by ' + source['field_display_authors']
|
|
log(title, url)
|
|
return {'url': url, 'title': title, 'description': desc}
|
|
|
|
|
|
def get_issue_data(br, log, node_id='1126213', year='2020', volnum='99', issue_vol='5'):
|
|
headers = {
|
|
'Accept': 'application/json, text/plain, */*',
|
|
'Content-Type': 'application/json;charset=UTF-8',
|
|
'Origin': 'https://www.foreignaffairs.com',
|
|
'Referer': 'https://www.foreignaffairs.com',
|
|
}
|
|
|
|
def make_query(**kwds):
|
|
size = kwds.pop('size', 1)
|
|
is_filter = kwds.pop('filter', None)
|
|
if is_filter:
|
|
q = {'filter': [{'terms': {k:v}} for k, v in kwds.items()]}
|
|
else:
|
|
q = {'must': [{'term': {k:v}} for k, v in kwds.items()]}
|
|
return {
|
|
'from': 0,
|
|
'post_filter': {'bool': q},
|
|
"_source": {
|
|
"includes": [
|
|
"nid", 'path', 'title', 'field_subtitle', 'field_display_authors',
|
|
'fa_node_type_or_subtype',
|
|
|
|
'field_issue_sspecial_articles__nid',
|
|
'field_issue_sspecial_header'
|
|
]
|
|
},
|
|
"query": {
|
|
"match_all": {}
|
|
},
|
|
'sort': [{'field_sequence': "asc"}, {'fa_normalized_date': "desc"}],
|
|
"size": size,
|
|
}
|
|
|
|
def get_data(data):
|
|
search_url = 'https://www.foreignaffairs.com/fa-search.php'
|
|
req = mechanize.Request(url=search_url,
|
|
data=json.dumps(data),
|
|
headers=headers,
|
|
method='POST')
|
|
res = br.open(req)
|
|
data = json.loads(res.read())
|
|
return data['hits']['hits']
|
|
|
|
feeds = []
|
|
issue_data = get_data(make_query(
|
|
fa_node_type_or_subtype='Issue',
|
|
field_issue_volume=issue_vol, field_issue_year=year,
|
|
field_issue_volume_number=volnum
|
|
))[0]['_source']
|
|
main_sec_title = issue_data['field_issue_sspecial_header'][0]
|
|
main_sec_nids = issue_data['field_issue_sspecial_articles__nid']
|
|
articles_data = get_data(make_query(nid=main_sec_nids, filter=True, size=len(main_sec_nids)))
|
|
articles = []
|
|
|
|
def as_article(source):
|
|
title = source['title'][0]
|
|
desc = ''
|
|
fs = source.get('field_subtitle')
|
|
if fs:
|
|
desc = fs[0]
|
|
aus = source.get('field_display_authors')
|
|
if aus:
|
|
desc += ' By ' + aus[0]
|
|
url = 'https://www.foreignaffairs.com' + source['path'][0]
|
|
return {'title': title, 'description': desc, 'url': url}
|
|
|
|
log(main_sec_title)
|
|
for entry in articles_data:
|
|
source = entry['_source']
|
|
articles.append(as_article(source))
|
|
log('\t', articles[-1]['title'], articles[-1]['url'])
|
|
feeds.append((main_sec_title, articles))
|
|
|
|
articles_data = get_data(make_query(field_issue__nid=node_id, size=50))
|
|
ans = {}
|
|
for entry in articles_data:
|
|
source = entry['_source']
|
|
section = source['fa_node_type_or_subtype'][0]
|
|
ans.setdefault(section, []).append(as_article(source))
|
|
for sectitle in sorted(ans):
|
|
articles = ans[sectitle]
|
|
log(sectitle)
|
|
if articles:
|
|
for art in articles:
|
|
log('\t', art['title'], art['url'])
|
|
feeds.append((sectitle, articles))
|
|
|
|
return feeds
|
|
|
|
|
|
class ForeignAffairsRecipe(BasicNewsRecipe):
|
|
title = u'Foreign Affairs'
|
|
__author__ = 'Kovid Goyal'
|
|
language = 'en'
|
|
publisher = u'Council on Foreign Relations'
|
|
category = u'USA, Foreign Affairs'
|
|
description = u'The leading forum for serious discussion of American foreign policy and international affairs.'
|
|
|
|
no_stylesheets = True
|
|
remove_javascript = True
|
|
needs_subscription = 'optional'
|
|
|
|
INDEX = 'https://www.foreignaffairs.com/magazine'
|
|
|
|
keep_only_tags = [
|
|
classes('article-header article-body article-lead-image article-body-text'),
|
|
]
|
|
remove_tags = [
|
|
classes('print-hidden loading-indicator paywall article-footer')
|
|
]
|
|
|
|
conversion_options = {'comments': description, 'tags': category, 'language': 'en',
|
|
'publisher': publisher}
|
|
|
|
def parse_index(self):
|
|
soup = self.index_to_soup(self.INDEX)
|
|
# get dates
|
|
date = re.split(r'\s\|\s', self.tag_to_string(
|
|
soup.head.title.string))[0]
|
|
self.title = "Foreign Affairs ({})".format(date)
|
|
self.timefmt = u' [%s]' % date
|
|
link = soup.find('link', rel='revision', href=True)['href']
|
|
year, volnum, issue_vol = link.split('/')[-3:]
|
|
self.cover_url = soup.find('meta', property="og:image:secure_url")['content']
|
|
|
|
cls = soup.find('body')['class']
|
|
if isinstance(cls, (list, tuple)):
|
|
cls = ' '.join(cls)
|
|
node_id = re.search(r'\bpage-node-(\d+)\b', cls).group(1)
|
|
br = self.cloned_browser
|
|
feeds = get_issue_data(br, self.log, node_id, year, volnum, issue_vol)
|
|
return feeds
|
|
|
|
def clean_fa_html(self, root):
|
|
for svg in tuple(root.iter('{*}svg')):
|
|
svg.getparent().remove(svg)
|
|
for meta in tuple(root.iter('{*}meta')):
|
|
meta.getparent().remove(meta)
|
|
return root
|
|
|
|
def preprocess_raw_html(self, raw_html, url):
|
|
root = html5lib.parse(raw_html, treebuilder='lxml',
|
|
namespaceHTMLElements=False).getroot()
|
|
self.clean_fa_html(root)
|
|
return html.tostring(root, encoding='unicode')
|
|
|
|
def preprocess_html(self, soup):
|
|
for attr in ('ng-src', 'data-blazy', 'data-src'):
|
|
for img in soup.findAll('img', attrs={attr: True}):
|
|
img['src'] = img[attr]
|
|
return soup
|
|
|
|
def get_browser(self):
|
|
|
|
def select_form(form):
|
|
return form.attrs.get('id', None) == 'fa-user-login-form'
|
|
|
|
br = BasicNewsRecipe.get_browser(self)
|
|
if self.username is not None and self.password is not None:
|
|
br.open('https://www.foreignaffairs.com/user/login')
|
|
br.select_form(predicate=select_form)
|
|
br.form['name'] = self.username
|
|
br.form['pass'] = self.password
|
|
br.submit()
|
|
return br
|