mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Work on Foreign Affairs
This commit is contained in:
parent
1f203ba7b6
commit
848934643e
@ -9,6 +9,12 @@ def select_form(form):
|
|||||||
return form.attrs.get('id', None) == 'user-login'
|
return form.attrs.get('id', None) == 'user-login'
|
||||||
|
|
||||||
|
|
||||||
|
def classes(classes):
|
||||||
|
q = frozenset(classes.split(' '))
|
||||||
|
return dict(attrs={
|
||||||
|
'class': lambda x: x and frozenset(x.split()).intersection(q)})
|
||||||
|
|
||||||
|
|
||||||
class ForeignAffairsRecipe(BasicNewsRecipe):
|
class ForeignAffairsRecipe(BasicNewsRecipe):
|
||||||
|
|
||||||
''' there are three modifications:
|
''' there are three modifications:
|
||||||
@ -37,7 +43,7 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
|
|||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
needs_subscription = True
|
needs_subscription = True
|
||||||
|
|
||||||
INDEX = 'http://www.foreignaffairs.com'
|
INDEX = 'https://www.foreignaffairs.com'
|
||||||
FRONTPAGE = INDEX + '/magazine'
|
FRONTPAGE = INDEX + '/magazine'
|
||||||
|
|
||||||
keep_only_tags = [
|
keep_only_tags = [
|
||||||
@ -50,36 +56,39 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
|
|||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
answer = []
|
answer = []
|
||||||
soup = self.index_to_soup(html.tostring(self.clean_fa_html(
|
soup = self.index_to_soup(self.FRONTPAGE)
|
||||||
self.index_to_soup(self.FRONTPAGE, as_tree=True))))
|
|
||||||
div = soup.find(
|
div = soup.find(
|
||||||
'div', attrs={'class': 'magazine-hero__image image_auto_width'})
|
'div', attrs={'class': 'magazine-actions'})
|
||||||
self.cover_url = div.find('img')['src']
|
self.cover_url = div.find('img')['ng-src']
|
||||||
# get dates
|
# get dates
|
||||||
date = re.split('\s\|\s', self.tag_to_string(
|
date = re.split(r'\s\|\s', self.tag_to_string(
|
||||||
soup.head.title.string))[0]
|
soup.head.title.string))[0]
|
||||||
self.title = "Foreign Affairs ({})".format(date)
|
self.title = "Foreign Affairs ({})".format(date)
|
||||||
self.timefmt = u' [%s]' % date
|
self.timefmt = u' [%s]' % date
|
||||||
|
|
||||||
for section in soup.findAll(attrs={'class':lambda x: x and 'magazine-list' in x.split()}):
|
# Fetching article list does not work as site uses javascript
|
||||||
|
# to load articles dynamically
|
||||||
|
for section in soup.findAll('section', attrs={'class':lambda x: x and 'magazine-list' in x.split()}):
|
||||||
articles = []
|
articles = []
|
||||||
section_title = self.tag_to_string(section.find('h1'))
|
section_title = self.tag_to_string(section.find('h2'))
|
||||||
for h2 in section.findAll('h2'):
|
if 'special_section.title' in section_title:
|
||||||
a = h2.parent
|
section_title = 'Special'
|
||||||
if a.get('href'):
|
self.log('\nSection:', section_title)
|
||||||
title = self.tag_to_string(h2)
|
for h3 in section.findAll(attrs={'class': lambda x: x and 'magazine-title' in x.split()}):
|
||||||
url = a['href']
|
a = h3.findParent('a', href=True)
|
||||||
atr = a.findNextSibling(attrs={'class':'author'})
|
title = self.tag_to_string(h3)
|
||||||
author = self.tag_to_string(atr) if atr else ''
|
url = a['href']
|
||||||
desc = a.findNextSibling(attrs={'class': 'deck'})
|
atr = a.findNextSibling(attrs={'class':'author'})
|
||||||
if desc is not None:
|
author = self.tag_to_string(atr) if atr else ''
|
||||||
description = self.tag_to_string(desc)
|
desc = a.findNextSibling(attrs={'class': 'deck'})
|
||||||
else:
|
if desc is not None:
|
||||||
description = ''
|
description = self.tag_to_string(desc)
|
||||||
articles.append({'title': title, 'url': url,
|
else:
|
||||||
'description': description, 'author': author})
|
description = ''
|
||||||
self.log(title)
|
articles.append({'title': title, 'url': url,
|
||||||
self.log('\t' + url)
|
'description': description, 'author': author})
|
||||||
|
self.log(title)
|
||||||
|
self.log('\t' + url)
|
||||||
if articles:
|
if articles:
|
||||||
answer.append((section_title, articles))
|
answer.append((section_title, articles))
|
||||||
return answer
|
return answer
|
||||||
@ -98,10 +107,8 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
|
|||||||
return html.tostring(root)
|
return html.tostring(root)
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for img in soup.findAll('img', attrs={'src': True}):
|
for img in soup.findAll('img', attrs={'ng-src': True}):
|
||||||
if not img['src'].startswith('http'):
|
img['src'] = img['ng-src']
|
||||||
img['src'] = self.INDEX + img['src']
|
|
||||||
|
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
def get_browser(self):
|
def get_browser(self):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user