mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Merge branch 'economist' of https://github.com/xxyzz/calibre
This commit is contained in:
commit
3d3651ebce
@ -9,6 +9,7 @@ except ImportError:
|
|||||||
import json
|
import json
|
||||||
from html5_parser import parse
|
from html5_parser import parse
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
from calibre import replace_entities
|
from calibre import replace_entities
|
||||||
from calibre.ebooks.BeautifulSoup import NavigableString, Tag
|
from calibre.ebooks.BeautifulSoup import NavigableString, Tag
|
||||||
@ -304,31 +305,20 @@ class Economist(BasicNewsRecipe):
|
|||||||
script_tag = soup.find("script", id="__NEXT_DATA__")
|
script_tag = soup.find("script", id="__NEXT_DATA__")
|
||||||
if script_tag is not None:
|
if script_tag is not None:
|
||||||
data = json.loads(script_tag.string)
|
data = json.loads(script_tag.string)
|
||||||
self.cover_url = data['props']['pageProps']['content']['image']['main']['url']['canonical']
|
self.cover_url = safe_dict(data, "props", "pageProps", "content", "image", "main", "url", "canonical")
|
||||||
self.log('Got cover:', self.cover_url)
|
self.log('Got cover:', self.cover_url)
|
||||||
feeds = []
|
|
||||||
for section in soup.findAll(**classes('layout-weekly-edition-section')):
|
feeds_dict = defaultdict(list)
|
||||||
h2 = section.find('h2')
|
for part in safe_dict(data, "props", "pageProps", "content", "hasPart", "parts"):
|
||||||
secname = self.tag_to_string(h2)
|
section = safe_dict(part, "print", "section", "headline")
|
||||||
self.log(secname)
|
title = safe_dict(part, "print", "headline")
|
||||||
articles = []
|
url = safe_dict(part, "url", "canonical")
|
||||||
for a in section.findAll('a', href=True, **classes('headline-link weekly-edition-wtw__link')):
|
desc = safe_dict(part, "print", "description")
|
||||||
spans = a.findAll('span')
|
feeds_dict[section].append({"title": title, "url": url, "description": desc})
|
||||||
if len(spans) == 2:
|
self.log(' ', title, url, '\n ', desc)
|
||||||
title = u'{}: {}'.format(*map(self.tag_to_string, spans))
|
return [(section, articles) for section, articles in feeds_dict.items()]
|
||||||
else:
|
else:
|
||||||
title = self.tag_to_string(a)
|
return []
|
||||||
desc = ''
|
|
||||||
desc_parent = a.findParent('div')
|
|
||||||
if desc_parent is not None:
|
|
||||||
p = desc_parent.find(itemprop='description')
|
|
||||||
if p is not None:
|
|
||||||
desc = self.tag_to_string(p)
|
|
||||||
articles.append({'title': title, 'url': process_url(a['href']), 'description': desc})
|
|
||||||
self.log(' ', title, articles[-1]['url'], '\n ', desc)
|
|
||||||
if articles:
|
|
||||||
feeds.append((secname, articles))
|
|
||||||
return feeds
|
|
||||||
|
|
||||||
def eco_find_image_tables(self, soup):
|
def eco_find_image_tables(self, soup):
|
||||||
for x in soup.findAll('table', align=['right', 'center']):
|
for x in soup.findAll('table', align=['right', 'center']):
|
||||||
|
@ -9,6 +9,7 @@ except ImportError:
|
|||||||
import json
|
import json
|
||||||
from html5_parser import parse
|
from html5_parser import parse
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
from calibre import replace_entities
|
from calibre import replace_entities
|
||||||
from calibre.ebooks.BeautifulSoup import NavigableString, Tag
|
from calibre.ebooks.BeautifulSoup import NavigableString, Tag
|
||||||
@ -304,31 +305,20 @@ class Economist(BasicNewsRecipe):
|
|||||||
script_tag = soup.find("script", id="__NEXT_DATA__")
|
script_tag = soup.find("script", id="__NEXT_DATA__")
|
||||||
if script_tag is not None:
|
if script_tag is not None:
|
||||||
data = json.loads(script_tag.string)
|
data = json.loads(script_tag.string)
|
||||||
self.cover_url = data['props']['pageProps']['content']['image']['main']['url']['canonical']
|
self.cover_url = safe_dict(data, "props", "pageProps", "content", "image", "main", "url", "canonical")
|
||||||
self.log('Got cover:', self.cover_url)
|
self.log('Got cover:', self.cover_url)
|
||||||
feeds = []
|
|
||||||
for section in soup.findAll(**classes('layout-weekly-edition-section')):
|
feeds_dict = defaultdict(list)
|
||||||
h2 = section.find('h2')
|
for part in safe_dict(data, "props", "pageProps", "content", "hasPart", "parts"):
|
||||||
secname = self.tag_to_string(h2)
|
section = safe_dict(part, "print", "section", "headline")
|
||||||
self.log(secname)
|
title = safe_dict(part, "print", "headline")
|
||||||
articles = []
|
url = safe_dict(part, "url", "canonical")
|
||||||
for a in section.findAll('a', href=True, **classes('headline-link weekly-edition-wtw__link')):
|
desc = safe_dict(part, "print", "description")
|
||||||
spans = a.findAll('span')
|
feeds_dict[section].append({"title": title, "url": url, "description": desc})
|
||||||
if len(spans) == 2:
|
self.log(' ', title, url, '\n ', desc)
|
||||||
title = u'{}: {}'.format(*map(self.tag_to_string, spans))
|
return [(section, articles) for section, articles in feeds_dict.items()]
|
||||||
else:
|
else:
|
||||||
title = self.tag_to_string(a)
|
return []
|
||||||
desc = ''
|
|
||||||
desc_parent = a.findParent('div')
|
|
||||||
if desc_parent is not None:
|
|
||||||
p = desc_parent.find(itemprop='description')
|
|
||||||
if p is not None:
|
|
||||||
desc = self.tag_to_string(p)
|
|
||||||
articles.append({'title': title, 'url': process_url(a['href']), 'description': desc})
|
|
||||||
self.log(' ', title, articles[-1]['url'], '\n ', desc)
|
|
||||||
if articles:
|
|
||||||
feeds.append((secname, articles))
|
|
||||||
return feeds
|
|
||||||
|
|
||||||
def eco_find_image_tables(self, soup):
|
def eco_find_image_tables(self, soup):
|
||||||
for x in soup.findAll('table', align=['right', 'center']):
|
for x in soup.findAll('table', align=['right', 'center']):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user