mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Update Economist recipes for index page markup change
Fixes #1882131 [Fetch "The Economist" News fails as they have totally revamped their website format](https://bugs.launchpad.net/calibre/+bug/1882131)
This commit is contained in:
parent
1c66024316
commit
8f8a7b89c1
@ -6,7 +6,6 @@ try:
|
|||||||
except ImportError:
|
except ImportError:
|
||||||
from cookielib import Cookie
|
from cookielib import Cookie
|
||||||
import json
|
import json
|
||||||
from collections import OrderedDict
|
|
||||||
|
|
||||||
from html5_parser import parse
|
from html5_parser import parse
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
@ -250,76 +249,29 @@ class Economist(BasicNewsRecipe):
|
|||||||
return ans
|
return ans
|
||||||
|
|
||||||
def economist_parse_index(self, soup):
|
def economist_parse_index(self, soup):
|
||||||
img = soup.find(attrs={'srcset': True, 'class': lambda x: x and 'print-edition__cover-widget__image' in x.split()})
|
div = soup.find(attrs={'class': 'weekly-edition-header__image'})
|
||||||
if img is not None:
|
if div is not None:
|
||||||
for part in img['srcset'].split():
|
img = div.find('img', srcset=True)
|
||||||
if part.startswith('/'):
|
self.cover_url = img['srcset'].split(',')[-1].split()[0]
|
||||||
part = part.replace('200-width', '640-width')
|
self.log('Got cover:', self.cover_url)
|
||||||
self.cover_url = 'https://www.economist.com' + part
|
|
||||||
self.log('Got cover:', self.cover_url)
|
|
||||||
break
|
|
||||||
|
|
||||||
sections = soup.findAll('div', attrs={'class': 'list__title'})
|
feeds = []
|
||||||
if sections:
|
for section in soup.findAll('div', **classes('layout-weekly-edition-section')):
|
||||||
feeds = []
|
h2 = section.find('h2')
|
||||||
for section in sections:
|
secname = self.tag_to_string(h2)
|
||||||
articles = []
|
self.log(secname)
|
||||||
secname = self.tag_to_string(section)
|
|
||||||
self.log(secname)
|
|
||||||
for a in section.findNextSiblings('a', href=True):
|
|
||||||
spans = a.findAll('span')
|
|
||||||
if len(spans) == 2:
|
|
||||||
title = u'{}: {}'.format(*map(self.tag_to_string, spans))
|
|
||||||
else:
|
|
||||||
title = self.tag_to_string(a)
|
|
||||||
articles.append({'title': title, 'url': process_url(a['href'])})
|
|
||||||
self.log(' ', title, articles[-1]['url'])
|
|
||||||
if articles:
|
|
||||||
feeds.append((secname, articles))
|
|
||||||
return feeds
|
|
||||||
return self.economist_parse_old_index(soup)
|
|
||||||
|
|
||||||
def economist_parse_old_index(self, soup):
|
|
||||||
feeds = OrderedDict()
|
|
||||||
for section in soup.findAll(attrs={'class': lambda x: x and 'section' in x}):
|
|
||||||
h4 = section.find('h4')
|
|
||||||
if h4 is None:
|
|
||||||
continue
|
|
||||||
section_title = self.tag_to_string(h4).strip()
|
|
||||||
if not section_title:
|
|
||||||
continue
|
|
||||||
self.log('Found section: %s' % section_title)
|
|
||||||
articles = []
|
articles = []
|
||||||
subsection = ''
|
for a in section.findAll('a', href=True, **classes('headline-link weekly-edition-wtw__link')):
|
||||||
for node in section.findAll(attrs={'class': 'article'}):
|
spans = a.findAll('span')
|
||||||
subsec = node.findPreviousSibling('h5')
|
if len(spans) == 2:
|
||||||
if subsec is not None:
|
title = u'{}: {}'.format(*map(self.tag_to_string, spans))
|
||||||
subsection = self.tag_to_string(subsec)
|
else:
|
||||||
prefix = (subsection + ': ') if subsection else ''
|
|
||||||
a = node.find('a', href=True)
|
|
||||||
if a is not None:
|
|
||||||
url = a['href']
|
|
||||||
if url.startswith('/'):
|
|
||||||
url = 'https://www.economist.com' + url
|
|
||||||
url += '/print'
|
|
||||||
title = self.tag_to_string(a)
|
title = self.tag_to_string(a)
|
||||||
if title:
|
articles.append({'title': title, 'url': process_url(a['href'])})
|
||||||
title = prefix + title
|
self.log(' ', title, articles[-1]['url'])
|
||||||
self.log('\tFound article:', title)
|
|
||||||
articles.append({
|
|
||||||
'title': title,
|
|
||||||
'url': url,
|
|
||||||
'description': '',
|
|
||||||
'date': ''
|
|
||||||
})
|
|
||||||
|
|
||||||
if articles:
|
if articles:
|
||||||
if section_title not in feeds:
|
feeds.append((secname, articles))
|
||||||
feeds[section_title] = []
|
return feeds
|
||||||
feeds[section_title] += articles
|
|
||||||
|
|
||||||
ans = [(key, val) for key, val in feeds.items()]
|
|
||||||
return ans
|
|
||||||
|
|
||||||
def eco_find_image_tables(self, soup):
|
def eco_find_image_tables(self, soup):
|
||||||
for x in soup.findAll('table', align=['right', 'center']):
|
for x in soup.findAll('table', align=['right', 'center']):
|
||||||
|
@ -6,7 +6,6 @@ try:
|
|||||||
except ImportError:
|
except ImportError:
|
||||||
from cookielib import Cookie
|
from cookielib import Cookie
|
||||||
import json
|
import json
|
||||||
from collections import OrderedDict
|
|
||||||
|
|
||||||
from html5_parser import parse
|
from html5_parser import parse
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
@ -250,76 +249,29 @@ class Economist(BasicNewsRecipe):
|
|||||||
return ans
|
return ans
|
||||||
|
|
||||||
def economist_parse_index(self, soup):
|
def economist_parse_index(self, soup):
|
||||||
img = soup.find(attrs={'srcset': True, 'class': lambda x: x and 'print-edition__cover-widget__image' in x.split()})
|
div = soup.find(attrs={'class': 'weekly-edition-header__image'})
|
||||||
if img is not None:
|
if div is not None:
|
||||||
for part in img['srcset'].split():
|
img = div.find('img', srcset=True)
|
||||||
if part.startswith('/'):
|
self.cover_url = img['srcset'].split(',')[-1].split()[0]
|
||||||
part = part.replace('200-width', '640-width')
|
self.log('Got cover:', self.cover_url)
|
||||||
self.cover_url = 'https://www.economist.com' + part
|
|
||||||
self.log('Got cover:', self.cover_url)
|
|
||||||
break
|
|
||||||
|
|
||||||
sections = soup.findAll('div', attrs={'class': 'list__title'})
|
feeds = []
|
||||||
if sections:
|
for section in soup.findAll('div', **classes('layout-weekly-edition-section')):
|
||||||
feeds = []
|
h2 = section.find('h2')
|
||||||
for section in sections:
|
secname = self.tag_to_string(h2)
|
||||||
articles = []
|
self.log(secname)
|
||||||
secname = self.tag_to_string(section)
|
|
||||||
self.log(secname)
|
|
||||||
for a in section.findNextSiblings('a', href=True):
|
|
||||||
spans = a.findAll('span')
|
|
||||||
if len(spans) == 2:
|
|
||||||
title = u'{}: {}'.format(*map(self.tag_to_string, spans))
|
|
||||||
else:
|
|
||||||
title = self.tag_to_string(a)
|
|
||||||
articles.append({'title': title, 'url': process_url(a['href'])})
|
|
||||||
self.log(' ', title, articles[-1]['url'])
|
|
||||||
if articles:
|
|
||||||
feeds.append((secname, articles))
|
|
||||||
return feeds
|
|
||||||
return self.economist_parse_old_index(soup)
|
|
||||||
|
|
||||||
def economist_parse_old_index(self, soup):
|
|
||||||
feeds = OrderedDict()
|
|
||||||
for section in soup.findAll(attrs={'class': lambda x: x and 'section' in x}):
|
|
||||||
h4 = section.find('h4')
|
|
||||||
if h4 is None:
|
|
||||||
continue
|
|
||||||
section_title = self.tag_to_string(h4).strip()
|
|
||||||
if not section_title:
|
|
||||||
continue
|
|
||||||
self.log('Found section: %s' % section_title)
|
|
||||||
articles = []
|
articles = []
|
||||||
subsection = ''
|
for a in section.findAll('a', href=True, **classes('headline-link weekly-edition-wtw__link')):
|
||||||
for node in section.findAll(attrs={'class': 'article'}):
|
spans = a.findAll('span')
|
||||||
subsec = node.findPreviousSibling('h5')
|
if len(spans) == 2:
|
||||||
if subsec is not None:
|
title = u'{}: {}'.format(*map(self.tag_to_string, spans))
|
||||||
subsection = self.tag_to_string(subsec)
|
else:
|
||||||
prefix = (subsection + ': ') if subsection else ''
|
|
||||||
a = node.find('a', href=True)
|
|
||||||
if a is not None:
|
|
||||||
url = a['href']
|
|
||||||
if url.startswith('/'):
|
|
||||||
url = 'https://www.economist.com' + url
|
|
||||||
url += '/print'
|
|
||||||
title = self.tag_to_string(a)
|
title = self.tag_to_string(a)
|
||||||
if title:
|
articles.append({'title': title, 'url': process_url(a['href'])})
|
||||||
title = prefix + title
|
self.log(' ', title, articles[-1]['url'])
|
||||||
self.log('\tFound article:', title)
|
|
||||||
articles.append({
|
|
||||||
'title': title,
|
|
||||||
'url': url,
|
|
||||||
'description': '',
|
|
||||||
'date': ''
|
|
||||||
})
|
|
||||||
|
|
||||||
if articles:
|
if articles:
|
||||||
if section_title not in feeds:
|
feeds.append((secname, articles))
|
||||||
feeds[section_title] = []
|
return feeds
|
||||||
feeds[section_title] += articles
|
|
||||||
|
|
||||||
ans = [(key, val) for key, val in feeds.items()]
|
|
||||||
return ans
|
|
||||||
|
|
||||||
def eco_find_image_tables(self, soup):
|
def eco_find_image_tables(self, soup):
|
||||||
for x in soup.findAll('table', align=['right', 'center']):
|
for x in soup.findAll('table', align=['right', 'center']):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user