mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Reuters by Kovid Goyal
This commit is contained in:
parent
27249c915d
commit
3fa778aefa
77
recipes/reuters.recipe
Normal file
77
recipes/reuters.recipe
Normal file
@ -0,0 +1,77 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=utf-8
|
||||
# License: GPLv3 Copyright: 2020, Kovid Goyal <kovid at kovidgoyal.net>
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import re
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe, classes
|
||||
|
||||
country = 'us'
|
||||
country_defs = {
|
||||
'us': ('www.reuters.com', {
|
||||
'Business': 'finance',
|
||||
'Markets': 'finance/markets',
|
||||
'World': 'news/world',
|
||||
'Politics': 'politics',
|
||||
'Tech': 'news/technology',
|
||||
'Wealth': 'finance/wealth',
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
class Reuters(BasicNewsRecipe):
|
||||
title = 'Reuters'
|
||||
description = 'News from all over'
|
||||
__author__ = 'Kovid Goyal'
|
||||
language = 'en'
|
||||
|
||||
keep_only_tags = [
|
||||
classes('ArticleHeader_content-container StandardArticleBody_body')
|
||||
]
|
||||
remove_tags = [
|
||||
classes('Image_expand-button RelatedCoverage_related-coverage-module'),
|
||||
dict(name='link'),
|
||||
]
|
||||
|
||||
def parse_index(self):
|
||||
base, sections = country_defs[country]
|
||||
ans = []
|
||||
|
||||
for section_title in sorted(sections):
|
||||
slug = sections[section_title]
|
||||
self.log(section_title)
|
||||
articles = list(self.parse_reuters_section(base, slug))
|
||||
if articles:
|
||||
ans.append((section_title, articles))
|
||||
if self.test and len(ans) >= self.test[0]:
|
||||
break
|
||||
return ans
|
||||
|
||||
def parse_reuters_section(self, base, slug):
|
||||
url = 'https://' + base + '/' + slug
|
||||
soup = self.index_to_soup(url)
|
||||
for div in soup.findAll(**classes('news-headline-list')):
|
||||
h3 = div.find(**classes('story-title'))
|
||||
a = h3.parent
|
||||
title = self.tag_to_string(h3)
|
||||
url = 'https://{}{}'.format(base, a['href'])
|
||||
self.log('\t', title, url)
|
||||
yield {'title': title, 'url': url}
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
url_pat = re.compile(r'url\((.+?)\)')
|
||||
for div in soup.findAll(style=True, **classes('LazyImage_image')):
|
||||
m = url_pat.search(div['style'])
|
||||
if m is None:
|
||||
self.warn('Failed to find lazy image url in:', div['style'])
|
||||
continue
|
||||
url = m.group(1)
|
||||
if url.startswith('//'):
|
||||
url = 'https:' + url
|
||||
url = url.replace('&w=20', '')
|
||||
img = div.findPreviousSibling('img')
|
||||
img['src'] = url
|
||||
img['data-modified'] = url
|
||||
return soup
|
Loading…
x
Reference in New Issue
Block a user