calibre/recipes/miami_herald.recipe
Kovid Goyal 0c678a1dc4
Update Miami Herald
Fixes #1906178 [[Enhancement] Fetch news: 30+ McClatchy newspapers](https://bugs.launchpad.net/calibre/+bug/1906178)
2020-12-27 13:38:22 +05:30

73 lines
3.6 KiB
Plaintext

__license__ = 'GPL v3'
__copyright__ = '2009-2010, Darko Miletic <darko.miletic at gmail.com>'
'''
miamiherald.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
def classes(classes):
q = frozenset(classes.split(' '))
return dict(attrs={
'class': lambda x: x and frozenset(x.split()).intersection(q)})
class TheMiamiHerald(BasicNewsRecipe):
title = 'The Miami Herald'
__author__ = 'Kovid Goyal'
description = "Miami-Dade and Broward's source for the latest breaking local news on sports, weather, business, jobs, real estate, shopping, health, travel, entertainment, & more." # noqa
oldest_article = 1
max_articles_per_feed = 100
publisher = u'The Miami Herald'
language = 'en'
no_stylesheets = True
use_embedded_content = False
encoding = 'utf-8'
remove_javascript = True
conversion_options = {
'comment': description, 'publisher': publisher, 'language': language
}
keep_only_tags = [
classes('story-body')
]
remove_tags = [
classes('social-network-macro social-media')
]
feeds = [
(u'News', u'https://www.miamiherald.com/news/?widgetName=rssfeed&widgetContentId=712015&getXmlFeed=true'),
(u'Miami-Dade', u'https://www.miamiherald.com/news/local/community/miami-dade/?widgetName=rssfeed&widgetContentId=712015&getXmlFeed=true'),
(u'Broward', u'https://www.miamiherald.com/news/local/community/broward/?widgetName=rssfeed&widgetContentId=712015&getXmlFeed=true'),
(u'Florida Keys', u'https://www.miamiherald.com/news/local/community/florida-keys/?widgetName=rssfeed&widgetContentId=712015&getXmlFeed=true'),
(u'Florida', u'https://www.miamiherald.com/news/state/florida/?widgetName=rssfeed&widgetContentId=712015&getXmlFeed=true'),
(u'National', u'https://www.miamiherald.com/news/nation-world/national/?widgetName=rssfeed&widgetContentId=712015&getXmlFeed=true'),
(u'World', u'https://www.miamiherald.com/news/nation-world/world/?widgetName=rssfeed&widgetContentId=712015&getXmlFeed=true'),
(u'Americas', u'https://www.miamiherald.com/news/nation-world/world/americas/?widgetName=rssfeed&widgetContentId=712015&getXmlFeed=true'),
(u'Cuba', u'https://www.miamiherald.com/news/nation-world/world/americas/cuba/?widgetName=rssfeed&widgetContentId=712015&getXmlFeed=true'),
(u'Haiti', u'https://www.miamiherald.com/news/nation-world/world/americas/haiti/?widgetName=rssfeed&widgetContentId=712015&getXmlFeed=true'),
(u'Politics', u'https://www.miamiherald.com/news/politics-government/?widgetName=rssfeed&widgetContentId=712015&getXmlFeed=true'),
(u'Education', u'https://www.miamiherald.com/news/local/education/?widgetName=rssfeed&widgetContentId=712015&getXmlFeed=true'),
(u'Environment', u'https://www.miamiherald.com/news/local/environment/?widgetName=rssfeed&widgetContentId=712015&getXmlFeed=true'),
(u'Sports', u'https://www.miamiherald.com/sports/?widgetName=rssfeed&widgetContentId=712015&getXmlFeed=true'),
]
def get_browser(self, *a, **kw):
# MyClatchy servers dont like the user-agent header, they hang forever
# when it is present
br = BasicNewsRecipe.get_browser(self, *a, **kw)
br.addheaders = [x for x in br.addheaders if x[0].lower() != 'user-agent']
return br
def preprocess_html(self, soup):
for picture in soup.findAll('picture'):
img = picture.find('img')
for i, source in enumerate(picture.findAll('source')):
if i == 0:
img['src'] = source['srcset'].split()[0]
source.extract()
return soup