mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
New recipe for Mother Jones by kwetal
This commit is contained in:
parent
4378c69bcd
commit
96ae3c2086
BIN
resources/images/news/motherjones.png
Normal file
BIN
resources/images/news/motherjones.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 326 B |
103
resources/recipes/motherjones.recipe
Normal file
103
resources/recipes/motherjones.recipe
Normal file
@ -0,0 +1,103 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import Tag
|
||||
from calibre.ptempfile import PersistentTemporaryFile
|
||||
|
||||
class MotherJonesRecipe(BasicNewsRecipe):
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = 'kwetal'
|
||||
language = 'en'
|
||||
version = 1
|
||||
|
||||
title = u'Mother Jones'
|
||||
publisher = u'Mother Jones'
|
||||
category = u'News, Investigative journalism'
|
||||
description = u'Independent investigative, political, and social justice reporting. Takes no prisoners, cleaves to no dogma, and tells it like it is.'
|
||||
|
||||
oldest_article = 14
|
||||
max_articles_per_feed = 100
|
||||
use_embedded_content = False
|
||||
|
||||
remove_empty_feeds = True
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
simultaneous_downloads = 3
|
||||
|
||||
keep_only_tags = []
|
||||
keep_only_tags.append(dict(name = 'h1'))
|
||||
keep_only_tags.append(dict(name = 'div', attrs = {'class': 'dek'}))
|
||||
keep_only_tags.append(dict(name = 'p', attrs = {'class': 'submitted'}))
|
||||
keep_only_tags.append(dict(name = 'div', attrs = {'class': 'print-content'}))
|
||||
#keep_only_tags.append(dict(name = '', attrs = {'': ''}))
|
||||
|
||||
remove_tags = []
|
||||
remove_tags.append(dict(name = 'base'))
|
||||
#remove_tags.append(dict(name = '', attrs = {'': ''}))
|
||||
|
||||
remove_attributes = ['style']
|
||||
|
||||
# feeds from http://motherjones.com/about/rss
|
||||
feeds = []
|
||||
feeds.append((u'Latest News', u'http://feeds.feedburner.com/motherjones/main?format=xml'))
|
||||
feeds.append((u'Politics & Current Affairs', u'http://motherjones.com/rss/sections/Politics/feed&format=xml'))
|
||||
feeds.append((u'Environment & Health', u'http://motherjones.com/rss/sections/Environment/feed'))
|
||||
feeds.append((u'Media & Culture', u'http://motherjones.com/rss/sections/Media/feed'))
|
||||
feeds.append((u'Blog: Kevin Drum', u'http://motherjones.com/rss/blogs/Kevin+Drum/feed'))
|
||||
feeds.append((u'Blog: MoJo Blog', u'http://motherjones.com/rss/blogs/mojo/feed'))
|
||||
feeds.append((u'Blog: Blue Marble', u'http://motherjones.com/rss/blogs/Blue+Marble/feed'))
|
||||
feeds.append((u'Blog: The Riff', u'http://motherjones.com/rss/blogs/Riff/feed'))
|
||||
##feeds.append((u'', u''))
|
||||
|
||||
extra_css = '''
|
||||
body{font-family:verdana,arial,helvetica,geneva,sans-serif;}
|
||||
img {float: left; margin-right: 0.5em;}
|
||||
div.dek {font-style: italic;}
|
||||
p.submitted {font-size: x-small; color: #696969;}
|
||||
div.mj_support {font-size: x-small; color: #0666666; border: 1px solid black; padding: 0.5em}
|
||||
a, a[href] {text-decoration: none; color: blue;}
|
||||
'''
|
||||
|
||||
conversion_options = {'comments': description, 'tags': category, 'language': 'en',
|
||||
'publisher': publisher}
|
||||
|
||||
temp_files = []
|
||||
articles_are_obfuscated = True
|
||||
|
||||
def get_obfuscated_article(self, url):
|
||||
'''
|
||||
The print version is sort of hard to get. I think they look at the referer header, and if
|
||||
it is not right they serve the original. This method works around that.
|
||||
'''
|
||||
br = self.get_browser()
|
||||
br.open(url)
|
||||
|
||||
response = br.follow_link(url_regex = r'/print/[0-9]+', nr = 0)
|
||||
html = response.read()
|
||||
|
||||
self.temp_files.append(PersistentTemporaryFile('_motherjones.html'))
|
||||
self.temp_files[-1].write(html)
|
||||
self.temp_files[-1].close()
|
||||
|
||||
return self.temp_files[-1].name
|
||||
|
||||
def get_article_url(self, article):
|
||||
'''
|
||||
Some of the feeds are served by feedburner (grr). Then the workaround to get their
|
||||
print version doesn't work anymore. This method provides a workaround.
|
||||
'''
|
||||
if hasattr(article, 'feedburner_origlink'):
|
||||
return article.feedburner_origlink
|
||||
else:
|
||||
return article.link
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for img in soup.findAll('img', attrs = {'src': True}):
|
||||
if not img['src'].startswith('http://'):
|
||||
img['src'] = 'http://motherjones.com' + img['src']
|
||||
|
||||
div = Tag(soup, 'div', [('class', 'mj_support')])
|
||||
div.append('''Your tax-deductible gifts help keep Mother Jones independent and uncompromised.
|
||||
To make a contribution, visit MotherJones.com or call 877-GIV-MOJO.
|
||||
''')
|
||||
soup.body.append(div)
|
||||
|
||||
return soup
|
Loading…
x
Reference in New Issue
Block a user