calibre/recipes/google_news.recipe
2024-03-30 13:03:29 +05:30

93 lines
3.5 KiB
Python

#!/usr/bin/env python
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
import json
from calibre.ptempfile import PersistentTemporaryFile
from calibre.web.feeds.news import BasicNewsRecipe
# a serarch topic, filled into the string below. You can change that to anything google news should be searched for...
terms_to_search_for = (
'computer',
'books',
)
class google_news_de(BasicNewsRecipe):
# Title of the Recipe - this is a sample
title = 'Google News'
cover_url = 'https://upload.wikimedia.org/wikipedia/commons/thumb/d/da/Google_News_icon.svg/500px-Google_News_icon.svg.png'
# Author
__author__ = 'Volker Heggemann, VoHe, unkn0wn'
# oldest article to download (in days) ---- can be edit by user
oldest_article = 1.25
# describes itself, ---- can be edit by user
max_articles_per_feed = 200
# speed up the download on fast computers be careful (I test max.20)
# ---- can be edit by user
simultaneous_downloads = 10
# description, some Reader show this in titlepage
description = u'Google News filter by your own recipe. Please read it in calibre software!'
# What is the content of?
category = u'NEWS'
# describes itself, ---- can be edit by user
use_embedded_content = False
remove_javascript = True
# Removes empty feeds - why keep them!?
remove_empty_feeds = True
# remove the rubbish (in ebook)
auto_cleanup = True
articles_are_obfuscated = True
def get_obfuscated_article(self, url):
br = self.get_browser()
try:
br.open(url)
except Exception as e:
url = e.hdrs.get('location')
soup = self.index_to_soup(url)
link = soup.find('a', href=True)
skip_sections =[ # add sections you want to skip
'/video/', '/videos/', '/media/', 'podcast-'
]
if any(x in link['href'] for x in skip_sections):
self.log('Aborting Article ', link['href'])
self.abort_article('skipping video links')
self.log('Found link: ', link['href'])
html = br.open(link['href']).read()
pt = PersistentTemporaryFile('.html')
pt.write(html)
pt.close()
return pt.name
# now the content description and URL follows
# feel free to add, wipe out what you need ---- can be edit by user
#
def get_feeds(self):
url = "https://geolocation-db.com/json"
data = self.index_to_soup(url, raw=True)
data = json.loads(data)
country_code = str(data['country_code']).lower() # for me this is de
city = data['city']
self.feeds = [
('Google news Topnews for ' + country_code,
'https://news.google.com/news?pz=1&cf=all&ned=' + country_code +
'&hl=' + country_code + '&output=rss'),
]
if city:
location = '{},{}'.format(city, country_code)
self.feeds.append(
('Google news for ' + location, 'https://news.google.' +
country_code + '/news/rss/headlines/section/geo/' + location),
)
for searchfor in terms_to_search_for:
self.feeds.append(
('Google news interested in ' + searchfor,
'https://news.google.com/news?cf=all&hl=' + country_code +
'+&pz=1&ned=' + country_code + '&q=' + searchfor + '&output=rss'))
return BasicNewsRecipe.get_feeds(self)