calibre/recipes/google_news.recipe

#!/usr/bin/env python
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals

import json

from calibre.ptempfile import PersistentTemporaryFile
from calibre.web.feeds.news import BasicNewsRecipe

# a serarch topic, filled into the string below. You can change that to anything google news should be searched for...
terms_to_search_for = (
    'computer',
    'books',
)


class google_news_de(BasicNewsRecipe):
    # Title of the Recipe - this is a sample
    title = 'Google News'
    cover_url = 'https://upload.wikimedia.org/wikipedia/commons/thumb/d/da/Google_News_icon.svg/500px-Google_News_icon.svg.png'
    # Author
    __author__ = 'Volker Heggemann, VoHe, unkn0wn'
    # oldest article to download (in days) 									---- can be edit by user
    oldest_article = 1.25
    # describes itself, 						 							---- can be edit by user
    max_articles_per_feed = 200
    # speed up the download on fast computers be careful (I test max.20)
    # ---- can be edit by user
    simultaneous_downloads = 10
    # description, some Reader show this in titlepage
    description = u'Google News filter by your own recipe. Please read it in calibre software!'
    # What is the content of?
    category = u'NEWS'
    # describes itself, 						 							---- can be edit by user
    use_embedded_content = False
    remove_javascript = True
    # Removes empty feeds - why keep them!?
    remove_empty_feeds = True

    # remove the rubbish (in ebook)
    auto_cleanup = True

    articles_are_obfuscated = True

    def get_obfuscated_article(self, url):
        br = self.get_browser()
        try:
            br.open(url)
        except Exception as e:
            url = e.hdrs.get('location')
        soup = self.index_to_soup(url)
        link = soup.find('a', href=True)
        skip_sections =[ # add sections you want to skip
            '/video/', '/videos/', '/media/', 'podcast-'
        ]
        if any(x in link['href'] for x in skip_sections):
            self.log('Aborting Article ', link['href'])
            self.abort_article('skipping video links')

        self.log('Found link: ', link['href'])
        html = br.open(link['href']).read()
        pt = PersistentTemporaryFile('.html')
        pt.write(html)
        pt.close()
        return pt.name

    # now the content description and URL follows
    # feel free to add, wipe out what you need	 							---- can be edit by user
    #
    def get_feeds(self):
        url = "https://geolocation-db.com/json"
        data = self.index_to_soup(url, raw=True)
        data = json.loads(data)
        country_code = str(data['country_code']).lower()  # for me this is de
        city = data['city']
        self.feeds = [
            ('Google news Topnews for ' + country_code,
             'https://news.google.com/news?pz=1&cf=all&ned=' + country_code +
             '&hl=' + country_code + '&output=rss'),
        ]
        if city:
            location = '{},{}'.format(city, country_code)
            self.feeds.append(
                ('Google news for ' + location, 'https://news.google.' +
                country_code + '/news/rss/headlines/section/geo/' + location),
            )
        for searchfor in terms_to_search_for:
            self.feeds.append(
                ('Google news interested in ' + searchfor,
                 'https://news.google.com/news?cf=all&hl=' + country_code +
                 '+&pz=1&ned=' + country_code + '&q=' + searchfor + '&output=rss'))
        return BasicNewsRecipe.get_feeds(self)