calibre/recipes/ekathemerini.recipe
2019-12-29 22:02:16 +05:30

60 lines
2.1 KiB
Plaintext

from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from lxml import etree
class Ekathimerini(BasicNewsRecipe):
title = 'ekathimerini'
__author__ = 'Thomas Scholl'
description = 'News from Greece, English edition'
masthead_url = 'http://wwk.kathimerini.gr/webadmin/EnglishNew/gifs/logo.gif'
max_articles_per_feed = 100
oldest_article = 100
publisher = 'Kathimerini'
category = 'news, GR'
language = 'en_GR'
encoding = 'windows-1253'
conversion_options = {'linearize_tables': True}
no_stylesheets = True
delay = 1
keep_only_tags = [dict(name='td', attrs={'class': 'news'})]
rss_url = 'http://ws.kathimerini.gr/xml_files/latestnews.xml'
def find_articles(self, idx, category):
for article in idx.findAll('item'):
cat = u''
cat_elem = article.find('subcat')
if cat_elem:
cat = self.tag_to_string(cat_elem)
if cat == category:
desc_html = self.tag_to_string(article.find('description'))
description = self.tag_to_string(BeautifulSoup(desc_html))
a = {
'title': self.tag_to_string(article.find('title')),
'url': self.tag_to_string(article.find('link')),
'description': description,
'date': self.tag_to_string(article.find('pubdate')),
}
yield a
def parse_index(self):
idx_contents = self.browser.open(self.rss_url).read()
idx = etree.fromstring(idx_contents, parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False))
cats = sorted({self.tag_to_string(subcat)
for subcat in idx.xpath('//*[local-name()="subcat"]')})
feeds = [(u'News', list(self.find_articles(idx, u'')))]
for cat in cats:
feeds.append((cat.capitalize(), list(
self.find_articles(idx, cat))))
return feeds
def print_version(self, url):
return url.replace('http://www.ekathimerini.com/4dcgi/', 'http://www.ekathimerini.com/4Dcgi/4dcgi/')