mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
60 lines
2.1 KiB
Plaintext
60 lines
2.1 KiB
Plaintext
from calibre.web.feeds.recipes import BasicNewsRecipe
|
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
|
from lxml import etree
|
|
|
|
|
|
class Ekathimerini(BasicNewsRecipe):
|
|
title = 'ekathimerini'
|
|
__author__ = 'Thomas Scholl'
|
|
description = 'News from Greece, English edition'
|
|
masthead_url = 'http://wwk.kathimerini.gr/webadmin/EnglishNew/gifs/logo.gif'
|
|
max_articles_per_feed = 100
|
|
oldest_article = 100
|
|
publisher = 'Kathimerini'
|
|
category = 'news, GR'
|
|
language = 'en_GR'
|
|
encoding = 'windows-1253'
|
|
conversion_options = {'linearize_tables': True}
|
|
no_stylesheets = True
|
|
delay = 1
|
|
keep_only_tags = [dict(name='td', attrs={'class': 'news'})]
|
|
|
|
rss_url = 'http://ws.kathimerini.gr/xml_files/latestnews.xml'
|
|
|
|
def find_articles(self, idx, category):
|
|
for article in idx.findAll('item'):
|
|
cat = u''
|
|
cat_elem = article.find('subcat')
|
|
if cat_elem:
|
|
cat = self.tag_to_string(cat_elem)
|
|
|
|
if cat == category:
|
|
desc_html = self.tag_to_string(article.find('description'))
|
|
description = self.tag_to_string(BeautifulSoup(desc_html))
|
|
|
|
a = {
|
|
'title': self.tag_to_string(article.find('title')),
|
|
'url': self.tag_to_string(article.find('link')),
|
|
'description': description,
|
|
'date': self.tag_to_string(article.find('pubdate')),
|
|
}
|
|
yield a
|
|
|
|
def parse_index(self):
|
|
idx_contents = self.browser.open(self.rss_url).read()
|
|
idx = etree.fromstring(idx_contents, parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False))
|
|
|
|
cats = sorted({self.tag_to_string(subcat)
|
|
for subcat in idx.xpath('//*[local-name()="subcat"]')})
|
|
|
|
feeds = [(u'News', list(self.find_articles(idx, u'')))]
|
|
|
|
for cat in cats:
|
|
feeds.append((cat.capitalize(), list(
|
|
self.find_articles(idx, cat))))
|
|
|
|
return feeds
|
|
|
|
def print_version(self, url):
|
|
return url.replace('http://www.ekathimerini.com/4dcgi/', 'http://www.ekathimerini.com/4Dcgi/4dcgi/')
|