from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup from lxml import etree class Ekathimerini(BasicNewsRecipe): title = 'ekathimerini' __author__ = 'Thomas Scholl' description = 'News from Greece, English edition' masthead_url = 'http://wwk.kathimerini.gr/webadmin/EnglishNew/gifs/logo.gif' max_articles_per_feed = 100 oldest_article = 100 publisher = 'Kathimerini' category = 'news, GR' language = 'en_GR' encoding = 'windows-1253' conversion_options = {'linearize_tables': True} no_stylesheets = True delay = 1 keep_only_tags = [dict(name='td', attrs={'class': 'news'})] rss_url = 'http://ws.kathimerini.gr/xml_files/latestnews.xml' def find_articles(self, idx, category): for article in idx.findAll('item'): cat = u'' cat_elem = article.find('subcat') if cat_elem: cat = self.tag_to_string(cat_elem) if cat == category: desc_html = self.tag_to_string(article.find('description')) description = self.tag_to_string(BeautifulSoup(desc_html)) a = { 'title': self.tag_to_string(article.find('title')), 'url': self.tag_to_string(article.find('link')), 'description': description, 'date': self.tag_to_string(article.find('pubdate')), } yield a def parse_index(self): idx_contents = self.browser.open(self.rss_url).read() idx = etree.fromstring(idx_contents, parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)) cats = sorted({self.tag_to_string(subcat) for subcat in idx.xpath('//*[local-name()="subcat"]')}) feeds = [(u'News', list(self.find_articles(idx, u'')))] for cat in cats: feeds.append((cat.capitalize(), list( self.find_articles(idx, cat)))) return feeds def print_version(self, url): return url.replace('http://www.ekathimerini.com/4dcgi/', 'http://www.ekathimerini.com/4Dcgi/4dcgi/')