from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup class Ekathimerini(BasicNewsRecipe): title = 'ekathimerini' __author__ = 'Thomas Scholl' description = 'News from Greece, English edition' masthead_url = 'http://wwk.kathimerini.gr/webadmin/EnglishNew/gifs/logo.gif' max_articles_per_feed = 100 oldest_article = 100 publisher = 'Kathimerini' category = 'news, GR' language = 'en_GR' encoding = 'windows-1253' conversion_options = { 'linearize_tables': True} no_stylesheets = True delay = 1 keep_only_tags = [dict(name='td', attrs={'class':'news'})] rss_url = 'http://ws.kathimerini.gr/xml_files/latestnews.xml' def find_articles(self, idx, category): for article in idx.findAll('item'): cat = u'' cat_elem = article.find('subcat') if cat_elem: cat = self.tag_to_string(cat_elem) if cat == category: desc_html = self.tag_to_string(article.find('description')) description = self.tag_to_string(BeautifulSoup(desc_html)) a = { 'title': self.tag_to_string(article.find('title')), 'url': self.tag_to_string(article.find('link')), 'description': description, 'date' : self.tag_to_string(article.find('pubdate')), } yield a def parse_index(self): idx_contents = self.browser.open(self.rss_url).read() idx = BeautifulStoneSoup(idx_contents, convertEntities=BeautifulStoneSoup.XML_ENTITIES) cats = list(set([self.tag_to_string(subcat) for subcat in idx.findAll('subcat')])) cats.sort() feeds = [(u'News',list(self.find_articles(idx, u'')))] for cat in cats: feeds.append((cat.capitalize(), list(self.find_articles(idx, cat)))) return feeds def print_version(self, url): return url.replace('http://www.ekathimerini.com/4dcgi/', 'http://www.ekathimerini.com/4Dcgi/4dcgi/')