From dee5c064c977a7750a29afda2eaa3f5748cb180d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 26 Dec 2009 07:40:21 -0700 Subject: [PATCH] New recipes for Cyprus News and Kleine Zeitung by kwetal --- Changelog.yaml | 6 ++ resources/recipes/bwmagazine.recipe | 44 ++++++++--- resources/recipes/cynewslive.recipe | 101 +++++++++++++++++++++++++ resources/recipes/kleinezeitung.recipe | 54 +++++++++++++ 4 files changed, 196 insertions(+), 9 deletions(-) create mode 100644 resources/recipes/cynewslive.recipe create mode 100644 resources/recipes/kleinezeitung.recipe diff --git a/Changelog.yaml b/Changelog.yaml index 5fab5eb3b3..bf59096c03 100644 --- a/Changelog.yaml +++ b/Changelog.yaml @@ -68,6 +68,12 @@ new recipes: + - title: Cyprus News Live + author: kwetal + + - title: Kleine Zeitung + author: kwetal + - title: Business Week Magazine author: Darko Mieltic diff --git a/resources/recipes/bwmagazine.recipe b/resources/recipes/bwmagazine.recipe index e8c0908fef..26dbc459d3 100644 --- a/resources/recipes/bwmagazine.recipe +++ b/resources/recipes/bwmagazine.recipe @@ -5,34 +5,60 @@ __copyright__ = '2009, Darko Miletic ' http://www.businessweek.com/magazine/news/articles/business_news.htm ''' +from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe class BWmagazine(BasicNewsRecipe): title = 'BusinessWeek Magazine' __author__ = 'Darko Miletic' description = 'Stay up to date with BusinessWeek magazine articles. Read news on international business, personal finances & the economy in the BusinessWeek online magazine.' - publisher = 'BusinessWeek' - category = 'news, finances, politics, USA' - oldest_article = 50 + publisher = 'Bloomberg L.P.' + category = 'news, International Business News, current news in international business,international business articles, personal business, business week magazine, business week magazine articles, business week magazine online, business week online magazine' + oldest_article = 10 max_articles_per_feed = 100 no_stylesheets = True encoding = 'utf-8' use_embedded_content = False language = 'en' + INDEX = 'http://www.businessweek.com/magazine/news/articles/business_news.htm' cover_url = 'http://images.businessweek.com/mz/covers/current_120x160.jpg' + conversion_options = { - 'comment' : description - , 'tags' : category - , 'publisher' : publisher - , 'language' : language + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language } + + def parse_index(self): + articles = [] + soup = self.index_to_soup(self.INDEX) + ditem = soup.find('div',attrs={'id':'column2'}) + if ditem: + for item in ditem.findAll('h3'): + title_prefix = '' + description = '' + feed_link = item.find('a') + if feed_link and feed_link.has_key('href'): + url = 'http://www.businessweek.com/magazine/' + feed_link['href'].partition('../../')[2] + title = title_prefix + self.tag_to_string(feed_link) + date = strftime(self.timefmt) + articles.append({ + 'title' :title + ,'date' :date + ,'url' :url + ,'description':description + }) + return [(soup.head.title.string, articles)] + keep_only_tags = dict(name='div', attrs={'id':'storyBody'}) - feeds = [(u'Articles', u'http://rss.businessweek.com/bw_rss/magazine')] - def print_version(self, url): rurl = url.rpartition('?')[0] + if rurl == '': + rurl = url return rurl.replace('.com/magazine/','.com/print/magazine/') + diff --git a/resources/recipes/cynewslive.recipe b/resources/recipes/cynewslive.recipe new file mode 100644 index 0000000000..47c0ca1fe6 --- /dev/null +++ b/resources/recipes/cynewslive.recipe @@ -0,0 +1,101 @@ +from calibre.web.feeds.news import BasicNewsRecipe +from datetime import datetime, timedelta + +class CyNewsLiveRecipe(BasicNewsRecipe): + __license__ = 'GPL v3' + __author__ = 'kwetal' + language = 'en_CY' + version = 1 + + title = u'Cyprus News Live' + publisher = u'The Cyprus Weekly' + category = u'News, Newspaper' + description = u'News from Cyprus' + + use_embedded_content = False + remove_empty_feeds = True + oldest_article = 7 + max_articles_per_feed = 100 + + no_stylesheets = True + remove_javascript = True + + pubTime = None + minTime = None + articleCount = 0 + + INDEX = 'http://www.cynewslive.com' + + feeds = [] + feeds.append(('News: Cyprus', 'http://www.cynewslive.com/main/92,0,0,0-CYPRUS.aspx')) + feeds.append(('News: World', 'http://www.cynewslive.com/main/78,0,0,0-UKWORLD.aspx')) + feeds.append(('Sport: Football', 'http://www.cynewslive.com/main/82,0,0,0-FOOTBALL.aspx')) + feeds.append(('Sport: Rugby', 'http://www.cynewslive.com/main/83,0,0,0-RUGBY.aspx')) + feeds.append(('Sport: Cricket', 'http://www.cynewslive.com/main/85,0,0,0-CRICKET.aspx')) + feeds.append(('Sport: Tennis', 'http://www.cynewslive.com/main/84,0,0,0-TENNIS.aspx')) + feeds.append(('Sport: Other', 'http://www.cynewslive.com/main/86,0,0,0-OTHER.aspx')) + feeds.append(('Business: Local', 'http://www.cynewslive.com/main/100,0,0,0-LOCAL.aspx')) + feeds.append(('Business: Foreign', 'http://www.cynewslive.com/main/101,0,0,0-FOREIGN.aspx')) + feeds.append(('Environment', 'http://www.cynewslive.com/main/93,0,0,0-ENVIRONMENT.aspx')) + feeds.append(('Culture', 'http://www.cynewslive.com/main/208,0,0,0-CULTURE.aspx')) + + keep_only_tags = [] + keep_only_tags.append(dict(name = 'div', attrs = {'class': 'ArticleCategories'})) + + extra_css = ''' + body{font-family:verdana,arial,helvetica,geneva,sans-serif ;} + ''' + + def parse_index(self): + answer = [] + for feed in self.feeds: + self.articleCount = 0 + articles = [] + soup = self.index_to_soup(feed[1]) + + table = soup.find('table', attrs = {'id': 'ctl00_cp_ctl01_listp'}) + if table: + self.pubTime = datetime.now() + self.minTime = self.pubTime - timedelta(days = self.oldest_article) + + self.find_articles(table, articles) + + answer.append((feed[0], articles)) + + return answer + + def postprocess_html(self, soup, first): + for el in soup.findAll(attrs = {'style': True}): + del el['style'] + + for el in soup.findAll('font'): + el.name = 'div' + for attr, value in el: + del el[attr] + + return soup + + def find_articles(self, table, articles): + for div in table.findAll('div', attrs = {'class': 'ListArticle'}): + el = div.find('div', attrs = {'class': 'ListArticle_T'}) + title = self.tag_to_string(el.a) + url = self.INDEX + el.a['href'] + + description = self.tag_to_string(div.find('div', attrs = {'class': 'ListArticle_BODY300'})) + + el = div.find('div', attrs = {'class': 'ListArticle_D'}) + if el: + dateParts = self.tag_to_string(el).split(' ') + monthNames = {'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6, + 'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, + 'December': 12} + timeParts = dateParts[3].split(':') + self.pubTime = datetime(year = int(dateParts[2]), month = int(monthNames[dateParts[1]]), + day = int(dateParts[0]), hour = int(timeParts[0]), + minute = int(timeParts[1])) + + if self.pubTime >= self.minTime and self.articleCount <= self.max_articles_per_feed: + articles.append({'title': title, 'date': self.pubTime, 'url': url, 'description': description}) + self.articleCount += 1 + else: + return diff --git a/resources/recipes/kleinezeitung.recipe b/resources/recipes/kleinezeitung.recipe new file mode 100644 index 0000000000..1ac035e0d9 --- /dev/null +++ b/resources/recipes/kleinezeitung.recipe @@ -0,0 +1,54 @@ +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup +import re + +class KleineZeitungRecipe(BasicNewsRecipe): + __license__ = 'GPL v3' + __author__ = 'kwetal' + language = 'de_AT' + version = 1 + + title = u'Kleine Zeitung' + publisher = u'Kleine Zeitung GmbH & Co KG' + category = u'News, Newspaper' + description = u'Nachrichten aus \u00D6sterreich' + + use_embedded_content = False + remove_empty_feeds = True + oldest_article = 2 + max_articles_per_feed = 100 + + no_stylesheets = True + remove_javascript = True + + # Feeds from http://www.kleinezeitung.at/allgemein/multimedia/102434/wichtige-news-immer-sofort-ueber-rss-feed-abrufen.story + feeds = [] + feeds.append((u'Chronik', u'http://www.kleinezeitung.at/klon/rss/nachrichten')) + feeds.append((u'Wirtschaft', u'http://www.kleinezeitung.at/klon/rss/wirtschaft')) + feeds.append((u'Leute', u'http://www.kleinezeitung.at/klon/rss/leute')) + feeds.append((u'Sport', u'http://www.kleinezeitung.at/klon/rss/sport')) + feeds.append((u'Nachrichten aus der Steiermark', u'http://www.kleinezeitung.at/klon/rss/steiermark')) + feeds.append((u'Nachrichten aus Kaernten', u'http://www.kleinezeitung.at/klon/rss/kaernten')) + feeds.append((u'Multimedia-News', u'http://www.kleinezeitung.at/klon/rss/multimedia')) + feeds.append((u'Kino, Events & Tickets', u'http://www.kleinezeitung.at/klon/rss/events')) + + keep_only_tags = [] + keep_only_tags.append(dict(name = 'div', attrs = {'class': 'article_body'})) + + remove_tags = [] + remove_tags.append(dict(name = 'a', attrs = {'id': 'comment_count'})) + remove_tags.append(dict(name = 'div', attrs = {'class': re.compile('adv[0-9]+')})) + remove_tags.append(dict(name = 'div', attrs = {'class': 'art_info'})) + remove_tags.append(dict(name = 'div', attrs = {'id': re.compile('grafikoverlay_.*')})) + remove_tags.append(dict(name = 'a', attrs = {'class': 'zoom'})) + + extra_css = ''' + body {font-family:verdana,arial,helvetica,geneva,sans-serif ;} + h1 {text-align: left;} + span {margin-left: 0.1em; margin-right: 0.1em;} + span.update {font-size: x-small; color: #666666} + span.update strong {font-weight: normal;} + p.intro {font-size: large;} + div.art_foto_big, div.art_foto {font-size: xx-small; color: #696969; margin-bottom: 0.5em;} + div.art_foto_big span.src {float: right;} + ''' \ No newline at end of file