from calibre.web.feeds.news import BasicNewsRecipe from datetime import datetime, timedelta class CyNewsLiveRecipe(BasicNewsRecipe): __license__ = 'GPL v3' __author__ = 'kwetal' language = 'en_CY' version = 1 title = u'Cyprus Weekly' publisher = u'The Cyprus Weekly' category = u'News, Newspaper' description = u'News from Cyprus' use_embedded_content = False remove_empty_feeds = True oldest_article = 7 max_articles_per_feed = 100 no_stylesheets = True remove_javascript = True pubTime = None minTime = None articleCount = 0 INDEX = 'http://www.cyprusweekly.com.cy/main/default.aspx' feeds = [] feeds.append( ('News: Cyprus', 'http://www.cyprusweekly.com.cy/main/92,0,0,0-CYPRUS.aspx')) feeds.append( ('News: World', 'http://www.cyprusweekly.com.cy/main/78,0,0,0-UKWORLD.aspx')) feeds.append(('Sport: Football', 'http://www.cyprusweekly.com.cy/main/82,0,0,0-FOOTBALL.aspx')) feeds.append( ('Sport: Rugby', 'http://www.cyprusweekly.com.cy/main/83,0,0,0-RUGBY.aspx')) feeds.append( ('Sport: Cricket', 'http://www.cyprusweekly.com.cy/main/85,0,0,0-CRICKET.aspx')) feeds.append( ('Sport: Tennis', 'http://www.cyprusweekly.com.cy/main/84,0,0,0-TENNIS.aspx')) feeds.append( ('Sport: Other', 'http://www.cyprusweekly.com.cy/main/86,0,0,0-OTHER.aspx')) feeds.append( ('Business: Local', 'http://www.cyprusweekly.com.cy/main/100,0,0,0-LOCAL.aspx')) feeds.append(('Business: Foreign', 'http://www.cyprusweekly.com.cy/main/101,0,0,0-FOREIGN.aspx')) feeds.append(('Whats On: Places of Interest', 'http://www.cyprusweekly.com.cy/main/123,0,0,0-PLACES-OF-INTEREST.aspx')) feeds.append(('Whats On: Going Out', 'http://www.cyprusweekly.com.cy/main/153,0,0,0-GOING-OUT.aspx')) feeds.append(('Whats On: Arts & Entertainment', 'http://www.cyprusweekly.com.cy/main/135,0,0,0-ARTS--and-ENTERTAINMENT.aspx')) feeds.append(('Whats On: Things To Do', 'http://www.cyprusweekly.com.cy/main/136,0,0,0-THINGS-TO-DO.aspx')) feeds.append(('Whats On: Shopping Guide', 'http://www.cyprusweekly.com.cy/main/142,0,0,0-SHOPPING-GUIDE.aspx')) feeds.append( ('Culture', 'http://www.cyprusweekly.com.cy/main/208,0,0,0-CULTURE.aspx')) feeds.append( ('Environment', 'http://www.cyprusweekly.com.cy/main/93,0,0,0-ENVIRONMENT.aspx')) feeds.append( ('Info', 'http://www.cyprusweekly.com.cy/main/91,0,0,0-INFO.aspx')) keep_only_tags = [] keep_only_tags.append( dict(name='div', attrs={'class': 'ArticleCategories'})) extra_css = ''' body{font-family:verdana,arial,helvetica,geneva,sans-serif ;} ''' def parse_index(self): answer = [] for feed in self.feeds: self.articleCount = 0 articles = [] soup = self.index_to_soup(feed[1]) table = soup.find('table', attrs={'id': 'ctl00_cp_ctl01_listp'}) if table: self.pubTime = datetime.now() self.minTime = self.pubTime - \ timedelta(days=self.oldest_article) self.find_articles(table, articles) answer.append((feed[0], articles)) return answer def postprocess_html(self, soup, first): for el in soup.findAll(attrs={'style': True}): del el['style'] for el in soup.findAll('font'): el.name = 'div' for attr, value in el: del el[attr] return soup def find_articles(self, table, articles): for div in table.findAll('div', attrs={'class': 'ListArticle'}): el = div.find('div', attrs={'class': 'ListArticle_T'}) title = self.tag_to_string(el.a) url = self.INDEX + el.a['href'] description = self.tag_to_string( div.find('div', attrs={'class': 'ListArticle_BODY300'})) el = div.find('div', attrs={'class': 'ListArticle_D'}) if el: dateParts = self.tag_to_string(el).split(' ') monthNames = {'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6, 'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12} timeParts = dateParts[3].split(':') self.pubTime = datetime(year=int(dateParts[2]), month=int(monthNames[dateParts[1]]), day=int(dateParts[0]), hour=int(timeParts[0]), minute=int(timeParts[1])) if self.pubTime >= self.minTime and self.articleCount <= self.max_articles_per_feed: articles.append( {'title': title, 'date': self.pubTime, 'url': url, 'description': description}) self.articleCount += 1 else: return