from calibre.web.feeds.news import BasicNewsRecipe class TheNewsRecipe(BasicNewsRecipe): __license__ = 'GPL v3' __author__ = 'kwetal' language = 'en_PK' version = 1 title = u'The News' publisher = u'Jang Group' category = u'News, Pakistan' description = u'English Newspaper from Pakistan' use_embedded_content = False remove_empty_feeds = True oldest_article = 2 max_articles_per_feed = 100 no_stylesheets = True remove_javascript = True encoding = 'iso-8859-1' remove_tags = [] remove_tags.append(dict(name='img', attrs={'src': 'images/thenews.gif'})) remove_tags.append(dict(name='img', attrs={'src': 'images/shim.gif'})) # Feeds from http://thenews.com.pk/rss.asp feeds = [] feeds.append( (u'Latest Stories', u'http://www.thenews.com.pk/rss/thenews_updates.xml')) feeds.append( (u'Top Stories', u'http://www.thenews.com.pk/rss/thenews_topstories.xml')) feeds.append( (u'World News', u'http://www.thenews.com.pk/rss/thenews_world.xml')) feeds.append( (u'National News', u'http://www.thenews.com.pk/rss/thenews_national.xml')) feeds.append( (u'Business News', u'http://www.thenews.com.pk/rss/thenews_business.xml')) feeds.append( (u'Karachi News', u'http://www.thenews.com.pk/rss/thenews_karachi.xml')) feeds.append( (u'Lahore News', u'http://www.thenews.com.pk/rss/thenews_lahore.xml')) feeds.append( (u'Islamabad News', u'http://www.thenews.com.pk/rss/thenews_islamabad.xml')) feeds.append( (u'Peshawar News', u'http://www.thenews.com.pk/rss/thenews_peshawar.xml')) feeds.append( (u'Editorial', u'http://www.thenews.com.pk/rss/thenews_editorial.xml')) feeds.append( (u'Opinion', u'http://www.thenews.com.pk/rss/thenews_opinion.xml')) feeds.append( (u'Sports News', u'http://www.thenews.com.pk/rss/thenews_sports.xml')) feeds.append( (u'Newspost', u'http://www.thenews.com.pk/rss/thenews_newspost.xml')) conversion_options = {'comments': description, 'tags': category, 'language': 'en', 'publisher': publisher, 'linearize_tables': True} extra_css = ''' body{font-family:verdana,arial,helvetica,geneva,sans-serif;} .heading_txt {font-size: x-large; font-weight: bold; text-align: left;} .small_txt {text-align: left;} .dateline {font-size: x-small; color: #696969; margin-top: 1em; margin-bottom: 1em} ''' def print_version(self, url): ignore, sep, main = url.rpartition('/') if main.startswith('updates.asp'): return url.replace('updates.asp', 'print.asp') elif main.startswith('top_story_detail.asp'): return url.replace('top_story_detail.asp', 'print3.asp') elif main.startswith('daily_detail.asp'): return url.replace('daily_detail.asp', 'print1.asp') else: return None def preprocess_html(self, soup): for tr in soup.findAll('tr', attrs={'bgcolor': True}): del tr['bgcolor'] td = soup.find('td', attrs={'class': 'small_txt', 'height': '20'}) if td: del td['height'] td['class'] = 'dateline' return soup