diff --git a/resources/recipes/chowk.recipe b/resources/recipes/chowk.recipe new file mode 100644 index 0000000000..3c8ea48607 --- /dev/null +++ b/resources/recipes/chowk.recipe @@ -0,0 +1,46 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class ChowkRecipe(BasicNewsRecipe): + __license__ = 'GPL v3' + __author__ = 'kwetal' + language = 'en_IN' + version = 1 + + title = u'Chowk' + publisher = u'chowk.com' + category = u'Opinion, South Asia' + description = u'Ideas & Identities of South Asia' + + use_embedded_content = False + remove_empty_feeds = True + oldest_article = 30 + max_articles_per_feed = 100 + + #no_stylesheets = True + remove_javascript = True + encoding = 'utf-8' + + feeds = [] + feeds.append(('Chowk Articles', 'http://www.chowk.com/rss')) + + keep_only_tags = [] + keep_only_tags.append(dict(name = 'div', attrs = {'id': 'content'})) + + conversion_options = {'comments': description, 'tags': category, 'language': 'en', + 'publisher': publisher} + + extra_css = ''' + body{font-family:verdana,arial,helvetica,geneva,sans-serif;} + a {text-decoration: none; color: blue;} + div.pgtitle {font-size: x-large; font-weight: bold;} + div.wname, div.date {font-size: x-small; color: #696969;} + div.wname {margin-top: 1em;} + div.date {margin-bottom: 1em;} + div.title {font-weight: bold;} + ''' + + + def print_version(self, url): + main, sep, id = url.rpartition('/') + + return main + '/print/' + id diff --git a/resources/recipes/dawn.recipe b/resources/recipes/dawn.recipe new file mode 100644 index 0000000000..3d4ab42a07 --- /dev/null +++ b/resources/recipes/dawn.recipe @@ -0,0 +1,92 @@ +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Tag + +class DawnRecipe(BasicNewsRecipe): + __license__ = 'GPL v3' + __author__ = 'kwetal' + language = 'en_PK' + version = 1 + + title = u'Dawn' + publisher = u'Dawn Media Group' + category = u'News, Pakistan' + description = u'Leading English Newspaper of Pakistan covering national & international news' + + use_embedded_content = False + remove_empty_feeds = True + oldest_article = 2 + max_articles_per_feed = 100 + + no_stylesheets = True + remove_javascript = True + encoding = 'utf-8' + + # Feeds from http://www.dawn.com/wps/wcm/connect/dawn-content-library/dawn/services/rss + feeds = [] + feeds.append((u'Latest News', u'http://feedproxy.google.com/Dawn-All-News')) + feeds.append((u'Pakistan News', u'http://feeds2.feedburner.com/dawn/news/pakistan')) + feeds.append((u'World News', u'http://feeds2.feedburner.com/dawn/news/world')) + feeds.append((u'Business News', u'http://feeds2.feedburner.com/dawn/news/business')) + feeds.append((u'Sport News', u'http://feeds2.feedburner.com/dawn/news/sport')) + feeds.append((u'Cricket News', u'http://feeds2.feedburner.com/dawn/news/cricket')) + feeds.append((u'Sci-tech News', u'http://feeds2.feedburner.com/dawn/news/technology')) + feeds.append((u'Entertainment News', u'http://feeds2.feedburner.com/dawn/news/entertainment')) + feeds.append((u'Columnists', u'http://feeds2.feedburner.com/dawn/news/columnists')) + #feeds.append((u'', u'')) + + conversion_options = {'comments': description, 'tags': category, 'language': 'en', + 'publisher': publisher} + + extra_css = ''' + body{font-family:verdana,arial,helvetica,geneva,sans-serif;} + center {font-size: xx-small; color: #666666;} + strong {font-size: small; font-weight: bold;} + span.news_headline {font-size: xx-large; font-weight: bold; margin: 0em; padding: 0em} + span.news_byline {font-size: x-small; color: #696969; margin-top: 1em;} + ''' + + def print_version(self, url): + return url + '?pagedesign=Dawn_PrintlyFriendlyPage' + + def preprocess_html(self, soup): + newBody = Tag(soup, 'body') + + for cl in ['page_title', 'news_headline', 'news_byline']: + tag = soup.find('span', attrs = {'class': cl}) + if tag: + # They like their
tags; I don't: does not work well on small screens. + if tag['class'] == 'news_byline': + for br in tag.findAll('br'): + br.extract() + + newBody.append(tag) + + table = soup.find('table', attrs = {'id': 'body table'}) + if table: + for td in table.findAll('td', attrs = {'class': 'news_story'}): + for tag in td.findAll(True): + if tag.has_key('id') and tag['id'] == 'banner-img_slide': + tag.extract() + elif tag.has_key('style'): + del tag['style'] + elif tag.name == 'script': + tag.extract() + + # They like their
tags; I don't: does not work well on small screens. + center = td.find('center') + if center: + for br in center.findNextSiblings('br'): + br.extract() + for br in center.findPreviousSiblings('br'): + br.extract() + + for attr in ['align', 'valign']: + if td.has_key(attr): + del td[attr] + + td.name = 'div' + newBody.append(td) + + soup.body.replaceWith(newBody) + + return soup diff --git a/resources/recipes/thenews.recipe b/resources/recipes/thenews.recipe new file mode 100644 index 0000000000..7137b4538b --- /dev/null +++ b/resources/recipes/thenews.recipe @@ -0,0 +1,78 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class TheNewsRecipe(BasicNewsRecipe): + __license__ = 'GPL v3' + __author__ = 'kwetal' + language = 'en_PK' + version = 1 + + title = u'The News' + publisher = u'Jang Group' + category = u'News, Pakistan' + description = u'English Newspaper from Pakistan' + + use_embedded_content = False + remove_empty_feeds = True + oldest_article = 2 + max_articles_per_feed = 100 + + no_stylesheets = True + remove_javascript = True + encoding = 'iso-8859-1' + + remove_tags = [] + remove_tags.append(dict(name = 'img', attrs = {'src': 'images/thenews.gif'})) + remove_tags.append(dict(name = 'img', attrs = {'src': 'images/shim.gif'})) + + # Feeds from http://thenews.com.pk/rss.asp + feeds = [] + feeds.append((u'Latest Stories', u'http://www.thenews.com.pk/rss/thenews_updates.xml')) + feeds.append((u'Top Stories', u'http://www.thenews.com.pk/rss/thenews_topstories.xml')) + feeds.append((u'World News', u'http://www.thenews.com.pk/rss/thenews_world.xml')) + feeds.append((u'National News', u'http://www.thenews.com.pk/rss/thenews_national.xml')) + feeds.append((u'Business News', u'http://www.thenews.com.pk/rss/thenews_business.xml')) + feeds.append((u'Karachi News', u'http://www.thenews.com.pk/rss/thenews_karachi.xml')) + feeds.append((u'Lahore News', u'http://www.thenews.com.pk/rss/thenews_lahore.xml')) + feeds.append((u'Islamabad News', u'http://www.thenews.com.pk/rss/thenews_islamabad.xml')) + feeds.append((u'Peshawar News', u'http://www.thenews.com.pk/rss/thenews_peshawar.xml')) + feeds.append((u'Editorial', u'http://www.thenews.com.pk/rss/thenews_editorial.xml')) + feeds.append((u'Opinion', u'http://www.thenews.com.pk/rss/thenews_opinion.xml')) + feeds.append((u'Sports News', u'http://www.thenews.com.pk/rss/thenews_sports.xml')) + feeds.append((u'Newspost', u'http://www.thenews.com.pk/rss/thenews_newspost.xml')) + + conversion_options = {'comments': description, 'tags': category, 'language': 'en', + 'publisher': publisher, 'linearize_tables': True} + + extra_css = ''' + body{font-family:verdana,arial,helvetica,geneva,sans-serif;} + .heading_txt {font-size: x-large; font-weight: bold; text-align: left;} + .small_txt {text-align: left;} + .dateline {font-size: x-small; color: #696969; margin-top: 1em; margin-bottom: 1em} + ''' + + + def print_version(self, url): + ignore, sep, main = url.rpartition('/') + + if main.startswith('updates.asp'): + return url.replace('updates.asp', 'print.asp') + elif main.startswith('top_story_detail.asp'): + return url.replace('top_story_detail.asp', 'print3.asp') + elif main.startswith('daily_detail.asp'): + return url.replace('daily_detail.asp', 'print1.asp') + else: + return None + + def preprocess_html(self, soup): + for tr in soup.findAll('tr', attrs = {'bgcolor': True}): + del tr['bgcolor'] + + td = soup.find('td', attrs = {'class': 'small_txt', 'height': '20'}) + if td: + del td['height'] + td['class'] = 'dateline' + + return soup + + + diff --git a/src/calibre/utils/localization.py b/src/calibre/utils/localization.py index 64dadb89e4..14b7b32c8c 100644 --- a/src/calibre/utils/localization.py +++ b/src/calibre/utils/localization.py @@ -101,6 +101,7 @@ _extra_lang_codes = { 'en_IN' : _('English (IND)'), 'en_TH' : _('English (TH)'), 'en_CY' : _('English (CY)'), + 'en_PK' : _('English (PK)'), 'de_AT' : _('German (AT)'), 'nl' : _('Dutch (NL)'), 'nl_BE' : _('Dutch (BE)'),