From 4b66da2fe628d5dcf58a3723ba2ea23a79c8b404 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 30 Dec 2009 22:56:07 -0700 Subject: [PATCH] New recipe for India Today and improved recipe for The Independent --- resources/recipes/independent.recipe | 69 +++++++++++++++++-------- resources/recipes/india_today.recipe | 76 ++++++++++++++++++++++++++++ src/calibre/web/feeds/news.py | 10 ++-- 3 files changed, 132 insertions(+), 23 deletions(-) create mode 100644 resources/recipes/india_today.recipe diff --git a/resources/recipes/independent.recipe b/resources/recipes/independent.recipe index 4d677b58a6..e9e15e2ba9 100644 --- a/resources/recipes/independent.recipe +++ b/resources/recipes/independent.recipe @@ -1,26 +1,55 @@ from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup -class Independent(BasicNewsRecipe): +class TheIndependent(BasicNewsRecipe): title = u'The Independent' - oldest_article = 1 - language = 'en_GB' - __author__ = 'Jimmy Patrick' - max_articles_per_feed = 100 + language = 'en_UK' + __author__ = 'Krittika Goyal' + oldest_article = 1 #days + max_articles_per_feed = 25 + encoding = 'latin1' - feeds = [(u'UK', u'http://www.independent.co.uk/news/uk/rss'), - (u'World', u'http://www.independent.co.uk/news/world/rss'), - (u'Sport', u'http://www.independent.co.uk/sport/rss'), - (u'Arts & Entertainment', u'http://www.independent.co.uk/arts-entertainment/rss'), - (u'Life & Style',u'http://www.independent.co.uk/life-style/fashion/news/rss'), - (u'Business',u'http://www.independent.co.uk/news/business/rss'), - (u'Science',u'http://www.independent.co.uk/news/science/rss'), - (u'Media',u'http://www.independent.co.uk/news/media/rss') - ] + remove_stylesheets = True + #remove_tags_before = dict(name='h1', attrs={'class':'heading'}) + #remove_tags_after = dict(name='td', attrs={'class':'newptool1'}) + remove_tags = [ + dict(name='iframe'), + dict(name='div', attrs={'class':'related-articles'}), + dict(name='div', attrs={'id':['qrformdiv', 'inSection', 'alpha-inner']}), + dict(name='ul', attrs={'class':'article-tools'}), + dict(name='ul', attrs={'class':'articleTools'}), + ] - keep_only_tags = [dict(id=['article'])] - remove_tags = [dict(name='div', attrs={'class':'share-links'}), - dict(name='ul', attrs={'class':'article-tools'}), - dict(name='div', attrs={'class':'related-articles'}) - ] + feeds = [ +('UK', + 'http://www.independent.co.uk/news/uk/rss'), +('World', + 'http://www.independent.co.uk/news/world/rss'), +('Sport', + 'http://www.independent.co.uk/sport/rss'), +('Arts and Entertainment', + 'http://www.independent.co.uk/arts-entertainment/rss'), +('Business', + 'http://www.independent.co.uk/news/business/rss'), +('Life and Style', + 'http://www.independent.co.uk/life-style/gadgets-and-tech/news/rss'), +('Science', + 'http://www.independent.co.uk/news/science/rss'), +('People', + 'http://www.independent.co.uk/news/people/rss'), +('Media', + 'http://www.independent.co.uk/news/media/rss'), +('Health and Families', + 'http://www.independent.co.uk/life-style/health-and-families/rss'), +('Obituaries', + 'http://www.independent.co.uk/news/obituaries/rss'), +] - extra_css = "body{color:black;}" + def preprocess_html(self, soup): + story = soup.find(name='div', attrs={'id':'mainColumn'}) + #td = heading.findParent(name='td') + #td.extract() + soup = BeautifulSoup('t') + body = soup.find(name='body') + body.insert(0, story) + return soup diff --git a/resources/recipes/india_today.recipe b/resources/recipes/india_today.recipe new file mode 100644 index 0000000000..604a7f57ad --- /dev/null +++ b/resources/recipes/india_today.recipe @@ -0,0 +1,76 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class IndiaToday(BasicNewsRecipe): + + title = 'India Today' + __author__ = 'Kovid Goyal' + language = 'en_IN' + timefmt = ' [%d %m, %Y]' + + oldest_article = 700 + max_articles_per_feed = 10 + + no_stylesheets = True + + remove_tags_before = dict(id='content_story_title') + remove_tags_after = dict(id='rightblockdiv') + remove_tags = [dict(id=['rightblockdiv', 'share_links'])] + + extra_css = '#content_story_title { font-size: 170%; font-weight: bold;}' + conversion_options = { 'linearize_tables': True } + + def it_get_index(self): + soup = self.index_to_soup('http://indiatoday.intoday.in/site/archive') + a = soup.find('a', href=lambda x: x and 'issueId=' in x) + url = 'http://indiatoday.intoday.in/site/'+a.get('href') + img = a.find('img') + self.cover_url = img.get('src') + return self.index_to_soup(url) + + def parse_index(self): + soup = self.it_get_index() + feeds, current_section, current_articles = [], None, [] + for x in soup.findAll(name=['h1', 'a']): + if x.name == 'h1': + if current_section and current_articles: + feeds.append((current_section, current_articles)) + current_section = self.tag_to_string(x) + current_articles = [] + self.log('\tFound section:', current_section) + elif x.name == 'a' and 'Story' in x.get('href', ''): + title = self.tag_to_string(x) + url = x.get('href') + url = url.replace(' ', '%20') + if not url.startswith('/'): + url = 'http://indiatoday.intoday.in/site/' + url + if title and url: + url += '?complete=1' + self.log('\tFound article:', title) + self.log('\t\t', url) + desc = '' + h3 = x.parent.findNextSibling('h3') + if h3 is not None: + desc = 'By ' + self.tag_to_string(h3) + h4 = h3.findNextSibling('h4') + if h4 is not None: + desc = self.tag_to_string(h4) + ' ' + desc + if desc: + self.log('\t\t', desc) + current_articles.append({'title':title, 'description':desc, + 'url':url, 'date':''}) + + if current_section and current_articles: + feeds.append((current_section, current_articles)) + + return feeds + + def postprocess_html(self, soup, first): + a = soup.find(text='Print') + if a is not None: + tr = a.findParent('tr') + if tr is not None: + tr.extract() + return soup + + + diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py index 4ade7fa73c..2778c1c5e9 100644 --- a/src/calibre/web/feeds/news.py +++ b/src/calibre/web/feeds/news.py @@ -819,10 +819,14 @@ class BasicNewsRecipe(Recipe): if '?' in ext: ext = '' ext = ext.lower() if ext else 'jpg' - self.report_progress(1, _('Downloading cover from %s')%cu) cpath = os.path.join(self.output_dir, 'cover.'+ext) - with nested(open(cpath, 'wb'), closing(self.browser.open(cu))) as (cfile, r): - cfile.write(r.read()) + if os.access(cu, os.R_OK): + with open(cpath, 'wb') as cfile: + cfile.write(open(cu, 'rb').read()) + else: + self.report_progress(1, _('Downloading cover from %s')%cu) + with nested(open(cpath, 'wb'), closing(self.browser.open(cu))) as (cfile, r): + cfile.write(r.read()) if ext.lower() == 'pdf': from calibre.ebook.metadata.pdf import get_metadata stream = open(cpath, 'rb')