diff --git a/resources/recipes/smh.recipe b/resources/recipes/smh.recipe index 0ea953170d..1a6f35a272 100644 --- a/resources/recipes/smh.recipe +++ b/resources/recipes/smh.recipe @@ -6,51 +6,86 @@ __docformat__ = 'restructuredtext en' ''' smh.com.au ''' -from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup class SMH(BasicNewsRecipe): title = 'Sydney Morning Herald' description = 'Business News, World News and Breaking News in Australia' - __author__ = 'Kovid Goyal' + __author__ = 'Kovid Goyal and Sujata Raman' language = 'en_AU' + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = False + no_javascript = True + + timefmt = ' [%A, %d %B, %Y]' + encoding = 'utf-8' + + keep_only_tags = [dict(name='div', attrs ={'id':'content'})] + remove_tags = [ + dict(name='div', attrs={'align' :'right'}), + dict(name='p', attrs={'class' :'comments'}), + dict(name='a', attrs={'class' :['more-photos','performerpromo']}), + dict(name='img', attrs={'alt' :'aap'}), + dict(name='div', attrs ={'id':['googleAds','moreGoogleAds','comments','footer','sidebar','austereopuff','adSpotIsland']}), + dict(name='div', attrs ={'class':['article-links','wof','articleTools top','cN-multimediaGroup cfix','articleTools bottom']}), + dict(name='div', attrs ={'class':['clear','adSpot-textboxgr1','adSpot-textBox','articleTools-c3 cfix','articleExtras-bottom','span-16 last']}), + dict(name='div', attrs ={'class':[ 'sidebar span-5','cT-socialCommenting','cN-linkList','cN-topicSelector','cT-storyTools cfix','cT-imageMultimedia']}) , + dict(name='iframe'), + ] + + extra_css = ''' + h1{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:large;} + .cT-storyDetails{font-family:Arial,Helvetica,sans-serif; color:#666666;font-size:x-small;} + .articleBody{font-family:Arial,Helvetica,sans-serif; color:black;font-size:small;} + .cT-imageLandscape{font-family:Arial,Helvetica,sans-serif; color:#333333 ;font-size:x-small;} + .source{font-family:Arial,Helvetica,sans-serif; color:#333333 ;font-size:xx-small;} + #content{font-family:Arial,Helvetica,sans-serif;font-size:x-small;} + .pageprint{font-family:Arial,Helvetica,sans-serif;font-size:small;} + #bylineDetails{font-family:Arial,Helvetica,sans-serif; color:#666666;font-size:x-small;} + .featurePic-wide{font-family:Arial,Helvetica,sans-serif;font-size:x-small;} + #idfeaturepic{font-family:Arial,Helvetica,sans-serif;font-size:x-small;} + h3{font-family:Georgia,"Times New Roman",Times,serif; font-size:small;} + h2{font-family:Georgia,"Times New Roman",Times,serif; font-size:small;} + h4{font-family:Georgia,"Times New Roman",Times,serif; font-size:small;} + h5{font-family:Georgia,"Times New Roman",Times,serif; font-size:small;} + body{font-family:Arial,Helvetica,sans-serif; font-size:x-small;} + ''' + + + feeds = [ + ('Top Stories', 'http://feeds.smh.com.au/rssheadlines/top.xml'), + ('National', 'http://feeds.smh.com.au/rssheadlines/national.xml'), + ('World', 'http://feeds.smh.com.au/rssheadlines/world.xml'), + ('Business', 'http://www.smh.com.au/rssheadlines/business.xml'), + ('National Times', 'http://www.smh.com.au/rssheadlines/opinion/article/rss.xml'), + ('Entertainment', 'http://feeds.smh.com.au/rssheadlines/entertainment.xml'), + ('Technology', 'http://feeds.smh.com.au/rssheadlines/technology.xml'), + ('Sport', 'http://feeds.smh.com.au/rssheadlines/sport.xml'), + ] + + def preprocess_html(self, soup): + bod = soup.find('bod') + if bod is not None: + bod.tag = 'div' + p = soup.find(id='content') + bod.extract() + p.insert(len(p), bod) + return soup def get_browser(self): br = BasicNewsRecipe.get_browser() br.set_handle_refresh(False) return br - def parse_index(self): - - soup = BeautifulSoup(self.browser.open('http://www.smh.com.au/text/').read()) - - feeds, articles = [], [] - feed = None + def get_article_url(self, article): + url = article.link + if 'media' in url: + url = '' + return url - for tag in soup.findAll(['h3', 'a']): - if tag.name == 'h3': - if articles: - feeds.append((feed, articles)) - articles = [] - feed = self.tag_to_string(tag) - elif feed is not None and tag.has_key('href') and tag['href'].strip(): - url = tag['href'].strip() - if url.startswith('/'): - url = 'http://www.smh.com.au' + url - title = self.tag_to_string(tag) - articles.append({ - 'title': title, - 'url' : url, - 'date' : strftime('%a, %d %b'), - 'description' : '', - 'content' : '', - }) - - return feeds -