From 8d3c9d93b348a326c965a27f792d02020616492e Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 17 Apr 2012 20:58:20 +0530 Subject: [PATCH] Fix Calgary Herald --- recipes/calgary_herald.recipe | 255 +++++----------------------------- 1 file changed, 35 insertions(+), 220 deletions(-) diff --git a/recipes/calgary_herald.recipe b/recipes/calgary_herald.recipe index dc919a76f8..12134bc9a4 100644 --- a/recipes/calgary_herald.recipe +++ b/recipes/calgary_herald.recipe @@ -1,220 +1,35 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -__license__ = 'GPL v3' - -''' -www.canada.com -''' - -import re -from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup - - -class CanWestPaper(BasicNewsRecipe): - - # un-comment the following four lines for the Victoria Times Colonist -## title = u'Victoria Times Colonist' -## url_prefix = 'http://www.timescolonist.com' -## description = u'News from Victoria, BC' -## fp_tag = 'CAN_TC' - - # un-comment the following four lines for the Vancouver Province -## title = u'Vancouver Province' -## url_prefix = 'http://www.theprovince.com' -## description = u'News from Vancouver, BC' -## fp_tag = 'CAN_VP' - - # un-comment the following four lines for the Vancouver Sun -## title = u'Vancouver Sun' -## url_prefix = 'http://www.vancouversun.com' -## description = u'News from Vancouver, BC' -## fp_tag = 'CAN_VS' - - # un-comment the following four lines for the Edmonton Journal -## title = u'Edmonton Journal' -## url_prefix = 'http://www.edmontonjournal.com' -## description = u'News from Edmonton, AB' -## fp_tag = 'CAN_EJ' - - # un-comment the following four lines for the Calgary Herald - title = u'Calgary Herald' - url_prefix = 'http://www.calgaryherald.com' - description = u'News from Calgary, AB' - fp_tag = 'CAN_CH' - - # un-comment the following four lines for the Regina Leader-Post -## title = u'Regina Leader-Post' -## url_prefix = 'http://www.leaderpost.com' -## description = u'News from Regina, SK' -## fp_tag = '' - - # un-comment the following four lines for the Saskatoon Star-Phoenix -## title = u'Saskatoon Star-Phoenix' -## url_prefix = 'http://www.thestarphoenix.com' -## description = u'News from Saskatoon, SK' -## fp_tag = '' - - # un-comment the following four lines for the Windsor Star -## title = u'Windsor Star' -## url_prefix = 'http://www.windsorstar.com' -## description = u'News from Windsor, ON' -## fp_tag = 'CAN_' - - # un-comment the following four lines for the Ottawa Citizen -## title = u'Ottawa Citizen' -## url_prefix = 'http://www.ottawacitizen.com' -## description = u'News from Ottawa, ON' -## fp_tag = 'CAN_OC' - - # un-comment the following four lines for the Montreal Gazette -## title = u'Montreal Gazette' -## url_prefix = 'http://www.montrealgazette.com' -## description = u'News from Montreal, QC' -## fp_tag = 'CAN_MG' - - - language = 'en_CA' - __author__ = 'Nick Redding' - no_stylesheets = True - timefmt = ' [%b %d]' - extra_css = ''' - .timestamp { font-size:xx-small; display: block; } - #storyheader { font-size: medium; } - #storyheader h1 { font-size: x-large; } - #storyheader h2 { font-size: large; font-style: italic; } - .byline { font-size:xx-small; } - #photocaption { font-size: small; font-style: italic } - #photocredit { font-size: xx-small; }''' - keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] - remove_tags = [{'class':'comments'}, - dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), - dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), - dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), - dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), - dict(name='div', attrs={'class':'rule_grey_solid'}), - dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] - - def get_cover_url(self): - from datetime import timedelta, date - if self.fp_tag=='': - return None - cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg' - br = BasicNewsRecipe.get_browser() - daysback=1 - try: - br.open(cover) - except: - while daysback<7: - cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg' - br = BasicNewsRecipe.get_browser() - try: - br.open(cover) - except: - daysback = daysback+1 - continue - break - if daysback==7: - self.log("\nCover unavailable") - cover = None - return cover - - def fixChars(self,string): - # Replace lsquo (\x91) - fixed = re.sub("\x91","‘",string) - # Replace rsquo (\x92) - fixed = re.sub("\x92","’",fixed) - # Replace ldquo (\x93) - fixed = re.sub("\x93","“",fixed) - # Replace rdquo (\x94) - fixed = re.sub("\x94","”",fixed) - # Replace ndash (\x96) - fixed = re.sub("\x96","–",fixed) - # Replace mdash (\x97) - fixed = re.sub("\x97","—",fixed) - fixed = re.sub("’","’",fixed) - return fixed - - def massageNCXText(self, description): - # Kindle TOC descriptions won't render certain characters - if description: - massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) - # Replace '&' with '&' - massaged = re.sub("&","&", massaged) - return self.fixChars(massaged) - else: - return description - - def populate_article_metadata(self, article, soup, first): - if first: - picdiv = soup.find('body').find('img') - if picdiv is not None: - self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src'])) - xtitle = article.text_summary.strip() - if len(xtitle) == 0: - desc = soup.find('meta',attrs={'property':'og:description'}) - if desc is not None: - article.summary = article.text_summary = desc['content'] - - def strip_anchors(self,soup): - paras = soup.findAll(True) - for para in paras: - aTags = para.findAll('a') - for a in aTags: - if a.img is None: - a.replaceWith(a.renderContents().decode('cp1252','replace')) - return soup - - def preprocess_html(self, soup): - return self.strip_anchors(soup) - - - - def parse_index(self): - soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') - - articles = {} - key = 'News' - ans = ['News'] - - # Find each instance of class="sectiontitle", class="featurecontent" - for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): - #self.log(" div class = %s" % divtag['class']) - if divtag['class'].startswith('section_title'): - # div contains section title - if not divtag.h3: - continue - key = self.tag_to_string(divtag.h3,False) - ans.append(key) - self.log("Section name %s" % key) - continue - # div contains article data - h1tag = divtag.find('h1') - if not h1tag: - continue - atag = h1tag.find('a',href=True) - if not atag: - continue - url = self.url_prefix+'/news/todays-paper/'+atag['href'] - #self.log("Section %s" % key) - #self.log("url %s" % url) - title = self.tag_to_string(atag,False) - #self.log("title %s" % title) - pubdate = '' - description = '' - ptag = divtag.find('p'); - if ptag: - description = self.tag_to_string(ptag,False) - #self.log("description %s" % description) - author = '' - autag = divtag.find('h4') - if autag: - author = self.tag_to_string(autag,False) - #self.log("author %s" % author) - if not articles.has_key(key): - articles[key] = [] - articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) - - ans = [(key, articles[key]) for key in ans if articles.has_key(key)] - return ans +from calibre.web.feeds.news import BasicNewsRecipe + +class CalgaryHerald(BasicNewsRecipe): + title = u'Calgary Herald' + oldest_article = 3 + max_articles_per_feed = 100 + + feeds = [ + (u'News', u'http://rss.canada.com/get/?F233'), + (u'Calgary', u'http://www.calgaryherald.com/scripts/sp6query.aspx?catalog=cahr&tags=keyword|calgary&output=rss?link=http%3a%2f%2fwww.calgaryherald'), + (u'Alberta', u'http://www.calgaryherald.com/scripts/Sp6Query.aspx?catalog=CAHR&tags=Keyword|Alberta&output=rss?link=http%3A%2F%2Fwww.calgaryherald.com%2Fnews%2Falberta%2Findex.html'), + (u'Politics', u'http://rss.canada.com/get/?F7551'), + (u'National', u'http://rss.canada.com/get/?F7552'), + (u'World', u'http://rss.canada.com/get/?F7553'), + ] + __author__ = 'rty' + pubisher = 'Calgary Herald' + description = 'Calgary, Alberta, Canada' + category = 'News, Calgary, Alberta, Canada' + + + remove_javascript = True + use_embedded_content = False + no_stylesheets = True + language = 'en_CA' + encoding = 'utf-8' + conversion_options = {'linearize_tables':True} + ##masthead_url = 'http://www.calgaryherald.com/index.html' + keep_only_tags = [ + dict(name='div', attrs={'id':'storyheader'}), + dict(name='div', attrs={'id':'storycontent'}) + + ] + remove_tags_after = {'class':"story_tool_hr"} +