calibre/recipes/saskatoon_star_phoenix.recipe
Kovid Goyal 567040ee1e Perform PEP8 compliance checks on the entire codebase
Some bits of PEP 8 are turned off via setup.cfg
2016-07-29 21:25:17 +05:30

225 lines
8.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python2
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
'''
www.canada.com
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
class CanWestPaper(BasicNewsRecipe):
# un-comment the following four lines for the Victoria Times Colonist
# # title = u'Victoria Times Colonist'
# # url_prefix = 'http://www.timescolonist.com'
# # description = u'News from Victoria, BC'
# # fp_tag = 'CAN_TC'
#
# un-comment the following four lines for the Vancouver Province
# # title = u'Vancouver Province'
# # url_prefix = 'http://www.theprovince.com'
# # description = u'News from Vancouver, BC'
# # fp_tag = 'CAN_VP'
#
# un-comment the following four lines for the Vancouver Sun
# # title = u'Vancouver Sun'
# # url_prefix = 'http://www.vancouversun.com'
# # description = u'News from Vancouver, BC'
# # fp_tag = 'CAN_VS'
#
# un-comment the following four lines for the Edmonton Journal
# # title = u'Edmonton Journal'
# # url_prefix = 'http://www.edmontonjournal.com'
# # description = u'News from Edmonton, AB'
# # fp_tag = 'CAN_EJ'
#
# un-comment the following four lines for the Calgary Herald
# # title = u'Calgary Herald'
# # url_prefix = 'http://www.calgaryherald.com'
# # description = u'News from Calgary, AB'
# # fp_tag = 'CAN_CH'
#
# un-comment the following four lines for the Regina Leader-Post
# # title = u'Regina Leader-Post'
# # url_prefix = 'http://www.leaderpost.com'
# # description = u'News from Regina, SK'
# # fp_tag = ''
#
# un-comment the following four lines for the Saskatoon Star-Phoenix
title = u'Saskatoon Star-Phoenix'
url_prefix = 'http://www.thestarphoenix.com'
description = u'News from Saskatoon, SK'
fp_tag = ''
# un-comment the following four lines for the Windsor Star
# # title = u'Windsor Star'
# # url_prefix = 'http://www.windsorstar.com'
# # description = u'News from Windsor, ON'
# # fp_tag = 'CAN_'
#
# un-comment the following four lines for the Ottawa Citizen
# # title = u'Ottawa Citizen'
# # url_prefix = 'http://www.ottawacitizen.com'
# # description = u'News from Ottawa, ON'
# # fp_tag = 'CAN_OC'
#
# un-comment the following four lines for the Montreal Gazette
# # title = u'Montreal Gazette'
# # url_prefix = 'http://www.montrealgazette.com'
# # description = u'News from Montreal, QC'
# # fp_tag = 'CAN_MG'
language = 'en_CA'
__author__ = 'Nick Redding'
no_stylesheets = True
timefmt = ' [%b %d]'
extra_css = '''
.timestamp { font-size:xx-small; display: block; }
#storyheader { font-size: medium; }
#storyheader h1 { font-size: x-large; }
#storyheader h2 { font-size: large; font-style: italic; }
.byline { font-size:xx-small; }
#photocaption { font-size: small; font-style: italic }
#photocredit { font-size: xx-small; }'''
keep_only_tags = [dict(name='div', attrs={'id': 'storyheader'}), dict(
name='div', attrs={'id': 'storycontent'})]
remove_tags = [{'class': 'comments'},
dict(name='div', attrs={'class': 'navbar'}), dict(
name='div', attrs={'class': 'morelinks'}),
dict(name='div', attrs={'class': 'viewmore'}), dict(
name='li', attrs={'class': 'email'}),
dict(name='div', attrs={'class': 'story_tool_hr'}), dict(
name='div', attrs={'class': 'clear'}),
dict(name='div', attrs={'class': 'story_tool'}), dict(
name='div', attrs={'class': 'copyright'}),
dict(name='div', attrs={'class': 'rule_grey_solid'}),
dict(name='li', attrs={'class': 'print'}), dict(name='li', attrs={'class': 'share'}), dict(name='ul', attrs={'class': 'bullet'})]
def get_cover_url(self):
from datetime import timedelta, date
if self.fp_tag == '':
return None
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg' + \
str(date.today().day) + '/lg/' + self.fp_tag + '.jpg'
br = BasicNewsRecipe.get_browser(self)
daysback = 1
try:
br.open(cover)
except:
while daysback < 7:
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg' + \
str((date.today() - timedelta(days=daysback)).day) + \
'/lg/' + self.fp_tag + '.jpg'
br = BasicNewsRecipe.get_browser(self)
try:
br.open(cover)
except:
daysback = daysback + 1
continue
break
if daysback == 7:
self.log("\nCover unavailable")
cover = None
return cover
def fixChars(self, string):
# Replace lsquo (\x91)
fixed = re.sub("\x91", "", string)
# Replace rsquo (\x92)
fixed = re.sub("\x92", "", fixed)
# Replace ldquo (\x93)
fixed = re.sub("\x93", "", fixed)
# Replace rdquo (\x94)
fixed = re.sub("\x94", "", fixed)
# Replace ndash (\x96)
fixed = re.sub("\x96", "", fixed)
# Replace mdash (\x97)
fixed = re.sub("\x97", "", fixed)
fixed = re.sub("&#x2019;", "", fixed)
return fixed
def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters
if description:
massaged = unicode(BeautifulStoneSoup(
description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&", "&", massaged)
return self.fixChars(massaged)
else:
return description
def populate_article_metadata(self, article, soup, first):
if first:
picdiv = soup.find('body').find('img')
if picdiv is not None:
self.add_toc_thumbnail(article, re.sub(
r'links\\link\d+\\', '', picdiv['src']))
xtitle = article.text_summary.strip()
if len(xtitle) == 0:
desc = soup.find('meta', attrs={'property': 'og:description'})
if desc is not None:
article.summary = article.text_summary = desc['content']
def strip_anchors(self, soup):
paras = soup.findAll(True)
for para in paras:
aTags = para.findAll('a')
for a in aTags:
if a.img is None:
a.replaceWith(a.renderContents().decode(
'cp1252', 'replace'))
return soup
def preprocess_html(self, soup):
return self.strip_anchors(soup)
def parse_index(self):
soup = self.index_to_soup(
self.url_prefix + '/news/todays-paper/index.html')
articles = {}
key = 'News'
ans = ['News']
# Find each instance of class="sectiontitle", class="featurecontent"
for divtag in soup.findAll('div', attrs={'class': ["section_title02", "featurecontent"]}):
if divtag['class'].startswith('section_title'):
# div contains section title
if not divtag.h3:
continue
key = self.tag_to_string(divtag.h3, False)
ans.append(key)
self.log("Section name %s" % key)
continue
# div contains article data
h1tag = divtag.find('h1')
if not h1tag:
continue
atag = h1tag.find('a', href=True)
if not atag:
continue
url = self.url_prefix + '/news/todays-paper/' + atag['href']
title = self.tag_to_string(atag, False)
pubdate = ''
description = ''
ptag = divtag.find('p')
if ptag:
description = self.tag_to_string(ptag, False)
author = ''
autag = divtag.find('h4')
if autag:
author = self.tag_to_string(autag, False)
if not articles.has_key(key): # noqa
articles[key] = []
articles[key].append(dict(title=title, url=url, date=pubdate,
description=description, author=author, content=''))
ans = [(k, articles[k]) for k in ans if articles.has_key(k)] # noqa
return ans