calibre/recipes/regina_leader_post.recipe
Kovid Goyal 29cd8d64ea
Change shebangs to python from python2
Also remove a few other miscellaneous references to python2
2020-08-22 18:47:51 +05:30

217 lines
8.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
'''
www.canada.com
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class CanWestPaper(BasicNewsRecipe):
# un-comment the following four lines for the Victoria Times Colonist
# # title = u'Victoria Times Colonist'
# # url_prefix = 'http://www.timescolonist.com'
# # description = u'News from Victoria, BC'
# # fp_tag = 'CAN_TC'
#
# un-comment the following four lines for the Vancouver Province
# # title = u'Vancouver Province'
# # url_prefix = 'http://www.theprovince.com'
# # description = u'News from Vancouver, BC'
# # fp_tag = 'CAN_VP'
#
# un-comment the following four lines for the Vancouver Sun
# # title = u'Vancouver Sun'
# # url_prefix = 'http://www.vancouversun.com'
# # description = u'News from Vancouver, BC'
# # fp_tag = 'CAN_VS'
#
# un-comment the following four lines for the Edmonton Journal
# # title = u'Edmonton Journal'
# # url_prefix = 'http://www.edmontonjournal.com'
# # description = u'News from Edmonton, AB'
# # fp_tag = 'CAN_EJ'
#
# un-comment the following four lines for the Calgary Herald
# # title = u'Calgary Herald'
# # url_prefix = 'http://www.calgaryherald.com'
# # description = u'News from Calgary, AB'
# # fp_tag = 'CAN_CH'
#
# un-comment the following four lines for the Regina Leader-Post
title = u'Regina Leader-Post'
url_prefix = 'http://www.leaderpost.com'
description = u'News from Regina, SK'
fp_tag = ''
# un-comment the following four lines for the Saskatoon Star-Phoenix
# # title = u'Saskatoon Star-Phoenix'
# # url_prefix = 'http://www.thestarphoenix.com'
# # description = u'News from Saskatoon, SK'
# # fp_tag = ''
#
# un-comment the following four lines for the Windsor Star
# # title = u'Windsor Star'
# # url_prefix = 'http://www.windsorstar.com'
# # description = u'News from Windsor, ON'
# # fp_tag = 'CAN_'
#
# un-comment the following four lines for the Ottawa Citizen
# # title = u'Ottawa Citizen'
# # url_prefix = 'http://www.ottawacitizen.com'
# # description = u'News from Ottawa, ON'
# # fp_tag = 'CAN_OC'
#
# un-comment the following four lines for the Montreal Gazette
# # title = u'Montreal Gazette'
# # url_prefix = 'http://www.montrealgazette.com'
# # description = u'News from Montreal, QC'
# # fp_tag = 'CAN_MG'
#
language = 'en_CA'
__author__ = 'Nick Redding'
no_stylesheets = True
timefmt = ' [%b %d]'
extra_css = '''
.timestamp { font-size:xx-small; display: block; }
#storyheader { font-size: medium; }
#storyheader h1 { font-size: x-large; }
#storyheader h2 { font-size: large; font-style: italic; }
.byline { font-size:xx-small; }
#photocaption { font-size: small; font-style: italic }
#photocredit { font-size: xx-small; }'''
keep_only_tags = [dict(name='div', attrs={'id': 'storyheader'}), dict(
name='div', attrs={'id': 'storycontent'})]
remove_tags = [{'class': 'comments'},
dict(name='div', attrs={'class': 'navbar'}), dict(
name='div', attrs={'class': 'morelinks'}),
dict(name='div', attrs={'class': 'viewmore'}), dict(
name='li', attrs={'class': 'email'}),
dict(name='div', attrs={'class': 'story_tool_hr'}), dict(
name='div', attrs={'class': 'clear'}),
dict(name='div', attrs={'class': 'story_tool'}), dict(
name='div', attrs={'class': 'copyright'}),
dict(name='div', attrs={'class': 'rule_grey_solid'}),
dict(name='li', attrs={'class': 'print'}), dict(name='li', attrs={'class': 'share'}), dict(name='ul', attrs={'class': 'bullet'})]
def get_cover_url(self):
from datetime import timedelta, date
if self.fp_tag == '':
return None
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg' + \
str(date.today().day) + '/lg/' + self.fp_tag + '.jpg'
br = BasicNewsRecipe.get_browser(self)
daysback = 1
try:
br.open(cover)
except:
while daysback < 7:
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg' + \
str((date.today() - timedelta(days=daysback)).day) + \
'/lg/' + self.fp_tag + '.jpg'
br = BasicNewsRecipe.get_browser(self)
try:
br.open(cover)
except:
daysback = daysback + 1
continue
break
if daysback == 7:
self.log("\nCover unavailable")
cover = None
return cover
def fixChars(self, string):
# Replace lsquo (\x91)
fixed = re.sub("\x91", "", string)
# Replace rsquo (\x92)
fixed = re.sub("\x92", "", fixed)
# Replace ldquo (\x93)
fixed = re.sub("\x93", "", fixed)
# Replace rdquo (\x94)
fixed = re.sub("\x94", "", fixed)
# Replace ndash (\x96)
fixed = re.sub("\x96", "", fixed)
# Replace mdash (\x97)
fixed = re.sub("\x97", "", fixed)
fixed = re.sub("&#x2019;", "", fixed)
return fixed
def massageNCXText(self, description):
return description
def populate_article_metadata(self, article, soup, first):
if first:
picdiv = soup.find('body').find('img')
if picdiv is not None:
self.add_toc_thumbnail(article, re.sub(
r'links\\link\d+\\', '', picdiv['src']))
xtitle = article.text_summary.strip()
if len(xtitle) == 0:
desc = soup.find('meta', attrs={'property': 'og:description'})
if desc is not None:
article.summary = article.text_summary = desc['content']
def strip_anchors(self, soup):
paras = soup.findAll(True)
for para in paras:
aTags = para.findAll('a')
for a in aTags:
if a.img is None:
a.replaceWith(a.renderContents().decode(
'cp1252', 'replace'))
return soup
def preprocess_html(self, soup):
return self.strip_anchors(soup)
def parse_index(self):
soup = self.index_to_soup(
self.url_prefix + '/news/todays-paper/index.html')
articles = {}
key = 'News'
ans = ['News']
# Find each instance of class="sectiontitle", class="featurecontent"
for divtag in soup.findAll('div', attrs={'class': ["section_title02", "featurecontent"]}):
if 'section_title' in ''.join(divtag['class']):
# div contains section title
if not divtag.h3:
continue
key = self.tag_to_string(divtag.h3, False)
ans.append(key)
self.log("Section name %s" % key)
continue
# div contains article data
h1tag = divtag.find('h1')
if not h1tag:
continue
atag = h1tag.find('a', href=True)
if not atag:
continue
url = self.url_prefix + '/news/todays-paper/' + atag['href']
title = self.tag_to_string(atag, False)
pubdate = ''
description = ''
ptag = divtag.find('p')
if ptag:
description = self.tag_to_string(ptag, False)
author = ''
autag = divtag.find('h4')
if autag:
author = self.tag_to_string(autag, False)
if key not in articles:
articles[key] = []
articles[key].append(dict(title=title, url=url, date=pubdate,
description=description, author=author, content=''))
ans = [(keyl, articles[keyl])
for keyl in ans if keyl in articles]
return ans