mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
216 lines
8.1 KiB
Python
216 lines
8.1 KiB
Python
#!/usr/bin/env python
|
||
# -*- coding: utf-8 -*-
|
||
|
||
__license__ = 'GPL v3'
|
||
|
||
'''
|
||
www.canada.com
|
||
'''
|
||
|
||
import re
|
||
from calibre.web.feeds.news import BasicNewsRecipe
|
||
|
||
|
||
class CanWestPaper(BasicNewsRecipe):
|
||
|
||
# un-comment the following four lines for the Victoria Times Colonist
|
||
# # title = u'Victoria Times Colonist'
|
||
# # url_prefix = 'http://www.timescolonist.com'
|
||
# # description = u'News from Victoria, BC'
|
||
# # fp_tag = 'CAN_TC'
|
||
#
|
||
# un-comment the following four lines for the Vancouver Province
|
||
# # title = u'Vancouver Province'
|
||
# # url_prefix = 'http://www.theprovince.com'
|
||
# # description = u'News from Vancouver, BC'
|
||
# # fp_tag = 'CAN_VP'
|
||
#
|
||
# un-comment the following four lines for the Vancouver Sun
|
||
# # title = u'Vancouver Sun'
|
||
# # url_prefix = 'http://www.vancouversun.com'
|
||
# # description = u'News from Vancouver, BC'
|
||
# # fp_tag = 'CAN_VS'
|
||
#
|
||
# un-comment the following four lines for the Edmonton Journal
|
||
# # title = u'Edmonton Journal'
|
||
# # url_prefix = 'http://www.edmontonjournal.com'
|
||
# # description = u'News from Edmonton, AB'
|
||
# # fp_tag = 'CAN_EJ'
|
||
#
|
||
# un-comment the following four lines for the Calgary Herald
|
||
# # title = u'Calgary Herald'
|
||
# # url_prefix = 'http://www.calgaryherald.com'
|
||
# # description = u'News from Calgary, AB'
|
||
# # fp_tag = 'CAN_CH'
|
||
#
|
||
# un-comment the following four lines for the Regina Leader-Post
|
||
# # title = u'Regina Leader-Post'
|
||
# # url_prefix = 'http://www.leaderpost.com'
|
||
# # description = u'News from Regina, SK'
|
||
# # fp_tag = ''
|
||
#
|
||
# un-comment the following four lines for the Saskatoon Star-Phoenix
|
||
title = u'Saskatoon Star-Phoenix'
|
||
url_prefix = 'http://www.thestarphoenix.com'
|
||
description = u'News from Saskatoon, SK'
|
||
fp_tag = ''
|
||
|
||
# un-comment the following four lines for the Windsor Star
|
||
# # title = u'Windsor Star'
|
||
# # url_prefix = 'http://www.windsorstar.com'
|
||
# # description = u'News from Windsor, ON'
|
||
# # fp_tag = 'CAN_'
|
||
#
|
||
# un-comment the following four lines for the Ottawa Citizen
|
||
# # title = u'Ottawa Citizen'
|
||
# # url_prefix = 'http://www.ottawacitizen.com'
|
||
# # description = u'News from Ottawa, ON'
|
||
# # fp_tag = 'CAN_OC'
|
||
#
|
||
# un-comment the following four lines for the Montreal Gazette
|
||
# # title = u'Montreal Gazette'
|
||
# # url_prefix = 'http://www.montrealgazette.com'
|
||
# # description = u'News from Montreal, QC'
|
||
# # fp_tag = 'CAN_MG'
|
||
|
||
language = 'en_CA'
|
||
__author__ = 'Nick Redding'
|
||
no_stylesheets = True
|
||
timefmt = ' [%b %d]'
|
||
extra_css = '''
|
||
.timestamp { font-size:xx-small; display: block; }
|
||
#storyheader { font-size: medium; }
|
||
#storyheader h1 { font-size: x-large; }
|
||
#storyheader h2 { font-size: large; font-style: italic; }
|
||
.byline { font-size:xx-small; }
|
||
#photocaption { font-size: small; font-style: italic }
|
||
#photocredit { font-size: xx-small; }'''
|
||
keep_only_tags = [dict(name='div', attrs={'id': 'storyheader'}), dict(
|
||
name='div', attrs={'id': 'storycontent'})]
|
||
remove_tags = [{'class': 'comments'},
|
||
dict(name='div', attrs={'class': 'navbar'}), dict(
|
||
name='div', attrs={'class': 'morelinks'}),
|
||
dict(name='div', attrs={'class': 'viewmore'}), dict(
|
||
name='li', attrs={'class': 'email'}),
|
||
dict(name='div', attrs={'class': 'story_tool_hr'}), dict(
|
||
name='div', attrs={'class': 'clear'}),
|
||
dict(name='div', attrs={'class': 'story_tool'}), dict(
|
||
name='div', attrs={'class': 'copyright'}),
|
||
dict(name='div', attrs={'class': 'rule_grey_solid'}),
|
||
dict(name='li', attrs={'class': 'print'}), dict(name='li', attrs={'class': 'share'}), dict(name='ul', attrs={'class': 'bullet'})]
|
||
|
||
def get_cover_url(self):
|
||
from datetime import timedelta, date
|
||
if self.fp_tag == '':
|
||
return None
|
||
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg' + \
|
||
str(date.today().day) + '/lg/' + self.fp_tag + '.jpg'
|
||
br = BasicNewsRecipe.get_browser(self)
|
||
daysback = 1
|
||
try:
|
||
br.open(cover)
|
||
except:
|
||
while daysback < 7:
|
||
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg' + \
|
||
str((date.today() - timedelta(days=daysback)).day) + \
|
||
'/lg/' + self.fp_tag + '.jpg'
|
||
br = BasicNewsRecipe.get_browser(self)
|
||
try:
|
||
br.open(cover)
|
||
except:
|
||
daysback = daysback + 1
|
||
continue
|
||
break
|
||
if daysback == 7:
|
||
self.log("\nCover unavailable")
|
||
cover = None
|
||
return cover
|
||
|
||
def fixChars(self, string):
|
||
# Replace lsquo (\x91)
|
||
fixed = re.sub("\x91", "‘", string)
|
||
# Replace rsquo (\x92)
|
||
fixed = re.sub("\x92", "’", fixed)
|
||
# Replace ldquo (\x93)
|
||
fixed = re.sub("\x93", "“", fixed)
|
||
# Replace rdquo (\x94)
|
||
fixed = re.sub("\x94", "”", fixed)
|
||
# Replace ndash (\x96)
|
||
fixed = re.sub("\x96", "–", fixed)
|
||
# Replace mdash (\x97)
|
||
fixed = re.sub("\x97", "—", fixed)
|
||
fixed = re.sub("’", "’", fixed)
|
||
return fixed
|
||
|
||
def massageNCXText(self, description):
|
||
return description
|
||
|
||
def populate_article_metadata(self, article, soup, first):
|
||
if first:
|
||
picdiv = soup.find('body').find('img')
|
||
if picdiv is not None:
|
||
self.add_toc_thumbnail(article, re.sub(
|
||
r'links\\link\d+\\', '', picdiv['src']))
|
||
xtitle = article.text_summary.strip()
|
||
if len(xtitle) == 0:
|
||
desc = soup.find('meta', attrs={'property': 'og:description'})
|
||
if desc is not None:
|
||
article.summary = article.text_summary = desc['content']
|
||
|
||
def strip_anchors(self, soup):
|
||
paras = soup.findAll(True)
|
||
for para in paras:
|
||
aTags = para.findAll('a')
|
||
for a in aTags:
|
||
if a.img is None:
|
||
a.replaceWith(a.renderContents().decode(
|
||
'cp1252', 'replace'))
|
||
return soup
|
||
|
||
def preprocess_html(self, soup):
|
||
return self.strip_anchors(soup)
|
||
|
||
def parse_index(self):
|
||
soup = self.index_to_soup(
|
||
self.url_prefix + '/news/todays-paper/index.html')
|
||
|
||
articles = {}
|
||
key = 'News'
|
||
ans = ['News']
|
||
|
||
# Find each instance of class="sectiontitle", class="featurecontent"
|
||
for divtag in soup.findAll('div', attrs={'class': ["section_title02", "featurecontent"]}):
|
||
if ''.join(divtag['class']).startswith('section_title'):
|
||
# div contains section title
|
||
if not divtag.h3:
|
||
continue
|
||
key = self.tag_to_string(divtag.h3, False)
|
||
ans.append(key)
|
||
self.log("Section name %s" % key)
|
||
continue
|
||
# div contains article data
|
||
h1tag = divtag.find('h1')
|
||
if not h1tag:
|
||
continue
|
||
atag = h1tag.find('a', href=True)
|
||
if not atag:
|
||
continue
|
||
url = self.url_prefix + '/news/todays-paper/' + atag['href']
|
||
title = self.tag_to_string(atag, False)
|
||
pubdate = ''
|
||
description = ''
|
||
ptag = divtag.find('p')
|
||
if ptag:
|
||
description = self.tag_to_string(ptag, False)
|
||
author = ''
|
||
autag = divtag.find('h4')
|
||
if autag:
|
||
author = self.tag_to_string(autag, False)
|
||
if key not in articles:
|
||
articles[key] = []
|
||
articles[key].append(dict(title=title, url=url, date=pubdate,
|
||
description=description, author=author, content=''))
|
||
|
||
ans = [(k, articles[k]) for k in ans if k in articles]
|
||
return ans
|