calibre/recipes/windows_star.recipe
Kovid Goyal 29cd8d64ea
Change shebangs to python from python2
Also remove a few other miscellaneous references to python2
2020-08-22 18:47:51 +05:30

106 lines
4.1 KiB
Python

#!/usr/bin/env python
__license__ = 'GPL v3'
'''
www.canada.com
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Windsor Star
title = u'Windsor Star'
url_prefix = 'http://www.windsorstar.com'
description = u'News from Windsor, ON'
# un-comment the following three lines for the Ottawa Citizen
# title = u'Ottawa Citizen'
# url_prefix = 'http://www.ottawacitizen.com'
# description = u'News from Ottawa, ON'
#
# un-comment the following three lines for the Montreal Gazette
# title = u'Montreal Gazette'
# url_prefix = 'http://www.montrealgazette.com'
# description = u'News from Montreal, QC'
language = 'en_CA'
__author__ = 'Nick Redding'
no_stylesheets = True
timefmt = ' [%b %d]'
extra_css = '''
.timestamp { font-size:xx-small; display: block; }
#storyheader { font-size: medium; }
#storyheader h1 { font-size: x-large; }
#storyheader h2 { font-size: large; font-style: italic; }
.byline { font-size:xx-small; }
#photocaption { font-size: small; font-style: italic }
#photocredit { font-size: xx-small; }'''
keep_only_tags = [dict(name='div', attrs={'id': 'storyheader'}), dict(
name='div', attrs={'id': 'storycontent'})]
remove_tags = [{'class': 'comments'},
dict(name='div', attrs={'class': 'navbar'}), dict(
name='div', attrs={'class': 'morelinks'}),
dict(name='div', attrs={'class': 'viewmore'}), dict(
name='li', attrs={'class': 'email'}),
dict(name='div', attrs={'class': 'story_tool_hr'}), dict(
name='div', attrs={'class': 'clear'}),
dict(name='div', attrs={'class': 'story_tool'}), dict(
name='div', attrs={'class': 'copyright'}),
dict(name='div', attrs={'class': 'rule_grey_solid'}),
dict(name='li', attrs={'class': 'print'}), dict(name='li', attrs={'class': 'share'}), dict(name='ul', attrs={'class': 'bullet'})]
def preprocess_html(self, soup):
# delete iempty id attributes--they screw up the TOC for unknow reasons
divtags = soup.findAll('div', attrs={'id': ''})
if divtags:
for div in divtags:
del(div['id'])
return soup
def parse_index(self):
soup = self.index_to_soup(
self.url_prefix + '/news/todays-paper/index.html')
articles = {}
key = 'News'
ans = ['News']
# Find each instance of class="sectiontitle", class="featurecontent"
for divtag in soup.findAll('div', attrs={'class': ["section_title02", "featurecontent"]}):
if 'section_title' in ''.join(divtag['class']):
# div contains section title
if not divtag.h3:
continue
key = self.tag_to_string(divtag.h3, False)
ans.append(key)
self.log("Section name %s" % key)
continue
# div contains article data
h1tag = divtag.find('h1')
if not h1tag:
continue
atag = h1tag.find('a', href=True)
if not atag:
continue
url = self.url_prefix + '/news/todays-paper/' + atag['href']
title = self.tag_to_string(atag, False)
pubdate = ''
description = ''
ptag = divtag.find('p')
if ptag:
description = self.tag_to_string(ptag, False)
author = ''
autag = divtag.find('h4')
if autag:
author = self.tag_to_string(autag, False)
if key not in articles:
articles[key] = []
articles[key].append(dict(title=title, url=url, date=pubdate,
description=description, author=author, content=''))
ans = [(keyl, articles[key]) for keyl in ans if keyl in articles]
return ans