mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
314 lines
13 KiB
Python
314 lines
13 KiB
Python
#!/usr/bin/env python
|
||
# -*- coding: utf-8 -*-
|
||
from __future__ import print_function
|
||
__license__ = 'GPL v3'
|
||
|
||
'''
|
||
www.canada.com
|
||
'''
|
||
import re
|
||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||
from calibre.ebooks.BeautifulSoup import Tag
|
||
|
||
|
||
def new_tag(soup, name, attrs=()):
|
||
impl = getattr(soup, 'new_tag', None)
|
||
if impl is not None:
|
||
return impl(name, attrs=dict(attrs))
|
||
return Tag(soup, name, attrs=attrs or None)
|
||
|
||
|
||
class CanWestPaper(BasicNewsRecipe):
|
||
|
||
postmedia_index_pages = [
|
||
(u'Headlines', u'/index.html'),
|
||
(u'Ottawa & Area', u'/news/ottawa/index.html'),
|
||
(u'Vancouver', u'/news/vancouver/index.html'),
|
||
(u'Calgary', u'/news/calgary/index.html'),
|
||
(u'Edmonton', u'/news/edmonton/index.html'),
|
||
(u'Montreal', u'/news/montreal/index.html'),
|
||
(u'Fraser Valley', u'/news/fraser-valley/index.html'),
|
||
(u'British Columbia', u'/news/bc/index.html'),
|
||
(u'Alberta', u'/news/alberta/index.html'),
|
||
(u'Canada', u'/news/canada/index.html'),
|
||
(u'National', u'/news/national/index.html'),
|
||
(u'Politics', u'/news/politics/index.html'),
|
||
(u'Insight', u'/news/insight/index.html'),
|
||
(u'Special Reports', u'/news/specialreports/index.html'),
|
||
(u'Gangs', u'/news/gangs/index.html'),
|
||
(u'Education', u'/news/education/index.html'),
|
||
(u'Health', u'/news/health/index.html'),
|
||
(u'Environment', u'/news/environment/index.html'),
|
||
(u'World', u'/news/world/index.html'),
|
||
(u'Police Blotter', u'/news/crime-and-justice/index.html'),
|
||
(u'Crime', u'/news/blotter/index.html'),
|
||
(u'Around Town', u'/news/topic.html?t=keyword&q=Around+Town'),
|
||
(u'Diplomatica', u'/news/diplomatica/index.html'),
|
||
(u'Opinion', u'/opinion/index.html'),
|
||
(u'Columnists', u'/columnists/index.html'),
|
||
(u'Editorials', u'/opinion/editorials/index.html'),
|
||
(u'Letters', u'/opinion/letters/index.html'),
|
||
(u'Business', u'/business/index.html'),
|
||
(u'Sports', u'/sports/index.html'),
|
||
(u'Arts', u'/entertainment/index.html'),
|
||
(u'Life', u'/life/index.html'),
|
||
(u'Technology', u'/technology/index.html'),
|
||
(u'Travel', u'/travel/index.html'),
|
||
(u'Health', u'/health/index.html')
|
||
]
|
||
|
||
# un-comment the following six lines for the Vancouver Province
|
||
# title = u'Vancouver Province'
|
||
# url_prefix = 'http://www.theprovince.com'
|
||
# description = u'News from Vancouver, BC'
|
||
# std_logo_url = 'http://www.theprovince.com/images/logo_theprovince.jpg'
|
||
# logo_url = 'vplogo.jpg'
|
||
# fp_tag = 'CAN_TP'
|
||
|
||
# un-comment the following six lines for the Vancouver Sun
|
||
# title = u'Vancouver Sun'
|
||
# url_prefix = 'http://www.vancouversun.com'
|
||
# description = u'News from Vancouver, BC'
|
||
# std_logo_url = 'http://www.vancouversun.com/images/logo_vancouversun.jpg'
|
||
# logo_url = 'vslogo.jpg'
|
||
# fp_tag = 'CAN_VS'
|
||
|
||
# un-comment the following six lines for the Calgary Herald
|
||
title = u'Calgary Herald'
|
||
url_prefix = 'http://www.calgaryherald.com'
|
||
description = u'News from Calgary, AB'
|
||
std_logo_url = 'http://www.calgaryherald.com/images/logo_calgaryherald.jpg'
|
||
logo_url = 'chlogo.jpg'
|
||
fp_tag = 'CAN_CH'
|
||
|
||
# un-comment the following six lines for the Edmonton Journal
|
||
# # title = u'Edmonton Journal'
|
||
# # url_prefix = 'http://www.edmontonjournal.com'
|
||
# # description = u'News from Edmonton, AB'
|
||
# # std_logo_url = 'http://www.edmontonjournal.com/images/logo_edmontonjournal.jpg'
|
||
# # logo_url = 'ejlogo.jpg'
|
||
# # fp_tag = 'CAN_EJ'
|
||
#
|
||
# un-comment the following six lines for the Ottawa Citizen
|
||
# # title = u'Ottawa Citizen'
|
||
# # url_prefix = 'http://www.ottawacitizen.com'
|
||
# # description = u'News from Ottawa, ON'
|
||
# # std_logo_url = 'http://www.ottawacitizen.com/images/logo_ottawacitizen.jpg'
|
||
# # logo_url = 'oclogo.jpg'
|
||
# # fp_tag = 'CAN_OC'
|
||
#
|
||
# un-comment the following six lines for the Montreal Gazette
|
||
# # title = u'Montreal Gazette'
|
||
# # url_prefix = 'http://www.montrealgazette.com'
|
||
# # description = u'News from Montreal, QC'
|
||
# # std_logo_url = 'http://www.montrealgazette.com/images/logo_montrealgazette.jpg'
|
||
# # logo_url = 'mglogo.jpg'
|
||
# # fp_tag = 'CAN_MG'
|
||
|
||
Kindle_Fire = False
|
||
masthead_url = std_logo_url
|
||
|
||
url_list = []
|
||
language = 'en_CA'
|
||
__author__ = 'Nick Redding'
|
||
no_stylesheets = True
|
||
timefmt = ' [%b %d]'
|
||
encoding = 'utf-8'
|
||
extra_css = '''
|
||
.timestamp { font-size:xx-small; display: block; }
|
||
#storyheader { font-size: medium; }
|
||
#storyheader h1 { font-size: x-large; }
|
||
#storyheader h2 { font-size: small; font-style: italic; }
|
||
.byline { font-size:xx-small; }
|
||
#photocaption { font-size: xx-small; font-style: italic; font-weight: normal; }
|
||
.photocaption { font-size: xx-small; font-style: italic; font-weight: normal; }
|
||
#photocredit { font-size: xx-small; font-weight: normal; }'''
|
||
|
||
keep_only_tags = [dict(name='div', attrs={'id': re.compile('story')})]
|
||
|
||
remove_tags = [{'class': 'comments'}, {'class': 'comment-intro'}, {'class': 'storytab'},
|
||
dict(name='div', attrs={'class': 'section_title'}), dict(name='div', attrs={'class': 'sharebar'}), dict(
|
||
name='div', attrs={'class': 'navbar'}), dict(name='div', attrs={'class': 'morelinks'}),
|
||
dict(name='h2', attrs={'id': 'photocredit'}),
|
||
dict(name='div', attrs={'class': 'viewmore'}), dict(
|
||
name='li', attrs={'class': 'email'}),
|
||
dict(name='div', attrs={'class': 'story_tool_hr'}), dict(
|
||
name='div', attrs={'class': 'clear'}),
|
||
dict(name='div', attrs={'class': 'story_tool'}), dict(
|
||
name='div', attrs={'class': 'copyright'}),
|
||
dict(name='div', attrs={'class': 'rule_grey_solid'}),
|
||
dict(name='div', attrs={'id': 'soundoff'}),
|
||
dict(name='div', attrs={'id': re.compile('flyer')}),
|
||
dict(name='li', attrs={'class': 'print'}), dict(name='li', attrs={'class': 'share'}), dict(name='ul', attrs={'class': 'bullet'})]
|
||
|
||
def get_cover_url(self):
|
||
from datetime import timedelta, date
|
||
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg' + \
|
||
str(date.today().day) + '/lg/' + self.fp_tag + '.jpg'
|
||
br = BasicNewsRecipe.get_browser(self)
|
||
daysback = 1
|
||
try:
|
||
br.open(cover)
|
||
except:
|
||
while daysback < 7:
|
||
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg' + \
|
||
str((date.today() - timedelta(days=daysback)).day) + \
|
||
'/lg/' + self.fp_tag + '.jpg'
|
||
br = BasicNewsRecipe.get_browser(self)
|
||
try:
|
||
br.open(cover)
|
||
except:
|
||
daysback = daysback + 1
|
||
continue
|
||
break
|
||
if daysback == 7:
|
||
self.log("\nCover unavailable")
|
||
cover = None
|
||
return cover
|
||
|
||
def fixChars(self, string):
|
||
# Replace lsquo (\x91)
|
||
fixed = re.sub("\x91", "‘", string)
|
||
# Replace rsquo (\x92)
|
||
fixed = re.sub("\x92", "’", fixed)
|
||
# Replace ldquo (\x93)
|
||
fixed = re.sub("\x93", "“", fixed)
|
||
# Replace rdquo (\x94)
|
||
fixed = re.sub("\x94", "”", fixed)
|
||
# Replace ndash (\x96)
|
||
fixed = re.sub("\x96", "–", fixed)
|
||
# Replace mdash (\x97)
|
||
fixed = re.sub("\x97", "—", fixed)
|
||
fixed = re.sub("’", "’", fixed)
|
||
return fixed
|
||
|
||
def massageNCXText(self, description):
|
||
return description
|
||
|
||
def populate_article_metadata(self, article, soup, first):
|
||
if first:
|
||
picdiv = soup.find('body').find('img')
|
||
if picdiv is not None:
|
||
self.add_toc_thumbnail(article, re.sub(
|
||
r'links\\link\d+\\', '', picdiv['src']))
|
||
xtitle = article.text_summary.strip()
|
||
if len(xtitle) == 0:
|
||
desc = soup.find('meta', attrs={'property': 'og:description'})
|
||
if desc is not None:
|
||
article.summary = article.text_summary = desc['content']
|
||
|
||
def strip_anchors(self, soup):
|
||
paras = soup.findAll(True)
|
||
for para in paras:
|
||
aTags = para.findAll('a')
|
||
for a in aTags:
|
||
if a.img is None:
|
||
a.replaceWith(a.renderContents().decode(
|
||
'cp1252', 'replace'))
|
||
return soup
|
||
|
||
def preprocess_html(self, soup):
|
||
# delete empty id attributes--they screw up the TOC for unknown reasons
|
||
divtags = soup.findAll('div', attrs={'id': ''})
|
||
if divtags:
|
||
for div in divtags:
|
||
del(div['id'])
|
||
|
||
pgall = soup.find('div', attrs={'id': 'storyphoto'})
|
||
if pgall is not None: # photo gallery perhaps
|
||
if (soup.find('div', attrs={'id': 'storycontent'}) is None):
|
||
allpics = new_tag(soup, 'div')
|
||
first_img = pgall.find('div', 'storyimage')
|
||
if first_img is not None:
|
||
first_img.extract()
|
||
tlist = pgall.find('div', attrs={'id': 'relatedthumbs'})
|
||
if tlist is not None:
|
||
for atag in tlist.findAll('a'):
|
||
img = new_tag(soup, 'img')
|
||
srcpre, sep, srcpost = atag.img[
|
||
'src'].partition('?')
|
||
img['src'] = srcpre
|
||
pdesc = new_tag(soup, 'p')
|
||
pdesc.insert(0, atag.img['alt'])
|
||
pdesc['class'] = 'photocaption'
|
||
div = new_tag(soup, 'div')
|
||
div.insert(0, pdesc)
|
||
div.insert(0, img)
|
||
allpics.append(div)
|
||
pgall.replaceWith(allpics)
|
||
|
||
for pg in soup.findAll('div', attrs={'id': 'storyphoto'}):
|
||
pg.extract()
|
||
return self.strip_anchors(soup)
|
||
|
||
def parse_index(self):
|
||
|
||
articles = {}
|
||
ans = []
|
||
|
||
def handle_article(adiv, key):
|
||
if adiv.name == 'h1' or adiv.name == 'h3':
|
||
h1tag = adiv
|
||
else:
|
||
h1tag = adiv.h1
|
||
if h1tag is None:
|
||
h1tag = adiv.h3
|
||
if h1tag is not None:
|
||
atag = h1tag.a
|
||
if atag is not None:
|
||
url = atag['href']
|
||
if url.startswith('/'):
|
||
url = self.url_prefix + url
|
||
if not url.startswith(self.url_prefix):
|
||
print("Rejected " + url)
|
||
return
|
||
if url in self.url_list:
|
||
print("Rejected dup " + url)
|
||
return
|
||
self.url_list.append(url)
|
||
title = self.tag_to_string(atag, False)
|
||
if 'VIDEO' in title.upper():
|
||
return
|
||
if 'GALLERY' in title.upper():
|
||
return
|
||
if 'PHOTOS' in title.upper():
|
||
return
|
||
dtag = adiv.find('div', 'content')
|
||
description = ''
|
||
print("URL " + url)
|
||
print("TITLE " + title)
|
||
if dtag is not None:
|
||
stag = dtag.span
|
||
if stag is not None:
|
||
if ''.join(stag['class']) != 'timestamp':
|
||
description = self.tag_to_string(stag, False)
|
||
else:
|
||
description = self.tag_to_string(dtag, False)
|
||
print("DESCRIPTION: " + description)
|
||
if key not in articles:
|
||
articles[key] = []
|
||
articles[key].append(dict(
|
||
title=title, url=url, date='', description=description, author='', content=''))
|
||
|
||
def parse_web_index(key, keyurl):
|
||
print("Section: " + key + ': ' + self.url_prefix + keyurl)
|
||
try:
|
||
soup = self.index_to_soup(self.url_prefix + keyurl)
|
||
except:
|
||
print("Section: " + key + ' NOT FOUND')
|
||
return
|
||
ans.append(key)
|
||
mainsoup = soup.find('div', 'bodywrapper')
|
||
footer = mainsoup.find(attrs={'id': 'footerfeature'})
|
||
if footer is not None:
|
||
footer.extract()
|
||
for wdiv in mainsoup.findAll(attrs={'class': ['genericfeature']}):
|
||
wdiv.extract()
|
||
for wdiv in mainsoup.findAll(attrs={'class': ['headline', 'featurecontent']}):
|
||
handle_article(wdiv, key)
|
||
|
||
for (k, url) in self.postmedia_index_pages:
|
||
parse_web_index(k, url)
|
||
ans = [(key, articles[key]) for key in ans if key in articles]
|
||
return ans
|