mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Fix Calgary Herald
This commit is contained in:
parent
d9b9961c77
commit
8d3c9d93b3
@ -1,220 +1,35 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
|
||||||
|
|
||||||
'''
|
|
||||||
www.canada.com
|
|
||||||
'''
|
|
||||||
|
|
||||||
import re
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
|
|
||||||
|
|
||||||
|
class CalgaryHerald(BasicNewsRecipe):
|
||||||
class CanWestPaper(BasicNewsRecipe):
|
|
||||||
|
|
||||||
# un-comment the following four lines for the Victoria Times Colonist
|
|
||||||
## title = u'Victoria Times Colonist'
|
|
||||||
## url_prefix = 'http://www.timescolonist.com'
|
|
||||||
## description = u'News from Victoria, BC'
|
|
||||||
## fp_tag = 'CAN_TC'
|
|
||||||
|
|
||||||
# un-comment the following four lines for the Vancouver Province
|
|
||||||
## title = u'Vancouver Province'
|
|
||||||
## url_prefix = 'http://www.theprovince.com'
|
|
||||||
## description = u'News from Vancouver, BC'
|
|
||||||
## fp_tag = 'CAN_VP'
|
|
||||||
|
|
||||||
# un-comment the following four lines for the Vancouver Sun
|
|
||||||
## title = u'Vancouver Sun'
|
|
||||||
## url_prefix = 'http://www.vancouversun.com'
|
|
||||||
## description = u'News from Vancouver, BC'
|
|
||||||
## fp_tag = 'CAN_VS'
|
|
||||||
|
|
||||||
# un-comment the following four lines for the Edmonton Journal
|
|
||||||
## title = u'Edmonton Journal'
|
|
||||||
## url_prefix = 'http://www.edmontonjournal.com'
|
|
||||||
## description = u'News from Edmonton, AB'
|
|
||||||
## fp_tag = 'CAN_EJ'
|
|
||||||
|
|
||||||
# un-comment the following four lines for the Calgary Herald
|
|
||||||
title = u'Calgary Herald'
|
title = u'Calgary Herald'
|
||||||
url_prefix = 'http://www.calgaryherald.com'
|
oldest_article = 3
|
||||||
description = u'News from Calgary, AB'
|
max_articles_per_feed = 100
|
||||||
fp_tag = 'CAN_CH'
|
|
||||||
|
|
||||||
# un-comment the following four lines for the Regina Leader-Post
|
feeds = [
|
||||||
## title = u'Regina Leader-Post'
|
(u'News', u'http://rss.canada.com/get/?F233'),
|
||||||
## url_prefix = 'http://www.leaderpost.com'
|
(u'Calgary', u'http://www.calgaryherald.com/scripts/sp6query.aspx?catalog=cahr&tags=keyword|calgary&output=rss?link=http%3a%2f%2fwww.calgaryherald'),
|
||||||
## description = u'News from Regina, SK'
|
(u'Alberta', u'http://www.calgaryherald.com/scripts/Sp6Query.aspx?catalog=CAHR&tags=Keyword|Alberta&output=rss?link=http%3A%2F%2Fwww.calgaryherald.com%2Fnews%2Falberta%2Findex.html'),
|
||||||
## fp_tag = ''
|
(u'Politics', u'http://rss.canada.com/get/?F7551'),
|
||||||
|
(u'National', u'http://rss.canada.com/get/?F7552'),
|
||||||
# un-comment the following four lines for the Saskatoon Star-Phoenix
|
(u'World', u'http://rss.canada.com/get/?F7553'),
|
||||||
## title = u'Saskatoon Star-Phoenix'
|
]
|
||||||
## url_prefix = 'http://www.thestarphoenix.com'
|
__author__ = 'rty'
|
||||||
## description = u'News from Saskatoon, SK'
|
pubisher = 'Calgary Herald'
|
||||||
## fp_tag = ''
|
description = 'Calgary, Alberta, Canada'
|
||||||
|
category = 'News, Calgary, Alberta, Canada'
|
||||||
# un-comment the following four lines for the Windsor Star
|
|
||||||
## title = u'Windsor Star'
|
|
||||||
## url_prefix = 'http://www.windsorstar.com'
|
|
||||||
## description = u'News from Windsor, ON'
|
|
||||||
## fp_tag = 'CAN_'
|
|
||||||
|
|
||||||
# un-comment the following four lines for the Ottawa Citizen
|
|
||||||
## title = u'Ottawa Citizen'
|
|
||||||
## url_prefix = 'http://www.ottawacitizen.com'
|
|
||||||
## description = u'News from Ottawa, ON'
|
|
||||||
## fp_tag = 'CAN_OC'
|
|
||||||
|
|
||||||
# un-comment the following four lines for the Montreal Gazette
|
|
||||||
## title = u'Montreal Gazette'
|
|
||||||
## url_prefix = 'http://www.montrealgazette.com'
|
|
||||||
## description = u'News from Montreal, QC'
|
|
||||||
## fp_tag = 'CAN_MG'
|
|
||||||
|
|
||||||
|
|
||||||
language = 'en_CA'
|
remove_javascript = True
|
||||||
__author__ = 'Nick Redding'
|
use_embedded_content = False
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
timefmt = ' [%b %d]'
|
language = 'en_CA'
|
||||||
extra_css = '''
|
encoding = 'utf-8'
|
||||||
.timestamp { font-size:xx-small; display: block; }
|
conversion_options = {'linearize_tables':True}
|
||||||
#storyheader { font-size: medium; }
|
##masthead_url = 'http://www.calgaryherald.com/index.html'
|
||||||
#storyheader h1 { font-size: x-large; }
|
keep_only_tags = [
|
||||||
#storyheader h2 { font-size: large; font-style: italic; }
|
dict(name='div', attrs={'id':'storyheader'}),
|
||||||
.byline { font-size:xx-small; }
|
dict(name='div', attrs={'id':'storycontent'})
|
||||||
#photocaption { font-size: small; font-style: italic }
|
|
||||||
#photocredit { font-size: xx-small; }'''
|
|
||||||
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
|
|
||||||
remove_tags = [{'class':'comments'},
|
|
||||||
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
|
|
||||||
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
|
|
||||||
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
|
|
||||||
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
|
|
||||||
dict(name='div', attrs={'class':'rule_grey_solid'}),
|
|
||||||
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
|
|
||||||
|
|
||||||
def get_cover_url(self):
|
]
|
||||||
from datetime import timedelta, date
|
remove_tags_after = {'class':"story_tool_hr"}
|
||||||
if self.fp_tag=='':
|
|
||||||
return None
|
|
||||||
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
|
|
||||||
br = BasicNewsRecipe.get_browser()
|
|
||||||
daysback=1
|
|
||||||
try:
|
|
||||||
br.open(cover)
|
|
||||||
except:
|
|
||||||
while daysback<7:
|
|
||||||
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
|
|
||||||
br = BasicNewsRecipe.get_browser()
|
|
||||||
try:
|
|
||||||
br.open(cover)
|
|
||||||
except:
|
|
||||||
daysback = daysback+1
|
|
||||||
continue
|
|
||||||
break
|
|
||||||
if daysback==7:
|
|
||||||
self.log("\nCover unavailable")
|
|
||||||
cover = None
|
|
||||||
return cover
|
|
||||||
|
|
||||||
def fixChars(self,string):
|
|
||||||
# Replace lsquo (\x91)
|
|
||||||
fixed = re.sub("\x91","‘",string)
|
|
||||||
# Replace rsquo (\x92)
|
|
||||||
fixed = re.sub("\x92","’",fixed)
|
|
||||||
# Replace ldquo (\x93)
|
|
||||||
fixed = re.sub("\x93","“",fixed)
|
|
||||||
# Replace rdquo (\x94)
|
|
||||||
fixed = re.sub("\x94","”",fixed)
|
|
||||||
# Replace ndash (\x96)
|
|
||||||
fixed = re.sub("\x96","–",fixed)
|
|
||||||
# Replace mdash (\x97)
|
|
||||||
fixed = re.sub("\x97","—",fixed)
|
|
||||||
fixed = re.sub("’","’",fixed)
|
|
||||||
return fixed
|
|
||||||
|
|
||||||
def massageNCXText(self, description):
|
|
||||||
# Kindle TOC descriptions won't render certain characters
|
|
||||||
if description:
|
|
||||||
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
|
|
||||||
# Replace '&' with '&'
|
|
||||||
massaged = re.sub("&","&", massaged)
|
|
||||||
return self.fixChars(massaged)
|
|
||||||
else:
|
|
||||||
return description
|
|
||||||
|
|
||||||
def populate_article_metadata(self, article, soup, first):
|
|
||||||
if first:
|
|
||||||
picdiv = soup.find('body').find('img')
|
|
||||||
if picdiv is not None:
|
|
||||||
self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
|
|
||||||
xtitle = article.text_summary.strip()
|
|
||||||
if len(xtitle) == 0:
|
|
||||||
desc = soup.find('meta',attrs={'property':'og:description'})
|
|
||||||
if desc is not None:
|
|
||||||
article.summary = article.text_summary = desc['content']
|
|
||||||
|
|
||||||
def strip_anchors(self,soup):
|
|
||||||
paras = soup.findAll(True)
|
|
||||||
for para in paras:
|
|
||||||
aTags = para.findAll('a')
|
|
||||||
for a in aTags:
|
|
||||||
if a.img is None:
|
|
||||||
a.replaceWith(a.renderContents().decode('cp1252','replace'))
|
|
||||||
return soup
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
|
||||||
return self.strip_anchors(soup)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def parse_index(self):
|
|
||||||
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
|
|
||||||
|
|
||||||
articles = {}
|
|
||||||
key = 'News'
|
|
||||||
ans = ['News']
|
|
||||||
|
|
||||||
# Find each instance of class="sectiontitle", class="featurecontent"
|
|
||||||
for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
|
|
||||||
#self.log(" div class = %s" % divtag['class'])
|
|
||||||
if divtag['class'].startswith('section_title'):
|
|
||||||
# div contains section title
|
|
||||||
if not divtag.h3:
|
|
||||||
continue
|
|
||||||
key = self.tag_to_string(divtag.h3,False)
|
|
||||||
ans.append(key)
|
|
||||||
self.log("Section name %s" % key)
|
|
||||||
continue
|
|
||||||
# div contains article data
|
|
||||||
h1tag = divtag.find('h1')
|
|
||||||
if not h1tag:
|
|
||||||
continue
|
|
||||||
atag = h1tag.find('a',href=True)
|
|
||||||
if not atag:
|
|
||||||
continue
|
|
||||||
url = self.url_prefix+'/news/todays-paper/'+atag['href']
|
|
||||||
#self.log("Section %s" % key)
|
|
||||||
#self.log("url %s" % url)
|
|
||||||
title = self.tag_to_string(atag,False)
|
|
||||||
#self.log("title %s" % title)
|
|
||||||
pubdate = ''
|
|
||||||
description = ''
|
|
||||||
ptag = divtag.find('p');
|
|
||||||
if ptag:
|
|
||||||
description = self.tag_to_string(ptag,False)
|
|
||||||
#self.log("description %s" % description)
|
|
||||||
author = ''
|
|
||||||
autag = divtag.find('h4')
|
|
||||||
if autag:
|
|
||||||
author = self.tag_to_string(autag,False)
|
|
||||||
#self.log("author %s" % author)
|
|
||||||
if not articles.has_key(key):
|
|
||||||
articles[key] = []
|
|
||||||
articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
|
|
||||||
|
|
||||||
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
|
||||||
return ans
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user