mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-09-29 15:31:08 -04:00
251 lines
9.9 KiB
Python
251 lines
9.9 KiB
Python
#!/usr/bin/env python2
|
||
# -*- coding: utf-8 -*-
|
||
__license__ = 'GPL v3'
|
||
|
||
'''
|
||
www.canada.com
|
||
'''
|
||
import re
|
||
from calibre.web.feeds.news import BasicNewsRecipe
|
||
|
||
from calibre.ebooks.BeautifulSoup import Tag, BeautifulStoneSoup
|
||
|
||
|
||
class TimesColonist(BasicNewsRecipe):
|
||
|
||
# Customization -- remove sections you don't want.
|
||
# If your e-reader is an e-ink Kindle and your output profile is
|
||
# set properly this recipe will not include images because the
|
||
# resulting file is too large. If you have one of these and want
|
||
# images you can set kindle_omit_images = False
|
||
# and remove sections (typically the e-ink Kindles will
|
||
# work with about a dozen of these, but your mileage may vary).
|
||
|
||
kindle_omit_images = True
|
||
|
||
section_list = [
|
||
('','Web Front Page'),
|
||
('news/','News Headlines'),
|
||
('news/b-c/','BC News'),
|
||
('news/national/','National News'),
|
||
('news/world/','World News'),
|
||
('opinion/','Opinion'),
|
||
('opinion/letters/','Letters'),
|
||
('business/','Business'),
|
||
('business/money/','Money'),
|
||
('business/technology/','Technology'),
|
||
('business/working/','Working'),
|
||
('sports/','Sports'),
|
||
('sports/hockey/','Hockey'),
|
||
('sports/football/','Football'),
|
||
('sports/basketball/','Basketball'),
|
||
('sports/golf/','Golf'),
|
||
('entertainment/','entertainment'),
|
||
('entertainment/go/','Go!'),
|
||
('entertainment/music/','Music'),
|
||
('entertainment/books/','Books'),
|
||
('entertainment/Movies/','Movies'),
|
||
('entertainment/television/','Television'),
|
||
('life/','Life'),
|
||
('life/health/','Health'),
|
||
('life/travel/','Travel'),
|
||
('life/driving/','Driving'),
|
||
('life/homes/','Homes'),
|
||
('life/food-drink/','Food & Drink')
|
||
]
|
||
|
||
title = u'Victoria Times Colonist'
|
||
url_prefix = 'http://www.timescolonist.com'
|
||
description = u'News from Victoria, BC'
|
||
fp_tag = 'CAN_TC'
|
||
|
||
masthead_url = 'http://www.timescolonist.com/gmg/img/global/logoTimesColonist.png'
|
||
|
||
|
||
url_list = []
|
||
language = 'en_CA'
|
||
__author__ = 'Nick Redding'
|
||
no_stylesheets = True
|
||
timefmt = ' [%b %d]'
|
||
encoding = 'utf-8'
|
||
extra_css = '''
|
||
.byline { font-size:xx-small; font-weight: bold;}
|
||
h3 { margin-bottom: 6px; }
|
||
.caption { font-size: xx-small; font-style: italic; font-weight: normal; }
|
||
'''
|
||
keep_only_tags = [dict(name='div', attrs={'class':re.compile('main.content')})]
|
||
|
||
def __init__(self, options, log, progress_reporter):
|
||
self.remove_tags = [{'class':'comments'},
|
||
{'id':'photocredit'},
|
||
dict(name='div', attrs={'class':re.compile('top.controls')}),
|
||
dict(name='div', attrs={'class':re.compile('^comments')}),
|
||
dict(name='div', attrs={'class':re.compile('social')}),
|
||
dict(name='div', attrs={'class':re.compile('tools')}),
|
||
dict(name='div', attrs={'class':re.compile('bottom.tools')}),
|
||
dict(name='div', attrs={'class':re.compile('window')}),
|
||
dict(name='div', attrs={'class':re.compile('related.news.element')})]
|
||
print("PROFILE NAME = "+options.output_profile.short_name)
|
||
if self.kindle_omit_images and options.output_profile.short_name in ['kindle', 'kindle_dx', 'kindle_pw']:
|
||
self.remove_tags.append(dict(name='div', attrs={'class':re.compile('image-container')}))
|
||
BasicNewsRecipe.__init__(self, options, log, progress_reporter)
|
||
|
||
def get_cover_url(self):
|
||
from datetime import timedelta, date
|
||
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
|
||
br = BasicNewsRecipe.get_browser(self)
|
||
daysback=1
|
||
try:
|
||
br.open(cover)
|
||
except:
|
||
while daysback<7:
|
||
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
|
||
br = BasicNewsRecipe.get_browser(self)
|
||
try:
|
||
br.open(cover)
|
||
except:
|
||
daysback = daysback+1
|
||
continue
|
||
break
|
||
if daysback==7:
|
||
self.log("\nCover unavailable")
|
||
cover = None
|
||
return cover
|
||
|
||
def prepare_masthead_image(self, path_to_image, out_path):
|
||
if self.Kindle_Fire:
|
||
from calibre.utils.magick import Image, create_canvas
|
||
img = Image()
|
||
img.open(path_to_image)
|
||
width, height = img.size
|
||
img2 = create_canvas(width, height)
|
||
img2.compose(img)
|
||
img2.save(out_path)
|
||
else:
|
||
BasicNewsRecipe.prepare_masthead_image(path_to_image, out_path)
|
||
|
||
def fixChars(self,string):
|
||
# Replace lsquo (\x91)
|
||
fixed = re.sub("\x91","‘",string)
|
||
# Replace rsquo (\x92)
|
||
fixed = re.sub("\x92","’",fixed)
|
||
# Replace ldquo (\x93)
|
||
fixed = re.sub("\x93","“",fixed)
|
||
# Replace rdquo (\x94)
|
||
fixed = re.sub("\x94","”",fixed)
|
||
# Replace ndash (\x96)
|
||
fixed = re.sub("\x96","–",fixed)
|
||
# Replace mdash (\x97)
|
||
fixed = re.sub("\x97","—",fixed)
|
||
fixed = re.sub("’","’",fixed)
|
||
return fixed
|
||
|
||
def massageNCXText(self, description):
|
||
# Kindle TOC descriptions won't render certain characters
|
||
if description:
|
||
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
|
||
# Replace '&' with '&'
|
||
massaged = re.sub("&","&", massaged)
|
||
return self.fixChars(massaged)
|
||
else:
|
||
return description
|
||
|
||
def populate_article_metadata(self, article, soup, first):
|
||
if first:
|
||
picdiv = soup.find('body').find('img')
|
||
if picdiv is not None:
|
||
self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
|
||
xtitle = article.text_summary.strip()
|
||
if len(xtitle) == 0:
|
||
desc = soup.find('meta',attrs={'property':'og:description'})
|
||
if desc is not None:
|
||
article.summary = article.text_summary = desc['content']
|
||
|
||
def strip_anchors(self,soup):
|
||
paras = soup.findAll(True)
|
||
for para in paras:
|
||
aTags = para.findAll('a')
|
||
for a in aTags:
|
||
if a.img is None:
|
||
a.replaceWith(a.renderContents().decode('cp1252','replace'))
|
||
return soup
|
||
|
||
def preprocess_html(self,soup):
|
||
byline = soup.find('p',attrs={'class':re.compile('ancillary')})
|
||
if byline is not None:
|
||
authstr = self.tag_to_string(byline,False)
|
||
authstr = re.sub('/ *Times Colonist','/',authstr, flags=re.IGNORECASE)
|
||
authstr = re.sub('BY */','',authstr, flags=re.IGNORECASE)
|
||
newdiv = Tag(soup,'div')
|
||
newdiv.insert(0,authstr)
|
||
newdiv['class']='byline'
|
||
byline.replaceWith(newdiv)
|
||
for caption in soup.findAll('p',attrs={'class':re.compile('caption')}):
|
||
capstr = self.tag_to_string(caption,False)
|
||
capstr = re.sub('Photograph by.*$','',capstr, flags=re.IGNORECASE)
|
||
newdiv = Tag(soup,'div')
|
||
newdiv.insert(0,capstr)
|
||
newdiv['class']='caption'
|
||
caption.replaceWith(newdiv)
|
||
for ptag in soup.findAll('p'):
|
||
ptext = self.tag_to_string(ptag,use_alt=False, normalize_whitespace=True)
|
||
ptext = re.sub(r'\s+','', ptext)
|
||
if (ptext=='') or (ptext==' '):
|
||
ptag.extract()
|
||
return self.strip_anchors(soup)
|
||
|
||
raeside = False
|
||
def handle_articles(self,htag,article_list,sectitle):
|
||
atag = htag.a
|
||
if atag is not None:
|
||
url = atag['href']
|
||
url = url.strip()
|
||
# print("Checking >>"+url+'<<\n\r')
|
||
if url.startswith('/'):
|
||
url = self.url_prefix+url
|
||
if url in self.url_list:
|
||
return
|
||
self.url_list.append(url)
|
||
title = self.tag_to_string(atag,False)
|
||
if 'VIDEO' in title.upper():
|
||
return
|
||
if 'GALLERY' in title.upper():
|
||
return
|
||
if 'PHOTOS' in title.upper():
|
||
return
|
||
if 'RAESIDE' in title.upper():
|
||
if self.raeside:
|
||
return
|
||
self.raeside = True
|
||
dtag = htag.findNext('p')
|
||
description=''
|
||
if dtag is not None:
|
||
description = self.tag_to_string(dtag,False)
|
||
article_list.append(dict(title=title,url=url,date='',description=description,author='',content=''))
|
||
print(sectitle+title+": description = "+description+" URL="+url+'\n\r')
|
||
|
||
def add_section_index(self,ans,securl,sectitle):
|
||
print("Add section url="+self.url_prefix+'/'+securl+'\n\r')
|
||
try:
|
||
soup = self.index_to_soup(self.url_prefix+'/'+securl)
|
||
except:
|
||
return ans
|
||
mainsoup = soup.find('div',attrs={'class':re.compile('main.content')})
|
||
article_list = []
|
||
for wdiv in mainsoup.findAll('div',attrs={'id':re.compile('featured.story')}):
|
||
for htag in wdiv.findAll('h3'):
|
||
self.handle_articles(htag,article_list,sectitle)
|
||
for ladiv in mainsoup.findAll(attrs={'class':re.compile('leading.articles')}):
|
||
for wdiv in mainsoup.findAll('div',attrs={'class':re.compile('article.row')}):
|
||
for htag in wdiv.findAll('h2'):
|
||
self.handle_articles(htag,article_list,sectitle)
|
||
ans.append((sectitle,article_list))
|
||
return ans
|
||
|
||
def parse_index(self):
|
||
ans = []
|
||
for (url,title) in self.section_list:
|
||
ans = self.add_section_index(ans,url,title)
|
||
return ans
|
||
|