mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Update Victoria Times
This commit is contained in:
parent
9a1d1c4fee
commit
fa47afe5a6
@ -6,17 +6,62 @@ __license__ = 'GPL v3'
|
|||||||
www.canada.com
|
www.canada.com
|
||||||
'''
|
'''
|
||||||
import re
|
import re
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
from calibre.ebooks.BeautifulSoup import Tag, BeautifulStoneSoup
|
from calibre.ebooks.BeautifulSoup import Tag, BeautifulStoneSoup
|
||||||
|
|
||||||
|
|
||||||
class TimesColonist(BasicNewsRecipe):
|
class TimesColonist(BasicNewsRecipe):
|
||||||
|
|
||||||
|
# Customization -- remove sections you don't want.
|
||||||
|
# If your e-reader is an e-ink Kindle and your output profile is
|
||||||
|
# set properly this recipe will not include images because the
|
||||||
|
# resulting file is too large. If you have one of these and want
|
||||||
|
# images you can set kindle_omit_images = False
|
||||||
|
# and remove sections (typically the e-ink Kindles will
|
||||||
|
# work with about a dozen of these, but your mileage may vary).
|
||||||
|
|
||||||
|
kindle_omit_images = True
|
||||||
|
|
||||||
|
section_list = [
|
||||||
|
('','Web Front Page'),
|
||||||
|
('news/','News Headlines'),
|
||||||
|
('news/b-c/','BC News'),
|
||||||
|
('news/national/','National News'),
|
||||||
|
('news/world/','World News'),
|
||||||
|
('opinion/','Opinion'),
|
||||||
|
('opinion/letters/','Letters'),
|
||||||
|
('business/','Business'),
|
||||||
|
('business/money/','Money'),
|
||||||
|
('business/technology/','Technology'),
|
||||||
|
('business/working/','Working'),
|
||||||
|
('sports/','Sports'),
|
||||||
|
('sports/hockey/','Hockey'),
|
||||||
|
('sports/football/','Football'),
|
||||||
|
('sports/basketball/','Basketball'),
|
||||||
|
('sports/golf/','Golf'),
|
||||||
|
('entertainment/','entertainment'),
|
||||||
|
('entertainment/go/','Go!'),
|
||||||
|
('entertainment/music/','Music'),
|
||||||
|
('entertainment/books/','Books'),
|
||||||
|
('entertainment/Movies/','Movies'),
|
||||||
|
('entertainment/television/','Television'),
|
||||||
|
('life/','Life'),
|
||||||
|
('life/health/','Health'),
|
||||||
|
('life/travel/','Travel'),
|
||||||
|
('life/driving/','Driving'),
|
||||||
|
('life/homes/','Homes'),
|
||||||
|
('life/food-drink/','Food & Drink')
|
||||||
|
]
|
||||||
|
|
||||||
title = u'Victoria Times Colonist'
|
title = u'Victoria Times Colonist'
|
||||||
url_prefix = 'http://www.timescolonist.com'
|
url_prefix = 'http://www.timescolonist.com'
|
||||||
description = u'News from Victoria, BC'
|
description = u'News from Victoria, BC'
|
||||||
fp_tag = 'CAN_TC'
|
fp_tag = 'CAN_TC'
|
||||||
|
|
||||||
|
masthead_url = 'http://www.timescolonist.com/gmg/img/global/logoTimesColonist.png'
|
||||||
|
|
||||||
|
|
||||||
url_list = []
|
url_list = []
|
||||||
language = 'en_CA'
|
language = 'en_CA'
|
||||||
__author__ = 'Nick Redding'
|
__author__ = 'Nick Redding'
|
||||||
@ -29,15 +74,21 @@ class TimesColonist(BasicNewsRecipe):
|
|||||||
.caption { font-size: xx-small; font-style: italic; font-weight: normal; }
|
.caption { font-size: xx-small; font-style: italic; font-weight: normal; }
|
||||||
'''
|
'''
|
||||||
keep_only_tags = [dict(name='div', attrs={'class':re.compile('main.content')})]
|
keep_only_tags = [dict(name='div', attrs={'class':re.compile('main.content')})]
|
||||||
remove_tags = [{'class':'comments'},
|
|
||||||
|
def __init__(self, options, log, progress_reporter):
|
||||||
|
self.remove_tags = [{'class':'comments'},
|
||||||
{'id':'photocredit'},
|
{'id':'photocredit'},
|
||||||
dict(name='div', attrs={'class':re.compile('top.controls')}),
|
dict(name='div', attrs={'class':re.compile('top.controls')}),
|
||||||
|
dict(name='div', attrs={'class':re.compile('^comments')}),
|
||||||
dict(name='div', attrs={'class':re.compile('social')}),
|
dict(name='div', attrs={'class':re.compile('social')}),
|
||||||
dict(name='div', attrs={'class':re.compile('tools')}),
|
dict(name='div', attrs={'class':re.compile('tools')}),
|
||||||
dict(name='div', attrs={'class':re.compile('bottom.tools')}),
|
dict(name='div', attrs={'class':re.compile('bottom.tools')}),
|
||||||
dict(name='div', attrs={'class':re.compile('window')}),
|
dict(name='div', attrs={'class':re.compile('window')}),
|
||||||
dict(name='div', attrs={'class':re.compile('related.news.element')})]
|
dict(name='div', attrs={'class':re.compile('related.news.element')})]
|
||||||
|
print("PROFILE NAME = "+options.output_profile.short_name)
|
||||||
|
if self.kindle_omit_images and options.output_profile.short_name in ['kindle', 'kindle_dx', 'kindle_pw']:
|
||||||
|
self.remove_tags.append(dict(name='div', attrs={'class':re.compile('image-container')}))
|
||||||
|
BasicNewsRecipe.__init__(self, options, log, progress_reporter)
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
from datetime import timedelta, date
|
from datetime import timedelta, date
|
||||||
@ -122,7 +173,6 @@ class TimesColonist(BasicNewsRecipe):
|
|||||||
def preprocess_html(self,soup):
|
def preprocess_html(self,soup):
|
||||||
byline = soup.find('p',attrs={'class':re.compile('ancillary')})
|
byline = soup.find('p',attrs={'class':re.compile('ancillary')})
|
||||||
if byline is not None:
|
if byline is not None:
|
||||||
byline.find('a')
|
|
||||||
authstr = self.tag_to_string(byline,False)
|
authstr = self.tag_to_string(byline,False)
|
||||||
authstr = re.sub('/ *Times Colonist','/',authstr, flags=re.IGNORECASE)
|
authstr = re.sub('/ *Times Colonist','/',authstr, flags=re.IGNORECASE)
|
||||||
authstr = re.sub('BY */','',authstr, flags=re.IGNORECASE)
|
authstr = re.sub('BY */','',authstr, flags=re.IGNORECASE)
|
||||||
@ -149,9 +199,10 @@ class TimesColonist(BasicNewsRecipe):
|
|||||||
atag = htag.a
|
atag = htag.a
|
||||||
if atag is not None:
|
if atag is not None:
|
||||||
url = atag['href']
|
url = atag['href']
|
||||||
#print("Checking "+url)
|
url = url.strip()
|
||||||
if atag['href'].startswith('/'):
|
# print("Checking >>"+url+'<<\n\r')
|
||||||
url = self.url_prefix+atag['href']
|
if url.startswith('/'):
|
||||||
|
url = self.url_prefix+url
|
||||||
if url in self.url_list:
|
if url in self.url_list:
|
||||||
return
|
return
|
||||||
self.url_list.append(url)
|
self.url_list.append(url)
|
||||||
@ -171,10 +222,10 @@ class TimesColonist(BasicNewsRecipe):
|
|||||||
if dtag is not None:
|
if dtag is not None:
|
||||||
description = self.tag_to_string(dtag,False)
|
description = self.tag_to_string(dtag,False)
|
||||||
article_list.append(dict(title=title,url=url,date='',description=description,author='',content=''))
|
article_list.append(dict(title=title,url=url,date='',description=description,author='',content=''))
|
||||||
#print(sectitle+title+": description = "+description+" URL="+url)
|
print(sectitle+title+": description = "+description+" URL="+url+'\n\r')
|
||||||
|
|
||||||
def add_section_index(self,ans,securl,sectitle):
|
def add_section_index(self,ans,securl,sectitle):
|
||||||
print("Add section url="+self.url_prefix+'/'+securl)
|
print("Add section url="+self.url_prefix+'/'+securl+'\n\r')
|
||||||
try:
|
try:
|
||||||
soup = self.index_to_soup(self.url_prefix+'/'+securl)
|
soup = self.index_to_soup(self.url_prefix+'/'+securl)
|
||||||
except:
|
except:
|
||||||
@ -193,33 +244,7 @@ class TimesColonist(BasicNewsRecipe):
|
|||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
ans = []
|
ans = []
|
||||||
ans = self.add_section_index(ans,'','Web Front Page')
|
for (url,title) in self.section_list:
|
||||||
ans = self.add_section_index(ans,'news/','News Headlines')
|
ans = self.add_section_index(ans,url,title)
|
||||||
ans = self.add_section_index(ans,'news/b-c/','BC News')
|
|
||||||
ans = self.add_section_index(ans,'news/national/','Natioanl News')
|
|
||||||
ans = self.add_section_index(ans,'news/world/','World News')
|
|
||||||
ans = self.add_section_index(ans,'opinion/','Opinion')
|
|
||||||
ans = self.add_section_index(ans,'opinion/letters/','Letters')
|
|
||||||
ans = self.add_section_index(ans,'business/','Business')
|
|
||||||
ans = self.add_section_index(ans,'business/money/','Money')
|
|
||||||
ans = self.add_section_index(ans,'business/technology/','Technology')
|
|
||||||
ans = self.add_section_index(ans,'business/working/','Working')
|
|
||||||
ans = self.add_section_index(ans,'sports/','Sports')
|
|
||||||
ans = self.add_section_index(ans,'sports/hockey/','Hockey')
|
|
||||||
ans = self.add_section_index(ans,'sports/football/','Football')
|
|
||||||
ans = self.add_section_index(ans,'sports/basketball/','Basketball')
|
|
||||||
ans = self.add_section_index(ans,'sports/golf/','Golf')
|
|
||||||
ans = self.add_section_index(ans,'entertainment/','entertainment')
|
|
||||||
ans = self.add_section_index(ans,'entertainment/go/','Go!')
|
|
||||||
ans = self.add_section_index(ans,'entertainment/music/','Music')
|
|
||||||
ans = self.add_section_index(ans,'entertainment/books/','Books')
|
|
||||||
ans = self.add_section_index(ans,'entertainment/Movies/','movies')
|
|
||||||
ans = self.add_section_index(ans,'entertainment/television/','Television')
|
|
||||||
ans = self.add_section_index(ans,'life/','Life')
|
|
||||||
ans = self.add_section_index(ans,'life/health/','Health')
|
|
||||||
ans = self.add_section_index(ans,'life/travel/','Travel')
|
|
||||||
ans = self.add_section_index(ans,'life/driving/','Driving')
|
|
||||||
ans = self.add_section_index(ans,'life/homes/','Homes')
|
|
||||||
ans = self.add_section_index(ans,'life/food-drink/','Food & Drink')
|
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user