calibre/recipes/vic_times.recipe
Kovid Goyal 29cd8d64ea
Change shebangs to python from python2
Also remove a few other miscellaneous references to python2
2020-08-22 18:47:51 +05:30

258 lines
9.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function
__license__ = 'GPL v3'
'''
www.canada.com
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
def new_tag(soup, name, attrs=()):
impl = getattr(soup, 'new_tag', None)
if impl is not None:
return impl(name, attrs=dict(attrs))
return Tag(soup, name, attrs=attrs or None)
class TimesColonist(BasicNewsRecipe):
# Customization -- remove sections you don't want.
# If your e-reader is an e-ink Kindle and your output profile is
# set properly this recipe will not include images because the
# resulting file is too large. If you have one of these and want
# images you can set kindle_omit_images = False
# and remove sections (typically the e-ink Kindles will
# work with about a dozen of these, but your mileage may vary).
kindle_omit_images = True
section_list = [
('', 'Web Front Page'),
('news/', 'News Headlines'),
('news/b-c/', 'BC News'),
('news/national/', 'National News'),
('news/world/', 'World News'),
('opinion/', 'Opinion'),
('opinion/letters/', 'Letters'),
('business/', 'Business'),
('business/money/', 'Money'),
('business/technology/', 'Technology'),
('business/working/', 'Working'),
('sports/', 'Sports'),
('sports/hockey/', 'Hockey'),
('sports/football/', 'Football'),
('sports/basketball/', 'Basketball'),
('sports/golf/', 'Golf'),
('entertainment/', 'entertainment'),
('entertainment/go/', 'Go!'),
('entertainment/music/', 'Music'),
('entertainment/books/', 'Books'),
('entertainment/Movies/', 'Movies'),
('entertainment/television/', 'Television'),
('life/', 'Life'),
('life/health/', 'Health'),
('life/travel/', 'Travel'),
('life/driving/', 'Driving'),
('life/homes/', 'Homes'),
('life/food-drink/', 'Food & Drink')
]
title = u'Victoria Times Colonist'
url_prefix = 'http://www.timescolonist.com'
description = u'News from Victoria, BC'
fp_tag = 'CAN_TC'
masthead_url = 'http://www.timescolonist.com/gmg/img/global/logoTimesColonist.png'
url_list = []
language = 'en_CA'
__author__ = 'Nick Redding'
no_stylesheets = True
timefmt = ' [%b %d]'
encoding = 'utf-8'
extra_css = '''
.byline { font-size:xx-small; font-weight: bold;}
h3 { margin-bottom: 6px; }
.caption { font-size: xx-small; font-style: italic; font-weight: normal; }
'''
keep_only_tags = [
dict(name='div', attrs={'class': re.compile('main.content')})]
def __init__(self, options, log, progress_reporter):
self.remove_tags = [{'class': 'comments'},
{'id': 'photocredit'},
dict(name='div', attrs={
'class': re.compile('top.controls')}),
dict(name='div', attrs={
'class': re.compile('^comments')}),
dict(name='div', attrs={
'class': re.compile('social')}),
dict(name='div', attrs={
'class': re.compile('tools')}),
dict(name='div', attrs={
'class': re.compile('bottom.tools')}),
dict(name='div', attrs={
'class': re.compile('window')}),
dict(name='div', attrs={'class': re.compile('related.news.element')})]
print("PROFILE NAME = " + options.output_profile.short_name)
if self.kindle_omit_images and options.output_profile.short_name in ['kindle', 'kindle_dx', 'kindle_pw']:
self.remove_tags.append(
dict(name='div', attrs={'class': re.compile('image-container')}))
BasicNewsRecipe.__init__(self, options, log, progress_reporter)
def get_cover_url(self):
from datetime import timedelta, date
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg' + \
str(date.today().day) + '/lg/' + self.fp_tag + '.jpg'
br = BasicNewsRecipe.get_browser(self)
daysback = 1
try:
br.open(cover)
except:
while daysback < 7:
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg' + \
str((date.today() - timedelta(days=daysback)).day) + \
'/lg/' + self.fp_tag + '.jpg'
br = BasicNewsRecipe.get_browser(self)
try:
br.open(cover)
except:
daysback = daysback + 1
continue
break
if daysback == 7:
self.log("\nCover unavailable")
cover = None
return cover
def fixChars(self, string):
# Replace lsquo (\x91)
fixed = re.sub("\x91", "", string)
# Replace rsquo (\x92)
fixed = re.sub("\x92", "", fixed)
# Replace ldquo (\x93)
fixed = re.sub("\x93", "", fixed)
# Replace rdquo (\x94)
fixed = re.sub("\x94", "", fixed)
# Replace ndash (\x96)
fixed = re.sub("\x96", "", fixed)
# Replace mdash (\x97)
fixed = re.sub("\x97", "", fixed)
fixed = re.sub("&#x2019;", "", fixed)
return fixed
def massageNCXText(self, description):
return description
def populate_article_metadata(self, article, soup, first):
if first:
picdiv = soup.find('body').find('img')
if picdiv is not None:
self.add_toc_thumbnail(article, re.sub(
r'links\\link\d+\\', '', picdiv['src']))
xtitle = article.text_summary.strip()
if len(xtitle) == 0:
desc = soup.find('meta', attrs={'property': 'og:description'})
if desc is not None:
article.summary = article.text_summary = desc['content']
def strip_anchors(self, soup):
paras = soup.findAll(True)
for para in paras:
aTags = para.findAll('a')
for a in aTags:
if a.img is None:
a.replaceWith(a.renderContents().decode(
'cp1252', 'replace'))
return soup
def preprocess_html(self, soup):
byline = soup.find('p', attrs={'class': re.compile('ancillary')})
if byline is not None:
authstr = self.tag_to_string(byline, False)
authstr = re.sub('/ *Times Colonist', '/',
authstr, flags=re.IGNORECASE)
authstr = re.sub('BY */', '', authstr, flags=re.IGNORECASE)
newdiv = new_tag(soup, 'div')
newdiv.insert(0, authstr)
newdiv['class'] = 'byline'
byline.replaceWith(newdiv)
for caption in soup.findAll('p', attrs={'class': re.compile('caption')}):
capstr = self.tag_to_string(caption, False)
capstr = re.sub('Photograph by.*$', '',
capstr, flags=re.IGNORECASE)
newdiv = new_tag(soup, 'div')
newdiv.insert(0, capstr)
newdiv['class'] = 'caption'
caption.replaceWith(newdiv)
for ptag in soup.findAll('p'):
ptext = self.tag_to_string(
ptag, use_alt=False, normalize_whitespace=True)
ptext = re.sub(r'\s+', '', ptext)
if (ptext == '') or (ptext == '&nbsp;'):
ptag.extract()
return self.strip_anchors(soup)
raeside = False
def handle_articles(self, htag, article_list, sectitle):
atag = htag.a
if atag is not None:
url = atag['href']
url = url.strip()
# print("Checking >>"+url+'<<\n\r')
if url.startswith('/'):
url = self.url_prefix + url
if url in self.url_list:
return
self.url_list.append(url)
title = self.tag_to_string(atag, False)
if 'VIDEO' in title.upper():
return
if 'GALLERY' in title.upper():
return
if 'PHOTOS' in title.upper():
return
if 'RAESIDE' in title.upper():
if self.raeside:
return
self.raeside = True
dtag = htag.findNext('p')
description = ''
if dtag is not None:
description = self.tag_to_string(dtag, False)
article_list.append(dict(
title=title, url=url, date='', description=description, author='', content=''))
print(sectitle + title + ": description = " +
description + " URL=" + url + '\n\r')
def add_section_index(self, ans, securl, sectitle):
print("Add section url=" + self.url_prefix + '/' + securl + '\n\r')
try:
soup = self.index_to_soup(self.url_prefix + '/' + securl)
except:
return ans
mainsoup = soup.find(
'div', attrs={'class': re.compile('main.content')})
article_list = []
for wdiv in mainsoup.findAll('div', attrs={'id': re.compile('featured.story')}):
for htag in wdiv.findAll('h3'):
self.handle_articles(htag, article_list, sectitle)
for ladiv in mainsoup.findAll(attrs={'class': re.compile('leading.articles')}):
for wdiv in mainsoup.findAll('div', attrs={'class': re.compile('article.row')}):
for htag in wdiv.findAll('h2'):
self.handle_articles(htag, article_list, sectitle)
ans.append((sectitle, article_list))
return ans
def parse_index(self):
ans = []
for (url, title) in self.section_list:
ans = self.add_section_index(ans, url, title)
return ans