mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Merge from trunk
This commit is contained in:
commit
3fe0662ea7
@ -1,105 +1,46 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
|
||||
'''
|
||||
www.canada.com
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import Tag, BeautifulStoneSoup
|
||||
|
||||
|
||||
class CanWestPaper(BasicNewsRecipe):
|
||||
class TimesColonist(BasicNewsRecipe):
|
||||
|
||||
# un-comment the following four lines for the Victoria Times Colonist
|
||||
title = u'Victoria Times Colonist'
|
||||
url_prefix = 'http://www.timescolonist.com'
|
||||
description = u'News from Victoria, BC'
|
||||
fp_tag = 'CAN_TC'
|
||||
|
||||
# un-comment the following four lines for the Vancouver Province
|
||||
## title = u'Vancouver Province'
|
||||
## url_prefix = 'http://www.theprovince.com'
|
||||
## description = u'News from Vancouver, BC'
|
||||
## fp_tag = 'CAN_VP'
|
||||
|
||||
# un-comment the following four lines for the Vancouver Sun
|
||||
## title = u'Vancouver Sun'
|
||||
## url_prefix = 'http://www.vancouversun.com'
|
||||
## description = u'News from Vancouver, BC'
|
||||
## fp_tag = 'CAN_VS'
|
||||
|
||||
# un-comment the following four lines for the Edmonton Journal
|
||||
## title = u'Edmonton Journal'
|
||||
## url_prefix = 'http://www.edmontonjournal.com'
|
||||
## description = u'News from Edmonton, AB'
|
||||
## fp_tag = 'CAN_EJ'
|
||||
|
||||
# un-comment the following four lines for the Calgary Herald
|
||||
## title = u'Calgary Herald'
|
||||
## url_prefix = 'http://www.calgaryherald.com'
|
||||
## description = u'News from Calgary, AB'
|
||||
## fp_tag = 'CAN_CH'
|
||||
|
||||
# un-comment the following four lines for the Regina Leader-Post
|
||||
## title = u'Regina Leader-Post'
|
||||
## url_prefix = 'http://www.leaderpost.com'
|
||||
## description = u'News from Regina, SK'
|
||||
## fp_tag = ''
|
||||
|
||||
# un-comment the following four lines for the Saskatoon Star-Phoenix
|
||||
## title = u'Saskatoon Star-Phoenix'
|
||||
## url_prefix = 'http://www.thestarphoenix.com'
|
||||
## description = u'News from Saskatoon, SK'
|
||||
## fp_tag = ''
|
||||
|
||||
# un-comment the following four lines for the Windsor Star
|
||||
## title = u'Windsor Star'
|
||||
## url_prefix = 'http://www.windsorstar.com'
|
||||
## description = u'News from Windsor, ON'
|
||||
## fp_tag = 'CAN_'
|
||||
|
||||
# un-comment the following four lines for the Ottawa Citizen
|
||||
## title = u'Ottawa Citizen'
|
||||
## url_prefix = 'http://www.ottawacitizen.com'
|
||||
## description = u'News from Ottawa, ON'
|
||||
## fp_tag = 'CAN_OC'
|
||||
|
||||
# un-comment the following four lines for the Montreal Gazette
|
||||
## title = u'Montreal Gazette'
|
||||
## url_prefix = 'http://www.montrealgazette.com'
|
||||
## description = u'News from Montreal, QC'
|
||||
## fp_tag = 'CAN_MG'
|
||||
|
||||
|
||||
url_list = []
|
||||
language = 'en_CA'
|
||||
__author__ = 'Nick Redding'
|
||||
no_stylesheets = True
|
||||
timefmt = ' [%b %d]'
|
||||
encoding = 'utf-8'
|
||||
extra_css = '''
|
||||
.timestamp { font-size:xx-small; display: block; }
|
||||
#storyheader { font-size: medium; }
|
||||
#storyheader h1 { font-size: x-large; }
|
||||
#storyheader h2 { font-size: large; font-style: italic; }
|
||||
.byline { font-size:xx-small; }
|
||||
#photocaption { font-size: small; font-style: italic }
|
||||
#photocredit { font-size: xx-small; }'''
|
||||
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
|
||||
.byline { font-size:xx-small; font-weight: bold;}
|
||||
h3 { margin-bottom: 6px; }
|
||||
.caption { font-size: xx-small; font-style: italic; font-weight: normal; }
|
||||
'''
|
||||
keep_only_tags = [dict(name='div', attrs={'class':re.compile('main.content')})]
|
||||
remove_tags = [{'class':'comments'},
|
||||
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
|
||||
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
|
||||
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
|
||||
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
|
||||
dict(name='div', attrs={'class':'rule_grey_solid'}),
|
||||
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
|
||||
{'id':'photocredit'},
|
||||
dict(name='div', attrs={'class':re.compile('top.controls')}),
|
||||
dict(name='div', attrs={'class':re.compile('social')}),
|
||||
dict(name='div', attrs={'class':re.compile('tools')}),
|
||||
dict(name='div', attrs={'class':re.compile('bottom.tools')}),
|
||||
dict(name='div', attrs={'class':re.compile('window')}),
|
||||
dict(name='div', attrs={'class':re.compile('related.news.element')})]
|
||||
|
||||
|
||||
def get_cover_url(self):
|
||||
from datetime import timedelta, date
|
||||
if self.fp_tag=='':
|
||||
return None
|
||||
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
|
||||
br = BasicNewsRecipe.get_browser(self)
|
||||
daysback=1
|
||||
@ -120,6 +61,18 @@ class CanWestPaper(BasicNewsRecipe):
|
||||
cover = None
|
||||
return cover
|
||||
|
||||
def prepare_masthead_image(self, path_to_image, out_path):
|
||||
if self.Kindle_Fire:
|
||||
from calibre.utils.magick import Image, create_canvas
|
||||
img = Image()
|
||||
img.open(path_to_image)
|
||||
width, height = img.size
|
||||
img2 = create_canvas(width, height)
|
||||
img2.compose(img)
|
||||
img2.save(out_path)
|
||||
else:
|
||||
BasicNewsRecipe.prepare_masthead_image(path_to_image, out_path)
|
||||
|
||||
def fixChars(self,string):
|
||||
# Replace lsquo (\x91)
|
||||
fixed = re.sub("\x91","‘",string)
|
||||
@ -166,55 +119,107 @@ class CanWestPaper(BasicNewsRecipe):
|
||||
a.replaceWith(a.renderContents().decode('cp1252','replace'))
|
||||
return soup
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
def preprocess_html(self,soup):
|
||||
byline = soup.find('p',attrs={'class':re.compile('ancillary')})
|
||||
if byline is not None:
|
||||
byline.find('a')
|
||||
authstr = self.tag_to_string(byline,False)
|
||||
authstr = re.sub('/ *Times Colonist','/',authstr, flags=re.IGNORECASE)
|
||||
authstr = re.sub('BY */','',authstr, flags=re.IGNORECASE)
|
||||
newdiv = Tag(soup,'div')
|
||||
newdiv.insert(0,authstr)
|
||||
newdiv['class']='byline'
|
||||
byline.replaceWith(newdiv)
|
||||
for caption in soup.findAll('p',attrs={'class':re.compile('caption')}):
|
||||
capstr = self.tag_to_string(caption,False)
|
||||
capstr = re.sub('Photograph by.*$','',capstr, flags=re.IGNORECASE)
|
||||
newdiv = Tag(soup,'div')
|
||||
newdiv.insert(0,capstr)
|
||||
newdiv['class']='caption'
|
||||
caption.replaceWith(newdiv)
|
||||
for ptag in soup.findAll('p'):
|
||||
ptext = self.tag_to_string(ptag,use_alt=False, normalize_whitespace=True)
|
||||
ptext = re.sub(r'\s+','', ptext)
|
||||
if (ptext=='') or (ptext==' '):
|
||||
ptag.extract()
|
||||
return self.strip_anchors(soup)
|
||||
|
||||
raeside = False
|
||||
def handle_articles(self,htag,article_list,sectitle):
|
||||
atag = htag.a
|
||||
if atag is not None:
|
||||
url = atag['href']
|
||||
#print("Checking "+url)
|
||||
if atag['href'].startswith('/'):
|
||||
url = self.url_prefix+atag['href']
|
||||
if url in self.url_list:
|
||||
return
|
||||
self.url_list.append(url)
|
||||
title = self.tag_to_string(atag,False)
|
||||
if 'VIDEO' in title.upper():
|
||||
return
|
||||
if 'GALLERY' in title.upper():
|
||||
return
|
||||
if 'PHOTOS' in title.upper():
|
||||
return
|
||||
if 'RAESIDE' in title.upper():
|
||||
if self.raeside:
|
||||
return
|
||||
self.raeside = True
|
||||
dtag = htag.findNext('p')
|
||||
description=''
|
||||
if dtag is not None:
|
||||
description = self.tag_to_string(dtag,False)
|
||||
article_list.append(dict(title=title,url=url,date='',description=description,author='',content=''))
|
||||
#print(sectitle+title+": description = "+description+" URL="+url)
|
||||
|
||||
def add_section_index(self,ans,securl,sectitle):
|
||||
print("Add section url="+self.url_prefix+'/'+securl)
|
||||
try:
|
||||
soup = self.index_to_soup(self.url_prefix+'/'+securl)
|
||||
except:
|
||||
return ans
|
||||
mainsoup = soup.find('div',attrs={'class':re.compile('main.content')})
|
||||
article_list = []
|
||||
for wdiv in mainsoup.findAll('div',attrs={'id':re.compile('featured.story')}):
|
||||
for htag in wdiv.findAll('h3'):
|
||||
self.handle_articles(htag,article_list,sectitle)
|
||||
for ladiv in mainsoup.findAll(attrs={'class':re.compile('leading.articles')}):
|
||||
for wdiv in mainsoup.findAll('div',attrs={'class':re.compile('article.row')}):
|
||||
for htag in wdiv.findAll('h2'):
|
||||
self.handle_articles(htag,article_list,sectitle)
|
||||
ans.append((sectitle,article_list))
|
||||
return ans
|
||||
|
||||
def parse_index(self):
|
||||
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
|
||||
|
||||
articles = {}
|
||||
key = 'News'
|
||||
ans = ['News']
|
||||
|
||||
# Find each instance of class="sectiontitle", class="featurecontent"
|
||||
for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
|
||||
#self.log(" div class = %s" % divtag['class'])
|
||||
if divtag['class'].startswith('section_title'):
|
||||
# div contains section title
|
||||
if not divtag.h3:
|
||||
continue
|
||||
key = self.tag_to_string(divtag.h3,False)
|
||||
ans.append(key)
|
||||
self.log("Section name %s" % key)
|
||||
continue
|
||||
# div contains article data
|
||||
h1tag = divtag.find('h1')
|
||||
if not h1tag:
|
||||
continue
|
||||
atag = h1tag.find('a',href=True)
|
||||
if not atag:
|
||||
continue
|
||||
url = self.url_prefix+'/news/todays-paper/'+atag['href']
|
||||
#self.log("Section %s" % key)
|
||||
#self.log("url %s" % url)
|
||||
title = self.tag_to_string(atag,False)
|
||||
#self.log("title %s" % title)
|
||||
pubdate = ''
|
||||
description = ''
|
||||
ptag = divtag.find('p');
|
||||
if ptag:
|
||||
description = self.tag_to_string(ptag,False)
|
||||
#self.log("description %s" % description)
|
||||
author = ''
|
||||
autag = divtag.find('h4')
|
||||
if autag:
|
||||
author = self.tag_to_string(autag,False)
|
||||
#self.log("author %s" % author)
|
||||
if not articles.has_key(key):
|
||||
articles[key] = []
|
||||
articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
|
||||
|
||||
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
||||
ans = []
|
||||
ans = self.add_section_index(ans,'','Web Front Page')
|
||||
ans = self.add_section_index(ans,'news/','News Headlines')
|
||||
ans = self.add_section_index(ans,'news/b-c/','BC News')
|
||||
ans = self.add_section_index(ans,'news/national/','Natioanl News')
|
||||
ans = self.add_section_index(ans,'news/world/','World News')
|
||||
ans = self.add_section_index(ans,'opinion/','Opinion')
|
||||
ans = self.add_section_index(ans,'opinion/letters/','Letters')
|
||||
ans = self.add_section_index(ans,'business/','Business')
|
||||
ans = self.add_section_index(ans,'business/money/','Money')
|
||||
ans = self.add_section_index(ans,'business/technology/','Technology')
|
||||
ans = self.add_section_index(ans,'business/working/','Working')
|
||||
ans = self.add_section_index(ans,'sports/','Sports')
|
||||
ans = self.add_section_index(ans,'sports/hockey/','Hockey')
|
||||
ans = self.add_section_index(ans,'sports/football/','Football')
|
||||
ans = self.add_section_index(ans,'sports/basketball/','Basketball')
|
||||
ans = self.add_section_index(ans,'sports/golf/','Golf')
|
||||
ans = self.add_section_index(ans,'entertainment/','entertainment')
|
||||
ans = self.add_section_index(ans,'entertainment/go/','Go!')
|
||||
ans = self.add_section_index(ans,'entertainment/music/','Music')
|
||||
ans = self.add_section_index(ans,'entertainment/books/','Books')
|
||||
ans = self.add_section_index(ans,'entertainment/Movies/','movies')
|
||||
ans = self.add_section_index(ans,'entertainment/television/','Television')
|
||||
ans = self.add_section_index(ans,'life/','Life')
|
||||
ans = self.add_section_index(ans,'life/health/','Health')
|
||||
ans = self.add_section_index(ans,'life/travel/','Travel')
|
||||
ans = self.add_section_index(ans,'life/driving/','Driving')
|
||||
ans = self.add_section_index(ans,'life/homes/','Homes')
|
||||
ans = self.add_section_index(ans,'life/food-drink/','Food & Drink')
|
||||
return ans
|
||||
|
||||
|
@ -614,10 +614,14 @@ class Amazon(Source):
|
||||
return domain
|
||||
|
||||
def clean_downloaded_metadata(self, mi):
|
||||
if mi.title and self.domain in ('com', 'uk'):
|
||||
docase = (
|
||||
mi.language == 'eng' or
|
||||
(mi.is_null('language') and self.domain in {'com', 'uk'})
|
||||
)
|
||||
if mi.title and docase:
|
||||
mi.title = fixcase(mi.title)
|
||||
mi.authors = fixauthors(mi.authors)
|
||||
if self.domain in ('com', 'uk'):
|
||||
if mi.tags and docase:
|
||||
mi.tags = list(map(fixcase, mi.tags))
|
||||
mi.isbn = check_isbn(mi.isbn)
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user