Merge from trunk

This commit is contained in:
Charles Haley 2013-01-29 07:27:18 +01:00
commit 3fe0662ea7
2 changed files with 135 additions and 126 deletions

View File

@ -1,105 +1,46 @@
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
__license__ = 'GPL v3' __license__ = 'GPL v3'
''' '''
www.canada.com www.canada.com
''' '''
import re import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup from calibre.ebooks.BeautifulSoup import Tag, BeautifulStoneSoup
class CanWestPaper(BasicNewsRecipe): class TimesColonist(BasicNewsRecipe):
# un-comment the following four lines for the Victoria Times Colonist
title = u'Victoria Times Colonist' title = u'Victoria Times Colonist'
url_prefix = 'http://www.timescolonist.com' url_prefix = 'http://www.timescolonist.com'
description = u'News from Victoria, BC' description = u'News from Victoria, BC'
fp_tag = 'CAN_TC' fp_tag = 'CAN_TC'
# un-comment the following four lines for the Vancouver Province url_list = []
## title = u'Vancouver Province'
## url_prefix = 'http://www.theprovince.com'
## description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VP'
# un-comment the following four lines for the Vancouver Sun
## title = u'Vancouver Sun'
## url_prefix = 'http://www.vancouversun.com'
## description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VS'
# un-comment the following four lines for the Edmonton Journal
## title = u'Edmonton Journal'
## url_prefix = 'http://www.edmontonjournal.com'
## description = u'News from Edmonton, AB'
## fp_tag = 'CAN_EJ'
# un-comment the following four lines for the Calgary Herald
## title = u'Calgary Herald'
## url_prefix = 'http://www.calgaryherald.com'
## description = u'News from Calgary, AB'
## fp_tag = 'CAN_CH'
# un-comment the following four lines for the Regina Leader-Post
## title = u'Regina Leader-Post'
## url_prefix = 'http://www.leaderpost.com'
## description = u'News from Regina, SK'
## fp_tag = ''
# un-comment the following four lines for the Saskatoon Star-Phoenix
## title = u'Saskatoon Star-Phoenix'
## url_prefix = 'http://www.thestarphoenix.com'
## description = u'News from Saskatoon, SK'
## fp_tag = ''
# un-comment the following four lines for the Windsor Star
## title = u'Windsor Star'
## url_prefix = 'http://www.windsorstar.com'
## description = u'News from Windsor, ON'
## fp_tag = 'CAN_'
# un-comment the following four lines for the Ottawa Citizen
## title = u'Ottawa Citizen'
## url_prefix = 'http://www.ottawacitizen.com'
## description = u'News from Ottawa, ON'
## fp_tag = 'CAN_OC'
# un-comment the following four lines for the Montreal Gazette
## title = u'Montreal Gazette'
## url_prefix = 'http://www.montrealgazette.com'
## description = u'News from Montreal, QC'
## fp_tag = 'CAN_MG'
language = 'en_CA' language = 'en_CA'
__author__ = 'Nick Redding' __author__ = 'Nick Redding'
no_stylesheets = True no_stylesheets = True
timefmt = ' [%b %d]' timefmt = ' [%b %d]'
encoding = 'utf-8'
extra_css = ''' extra_css = '''
.timestamp { font-size:xx-small; display: block; } .byline { font-size:xx-small; font-weight: bold;}
#storyheader { font-size: medium; } h3 { margin-bottom: 6px; }
#storyheader h1 { font-size: x-large; } .caption { font-size: xx-small; font-style: italic; font-weight: normal; }
#storyheader h2 { font-size: large; font-style: italic; } '''
.byline { font-size:xx-small; } keep_only_tags = [dict(name='div', attrs={'class':re.compile('main.content')})]
#photocaption { font-size: small; font-style: italic }
#photocredit { font-size: xx-small; }'''
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
remove_tags = [{'class':'comments'}, remove_tags = [{'class':'comments'},
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), {'id':'photocredit'},
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), dict(name='div', attrs={'class':re.compile('top.controls')}),
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), dict(name='div', attrs={'class':re.compile('social')}),
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), dict(name='div', attrs={'class':re.compile('tools')}),
dict(name='div', attrs={'class':'rule_grey_solid'}), dict(name='div', attrs={'class':re.compile('bottom.tools')}),
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] dict(name='div', attrs={'class':re.compile('window')}),
dict(name='div', attrs={'class':re.compile('related.news.element')})]
def get_cover_url(self): def get_cover_url(self):
from datetime import timedelta, date from datetime import timedelta, date
if self.fp_tag=='':
return None
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg' cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser(self) br = BasicNewsRecipe.get_browser(self)
daysback=1 daysback=1
@ -120,6 +61,18 @@ class CanWestPaper(BasicNewsRecipe):
cover = None cover = None
return cover return cover
def prepare_masthead_image(self, path_to_image, out_path):
if self.Kindle_Fire:
from calibre.utils.magick import Image, create_canvas
img = Image()
img.open(path_to_image)
width, height = img.size
img2 = create_canvas(width, height)
img2.compose(img)
img2.save(out_path)
else:
BasicNewsRecipe.prepare_masthead_image(path_to_image, out_path)
def fixChars(self,string): def fixChars(self,string):
# Replace lsquo (\x91) # Replace lsquo (\x91)
fixed = re.sub("\x91","",string) fixed = re.sub("\x91","",string)
@ -166,55 +119,107 @@ class CanWestPaper(BasicNewsRecipe):
a.replaceWith(a.renderContents().decode('cp1252','replace')) a.replaceWith(a.renderContents().decode('cp1252','replace'))
return soup return soup
def preprocess_html(self, soup): def preprocess_html(self,soup):
byline = soup.find('p',attrs={'class':re.compile('ancillary')})
if byline is not None:
byline.find('a')
authstr = self.tag_to_string(byline,False)
authstr = re.sub('/ *Times Colonist','/',authstr, flags=re.IGNORECASE)
authstr = re.sub('BY */','',authstr, flags=re.IGNORECASE)
newdiv = Tag(soup,'div')
newdiv.insert(0,authstr)
newdiv['class']='byline'
byline.replaceWith(newdiv)
for caption in soup.findAll('p',attrs={'class':re.compile('caption')}):
capstr = self.tag_to_string(caption,False)
capstr = re.sub('Photograph by.*$','',capstr, flags=re.IGNORECASE)
newdiv = Tag(soup,'div')
newdiv.insert(0,capstr)
newdiv['class']='caption'
caption.replaceWith(newdiv)
for ptag in soup.findAll('p'):
ptext = self.tag_to_string(ptag,use_alt=False, normalize_whitespace=True)
ptext = re.sub(r'\s+','', ptext)
if (ptext=='') or (ptext==' '):
ptag.extract()
return self.strip_anchors(soup) return self.strip_anchors(soup)
raeside = False
def handle_articles(self,htag,article_list,sectitle):
atag = htag.a
if atag is not None:
url = atag['href']
#print("Checking "+url)
if atag['href'].startswith('/'):
url = self.url_prefix+atag['href']
if url in self.url_list:
return
self.url_list.append(url)
title = self.tag_to_string(atag,False)
if 'VIDEO' in title.upper():
return
if 'GALLERY' in title.upper():
return
if 'PHOTOS' in title.upper():
return
if 'RAESIDE' in title.upper():
if self.raeside:
return
self.raeside = True
dtag = htag.findNext('p')
description=''
if dtag is not None:
description = self.tag_to_string(dtag,False)
article_list.append(dict(title=title,url=url,date='',description=description,author='',content=''))
#print(sectitle+title+": description = "+description+" URL="+url)
def add_section_index(self,ans,securl,sectitle):
print("Add section url="+self.url_prefix+'/'+securl)
try:
soup = self.index_to_soup(self.url_prefix+'/'+securl)
except:
return ans
mainsoup = soup.find('div',attrs={'class':re.compile('main.content')})
article_list = []
for wdiv in mainsoup.findAll('div',attrs={'id':re.compile('featured.story')}):
for htag in wdiv.findAll('h3'):
self.handle_articles(htag,article_list,sectitle)
for ladiv in mainsoup.findAll(attrs={'class':re.compile('leading.articles')}):
for wdiv in mainsoup.findAll('div',attrs={'class':re.compile('article.row')}):
for htag in wdiv.findAll('h2'):
self.handle_articles(htag,article_list,sectitle)
ans.append((sectitle,article_list))
return ans
def parse_index(self): def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') ans = []
ans = self.add_section_index(ans,'','Web Front Page')
articles = {} ans = self.add_section_index(ans,'news/','News Headlines')
key = 'News' ans = self.add_section_index(ans,'news/b-c/','BC News')
ans = ['News'] ans = self.add_section_index(ans,'news/national/','Natioanl News')
ans = self.add_section_index(ans,'news/world/','World News')
# Find each instance of class="sectiontitle", class="featurecontent" ans = self.add_section_index(ans,'opinion/','Opinion')
for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): ans = self.add_section_index(ans,'opinion/letters/','Letters')
#self.log(" div class = %s" % divtag['class']) ans = self.add_section_index(ans,'business/','Business')
if divtag['class'].startswith('section_title'): ans = self.add_section_index(ans,'business/money/','Money')
# div contains section title ans = self.add_section_index(ans,'business/technology/','Technology')
if not divtag.h3: ans = self.add_section_index(ans,'business/working/','Working')
continue ans = self.add_section_index(ans,'sports/','Sports')
key = self.tag_to_string(divtag.h3,False) ans = self.add_section_index(ans,'sports/hockey/','Hockey')
ans.append(key) ans = self.add_section_index(ans,'sports/football/','Football')
self.log("Section name %s" % key) ans = self.add_section_index(ans,'sports/basketball/','Basketball')
continue ans = self.add_section_index(ans,'sports/golf/','Golf')
# div contains article data ans = self.add_section_index(ans,'entertainment/','entertainment')
h1tag = divtag.find('h1') ans = self.add_section_index(ans,'entertainment/go/','Go!')
if not h1tag: ans = self.add_section_index(ans,'entertainment/music/','Music')
continue ans = self.add_section_index(ans,'entertainment/books/','Books')
atag = h1tag.find('a',href=True) ans = self.add_section_index(ans,'entertainment/Movies/','movies')
if not atag: ans = self.add_section_index(ans,'entertainment/television/','Television')
continue ans = self.add_section_index(ans,'life/','Life')
url = self.url_prefix+'/news/todays-paper/'+atag['href'] ans = self.add_section_index(ans,'life/health/','Health')
#self.log("Section %s" % key) ans = self.add_section_index(ans,'life/travel/','Travel')
#self.log("url %s" % url) ans = self.add_section_index(ans,'life/driving/','Driving')
title = self.tag_to_string(atag,False) ans = self.add_section_index(ans,'life/homes/','Homes')
#self.log("title %s" % title) ans = self.add_section_index(ans,'life/food-drink/','Food & Drink')
pubdate = ''
description = ''
ptag = divtag.find('p');
if ptag:
description = self.tag_to_string(ptag,False)
#self.log("description %s" % description)
author = ''
autag = divtag.find('h4')
if autag:
author = self.tag_to_string(autag,False)
#self.log("author %s" % author)
if not articles.has_key(key):
articles[key] = []
articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans return ans

View File

@ -614,10 +614,14 @@ class Amazon(Source):
return domain return domain
def clean_downloaded_metadata(self, mi): def clean_downloaded_metadata(self, mi):
if mi.title and self.domain in ('com', 'uk'): docase = (
mi.language == 'eng' or
(mi.is_null('language') and self.domain in {'com', 'uk'})
)
if mi.title and docase:
mi.title = fixcase(mi.title) mi.title = fixcase(mi.title)
mi.authors = fixauthors(mi.authors) mi.authors = fixauthors(mi.authors)
if self.domain in ('com', 'uk'): if mi.tags and docase:
mi.tags = list(map(fixcase, mi.tags)) mi.tags = list(map(fixcase, mi.tags))
mi.isbn = check_isbn(mi.isbn) mi.isbn = check_isbn(mi.isbn)