diff --git a/recipes/business_spectator.recipe b/recipes/business_spectator.recipe index ef58424c6c..9ed3f1f7ac 100644 --- a/recipes/business_spectator.recipe +++ b/recipes/business_spectator.recipe @@ -16,6 +16,7 @@ class BusinessSpectator(BasicNewsRecipe): oldest_article = 2 max_articles_per_feed = 100 no_stylesheets = True + auto_cleanup = True #delay = 1 use_embedded_content = False encoding = 'utf8' @@ -32,11 +33,11 @@ class BusinessSpectator(BasicNewsRecipe): ,'linearize_tables': False } - keep_only_tags = [dict(id='storyHeader'), dict(id='body-html')] + #keep_only_tags = [dict(id='storyHeader'), dict(id='body-html')] - remove_tags = [dict(attrs={'class':'hql'})] + #remove_tags = [dict(attrs={'class':'hql'})] - remove_attributes = ['width','height','style'] + #remove_attributes = ['width','height','style'] feeds = [ ('Top Stories', 'http://www.businessspectator.com.au/top-stories.rss'), @@ -46,3 +47,4 @@ class BusinessSpectator(BasicNewsRecipe): ('Daily Dossier', 'http://www.businessspectator.com.au/bs.nsf/RSS?readform&type=kgb&cat=dossier'), ('Australia', 'http://www.businessspectator.com.au/bs.nsf/RSS?readform&type=region&cat=australia'), ] + diff --git a/recipes/calgary_herald.recipe b/recipes/calgary_herald.recipe index 12134bc9a4..74ec104463 100644 --- a/recipes/calgary_herald.recipe +++ b/recipes/calgary_herald.recipe @@ -1,35 +1,320 @@ -from calibre.web.feeds.news import BasicNewsRecipe - -class CalgaryHerald(BasicNewsRecipe): - title = u'Calgary Herald' - oldest_article = 3 - max_articles_per_feed = 100 - - feeds = [ - (u'News', u'http://rss.canada.com/get/?F233'), - (u'Calgary', u'http://www.calgaryherald.com/scripts/sp6query.aspx?catalog=cahr&tags=keyword|calgary&output=rss?link=http%3a%2f%2fwww.calgaryherald'), - (u'Alberta', u'http://www.calgaryherald.com/scripts/Sp6Query.aspx?catalog=CAHR&tags=Keyword|Alberta&output=rss?link=http%3A%2F%2Fwww.calgaryherald.com%2Fnews%2Falberta%2Findex.html'), - (u'Politics', u'http://rss.canada.com/get/?F7551'), - (u'National', u'http://rss.canada.com/get/?F7552'), - (u'World', u'http://rss.canada.com/get/?F7553'), - ] - __author__ = 'rty' - pubisher = 'Calgary Herald' - description = 'Calgary, Alberta, Canada' - category = 'News, Calgary, Alberta, Canada' - - - remove_javascript = True - use_embedded_content = False - no_stylesheets = True - language = 'en_CA' - encoding = 'utf-8' - conversion_options = {'linearize_tables':True} - ##masthead_url = 'http://www.calgaryherald.com/index.html' - keep_only_tags = [ - dict(name='div', attrs={'id':'storyheader'}), - dict(name='div', attrs={'id':'storycontent'}) - - ] - remove_tags_after = {'class':"story_tool_hr"} - +#!/usr/bin/env python +# -*- coding: utf-8 -*- +__license__ = 'GPL v3' + +''' +www.canada.com +''' +import string, re +from calibre import strftime +from calibre.web.feeds.news import BasicNewsRecipe + +import string, re +from calibre import strftime +from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag + + +class CanWestPaper(BasicNewsRecipe): + + postmedia_index_pages = [ + (u'Headlines',u'/index.html'), + (u'Ottawa & Area',u'/news/ottawa/index.html'), + (u'Vancouver',u'/news/vancouver/index.html'), + (u'Calgary',u'/news/calgary/index.html'), + (u'Edmonton',u'/news/edmonton/index.html'), + (u'Montreal',u'/news/montreal/index.html'), + (u'Fraser Valley',u'/news/fraser-valley/index.html'), + (u'British Columbia',u'/news/bc/index.html'), + (u'Alberta',u'/news/alberta/index.html'), + (u'Canada',u'/news/canada/index.html'), + (u'National',u'/news/national/index.html'), + (u'Politics',u'/news/politics/index.html'), + (u'Insight',u'/news/insight/index.html'), + (u'Special Reports',u'/news/specialreports/index.html'), + (u'Gangs',u'/news/gangs/index.html'), + (u'Education',u'/news/education/index.html'), + (u'Health',u'/news/health/index.html'), + (u'Environment',u'/news/environment/index.html'), + (u'World',u'/news/world/index.html'), + (u'Police Blotter',u'/news/crime-and-justice/index.html'), + (u'Crime',u'/news/blotter/index.html'), + (u'Around Town',u'/news/topic.html?t=keyword&q=Around+Town'), + (u'Diplomatica',u'/news/diplomatica/index.html'), + (u'Opinion',u'/opinion/index.html'), + (u'Columnists',u'/columnists/index.html'), + (u'Editorials',u'/opinion/editorials/index.html'), + (u'Letters',u'/opinion/letters/index.html'), + (u'Business',u'/business/index.html'), + (u'Sports',u'/sports/index.html'), + (u'Arts',u'/entertainment/index.html'), + (u'Life',u'/life/index.html'), + (u'Technology',u'/technology/index.html'), + (u'Travel',u'/travel/index.html'), + (u'Health',u'/health/index.html') + ] + + + # un-comment the following six lines for the Vancouver Province +## title = u'Vancouver Province' +## url_prefix = 'http://www.theprovince.com' +## description = u'News from Vancouver, BC' +## std_logo_url = 'http://www.theprovince.com/images/logo_theprovince.jpg' +## logo_url = 'vplogo.jpg' +## fp_tag = 'CAN_TP' + + # un-comment the following six lines for the Vancouver Sun +## title = u'Vancouver Sun' +## url_prefix = 'http://www.vancouversun.com' +## description = u'News from Vancouver, BC' +## std_logo_url = 'http://www.vancouversun.com/images/logo_vancouversun.jpg' +## logo_url = 'vslogo.jpg' +## fp_tag = 'CAN_VS' + + # un-comment the following six lines for the Calgary Herald + title = u'Calgary Herald' + url_prefix = 'http://www.calgaryherald.com' + description = u'News from Calgary, AB' + std_logo_url = 'http://www.calgaryherald.com/images/logo_calgaryherald.jpg' + logo_url = 'chlogo.jpg' + fp_tag = 'CAN_CH' + + # un-comment the following six lines for the Edmonton Journal +## title = u'Edmonton Journal' +## url_prefix = 'http://www.edmontonjournal.com' +## description = u'News from Edmonton, AB' +## std_logo_url = 'http://www.edmontonjournal.com/images/logo_edmontonjournal.jpg' +## logo_url = 'ejlogo.jpg' +## fp_tag = 'CAN_EJ' + + # un-comment the following six lines for the Ottawa Citizen +## title = u'Ottawa Citizen' +## url_prefix = 'http://www.ottawacitizen.com' +## description = u'News from Ottawa, ON' +## std_logo_url = 'http://www.ottawacitizen.com/images/logo_ottawacitizen.jpg' +## logo_url = 'oclogo.jpg' +## fp_tag = 'CAN_OC' + + # un-comment the following six lines for the Montreal Gazette +## title = u'Montreal Gazette' +## url_prefix = 'http://www.montrealgazette.com' +## description = u'News from Montreal, QC' +## std_logo_url = 'http://www.montrealgazette.com/images/logo_montrealgazette.jpg' +## logo_url = 'mglogo.jpg' +## fp_tag = 'CAN_MG' + + Kindle_Fire=False + masthead_url = std_logo_url + + url_list = [] + language = 'en_CA' + __author__ = 'Nick Redding' + no_stylesheets = True + timefmt = ' [%b %d]' + encoding = 'utf-8' + extra_css = ''' + .timestamp { font-size:xx-small; display: block; } + #storyheader { font-size: medium; } + #storyheader h1 { font-size: x-large; } + #storyheader h2 { font-size: small; font-style: italic; } + .byline { font-size:xx-small; } + #photocaption { font-size: xx-small; font-style: italic; font-weight: normal; } + .photocaption { font-size: xx-small; font-style: italic; font-weight: normal; } + #photocredit { font-size: xx-small; font-weight: normal; }''' + + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'}),dict(name='div', attrs={'id':'storyphoto'})] + + remove_tags = [{'class':'comments'}, + dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='h2', attrs={'id':'photocredit'}), + dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), + dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), + dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), + dict(name='div', attrs={'class':'rule_grey_solid'}), + dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + + + def get_cover_url(self): + from datetime import timedelta, datetime, date + cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg' + br = BasicNewsRecipe.get_browser() + daysback=1 + try: + br.open(cover) + except: + while daysback<7: + cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg' + br = BasicNewsRecipe.get_browser() + try: + br.open(cover) + except: + daysback = daysback+1 + continue + break + if daysback==7: + self.log("\nCover unavailable") + cover = None + return cover + + def prepare_masthead_image(self, path_to_image, out_path): + if self.Kindle_Fire: + from calibre import fit_image + from calibre.utils.magick import Image, create_canvas + img = Image() + img.open(path_to_image) + width, height = img.size + img2 = create_canvas(width, height) + img2.compose(img) + img2.save(out_path) + else: + BasicNewsRecipe.prepare_masthead_image(self, path_to_image, out_path) + + def fixChars(self,string): + # Replace lsquo (\x91) + fixed = re.sub("\x91","‘",string) + # Replace rsquo (\x92) + fixed = re.sub("\x92","’",fixed) + # Replace ldquo (\x93) + fixed = re.sub("\x93","“",fixed) + # Replace rdquo (\x94) + fixed = re.sub("\x94","”",fixed) + # Replace ndash (\x96) + fixed = re.sub("\x96","–",fixed) + # Replace mdash (\x97) + fixed = re.sub("\x97","—",fixed) + fixed = re.sub("’","’",fixed) + return fixed + + def massageNCXText(self, description): + # Kindle TOC descriptions won't render certain characters + if description: + massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) + # Replace '&' with '&' + massaged = re.sub("&","&", massaged) + return self.fixChars(massaged) + else: + return description + + def populate_article_metadata(self, article, soup, first): + if first: + picdiv = soup.find('body').find('img') + if picdiv is not None: + self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src'])) + xtitle = article.text_summary.strip() + if len(xtitle) == 0: + desc = soup.find('meta',attrs={'property':'og:description'}) + if desc is not None: + article.summary = article.text_summary = desc['content'] + + def strip_anchors(self,soup): + paras = soup.findAll(True) + for para in paras: + aTags = para.findAll('a') + for a in aTags: + if a.img is None: + a.replaceWith(a.renderContents().decode('cp1252','replace')) + return soup + + + def preprocess_html(self,soup): + #delete empty id attributes--they screw up the TOC for unknown reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + + pgall = soup.find('div',attrs={'id':'storyphoto'}) + if pgall is not None: # photo gallery perhaps + if (soup.find('div',attrs={'id':'storycontent'}) is None): + allpics = Tag(soup,'div') + first_img = pgall.find('div','storyimage') + if first_img is not None: + first_img.extract() + tlist = pgall.find('div',attrs={'id':'relatedthumbs'}) + if tlist is not None: + for atag in tlist.findAll('a'): + img = Tag(soup,'img') + srcpre, sep, srcpost = atag.img['src'].partition('?') + img['src'] = srcpre + pdesc = Tag(soup,'p') + pdesc.insert(0,atag.img['alt']) + pdesc['class']='photocaption' + div = Tag(soup,'div') + div.insert(0,pdesc) + div.insert(0,img) + allpics.append(div) + pgall.replaceWith(allpics) + + for pg in soup.findAll('div',attrs={'id':'storyphoto'}): + pg.extract() + return self.strip_anchors(soup) + + + + def parse_index(self): + + articles = {} + ans = [] + + + def handle_article(adiv,key): + h1tag = adiv.h1 + if h1tag is not None: + atag = h1tag.a + if atag is not None: + url = atag['href'] + if atag['href'].startswith('http'): + return + elif atag['href'].startswith('/'): + url = self.url_prefix+atag['href'] + else: + url = self.url_prefix+'/'+atag['href'] + if url in self.url_list: + return + self.url_list.append(url) + title = self.tag_to_string(atag,False) + if 'VIDEO' in title.upper(): + return + if 'GALLERY' in title.upper(): + return + if 'PHOTOS' in title.upper(): + return + dtag = adiv.find('div','content') + description='' + print("URL "+url) + print("TITLE "+title) + if dtag is not None: + stag = dtag.span + if stag is not None: + if stag['class'] != 'timestamp': + description = self.tag_to_string(stag,False) + else: + description = self.tag_to_string(dtag,False) + print("DESCRIPTION: "+description) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date='',description=description,author='',content='')) + + def parse_web_index(key, keyurl): + try: + soup = self.index_to_soup(self.url_prefix+keyurl) + except: + return + ans.append(key) + mainsoup = soup.find('div','bodywrapper') + footer = mainsoup.find(attrs={'id':'footerfeature'}) + if footer is not None: + footer.extract() + print("Section: "+key) + for wdiv in mainsoup.findAll('div',attrs={'id':re.compile('^HorizontalFeatureSlider_1_Story')}): + handle_article(wdiv,key) + wdiv.extract() + for wdiv in mainsoup.findAll(attrs={'id':['featurewidget','textfeature','textlinks_timestamp']}): + for adiv in wdiv.findAll('div','featurecontent'): + handle_article(adiv,key) + + for (k,url) in self.postmedia_index_pages: + parse_web_index(k,url) + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans + diff --git a/recipes/edmonton_journal.recipe b/recipes/edmonton_journal.recipe index e0c02b7d83..85cc521a81 100644 --- a/recipes/edmonton_journal.recipe +++ b/recipes/edmonton_journal.recipe @@ -1,105 +1,141 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- - __license__ = 'GPL v3' ''' www.canada.com ''' - -import re +import string, re +from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup + +import string, re +from calibre import strftime +from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag class CanWestPaper(BasicNewsRecipe): - # un-comment the following four lines for the Victoria Times Colonist -## title = u'Victoria Times Colonist' -## url_prefix = 'http://www.timescolonist.com' -## description = u'News from Victoria, BC' -## fp_tag = 'CAN_TC' + postmedia_index_pages = [ + (u'Headlines',u'/index.html'), + (u'Ottawa & Area',u'/news/ottawa/index.html'), + (u'Vancouver',u'/news/vancouver/index.html'), + (u'Calgary',u'/news/calgary/index.html'), + (u'Edmonton',u'/news/edmonton/index.html'), + (u'Montreal',u'/news/montreal/index.html'),, + (u'Fraser Valley',u'/news/fraser-valley/index.html'), + (u'British Columbia',u'/news/bc/index.html'), + (u'Alberta',u'/news/alberta/index.html'), + (u'Canada',u'/news/canada/index.html'), + (u'National',u'/news/national/index.html'), + (u'Politics',u'/news/politics/index.html'), + (u'Insight',u'/news/insight/index.html'), + (u'Special Reports',u'/news/specialreports/index.html'), + (u'Gangs',u'/news/gangs/index.html'), + (u'Education',u'/news/education/index.html'), + (u'Health',u'/news/health/index.html'), + (u'Environment',u'/news/environment/index.html'), + (u'World',u'/news/world/index.html'), + (u'Police Blotter',u'/news/crime-and-justice/index.html'), + (u'Crime',u'/news/blotter/index.html'), + (u'Around Town',u'/news/topic.html?t=keyword&q=Around+Town'), + (u'Diplomatica',u'/news/diplomatica/index.html'), + (u'Opinion',u'/opinion/index.html'), + (u'Columnists',u'/columnists/index.html'), + (u'Editorials',u'/opinion/editorials/index.html'), + (u'Letters',u'/opinion/letters/index.html'), + (u'Business',u'/business/index.html'), + (u'Sports',u'/sports/index.html'), + (u'Arts',u'/entertainment/index.html'), + (u'Life',u'/life/index.html'), + (u'Technology',u'/technology/index.html'), + (u'Travel',u'/travel/index.html'), + (u'Health',u'/health/index.html') + ] - # un-comment the following four lines for the Vancouver Province + + # un-comment the following six lines for the Vancouver Province ## title = u'Vancouver Province' ## url_prefix = 'http://www.theprovince.com' ## description = u'News from Vancouver, BC' -## fp_tag = 'CAN_VP' +## std_logo_url = 'http://www.theprovince.com/images/logo_theprovince.jpg' +## logo_url = 'vplogo.jpg' +## fp_tag = 'CAN_TP' - # un-comment the following four lines for the Vancouver Sun + # un-comment the following six lines for the Vancouver Sun ## title = u'Vancouver Sun' ## url_prefix = 'http://www.vancouversun.com' ## description = u'News from Vancouver, BC' +## std_logo_url = 'http://www.vancouversun.com/images/logo_vancouversun.jpg' +## logo_url = 'vslogo.jpg' ## fp_tag = 'CAN_VS' - # un-comment the following four lines for the Edmonton Journal - title = u'Edmonton Journal' - url_prefix = 'http://www.edmontonjournal.com' - description = u'News from Edmonton, AB' - fp_tag = 'CAN_EJ' - - # un-comment the following four lines for the Calgary Herald + # un-comment the following six lines for the Calgary Herald ## title = u'Calgary Herald' ## url_prefix = 'http://www.calgaryherald.com' ## description = u'News from Calgary, AB' +## std_logo_url = 'http://www.calgaryherald.com/images/logo_calgaryherald.jpg' +## logo_url = 'chlogo.jpg' ## fp_tag = 'CAN_CH' - # un-comment the following four lines for the Regina Leader-Post -## title = u'Regina Leader-Post' -## url_prefix = 'http://www.leaderpost.com' -## description = u'News from Regina, SK' -## fp_tag = '' + # un-comment the following six lines for the Edmonton Journal + title = u'Edmonton Journal' + url_prefix = 'http://www.edmontonjournal.com' + description = u'News from Edmonton, AB' + std_logo_url = 'http://www.edmontonjournal.com/images/logo_edmontonjournal.jpg' + logo_url = 'ejlogo.jpg' + fp_tag = 'CAN_EJ' - # un-comment the following four lines for the Saskatoon Star-Phoenix -## title = u'Saskatoon Star-Phoenix' -## url_prefix = 'http://www.thestarphoenix.com' -## description = u'News from Saskatoon, SK' -## fp_tag = '' - - # un-comment the following four lines for the Windsor Star -## title = u'Windsor Star' -## url_prefix = 'http://www.windsorstar.com' -## description = u'News from Windsor, ON' -## fp_tag = 'CAN_' - - # un-comment the following four lines for the Ottawa Citizen + # un-comment the following six lines for the Ottawa Citizen ## title = u'Ottawa Citizen' ## url_prefix = 'http://www.ottawacitizen.com' -## description = u'News from Ottawa, ON' +## description = u'News from Ottawa, ON' +## std_logo_url = 'http://www.ottawacitizen.com/images/logo_ottawacitizen.jpg' +## logo_url = 'oclogo.jpg' ## fp_tag = 'CAN_OC' - # un-comment the following four lines for the Montreal Gazette + # un-comment the following six lines for the Montreal Gazette ## title = u'Montreal Gazette' ## url_prefix = 'http://www.montrealgazette.com' ## description = u'News from Montreal, QC' +## std_logo_url = 'http://www.montrealgazette.com/images/logo_montrealgazette.jpg' +## logo_url = 'mglogo.jpg' ## fp_tag = 'CAN_MG' + Kindle_Fire=False + masthead_url = std_logo_url + url_list = [] language = 'en_CA' __author__ = 'Nick Redding' no_stylesheets = True - timefmt = ' [%b %d]' + timefmt = ' [%b %d]' + encoding = 'utf-8' extra_css = ''' .timestamp { font-size:xx-small; display: block; } #storyheader { font-size: medium; } #storyheader h1 { font-size: x-large; } - #storyheader h2 { font-size: large; font-style: italic; } + #storyheader h2 { font-size: small; font-style: italic; } .byline { font-size:xx-small; } - #photocaption { font-size: small; font-style: italic } - #photocredit { font-size: xx-small; }''' - keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + #photocaption { font-size: xx-small; font-style: italic; font-weight: normal; } + .photocaption { font-size: xx-small; font-style: italic; font-weight: normal; } + #photocredit { font-size: xx-small; font-weight: normal; }''' + + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'}),dict(name='div', attrs={'id':'storyphoto'})] + remove_tags = [{'class':'comments'}, dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='h2', attrs={'id':'photocredit'}), dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), dict(name='div', attrs={'class':'rule_grey_solid'}), dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + def get_cover_url(self): - from datetime import timedelta, date - if self.fp_tag=='': - return None + from datetime import timedelta, datetime, date cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg' br = BasicNewsRecipe.get_browser() daysback=1 @@ -120,6 +156,19 @@ class CanWestPaper(BasicNewsRecipe): cover = None return cover + def prepare_masthead_image(self, path_to_image, out_path): + if self.Kindle_Fire: + from calibre import fit_image + from calibre.utils.magick import Image, create_canvas + img = Image() + img.open(path_to_image) + width, height = img.size + img2 = create_canvas(width, height) + img2.compose(img) + img2.save(out_path) + else: + BasicNewsRecipe.prepare_masthead_image(self, path_to_image, out_path) + def fixChars(self,string): # Replace lsquo (\x91) fixed = re.sub("\x91","‘",string) @@ -166,55 +215,106 @@ class CanWestPaper(BasicNewsRecipe): a.replaceWith(a.renderContents().decode('cp1252','replace')) return soup - def preprocess_html(self, soup): + + def preprocess_html(self,soup): + #delete empty id attributes--they screw up the TOC for unknown reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + + pgall = soup.find('div',attrs={'id':'storyphoto'}) + if pgall is not None: # photo gallery perhaps + if (soup.find('div',attrs={'id':'storycontent'}) is None): + allpics = Tag(soup,'div') + first_img = pgall.find('div','storyimage') + if first_img is not None: + first_img.extract() + tlist = pgall.find('div',attrs={'id':'relatedthumbs'}) + if tlist is not None: + for atag in tlist.findAll('a'): + img = Tag(soup,'img') + srcpre, sep, srcpost = atag.img['src'].partition('?') + img['src'] = srcpre + pdesc = Tag(soup,'p') + pdesc.insert(0,atag.img['alt']) + pdesc['class']='photocaption' + div = Tag(soup,'div') + div.insert(0,pdesc) + div.insert(0,img) + allpics.append(div) + pgall.replaceWith(allpics) + + for pg in soup.findAll('div',attrs={'id':'storyphoto'}): + pg.extract() return self.strip_anchors(soup) - + def parse_index(self): - soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') articles = {} - key = 'News' - ans = ['News'] + ans = [] - # Find each instance of class="sectiontitle", class="featurecontent" - for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): - #self.log(" div class = %s" % divtag['class']) - if divtag['class'].startswith('section_title'): - # div contains section title - if not divtag.h3: - continue - key = self.tag_to_string(divtag.h3,False) - ans.append(key) - self.log("Section name %s" % key) - continue - # div contains article data - h1tag = divtag.find('h1') - if not h1tag: - continue - atag = h1tag.find('a',href=True) - if not atag: - continue - url = self.url_prefix+'/news/todays-paper/'+atag['href'] - #self.log("Section %s" % key) - #self.log("url %s" % url) - title = self.tag_to_string(atag,False) - #self.log("title %s" % title) - pubdate = '' - description = '' - ptag = divtag.find('p'); - if ptag: - description = self.tag_to_string(ptag,False) - #self.log("description %s" % description) - author = '' - autag = divtag.find('h4') - if autag: - author = self.tag_to_string(autag,False) - #self.log("author %s" % author) - if not articles.has_key(key): - articles[key] = [] - articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + def handle_article(adiv,key): + h1tag = adiv.h1 + if h1tag is not None: + atag = h1tag.a + if atag is not None: + url = atag['href'] + if atag['href'].startswith('http'): + return + elif atag['href'].startswith('/'): + url = self.url_prefix+atag['href'] + else: + url = self.url_prefix+'/'+atag['href'] + if url in self.url_list: + return + self.url_list.append(url) + title = self.tag_to_string(atag,False) + if 'VIDEO' in title.upper(): + return + if 'GALLERY' in title.upper(): + return + if 'PHOTOS' in title.upper(): + return + dtag = adiv.find('div','content') + description='' + print("URL "+url) + print("TITLE "+title) + if dtag is not None: + stag = dtag.span + if stag is not None: + if stag['class'] != 'timestamp': + description = self.tag_to_string(stag,False) + else: + description = self.tag_to_string(dtag,False) + print("DESCRIPTION: "+description) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date='',description=description,author='',content='')) + + def parse_web_index(key, keyurl): + try: + soup = self.index_to_soup(self.url_prefix+keyurl) + except: + return + ans.append(key) + mainsoup = soup.find('div','bodywrapper') + footer = mainsoup.find(attrs={'id':'footerfeature'}) + if footer is not None: + footer.extract() + print("Section: "+key) + for wdiv in mainsoup.findAll('div',attrs={'id':re.compile('^HorizontalFeatureSlider_1_Story')}): + handle_article(wdiv,key) + wdiv.extract() + for wdiv in mainsoup.findAll(attrs={'id':['featurewidget','textfeature','textlinks_timestamp']}): + for adiv in wdiv.findAll('div','featurecontent'): + handle_article(adiv,key) + + for (k,url) in self.postmedia_index_pages: + parse_web_index(k,url) ans = [(key, articles[key]) for key in ans if articles.has_key(key)] return ans + diff --git a/recipes/montreal_gazette.recipe b/recipes/montreal_gazette.recipe index 4ebbdbc0a1..0e87322309 100644 --- a/recipes/montreal_gazette.recipe +++ b/recipes/montreal_gazette.recipe @@ -1,48 +1,320 @@ #!/usr/bin/env python - +# -*- coding: utf-8 -*- __license__ = 'GPL v3' ''' www.canada.com ''' +import string, re +from calibre import strftime +from calibre.web.feeds.news import BasicNewsRecipe +import string, re +from calibre import strftime from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag class CanWestPaper(BasicNewsRecipe): - # un-comment the following three lines for the Montreal Gazette + postmedia_index_pages = [ + (u'Headlines',u'/index.html'), + (u'Ottawa & Area',u'/news/ottawa/index.html'), + (u'Vancouver',u'/news/vancouver/index.html'), + (u'Calgary',u'/news/calgary/index.html'), + (u'Edmonton',u'/news/edmonton/index.html'), + (u'Montreal',u'/news/montreal/index.html'), + (u'Fraser Valley',u'/news/fraser-valley/index.html'), + (u'British Columbia',u'/news/bc/index.html'), + (u'Alberta',u'/news/alberta/index.html'), + (u'Canada',u'/news/canada/index.html'), + (u'National',u'/news/national/index.html'), + (u'Politics',u'/news/politics/index.html'), + (u'Insight',u'/news/insight/index.html'), + (u'Special Reports',u'/news/specialreports/index.html'), + (u'Gangs',u'/news/gangs/index.html'), + (u'Education',u'/news/education/index.html'), + (u'Health',u'/news/health/index.html'), + (u'Environment',u'/news/environment/index.html'), + (u'World',u'/news/world/index.html'), + (u'Police Blotter',u'/news/crime-and-justice/index.html'), + (u'Crime',u'/news/blotter/index.html'), + (u'Around Town',u'/news/topic.html?t=keyword&q=Around+Town'), + (u'Diplomatica',u'/news/diplomatica/index.html'), + (u'Opinion',u'/opinion/index.html'), + (u'Columnists',u'/columnists/index.html'), + (u'Editorials',u'/opinion/editorials/index.html'), + (u'Letters',u'/opinion/letters/index.html'), + (u'Business',u'/business/index.html'), + (u'Sports',u'/sports/index.html'), + (u'Arts',u'/entertainment/index.html'), + (u'Life',u'/life/index.html'), + (u'Technology',u'/technology/index.html'), + (u'Travel',u'/travel/index.html'), + (u'Health',u'/health/index.html') + ] + + + # un-comment the following six lines for the Vancouver Province +## title = u'Vancouver Province' +## url_prefix = 'http://www.theprovince.com' +## description = u'News from Vancouver, BC' +## std_logo_url = 'http://www.theprovince.com/images/logo_theprovince.jpg' +## logo_url = 'vplogo.jpg' +## fp_tag = 'CAN_TP' + + # un-comment the following six lines for the Vancouver Sun +## title = u'Vancouver Sun' +## url_prefix = 'http://www.vancouversun.com' +## description = u'News from Vancouver, BC' +## std_logo_url = 'http://www.vancouversun.com/images/logo_vancouversun.jpg' +## logo_url = 'vslogo.jpg' +## fp_tag = 'CAN_VS' + + # un-comment the following six lines for the Calgary Herald +## title = u'Calgary Herald' +## url_prefix = 'http://www.calgaryherald.com' +## description = u'News from Calgary, AB' +## std_logo_url = 'http://www.calgaryherald.com/images/logo_calgaryherald.jpg' +## logo_url = 'chlogo.jpg' +## fp_tag = 'CAN_CH' + + # un-comment the following six lines for the Edmonton Journal +## title = u'Edmonton Journal' +## url_prefix = 'http://www.edmontonjournal.com' +## description = u'News from Edmonton, AB' +## std_logo_url = 'http://www.edmontonjournal.com/images/logo_edmontonjournal.jpg' +## logo_url = 'ejlogo.jpg' +## fp_tag = 'CAN_EJ' + + # un-comment the following six lines for the Ottawa Citizen +## title = u'Ottawa Citizen' +## url_prefix = 'http://www.ottawacitizen.com' +## description = u'News from Ottawa, ON' +## std_logo_url = 'http://www.ottawacitizen.com/images/logo_ottawacitizen.jpg' +## logo_url = 'oclogo.jpg' +## fp_tag = 'CAN_OC' + + # un-comment the following six lines for the Montreal Gazette title = u'Montreal Gazette' + url_prefix = 'http://www.montrealgazette.com' description = u'News from Montreal, QC' + std_logo_url = 'http://www.montrealgazette.com/images/logo_montrealgazette.jpg' + logo_url = 'mglogo.jpg' + fp_tag = 'CAN_MG' + Kindle_Fire=False + masthead_url = std_logo_url + url_list = [] language = 'en_CA' __author__ = 'Nick Redding' no_stylesheets = True - auto_cleanup = True - auto_cleanup_keep = '//*[@id="imageBox"]' - timefmt = ' [%b %d]' + timefmt = ' [%b %d]' + encoding = 'utf-8' extra_css = ''' .timestamp { font-size:xx-small; display: block; } #storyheader { font-size: medium; } #storyheader h1 { font-size: x-large; } - #storyheader h2 { font-size: large; font-style: italic; } + #storyheader h2 { font-size: small; font-style: italic; } .byline { font-size:xx-small; } - #photocaption { font-size: small; font-style: italic } - #photocredit { font-size: xx-small; }''' + #photocaption { font-size: xx-small; font-style: italic; font-weight: normal; } + .photocaption { font-size: xx-small; font-style: italic; font-weight: normal; } + #photocredit { font-size: xx-small; font-weight: normal; }''' + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'}),dict(name='div', attrs={'id':'storyphoto'})] + + remove_tags = [{'class':'comments'}, + dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='h2', attrs={'id':'photocredit'}), + dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), + dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), + dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), + dict(name='div', attrs={'class':'rule_grey_solid'}), + dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] - feeds = [ -('News', - 'http://rss.canada.com/get/?F297'), - ('Sports', - 'http://rss.canada.com/get/?F299'), - ('Entertainment', - 'http://rss.canada.com/get/?F7366'), - ('Business', - 'http://rss.canada.com/get/?F6939'), -] + def get_cover_url(self): + from datetime import timedelta, datetime, date + cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg' + br = BasicNewsRecipe.get_browser() + daysback=1 + try: + br.open(cover) + except: + while daysback<7: + cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg' + br = BasicNewsRecipe.get_browser() + try: + br.open(cover) + except: + daysback = daysback+1 + continue + break + if daysback==7: + self.log("\nCover unavailable") + cover = None + return cover + + def prepare_masthead_image(self, path_to_image, out_path): + if self.Kindle_Fire: + from calibre import fit_image + from calibre.utils.magick import Image, create_canvas + img = Image() + img.open(path_to_image) + width, height = img.size + img2 = create_canvas(width, height) + img2.compose(img) + img2.save(out_path) + else: + BasicNewsRecipe.prepare_masthead_image(self, path_to_image, out_path) + + def fixChars(self,string): + # Replace lsquo (\x91) + fixed = re.sub("\x91","‘",string) + # Replace rsquo (\x92) + fixed = re.sub("\x92","’",fixed) + # Replace ldquo (\x93) + fixed = re.sub("\x93","“",fixed) + # Replace rdquo (\x94) + fixed = re.sub("\x94","”",fixed) + # Replace ndash (\x96) + fixed = re.sub("\x96","–",fixed) + # Replace mdash (\x97) + fixed = re.sub("\x97","—",fixed) + fixed = re.sub("’","’",fixed) + return fixed + + def massageNCXText(self, description): + # Kindle TOC descriptions won't render certain characters + if description: + massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) + # Replace '&' with '&' + massaged = re.sub("&","&", massaged) + return self.fixChars(massaged) + else: + return description + + def populate_article_metadata(self, article, soup, first): + if first: + picdiv = soup.find('body').find('img') + if picdiv is not None: + self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src'])) + xtitle = article.text_summary.strip() + if len(xtitle) == 0: + desc = soup.find('meta',attrs={'property':'og:description'}) + if desc is not None: + article.summary = article.text_summary = desc['content'] + + def strip_anchors(self,soup): + paras = soup.findAll(True) + for para in paras: + aTags = para.findAll('a') + for a in aTags: + if a.img is None: + a.replaceWith(a.renderContents().decode('cp1252','replace')) + return soup - + def preprocess_html(self,soup): + #delete empty id attributes--they screw up the TOC for unknown reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + + pgall = soup.find('div',attrs={'id':'storyphoto'}) + if pgall is not None: # photo gallery perhaps + if (soup.find('div',attrs={'id':'storycontent'}) is None): + allpics = Tag(soup,'div') + first_img = pgall.find('div','storyimage') + if first_img is not None: + first_img.extract() + tlist = pgall.find('div',attrs={'id':'relatedthumbs'}) + if tlist is not None: + for atag in tlist.findAll('a'): + img = Tag(soup,'img') + srcpre, sep, srcpost = atag.img['src'].partition('?') + img['src'] = srcpre + pdesc = Tag(soup,'p') + pdesc.insert(0,atag.img['alt']) + pdesc['class']='photocaption' + div = Tag(soup,'div') + div.insert(0,pdesc) + div.insert(0,img) + allpics.append(div) + pgall.replaceWith(allpics) + + for pg in soup.findAll('div',attrs={'id':'storyphoto'}): + pg.extract() + return self.strip_anchors(soup) + + + + def parse_index(self): + + articles = {} + ans = [] + + + def handle_article(adiv,key): + h1tag = adiv.h1 + if h1tag is not None: + atag = h1tag.a + if atag is not None: + url = atag['href'] + if atag['href'].startswith('http'): + return + elif atag['href'].startswith('/'): + url = self.url_prefix+atag['href'] + else: + url = self.url_prefix+'/'+atag['href'] + if url in self.url_list: + return + self.url_list.append(url) + title = self.tag_to_string(atag,False) + if 'VIDEO' in title.upper(): + return + if 'GALLERY' in title.upper(): + return + if 'PHOTOS' in title.upper(): + return + dtag = adiv.find('div','content') + description='' + print("URL "+url) + print("TITLE "+title) + if dtag is not None: + stag = dtag.span + if stag is not None: + if stag['class'] != 'timestamp': + description = self.tag_to_string(stag,False) + else: + description = self.tag_to_string(dtag,False) + print("DESCRIPTION: "+description) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date='',description=description,author='',content='')) + + def parse_web_index(key, keyurl): + try: + soup = self.index_to_soup(self.url_prefix+keyurl) + except: + return + ans.append(key) + mainsoup = soup.find('div','bodywrapper') + footer = mainsoup.find(attrs={'id':'footerfeature'}) + if footer is not None: + footer.extract() + print("Section: "+key) + for wdiv in mainsoup.findAll('div',attrs={'id':re.compile('^HorizontalFeatureSlider_1_Story')}): + handle_article(wdiv,key) + wdiv.extract() + for wdiv in mainsoup.findAll(attrs={'id':['featurewidget','textfeature','textlinks_timestamp']}): + for adiv in wdiv.findAll('div','featurecontent'): + handle_article(adiv,key) + + for (k,url) in self.postmedia_index_pages: + parse_web_index(k,url) + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans + diff --git a/recipes/ottawa_citizen.recipe b/recipes/ottawa_citizen.recipe index 32d5567d6d..5a53bbbab8 100644 --- a/recipes/ottawa_citizen.recipe +++ b/recipes/ottawa_citizen.recipe @@ -1,105 +1,141 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- - __license__ = 'GPL v3' ''' www.canada.com ''' - -import re +import string, re +from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup + +import string, re +from calibre import strftime +from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag class CanWestPaper(BasicNewsRecipe): - # un-comment the following four lines for the Victoria Times Colonist -## title = u'Victoria Times Colonist' -## url_prefix = 'http://www.timescolonist.com' -## description = u'News from Victoria, BC' -## fp_tag = 'CAN_TC' + postmedia_index_pages = [ + (u'Headlines',u'/index.html'), + (u'Ottawa & Area',u'/news/ottawa/index.html'), + (u'Vancouver',u'/news/vancouver/index.html'), + (u'Calgary',u'/news/calgary/index.html'), + (u'Edmonton',u'/news/edmonton/index.html'), + (u'Montreal',u'/news/montreal/index.html'), + (u'Fraser Valley',u'/news/fraser-valley/index.html'), + (u'British Columbia',u'/news/bc/index.html'), + (u'Alberta',u'/news/alberta/index.html'), + (u'Canada',u'/news/canada/index.html'), + (u'National',u'/news/national/index.html'), + (u'Politics',u'/news/politics/index.html'), + (u'Insight',u'/news/insight/index.html'), + (u'Special Reports',u'/news/specialreports/index.html'), + (u'Gangs',u'/news/gangs/index.html'), + (u'Education',u'/news/education/index.html'), + (u'Health',u'/news/health/index.html'), + (u'Environment',u'/news/environment/index.html'), + (u'World',u'/news/world/index.html'), + (u'Police Blotter',u'/news/crime-and-justice/index.html'), + (u'Crime',u'/news/blotter/index.html'), + (u'Around Town',u'/news/topic.html?t=keyword&q=Around+Town'), + (u'Diplomatica',u'/news/diplomatica/index.html'), + (u'Opinion',u'/opinion/index.html'), + (u'Columnists',u'/columnists/index.html'), + (u'Editorials',u'/opinion/editorials/index.html'), + (u'Letters',u'/opinion/letters/index.html'), + (u'Business',u'/business/index.html'), + (u'Sports',u'/sports/index.html'), + (u'Arts',u'/entertainment/index.html'), + (u'Life',u'/life/index.html'), + (u'Technology',u'/technology/index.html'), + (u'Travel',u'/travel/index.html'), + (u'Health',u'/health/index.html') + ] - # un-comment the following four lines for the Vancouver Province + + # un-comment the following six lines for the Vancouver Province ## title = u'Vancouver Province' ## url_prefix = 'http://www.theprovince.com' ## description = u'News from Vancouver, BC' -## fp_tag = 'CAN_VP' +## std_logo_url = 'http://www.theprovince.com/images/logo_theprovince.jpg' +## logo_url = 'vplogo.jpg' +## fp_tag = 'CAN_TP' - # un-comment the following four lines for the Vancouver Sun + # un-comment the following six lines for the Vancouver Sun ## title = u'Vancouver Sun' ## url_prefix = 'http://www.vancouversun.com' ## description = u'News from Vancouver, BC' +## std_logo_url = 'http://www.vancouversun.com/images/logo_vancouversun.jpg' +## logo_url = 'vslogo.jpg' ## fp_tag = 'CAN_VS' - # un-comment the following four lines for the Edmonton Journal -## title = u'Edmonton Journal' -## url_prefix = 'http://www.edmontonjournal.com' -## description = u'News from Edmonton, AB' -## fp_tag = 'CAN_EJ' - - # un-comment the following four lines for the Calgary Herald + # un-comment the following six lines for the Calgary Herald ## title = u'Calgary Herald' ## url_prefix = 'http://www.calgaryherald.com' ## description = u'News from Calgary, AB' +## std_logo_url = 'http://www.calgaryherald.com/images/logo_calgaryherald.jpg' +## logo_url = 'chlogo.jpg' ## fp_tag = 'CAN_CH' - # un-comment the following four lines for the Regina Leader-Post -## title = u'Regina Leader-Post' -## url_prefix = 'http://www.leaderpost.com' -## description = u'News from Regina, SK' -## fp_tag = '' + # un-comment the following six lines for the Edmonton Journal +## title = u'Edmonton Journal' +## url_prefix = 'http://www.edmontonjournal.com' +## description = u'News from Edmonton, AB' +## std_logo_url = 'http://www.edmontonjournal.com/images/logo_edmontonjournal.jpg' +## logo_url = 'ejlogo.jpg' +## fp_tag = 'CAN_EJ' - # un-comment the following four lines for the Saskatoon Star-Phoenix -## title = u'Saskatoon Star-Phoenix' -## url_prefix = 'http://www.thestarphoenix.com' -## description = u'News from Saskatoon, SK' -## fp_tag = '' - - # un-comment the following four lines for the Windsor Star -## title = u'Windsor Star' -## url_prefix = 'http://www.windsorstar.com' -## description = u'News from Windsor, ON' -## fp_tag = 'CAN_' - - # un-comment the following four lines for the Ottawa Citizen + # un-comment the following six lines for the Ottawa Citizen title = u'Ottawa Citizen' url_prefix = 'http://www.ottawacitizen.com' - description = u'News from Ottawa, ON' + description = u'News from Ottawa, ON' + std_logo_url = 'http://www.ottawacitizen.com/images/logo_ottawacitizen.jpg' + logo_url = 'oclogo.jpg' fp_tag = 'CAN_OC' - # un-comment the following four lines for the Montreal Gazette + # un-comment the following six lines for the Montreal Gazette ## title = u'Montreal Gazette' ## url_prefix = 'http://www.montrealgazette.com' ## description = u'News from Montreal, QC' +## std_logo_url = 'http://www.montrealgazette.com/images/logo_montrealgazette.jpg' +## logo_url = 'mglogo.jpg' ## fp_tag = 'CAN_MG' + Kindle_Fire=False + masthead_url = std_logo_url + url_list = [] language = 'en_CA' __author__ = 'Nick Redding' no_stylesheets = True - timefmt = ' [%b %d]' + timefmt = ' [%b %d]' + encoding = 'utf-8' extra_css = ''' .timestamp { font-size:xx-small; display: block; } #storyheader { font-size: medium; } #storyheader h1 { font-size: x-large; } - #storyheader h2 { font-size: large; font-style: italic; } + #storyheader h2 { font-size: small; font-style: italic; } .byline { font-size:xx-small; } - #photocaption { font-size: small; font-style: italic } - #photocredit { font-size: xx-small; }''' - keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + #photocaption { font-size: xx-small; font-style: italic; font-weight: normal; } + .photocaption { font-size: xx-small; font-style: italic; font-weight: normal; } + #photocredit { font-size: xx-small; font-weight: normal; }''' + + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'}),dict(name='div', attrs={'id':'storyphoto'})] + remove_tags = [{'class':'comments'}, dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='h2', attrs={'id':'photocredit'}), dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), dict(name='div', attrs={'class':'rule_grey_solid'}), dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + def get_cover_url(self): - from datetime import timedelta, date - if self.fp_tag=='': - return None + from datetime import timedelta, datetime, date cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg' br = BasicNewsRecipe.get_browser() daysback=1 @@ -120,6 +156,19 @@ class CanWestPaper(BasicNewsRecipe): cover = None return cover + def prepare_masthead_image(self, path_to_image, out_path): + if self.Kindle_Fire: + from calibre import fit_image + from calibre.utils.magick import Image, create_canvas + img = Image() + img.open(path_to_image) + width, height = img.size + img2 = create_canvas(width, height) + img2.compose(img) + img2.save(out_path) + else: + BasicNewsRecipe.prepare_masthead_image(self, path_to_image, out_path) + def fixChars(self,string): # Replace lsquo (\x91) fixed = re.sub("\x91","‘",string) @@ -166,55 +215,106 @@ class CanWestPaper(BasicNewsRecipe): a.replaceWith(a.renderContents().decode('cp1252','replace')) return soup - def preprocess_html(self, soup): + + def preprocess_html(self,soup): + #delete empty id attributes--they screw up the TOC for unknown reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + + pgall = soup.find('div',attrs={'id':'storyphoto'}) + if pgall is not None: # photo gallery perhaps + if (soup.find('div',attrs={'id':'storycontent'}) is None): + allpics = Tag(soup,'div') + first_img = pgall.find('div','storyimage') + if first_img is not None: + first_img.extract() + tlist = pgall.find('div',attrs={'id':'relatedthumbs'}) + if tlist is not None: + for atag in tlist.findAll('a'): + img = Tag(soup,'img') + srcpre, sep, srcpost = atag.img['src'].partition('?') + img['src'] = srcpre + pdesc = Tag(soup,'p') + pdesc.insert(0,atag.img['alt']) + pdesc['class']='photocaption' + div = Tag(soup,'div') + div.insert(0,pdesc) + div.insert(0,img) + allpics.append(div) + pgall.replaceWith(allpics) + + for pg in soup.findAll('div',attrs={'id':'storyphoto'}): + pg.extract() return self.strip_anchors(soup) - + def parse_index(self): - soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') articles = {} - key = 'News' - ans = ['News'] + ans = [] - # Find each instance of class="sectiontitle", class="featurecontent" - for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): - #self.log(" div class = %s" % divtag['class']) - if divtag['class'].startswith('section_title'): - # div contains section title - if not divtag.h3: - continue - key = self.tag_to_string(divtag.h3,False) - ans.append(key) - self.log("Section name %s" % key) - continue - # div contains article data - h1tag = divtag.find('h1') - if not h1tag: - continue - atag = h1tag.find('a',href=True) - if not atag: - continue - url = self.url_prefix+'/news/todays-paper/'+atag['href'] - #self.log("Section %s" % key) - #self.log("url %s" % url) - title = self.tag_to_string(atag,False) - #self.log("title %s" % title) - pubdate = '' - description = '' - ptag = divtag.find('p'); - if ptag: - description = self.tag_to_string(ptag,False) - #self.log("description %s" % description) - author = '' - autag = divtag.find('h4') - if autag: - author = self.tag_to_string(autag,False) - #self.log("author %s" % author) - if not articles.has_key(key): - articles[key] = [] - articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + def handle_article(adiv,key): + h1tag = adiv.h1 + if h1tag is not None: + atag = h1tag.a + if atag is not None: + url = atag['href'] + if atag['href'].startswith('http'): + return + elif atag['href'].startswith('/'): + url = self.url_prefix+atag['href'] + else: + url = self.url_prefix+'/'+atag['href'] + if url in self.url_list: + return + self.url_list.append(url) + title = self.tag_to_string(atag,False) + if 'VIDEO' in title.upper(): + return + if 'GALLERY' in title.upper(): + return + if 'PHOTOS' in title.upper(): + return + dtag = adiv.find('div','content') + description='' + print("URL "+url) + print("TITLE "+title) + if dtag is not None: + stag = dtag.span + if stag is not None: + if stag['class'] != 'timestamp': + description = self.tag_to_string(stag,False) + else: + description = self.tag_to_string(dtag,False) + print("DESCRIPTION: "+description) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date='',description=description,author='',content='')) + + def parse_web_index(key, keyurl): + try: + soup = self.index_to_soup(self.url_prefix+keyurl) + except: + return + ans.append(key) + mainsoup = soup.find('div','bodywrapper') + footer = mainsoup.find(attrs={'id':'footerfeature'}) + if footer is not None: + footer.extract() + print("Section: "+key) + for wdiv in mainsoup.findAll('div',attrs={'id':re.compile('^HorizontalFeatureSlider_1_Story')}): + handle_article(wdiv,key) + wdiv.extract() + for wdiv in mainsoup.findAll(attrs={'id':['featurewidget','textfeature','textlinks_timestamp']}): + for adiv in wdiv.findAll('div','featurecontent'): + handle_article(adiv,key) + + for (k,url) in self.postmedia_index_pages: + parse_web_index(k,url) ans = [(key, articles[key]) for key in ans if articles.has_key(key)] return ans + diff --git a/recipes/vancouver_provice.recipe b/recipes/vancouver_provice.recipe index 9375670c59..1e41591a79 100644 --- a/recipes/vancouver_provice.recipe +++ b/recipes/vancouver_provice.recipe @@ -1,136 +1,320 @@ #!/usr/bin/env python - +# -*- coding: utf-8 -*- __license__ = 'GPL v3' ''' www.canada.com ''' +import string, re +from calibre import strftime +from calibre.web.feeds.news import BasicNewsRecipe +import string, re +from calibre import strftime from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag class CanWestPaper(BasicNewsRecipe): - # un-comment the following three lines for the Vancouver Province + postmedia_index_pages = [ + (u'Headlines',u'/index.html'), + (u'Ottawa & Area',u'/news/ottawa/index.html'), + (u'Vancouver',u'/news/vancouver/index.html'), + (u'Calgary',u'/news/calgary/index.html'), + (u'Edmonton',u'/news/edmonton/index.html'), + (u'Montreal',u'/news/montreal/index.html'), + (u'Fraser Valley',u'/news/fraser-valley/index.html'), + (u'British Columbia',u'/news/bc/index.html'), + (u'Alberta',u'/news/alberta/index.html'), + (u'Canada',u'/news/canada/index.html'), + (u'National',u'/news/national/index.html'), + (u'Politics',u'/news/politics/index.html'), + (u'Insight',u'/news/insight/index.html'), + (u'Special Reports',u'/news/specialreports/index.html'), + (u'Gangs',u'/news/gangs/index.html'), + (u'Education',u'/news/education/index.html'), + (u'Health',u'/news/health/index.html'), + (u'Environment',u'/news/environment/index.html'), + (u'World',u'/news/world/index.html'), + (u'Police Blotter',u'/news/crime-and-justice/index.html'), + (u'Crime',u'/news/blotter/index.html'), + (u'Around Town',u'/news/topic.html?t=keyword&q=Around+Town'), + (u'Diplomatica',u'/news/diplomatica/index.html'), + (u'Opinion',u'/opinion/index.html'), + (u'Columnists',u'/columnists/index.html'), + (u'Editorials',u'/opinion/editorials/index.html'), + (u'Letters',u'/opinion/letters/index.html'), + (u'Business',u'/business/index.html'), + (u'Sports',u'/sports/index.html'), + (u'Arts',u'/entertainment/index.html'), + (u'Life',u'/life/index.html'), + (u'Technology',u'/technology/index.html'), + (u'Travel',u'/travel/index.html'), + (u'Health',u'/health/index.html') + ] + + + # un-comment the following six lines for the Vancouver Province title = u'Vancouver Province' url_prefix = 'http://www.theprovince.com' description = u'News from Vancouver, BC' + std_logo_url = 'http://www.theprovince.com/images/logo_theprovince.jpg' + logo_url = 'vplogo.jpg' + fp_tag = 'CAN_TP' - # un-comment the following three lines for the Vancouver Sun - #title = u'Vancouver Sun' - #url_prefix = 'http://www.vancouversun.com' - #description = u'News from Vancouver, BC' + # un-comment the following six lines for the Vancouver Sun +## title = u'Vancouver Sun' +## url_prefix = 'http://www.vancouversun.com' +## description = u'News from Vancouver, BC' +## std_logo_url = 'http://www.vancouversun.com/images/logo_vancouversun.jpg' +## logo_url = 'vslogo.jpg' +## fp_tag = 'CAN_VS' - # un-comment the following three lines for the Edmonton Journal - #title = u'Edmonton Journal' - #url_prefix = 'http://www.edmontonjournal.com' - #description = u'News from Edmonton, AB' + # un-comment the following six lines for the Calgary Herald +## title = u'Calgary Herald' +## url_prefix = 'http://www.calgaryherald.com' +## description = u'News from Calgary, AB' +## std_logo_url = 'http://www.calgaryherald.com/images/logo_calgaryherald.jpg' +## logo_url = 'chlogo.jpg' +## fp_tag = 'CAN_CH' - # un-comment the following three lines for the Calgary Herald - #title = u'Calgary Herald' - #url_prefix = 'http://www.calgaryherald.com' - #description = u'News from Calgary, AB' + # un-comment the following six lines for the Edmonton Journal +## title = u'Edmonton Journal' +## url_prefix = 'http://www.edmontonjournal.com' +## description = u'News from Edmonton, AB' +## std_logo_url = 'http://www.edmontonjournal.com/images/logo_edmontonjournal.jpg' +## logo_url = 'ejlogo.jpg' +## fp_tag = 'CAN_EJ' - # un-comment the following three lines for the Regina Leader-Post - #title = u'Regina Leader-Post' - #url_prefix = 'http://www.leaderpost.com' - #description = u'News from Regina, SK' + # un-comment the following six lines for the Ottawa Citizen +## title = u'Ottawa Citizen' +## url_prefix = 'http://www.ottawacitizen.com' +## description = u'News from Ottawa, ON' +## std_logo_url = 'http://www.ottawacitizen.com/images/logo_ottawacitizen.jpg' +## logo_url = 'oclogo.jpg' +## fp_tag = 'CAN_OC' - # un-comment the following three lines for the Saskatoon Star-Phoenix - #title = u'Saskatoon Star-Phoenix' - #url_prefix = 'http://www.thestarphoenix.com' - #description = u'News from Saskatoon, SK' - - # un-comment the following three lines for the Windsor Star - #title = u'Windsor Star' - #url_prefix = 'http://www.windsorstar.com' - #description = u'News from Windsor, ON' - - # un-comment the following three lines for the Ottawa Citizen - #title = u'Ottawa Citizen' - #url_prefix = 'http://www.ottawacitizen.com' - #description = u'News from Ottawa, ON' - - # un-comment the following three lines for the Montreal Gazette - #title = u'Montreal Gazette' - #url_prefix = 'http://www.montrealgazette.com' - #description = u'News from Montreal, QC' + # un-comment the following six lines for the Montreal Gazette +## title = u'Montreal Gazette' +## url_prefix = 'http://www.montrealgazette.com' +## description = u'News from Montreal, QC' +## std_logo_url = 'http://www.montrealgazette.com/images/logo_montrealgazette.jpg' +## logo_url = 'mglogo.jpg' +## fp_tag = 'CAN_MG' + Kindle_Fire=False + masthead_url = std_logo_url + url_list = [] language = 'en_CA' __author__ = 'Nick Redding' no_stylesheets = True - timefmt = ' [%b %d]' + timefmt = ' [%b %d]' + encoding = 'utf-8' extra_css = ''' .timestamp { font-size:xx-small; display: block; } #storyheader { font-size: medium; } #storyheader h1 { font-size: x-large; } - #storyheader h2 { font-size: large; font-style: italic; } + #storyheader h2 { font-size: small; font-style: italic; } .byline { font-size:xx-small; } - #photocaption { font-size: small; font-style: italic } - #photocredit { font-size: xx-small; }''' - keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + #photocaption { font-size: xx-small; font-style: italic; font-weight: normal; } + .photocaption { font-size: xx-small; font-style: italic; font-weight: normal; } + #photocredit { font-size: xx-small; font-weight: normal; }''' + + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'}),dict(name='div', attrs={'id':'storyphoto'})] + remove_tags = [{'class':'comments'}, dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='h2', attrs={'id':'photocredit'}), dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), dict(name='div', attrs={'class':'rule_grey_solid'}), dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + + def get_cover_url(self): + from datetime import timedelta, datetime, date + cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg' + br = BasicNewsRecipe.get_browser() + daysback=1 + try: + br.open(cover) + except: + while daysback<7: + cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg' + br = BasicNewsRecipe.get_browser() + try: + br.open(cover) + except: + daysback = daysback+1 + continue + break + if daysback==7: + self.log("\nCover unavailable") + cover = None + return cover + + def prepare_masthead_image(self, path_to_image, out_path): + if self.Kindle_Fire: + from calibre import fit_image + from calibre.utils.magick import Image, create_canvas + img = Image() + img.open(path_to_image) + width, height = img.size + img2 = create_canvas(width, height) + img2.compose(img) + img2.save(out_path) + else: + BasicNewsRecipe.prepare_masthead_image(self, path_to_image, out_path) + + def fixChars(self,string): + # Replace lsquo (\x91) + fixed = re.sub("\x91","‘",string) + # Replace rsquo (\x92) + fixed = re.sub("\x92","’",fixed) + # Replace ldquo (\x93) + fixed = re.sub("\x93","“",fixed) + # Replace rdquo (\x94) + fixed = re.sub("\x94","”",fixed) + # Replace ndash (\x96) + fixed = re.sub("\x96","–",fixed) + # Replace mdash (\x97) + fixed = re.sub("\x97","—",fixed) + fixed = re.sub("’","’",fixed) + return fixed + + def massageNCXText(self, description): + # Kindle TOC descriptions won't render certain characters + if description: + massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) + # Replace '&' with '&' + massaged = re.sub("&","&", massaged) + return self.fixChars(massaged) + else: + return description + + def populate_article_metadata(self, article, soup, first): + if first: + picdiv = soup.find('body').find('img') + if picdiv is not None: + self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src'])) + xtitle = article.text_summary.strip() + if len(xtitle) == 0: + desc = soup.find('meta',attrs={'property':'og:description'}) + if desc is not None: + article.summary = article.text_summary = desc['content'] + + def strip_anchors(self,soup): + paras = soup.findAll(True) + for para in paras: + aTags = para.findAll('a') + for a in aTags: + if a.img is None: + a.replaceWith(a.renderContents().decode('cp1252','replace')) + return soup + + def preprocess_html(self,soup): - #delete iempty id attributes--they screw up the TOC for unknow reasons + #delete empty id attributes--they screw up the TOC for unknown reasons divtags = soup.findAll('div',attrs={'id':''}) if divtags: for div in divtags: del(div['id']) - return soup + pgall = soup.find('div',attrs={'id':'storyphoto'}) + if pgall is not None: # photo gallery perhaps + if (soup.find('div',attrs={'id':'storycontent'}) is None): + allpics = Tag(soup,'div') + first_img = pgall.find('div','storyimage') + if first_img is not None: + first_img.extract() + tlist = pgall.find('div',attrs={'id':'relatedthumbs'}) + if tlist is not None: + for atag in tlist.findAll('a'): + img = Tag(soup,'img') + srcpre, sep, srcpost = atag.img['src'].partition('?') + img['src'] = srcpre + pdesc = Tag(soup,'p') + pdesc.insert(0,atag.img['alt']) + pdesc['class']='photocaption' + div = Tag(soup,'div') + div.insert(0,pdesc) + div.insert(0,img) + allpics.append(div) + pgall.replaceWith(allpics) + + for pg in soup.findAll('div',attrs={'id':'storyphoto'}): + pg.extract() + return self.strip_anchors(soup) + + def parse_index(self): - soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') articles = {} - key = 'News' - ans = ['News'] + ans = [] - # Find each instance of class="sectiontitle", class="featurecontent" - for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): - #self.log(" div class = %s" % divtag['class']) - if divtag['class'].startswith('section_title'): - # div contains section title - if not divtag.h3: - continue - key = self.tag_to_string(divtag.h3,False) - ans.append(key) - self.log("Section name %s" % key) - continue - # div contains article data - h1tag = divtag.find('h1') - if not h1tag: - continue - atag = h1tag.find('a',href=True) - if not atag: - continue - url = self.url_prefix+'/news/todays-paper/'+atag['href'] - #self.log("Section %s" % key) - #self.log("url %s" % url) - title = self.tag_to_string(atag,False) - #self.log("title %s" % title) - pubdate = '' - description = '' - ptag = divtag.find('p'); - if ptag: - description = self.tag_to_string(ptag,False) - #self.log("description %s" % description) - author = '' - autag = divtag.find('h4') - if autag: - author = self.tag_to_string(autag,False) - #self.log("author %s" % author) - if not articles.has_key(key): - articles[key] = [] - articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + def handle_article(adiv,key): + h1tag = adiv.h1 + if h1tag is not None: + atag = h1tag.a + if atag is not None: + url = atag['href'] + if atag['href'].startswith('http'): + return + elif atag['href'].startswith('/'): + url = self.url_prefix+atag['href'] + else: + url = self.url_prefix+'/'+atag['href'] + if url in self.url_list: + return + self.url_list.append(url) + title = self.tag_to_string(atag,False) + if 'VIDEO' in title.upper(): + return + if 'GALLERY' in title.upper(): + return + if 'PHOTOS' in title.upper(): + return + dtag = adiv.find('div','content') + description='' + print("URL "+url) + print("TITLE "+title) + if dtag is not None: + stag = dtag.span + if stag is not None: + if stag['class'] != 'timestamp': + description = self.tag_to_string(stag,False) + else: + description = self.tag_to_string(dtag,False) + print("DESCRIPTION: "+description) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date='',description=description,author='',content='')) + + def parse_web_index(key, keyurl): + try: + soup = self.index_to_soup(self.url_prefix+keyurl) + except: + return + ans.append(key) + mainsoup = soup.find('div','bodywrapper') + footer = mainsoup.find(attrs={'id':'footerfeature'}) + if footer is not None: + footer.extract() + print("Section: "+key) + for wdiv in mainsoup.findAll('div',attrs={'id':re.compile('^HorizontalFeatureSlider_1_Story')}): + handle_article(wdiv,key) + wdiv.extract() + for wdiv in mainsoup.findAll(attrs={'id':['featurewidget','textfeature','textlinks_timestamp']}): + for adiv in wdiv.findAll('div','featurecontent'): + handle_article(adiv,key) + + for (k,url) in self.postmedia_index_pages: + parse_web_index(k,url) ans = [(key, articles[key]) for key in ans if articles.has_key(key)] return ans + diff --git a/recipes/vancouver_sun.recipe b/recipes/vancouver_sun.recipe index 98926e4ad8..4cc3c478e4 100644 --- a/recipes/vancouver_sun.recipe +++ b/recipes/vancouver_sun.recipe @@ -1,105 +1,141 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- - __license__ = 'GPL v3' ''' www.canada.com ''' - -import re +import string, re +from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup + +import string, re +from calibre import strftime +from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag class CanWestPaper(BasicNewsRecipe): - # un-comment the following four lines for the Victoria Times Colonist -## title = u'Victoria Times Colonist' -## url_prefix = 'http://www.timescolonist.com' -## description = u'News from Victoria, BC' -## fp_tag = 'CAN_TC' + postmedia_index_pages = [ + (u'Headlines',u'/index.html'), + (u'Ottawa & Area',u'/news/ottawa/index.html'), + (u'Vancouver',u'/news/vancouver/index.html'), + (u'Calgary',u'/news/calgary/index.html'), + (u'Edmonton',u'/news/edmonton/index.html'), + (u'Montreal',u'/news/montreal/index.html'), + (u'Fraser Valley',u'/news/fraser-valley/index.html'), + (u'British Columbia',u'/news/bc/index.html'), + (u'Alberta',u'/news/alberta/index.html'), + (u'Canada',u'/news/canada/index.html'), + (u'National',u'/news/national/index.html'), + (u'Politics',u'/news/politics/index.html'), + (u'Insight',u'/news/insight/index.html'), + (u'Special Reports',u'/news/specialreports/index.html'), + (u'Gangs',u'/news/gangs/index.html'), + (u'Education',u'/news/education/index.html'), + (u'Health',u'/news/health/index.html'), + (u'Environment',u'/news/environment/index.html'), + (u'World',u'/news/world/index.html'), + (u'Police Blotter',u'/news/crime-and-justice/index.html'), + (u'Crime',u'/news/blotter/index.html'), + (u'Around Town',u'/news/topic.html?t=keyword&q=Around+Town'), + (u'Diplomatica',u'/news/diplomatica/index.html'), + (u'Opinion',u'/opinion/index.html'), + (u'Columnists',u'/columnists/index.html'), + (u'Editorials',u'/opinion/editorials/index.html'), + (u'Letters',u'/opinion/letters/index.html'), + (u'Business',u'/business/index.html'), + (u'Sports',u'/sports/index.html'), + (u'Arts',u'/entertainment/index.html'), + (u'Life',u'/life/index.html'), + (u'Technology',u'/technology/index.html'), + (u'Travel',u'/travel/index.html'), + (u'Health',u'/health/index.html') + ] - # un-comment the following four lines for the Vancouver Province + + # un-comment the following six lines for the Vancouver Province ## title = u'Vancouver Province' ## url_prefix = 'http://www.theprovince.com' ## description = u'News from Vancouver, BC' -## fp_tag = 'CAN_VP' +## std_logo_url = 'http://www.theprovince.com/images/logo_theprovince.jpg' +## logo_url = 'vplogo.jpg' +## fp_tag = 'CAN_TP' - # un-comment the following four lines for the Vancouver Sun + # un-comment the following six lines for the Vancouver Sun title = u'Vancouver Sun' url_prefix = 'http://www.vancouversun.com' description = u'News from Vancouver, BC' + std_logo_url = 'http://www.vancouversun.com/images/logo_vancouversun.jpg' + logo_url = 'vslogo.jpg' fp_tag = 'CAN_VS' - # un-comment the following four lines for the Edmonton Journal -## title = u'Edmonton Journal' -## url_prefix = 'http://www.edmontonjournal.com' -## description = u'News from Edmonton, AB' -## fp_tag = 'CAN_EJ' - - # un-comment the following four lines for the Calgary Herald + # un-comment the following six lines for the Calgary Herald ## title = u'Calgary Herald' ## url_prefix = 'http://www.calgaryherald.com' ## description = u'News from Calgary, AB' +## std_logo_url = 'http://www.calgaryherald.com/images/logo_calgaryherald.jpg' +## logo_url = 'chlogo.jpg' ## fp_tag = 'CAN_CH' - # un-comment the following four lines for the Regina Leader-Post -## title = u'Regina Leader-Post' -## url_prefix = 'http://www.leaderpost.com' -## description = u'News from Regina, SK' -## fp_tag = '' + # un-comment the following six lines for the Edmonton Journal +## title = u'Edmonton Journal' +## url_prefix = 'http://www.edmontonjournal.com' +## description = u'News from Edmonton, AB' +## std_logo_url = 'http://www.edmontonjournal.com/images/logo_edmontonjournal.jpg' +## logo_url = 'ejlogo.jpg' +## fp_tag = 'CAN_EJ' - # un-comment the following four lines for the Saskatoon Star-Phoenix -## title = u'Saskatoon Star-Phoenix' -## url_prefix = 'http://www.thestarphoenix.com' -## description = u'News from Saskatoon, SK' -## fp_tag = '' - - # un-comment the following four lines for the Windsor Star -## title = u'Windsor Star' -## url_prefix = 'http://www.windsorstar.com' -## description = u'News from Windsor, ON' -## fp_tag = 'CAN_' - - # un-comment the following four lines for the Ottawa Citizen + # un-comment the following six lines for the Ottawa Citizen ## title = u'Ottawa Citizen' ## url_prefix = 'http://www.ottawacitizen.com' -## description = u'News from Ottawa, ON' +## description = u'News from Ottawa, ON' +## std_logo_url = 'http://www.ottawacitizen.com/images/logo_ottawacitizen.jpg' +## logo_url = 'oclogo.jpg' ## fp_tag = 'CAN_OC' - # un-comment the following four lines for the Montreal Gazette + # un-comment the following six lines for the Montreal Gazette ## title = u'Montreal Gazette' ## url_prefix = 'http://www.montrealgazette.com' ## description = u'News from Montreal, QC' +## std_logo_url = 'http://www.montrealgazette.com/images/logo_montrealgazette.jpg' +## logo_url = 'mglogo.jpg' ## fp_tag = 'CAN_MG' + Kindle_Fire=False + masthead_url = std_logo_url + url_list = [] language = 'en_CA' __author__ = 'Nick Redding' no_stylesheets = True - timefmt = ' [%b %d]' + timefmt = ' [%b %d]' + encoding = 'utf-8' extra_css = ''' .timestamp { font-size:xx-small; display: block; } #storyheader { font-size: medium; } #storyheader h1 { font-size: x-large; } - #storyheader h2 { font-size: large; font-style: italic; } + #storyheader h2 { font-size: small; font-style: italic; } .byline { font-size:xx-small; } - #photocaption { font-size: small; font-style: italic } - #photocredit { font-size: xx-small; }''' - keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + #photocaption { font-size: xx-small; font-style: italic; font-weight: normal; } + .photocaption { font-size: xx-small; font-style: italic; font-weight: normal; } + #photocredit { font-size: xx-small; font-weight: normal; }''' + + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'}),dict(name='div', attrs={'id':'storyphoto'})] + remove_tags = [{'class':'comments'}, dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='h2', attrs={'id':'photocredit'}), dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), dict(name='div', attrs={'class':'rule_grey_solid'}), dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + def get_cover_url(self): - from datetime import timedelta, date - if self.fp_tag=='': - return None + from datetime import timedelta, datetime, date cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg' br = BasicNewsRecipe.get_browser() daysback=1 @@ -120,6 +156,19 @@ class CanWestPaper(BasicNewsRecipe): cover = None return cover + def prepare_masthead_image(self, path_to_image, out_path): + if self.Kindle_Fire: + from calibre import fit_image + from calibre.utils.magick import Image, create_canvas + img = Image() + img.open(path_to_image) + width, height = img.size + img2 = create_canvas(width, height) + img2.compose(img) + img2.save(out_path) + else: + BasicNewsRecipe.prepare_masthead_image(self, path_to_image, out_path) + def fixChars(self,string): # Replace lsquo (\x91) fixed = re.sub("\x91","‘",string) @@ -166,55 +215,106 @@ class CanWestPaper(BasicNewsRecipe): a.replaceWith(a.renderContents().decode('cp1252','replace')) return soup - def preprocess_html(self, soup): + + def preprocess_html(self,soup): + #delete empty id attributes--they screw up the TOC for unknown reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + + pgall = soup.find('div',attrs={'id':'storyphoto'}) + if pgall is not None: # photo gallery perhaps + if (soup.find('div',attrs={'id':'storycontent'}) is None): + allpics = Tag(soup,'div') + first_img = pgall.find('div','storyimage') + if first_img is not None: + first_img.extract() + tlist = pgall.find('div',attrs={'id':'relatedthumbs'}) + if tlist is not None: + for atag in tlist.findAll('a'): + img = Tag(soup,'img') + srcpre, sep, srcpost = atag.img['src'].partition('?') + img['src'] = srcpre + pdesc = Tag(soup,'p') + pdesc.insert(0,atag.img['alt']) + pdesc['class']='photocaption' + div = Tag(soup,'div') + div.insert(0,pdesc) + div.insert(0,img) + allpics.append(div) + pgall.replaceWith(allpics) + + for pg in soup.findAll('div',attrs={'id':'storyphoto'}): + pg.extract() return self.strip_anchors(soup) - + def parse_index(self): - soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') articles = {} - key = 'News' - ans = ['News'] + ans = [] - # Find each instance of class="sectiontitle", class="featurecontent" - for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): - #self.log(" div class = %s" % divtag['class']) - if divtag['class'].startswith('section_title'): - # div contains section title - if not divtag.h3: - continue - key = self.tag_to_string(divtag.h3,False) - ans.append(key) - self.log("Section name %s" % key) - continue - # div contains article data - h1tag = divtag.find('h1') - if not h1tag: - continue - atag = h1tag.find('a',href=True) - if not atag: - continue - url = self.url_prefix+'/news/todays-paper/'+atag['href'] - #self.log("Section %s" % key) - #self.log("url %s" % url) - title = self.tag_to_string(atag,False) - #self.log("title %s" % title) - pubdate = '' - description = '' - ptag = divtag.find('p'); - if ptag: - description = self.tag_to_string(ptag,False) - #self.log("description %s" % description) - author = '' - autag = divtag.find('h4') - if autag: - author = self.tag_to_string(autag,False) - #self.log("author %s" % author) - if not articles.has_key(key): - articles[key] = [] - articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + def handle_article(adiv,key): + h1tag = adiv.h1 + if h1tag is not None: + atag = h1tag.a + if atag is not None: + url = atag['href'] + if atag['href'].startswith('http'): + return + elif atag['href'].startswith('/'): + url = self.url_prefix+atag['href'] + else: + url = self.url_prefix+'/'+atag['href'] + if url in self.url_list: + return + self.url_list.append(url) + title = self.tag_to_string(atag,False) + if 'VIDEO' in title.upper(): + return + if 'GALLERY' in title.upper(): + return + if 'PHOTOS' in title.upper(): + return + dtag = adiv.find('div','content') + description='' + print("URL "+url) + print("TITLE "+title) + if dtag is not None: + stag = dtag.span + if stag is not None: + if stag['class'] != 'timestamp': + description = self.tag_to_string(stag,False) + else: + description = self.tag_to_string(dtag,False) + print("DESCRIPTION: "+description) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date='',description=description,author='',content='')) + + def parse_web_index(key, keyurl): + try: + soup = self.index_to_soup(self.url_prefix+keyurl) + except: + return + ans.append(key) + mainsoup = soup.find('div','bodywrapper') + footer = mainsoup.find(attrs={'id':'footerfeature'}) + if footer is not None: + footer.extract() + print("Section: "+key) + for wdiv in mainsoup.findAll('div',attrs={'id':re.compile('^HorizontalFeatureSlider_1_Story')}): + handle_article(wdiv,key) + wdiv.extract() + for wdiv in mainsoup.findAll(attrs={'id':['featurewidget','textfeature','textlinks_timestamp']}): + for adiv in wdiv.findAll('div','featurecontent'): + handle_article(adiv,key) + + for (k,url) in self.postmedia_index_pages: + parse_web_index(k,url) ans = [(key, articles[key]) for key in ans if articles.has_key(key)] return ans + diff --git a/resources/catalog/stylesheet.css b/resources/catalog/stylesheet.css index f14b1aaa29..41a5980ed2 100644 --- a/resources/catalog/stylesheet.css +++ b/resources/catalog/stylesheet.css @@ -1,35 +1,229 @@ body { background-color: white; } +/* +** The following rules apply principally to the line items shown in the +** Authors, Titles, Genres, Series, and Recently Added sections. Rules for the +** Descriptions section are grouped together later in the file. +** ------------------------------------------------------------------------ +*/ + + +/* +**
' + _('If checked, use the port number in the "Port" box, otherwise ' 'the driver will pick a random port') + '
', - _('Port') + ':::' + + _('Port number: ') + ':::
' + _('Enter the port number the driver is to use if the "fixed port" box is checked') + '
', _('Print extra debug information') + ':::' + _('Check this box if requested when reporting problems') + '
', @@ -131,7 +135,13 @@ class SMART_DEVICE_APP(DeviceConfig, DevicePlugin): _('. Two special collections are available: %(abt)s:%(abtv)s and %(aba)s:%(abav)s. Add ' 'these values to the list to enable them. The collections will be ' 'given the name provided after the ":" character.')%dict( - abt='abt', abtv=ALL_BY_TITLE, aba='aba', abav=ALL_BY_AUTHOR) + abt='abt', abtv=ALL_BY_TITLE, aba='aba', abav=ALL_BY_AUTHOR), + '', + _('Enable the no-activity timeout') + ':::' + + _('If this box is checked, calibre will automatically disconnect if ' + 'a connected device does nothing for %d minutes. Unchecking this ' + ' box disables this timeout, so calibre will never automatically ' + 'disconnect.')%(DISCONNECT_AFTER_N_SECONDS/60,) + '
', ] EXTRA_CUSTOMIZATION_DEFAULT = [ False, @@ -141,7 +151,9 @@ class SMART_DEVICE_APP(DeviceConfig, DevicePlugin): False, '9090', False, '', - '' + '', + '', + True, ] OPT_AUTOSTART = 0 OPT_PASSWORD = 2 @@ -149,6 +161,7 @@ class SMART_DEVICE_APP(DeviceConfig, DevicePlugin): OPT_PORT_NUMBER = 5 OPT_EXTRA_DEBUG = 6 OPT_COLLECTIONS = 8 + OPT_AUTODISCONNECT = 10 def __init__(self, path): self.sync_lock = threading.RLock() @@ -165,7 +178,16 @@ class SMART_DEVICE_APP(DeviceConfig, DevicePlugin): inspect.stack()[1][3]), end='') for a in args: try: - prints('', a, end='') + if isinstance(a, dict): + printable = {} + for k,v in a.iteritems(): + if isinstance(v, (str, unicode)) and len(v) > 50: + printable[k] = 'too long' + else: + printable[k] = v + prints('', printable, end=''); + else: + prints('', a, end='') except: prints('', 'value too long', end='') print() @@ -339,6 +361,27 @@ class SMART_DEVICE_APP(DeviceConfig, DevicePlugin): pos += len(v) return data + def _send_byte_string(self, s): + if not isinstance(s, bytes): + self._debug('given a non-byte string!') + raise PacketError("Internal error: found a string that isn't bytes") + sent_len = 0; + total_len = len(s) + while sent_len < total_len: + try: + if sent_len == 0: + amt_sent = self.device_socket.send(s) + else: + amt_sent = self.device_socket.send(s[sent_len:]) + if amt_sent <= 0: + raise IOError('Bad write on device socket'); + sent_len += amt_sent + except socket.error as e: + self._debug('socket error', e, e.errno) + if e.args[0] != EAGAIN and e.args[0] != EINTR: + raise + time.sleep(0.1) # lets not hammer the OS too hard + def _call_client(self, op, arg, print_debug_info=True): if op != 'NOOP': self.noop_counter = 0 @@ -355,9 +398,9 @@ class SMART_DEVICE_APP(DeviceConfig, DevicePlugin): if print_debug_info and extra_debug: self._debug('send string', s) self.device_socket.settimeout(self.MAX_CLIENT_COMM_TIMEOUT) - self.device_socket.sendall(('%d' % len(s))+s) - self.device_socket.settimeout(None) + self._send_byte_string((b'%d' % len(s))+s) v = self._read_string_from_net() + self.device_socket.settimeout(None) if print_debug_info and extra_debug: self._debug('received string', v) if v: @@ -373,13 +416,13 @@ class SMART_DEVICE_APP(DeviceConfig, DevicePlugin): except socket.error: self._debug('device went away') self._close_device_socket() - raise ControlError('Device closed the network connection') + raise ControlError(desc='Device closed the network connection') except: self._debug('other exception') traceback.print_exc() self._close_device_socket() raise - raise ControlError('Device responded with incorrect information') + raise ControlError(desc='Device responded with incorrect information') # Write a file as a series of base64-encoded strings. def _put_file(self, infile, lpath, book_metadata, this_book, total_books): @@ -475,7 +518,8 @@ class SMART_DEVICE_APP(DeviceConfig, DevicePlugin): self.is_connected = False if self.is_connected: self.noop_counter += 1 - if only_presence and (self.noop_counter % 5) != 1: + if only_presence and ( + self.noop_counter % self.SEND_NOOP_EVERY_NTH_PROBE) != 1: try: ans = select.select((self.device_socket,), (), (), 0) if len(ans[0]) == 0: @@ -486,11 +530,16 @@ class SMART_DEVICE_APP(DeviceConfig, DevicePlugin): # This will usually toss an exception if the socket is gone. except: pass - try: - if self._call_client('NOOP', dict())[0] is None: - self._close_device_socket() - except: + if (self.settings().extra_customization[self.OPT_AUTODISCONNECT] and + self.noop_counter > self.DISCONNECT_AFTER_N_SECONDS): self._close_device_socket() + self._debug('timeout -- disconnected') + else: + try: + if self._call_client('NOOP', dict())[0] is None: + self._close_device_socket() + except: + self._close_device_socket() return (self.is_connected, self) if getattr(self, 'listen_socket', None) is not None: ans = select.select((self.listen_socket,), (), (), 0) @@ -533,7 +582,7 @@ class SMART_DEVICE_APP(DeviceConfig, DevicePlugin): self._debug() if not self.is_connected: # We have been called to retry the connection. Give up immediately - raise ControlError('Attempt to open a closed device') + raise ControlError(desc='Attempt to open a closed device') self.current_library_uuid = library_uuid self.current_library_name = current_library_name() try: @@ -569,6 +618,7 @@ class SMART_DEVICE_APP(DeviceConfig, DevicePlugin): self._debug('Protocol error - bogus book packet length') self._close_device_socket() return False + self._debug('CC version #:', result.get('ccVersionNumber', 'unknown')) self.max_book_packet_len = result.get('maxBookContentPacketLen', self.BASE_PACKET_LEN) exts = result.get('acceptedExtensions', None) @@ -689,7 +739,7 @@ class SMART_DEVICE_APP(DeviceConfig, DevicePlugin): self._set_known_metadata(book) bl.add_book(book, replace_metadata=True) else: - raise ControlError('book metadata not returned') + raise ControlError(desc='book metadata not returned') return bl @synchronous('sync_lock') @@ -720,7 +770,7 @@ class SMART_DEVICE_APP(DeviceConfig, DevicePlugin): print_debug_info=False) if opcode != 'OK': self._debug('protocol error', opcode, i) - raise ControlError('sync_booklists') + raise ControlError(desc='sync_booklists') @synchronous('sync_lock') def eject(self): @@ -748,7 +798,7 @@ class SMART_DEVICE_APP(DeviceConfig, DevicePlugin): book = Book(self.PREFIX, lpath, other=mdata) length = self._put_file(infile, lpath, book, i, len(files)) if length < 0: - raise ControlError('Sending book %s to device failed' % lpath) + raise ControlError(desc='Sending book %s to device failed' % lpath) paths.append((lpath, length)) # No need to deal with covers. The client will get the thumbnails # in the mi structure @@ -789,7 +839,7 @@ class SMART_DEVICE_APP(DeviceConfig, DevicePlugin): if opcode == 'OK': self._debug('removed book with UUID', result['uuid']) else: - raise ControlError('Protocol error - delete books') + raise ControlError(desc='Protocol error - delete books') @synchronous('sync_lock') def remove_books_from_metadata(self, paths, booklists): @@ -825,7 +875,7 @@ class SMART_DEVICE_APP(DeviceConfig, DevicePlugin): else: eof = True else: - raise ControlError('request for book data failed') + raise ControlError(desc='request for book data failed') @synchronous('sync_lock') def set_plugboards(self, plugboards, pb_func): diff --git a/src/calibre/ebooks/conversion/plugins/mobi_output.py b/src/calibre/ebooks/conversion/plugins/mobi_output.py index f07e01a53c..ab00346be9 100644 --- a/src/calibre/ebooks/conversion/plugins/mobi_output.py +++ b/src/calibre/ebooks/conversion/plugins/mobi_output.py @@ -88,6 +88,15 @@ class MOBIOutput(OutputFormatPlugin): 'formats. This option tells calibre not to do this. ' 'Useful if your document contains lots of GIF/PNG images that ' 'become very large when converted to JPEG.')), + OptionRecommendation(name='mobi_file_type', choices=['old', 'both', + 'new'], recommended_value='old', + help=_('By default calibre generates MOBI files that contain the ' + 'old MOBI 6 format. This format is compatible with all ' + 'devices. However, by changing this setting, you can tell ' + 'calibre to generate MOBI files that contain both MOBI 6 and ' + 'the new KF8 format, or only the new KF8 format. KF8 has ' + 'more features than MOBI 6, but only works with newer Kindles.')), + ]) def check_for_periodical(self): @@ -165,11 +174,10 @@ class MOBIOutput(OutputFormatPlugin): toc.nodes[0].href = toc.nodes[0].nodes[0].href def convert(self, oeb, output_path, input_plugin, opts, log): - from calibre.utils.config import tweaks from calibre.ebooks.mobi.writer2.resources import Resources self.log, self.opts, self.oeb = log, opts, oeb - mobi_type = tweaks.get('test_mobi_output_type', 'old') + mobi_type = opts.mobi_file_type if self.is_periodical: mobi_type = 'old' # Amazon does not support KF8 periodicals create_kf8 = mobi_type in ('new', 'both') diff --git a/src/calibre/ebooks/oeb/transforms/flatcss.py b/src/calibre/ebooks/oeb/transforms/flatcss.py index d58b64ac53..72c9dc0d72 100644 --- a/src/calibre/ebooks/oeb/transforms/flatcss.py +++ b/src/calibre/ebooks/oeb/transforms/flatcss.py @@ -11,6 +11,7 @@ from collections import defaultdict from lxml import etree import cssutils +from cssutils.css import Property from calibre.ebooks.oeb.base import (XHTML, XHTML_NS, CSS_MIME, OEB_STYLES, namespace, barename, XPath) @@ -276,10 +277,16 @@ class CSSFlattener(object): cssdict['font-family'] = node.attrib['face'] del node.attrib['face'] if 'color' in node.attrib: - cssdict['color'] = node.attrib['color'] + try: + cssdict['color'] = Property('color', node.attrib['color']).value + except ValueError: + pass del node.attrib['color'] if 'bgcolor' in node.attrib: - cssdict['background-color'] = node.attrib['bgcolor'] + try: + cssdict['background-color'] = Property('background-color', node.attrib['bgcolor']).value + except ValueError: + pass del node.attrib['bgcolor'] if cssdict.get('font-weight', '').lower() == 'medium': cssdict['font-weight'] = 'normal' # ADE chokes on font-weight medium diff --git a/src/calibre/gui2/catalog/catalog_epub_mobi.py b/src/calibre/gui2/catalog/catalog_epub_mobi.py index 42143eb506..dad8ea804d 100644 --- a/src/calibre/gui2/catalog/catalog_epub_mobi.py +++ b/src/calibre/gui2/catalog/catalog_epub_mobi.py @@ -15,13 +15,15 @@ from calibre.utils.icu import sort_key from catalog_epub_mobi_ui import Ui_Form from PyQt4.Qt import (Qt, QAbstractItemView, QCheckBox, QComboBox, - QDoubleSpinBox, QIcon, QLineEdit, QRadioButton, QSize, QSizePolicy, - QTableWidget, QTableWidgetItem, QToolButton, QVBoxLayout, QWidget) + QDoubleSpinBox, QIcon, QLineEdit, QObject, QRadioButton, QSize, QSizePolicy, + QTableWidget, QTableWidgetItem, QToolButton, QVBoxLayout, QWidget, + SIGNAL) class PluginWidget(QWidget,Ui_Form): TITLE = _('E-book options') HELP = _('Options specific to')+' AZW3/EPUB/MOBI '+_('output') + DEBUG = False # Output synced to the connected device? sync_enabled = True @@ -100,6 +102,39 @@ class PluginWidget(QWidget,Ui_Form): self.OPTION_FIELDS = option_fields + def construct_tw_opts_object(self, c_name, opt_value, opts_dict): + ''' + Build an opts object from the UI settings to pass to the catalog builder + Handles two types of rules sets, with and without ['prefix'] field + Store processed opts object to opt_dict + ''' + rule_set = [] + for stored_rule in opt_value: + rule = copy(stored_rule) + # Skip disabled and incomplete rules + if not rule['enabled']: + continue + elif not rule['field'] or not rule['pattern']: + continue + elif 'prefix' in rule and not rule['prefix']: + continue + else: + if rule['field'] != 'Tags': + # Look up custom column friendly name + rule['field'] = self.eligible_custom_fields[rule['field']]['field'] + if rule['pattern'] in [_('any value'),_('any date')]: + rule_pattern = '.*' + elif rule['pattern'] == _('unspecified'): + rule['pattern'] = 'None' + if 'prefix' in rule: + pr = (rule['name'],rule['field'],rule['pattern'],rule['prefix']) + else: + pr = (rule['name'],rule['field'],rule['pattern']) + rule_set.append(pr) + opt_value = tuple(rule_set) + # Strip off the trailing '_tw' + opts_dict[c_name[:-3]] = opt_value + def fetchEligibleCustomFields(self): self.all_custom_fields = self.db.custom_field_keys() custom_fields = {} @@ -194,11 +229,10 @@ class PluginWidget(QWidget,Ui_Form): def options(self): # Save/return the current options # exclude_genre stores literally - # generate_titles, generate_recently_added store as True/False + # Section switches store as True/False # others store as lists opts_dict = {} - # Save values to gprefs prefix_rules_processed = False exclusion_rules_processed = False @@ -229,56 +263,8 @@ class PluginWidget(QWidget,Ui_Form): gprefs.set(self.name + '_' + c_name, opt_value) # Construct opts object for catalog builder - if c_name == 'prefix_rules_tw': - rule_set = [] - for stored_rule in opt_value: - # Test for empty name/field/pattern/prefix, continue - # If pattern = any or unspecified, convert to regex - rule = copy(stored_rule) - if not rule['enabled']: - continue - elif not rule['field'] or not rule['pattern'] or not rule['prefix']: - continue - else: - if rule['field'] != 'Tags': - # Look up custom column name - #print(self.eligible_custom_fields[rule['field']]['field']) - rule['field'] = self.eligible_custom_fields[rule['field']]['field'] - if rule['pattern'].startswith('any'): - rule['pattern'] = '.*' - elif rule['pattern'] == 'unspecified': - rule['pattern'] = 'None' - - pr = (rule['name'],rule['field'],rule['pattern'],rule['prefix']) - rule_set.append(pr) - opt_value = tuple(rule_set) - opts_dict['prefix_rules'] = opt_value - - elif c_name == 'exclusion_rules_tw': - rule_set = [] - for stored_rule in opt_value: - # Test for empty name/field/pattern/prefix, continue - # If pattern = any or unspecified, convert to regex - rule = copy(stored_rule) - if not rule['enabled']: - continue - elif not rule['field'] or not rule['pattern']: - continue - else: - if rule['field'] != 'Tags': - # Look up custom column name - #print(self.eligible_custom_fields[rule['field']]['field']) - rule['field'] = self.eligible_custom_fields[rule['field']]['field'] - if rule['pattern'].startswith('any'): - rule['pattern'] = '.*' - elif rule['pattern'] == 'unspecified': - rule['pattern'] = 'None' - - pr = (rule['name'],rule['field'],rule['pattern']) - rule_set.append(pr) - opt_value = tuple(rule_set) - opts_dict['exclusion_rules'] = opt_value - + if c_name in ['exclusion_rules_tw','prefix_rules_tw']: + self.construct_tw_opts_object(c_name, opt_value, opts_dict) else: opts_dict[c_name] = opt_value @@ -299,7 +285,7 @@ class PluginWidget(QWidget,Ui_Form): opts_dict['output_profile'] = [load_defaults('page_setup')['output_profile']] except: opts_dict['output_profile'] = ['default'] - if False: + if self.DEBUG: print "opts_dict" for opt in sorted(opts_dict.keys(), key=sort_key): print " %s: %s" % (opt, repr(opts_dict[opt])) @@ -343,7 +329,6 @@ class PluginWidget(QWidget,Ui_Form): self.header_note_source_fields = custom_fields self.header_note_source_field.currentIndexChanged.connect(self.header_note_source_field_changed) - # Populate the 'Merge with Comments' combo box custom_fields = {} for custom_field in self.all_custom_fields: @@ -450,10 +435,11 @@ class ComboBox(NoWheelComboBox): class GenericRulesTable(QTableWidget): ''' - Generic methods for managing rows - Add QTableWidget, controls to parent QGroupBox - placeholders for basic methods to be overriden + Generic methods for managing rows in a QTableWidget ''' + DEBUG = False + MAXIMUM_TABLE_HEIGHT = 113 + NAME_FIELD_WIDTH = 225 def __init__(self, parent_gb, object_name, rules, eligible_custom_fields, db): self.rules = rules @@ -464,13 +450,12 @@ class GenericRulesTable(QTableWidget): self.layout = parent_gb.layout() # Add ourselves to the layout - #print("verticalHeader: %s" % dir(self.verticalHeader())) sizePolicy = QSizePolicy(QSizePolicy.Expanding, QSizePolicy.Minimum) sizePolicy.setHorizontalStretch(0) sizePolicy.setVerticalStretch(0) #sizePolicy.setHeightForWidth(self.sizePolicy().hasHeightForWidth()) self.setSizePolicy(sizePolicy) - self.setMaximumSize(QSize(16777215, 113)) + self.setMaximumSize(QSize(16777215, self.MAXIMUM_TABLE_HEIGHT)) self.setColumnCount(0) self.setRowCount(0) @@ -481,6 +466,9 @@ class GenericRulesTable(QTableWidget): self._init_controls() + # Hook check_box changes + QObject.connect(self, SIGNAL('cellChanged(int,int)'), self.enabled_state_changed) + def _init_controls(self): # Add the control set vbl = QVBoxLayout() @@ -517,6 +505,8 @@ class GenericRulesTable(QTableWidget): def add_row(self): self.setFocus() row = self.last_row_selected + 1 + if self.DEBUG: + print("%s:add_row(): at row: %d" % (self.objectName(), row)) self.insertRow(row) self.populate_table_row(row, self.create_blank_row_data()) self.select_and_scroll_to_row(row) @@ -524,19 +514,10 @@ class GenericRulesTable(QTableWidget): # In case table was empty self.horizontalHeader().setStretchLastSection(True) - def convert_row_to_data(self): - ''' - override - ''' - pass - - def create_blank_row_data(self): - ''' - override - ''' - pass - def delete_row(self): + if self.DEBUG: + print("%s:delete_row()" % self.objectName()) + self.setFocus() rows = self.last_rows_selected if len(rows) == 0: @@ -545,10 +526,11 @@ class GenericRulesTable(QTableWidget): first = rows[0].row() + 1 last = rows[-1].row() + 1 - message = _('Are you sure you want to delete rule %d?') % first + first_rule_name = unicode(self.cellWidget(first-1,self.COLUMNS['NAME']['ordinal']).text()).strip() + message = _("Are you sure you want to delete '%s'?") % (first_rule_name) if len(rows) > 1: - message = _('Are you sure you want to delete rules %d-%d?') % (first, last) - if not question_dialog(self, _('Are you sure?'), message, show_copy_button=False): + message = _('Are you sure you want to delete rules #%d-%d?') % (first, last) + if not question_dialog(self, _('Delete Rule'), message, show_copy_button=False): return first_sel_row = self.currentRow() for selrow in reversed(rows): @@ -558,17 +540,24 @@ class GenericRulesTable(QTableWidget): elif self.rowCount() > 0: self.select_and_scroll_to_row(first_sel_row - 1) + def enabled_state_changed(self, row, col): + if col in [self.COLUMNS['ENABLED']['ordinal']]: + self.select_and_scroll_to_row(row) + if self.DEBUG: + print("%s:enabled_state_changed(): row %d col %d" % + (self.objectName(), row, col)) + + def focusInEvent(self,e): + if self.DEBUG: + print("%s:focusInEvent()" % self.objectName()) + def focusOutEvent(self,e): # Override of QTableWidget method - clear selection when table loses focus self.last_row_selected = self.currentRow() self.last_rows_selected = self.selectionModel().selectedRows() self.clearSelection() - - def get_data(self): - ''' - override - ''' - pass + if self.DEBUG: + print("%s:focusOutEvent(): self.last_row_selected: %d" % (self.objectName(),self.last_row_selected)) def move_row_down(self): self.setFocus() @@ -583,6 +572,8 @@ class GenericRulesTable(QTableWidget): for selrow in reversed(rows): dest_row = selrow.row() + 1 src_row = selrow.row() + if self.DEBUG: + print("%s:move_row_down() %d -> %d" % (self.objectName(),src_row, dest_row)) # Save the contents of the destination row saved_data = self.convert_row_to_data(dest_row) @@ -596,11 +587,9 @@ class GenericRulesTable(QTableWidget): # Populate it with the saved data self.populate_table_row(src_row, saved_data) - self.blockSignals(False) scroll_to_row = last_sel_row + 1 - if scroll_to_row < self.rowCount() - 1: - scroll_to_row = scroll_to_row + 1 self.select_and_scroll_to_row(scroll_to_row) + self.blockSignals(False) def move_row_up(self): self.setFocus() @@ -611,7 +600,11 @@ class GenericRulesTable(QTableWidget): if first_sel_row <= 0: return self.blockSignals(True) + for selrow in rows: + if self.DEBUG: + print("%s:move_row_up() %d -> %d" % (self.objectName(),selrow.row(), selrow.row()-1)) + # Save the row above saved_data = self.convert_row_to_data(selrow.row() - 1) @@ -621,33 +614,92 @@ class GenericRulesTable(QTableWidget): # Delete the row above self.removeRow(selrow.row() - 1) - self.blockSignals(False) - scroll_to_row = first_sel_row - 1 + scroll_to_row = first_sel_row if scroll_to_row > 0: scroll_to_row = scroll_to_row - 1 self.select_and_scroll_to_row(scroll_to_row) + self.blockSignals(False) - def populate_table_row(self): - ''' - override - ''' - pass + def populate_table(self): + # Format of rules list is different if default values vs retrieved JSON + # Hack to normalize list style + rules = self.rules + if rules and type(rules[0]) is list: + rules = rules[0] + self.setFocus() + rules = sorted(rules, key=lambda k: k['ordinal']) + for row, rule in enumerate(rules): + self.insertRow(row) + self.select_and_scroll_to_row(row) + self.populate_table_row(row, rule) + self.selectRow(0) - def resize_name(self, scale): - #current_width = self.columnWidth(1) - #self.setColumnWidth(1, min(225,int(current_width * scale))) - self.setColumnWidth(1, 225) + def resize_name(self): + self.setColumnWidth(1, self.NAME_FIELD_WIDTH) def rule_name_edited(self): + if self.DEBUG: + print("%s:rule_name_edited()" % self.objectName()) + current_row = self.currentRow() self.cellWidget(current_row,1).home(False) - self.setFocus() self.select_and_scroll_to_row(current_row) def select_and_scroll_to_row(self, row): + self.setFocus() self.selectRow(row) self.scrollToItem(self.currentItem()) + self.last_row_selected = self.currentRow() + self.last_rows_selected = self.selectionModel().selectedRows() + + def _source_index_changed(self, combo): + # Figure out which row we're in + for row in range(self.rowCount()): + if self.cellWidget(row, self.COLUMNS['FIELD']['ordinal']) is combo: + break + + if self.DEBUG: + print("%s:_source_index_changed(): calling source_index_changed with row: %d " % + (self.objectName(), row)) + + self.source_index_changed(combo, row) + + def source_index_changed(self, combo, row, pattern=''): + # Populate the Pattern field based upon the Source field + + source_field = str(combo.currentText()) + if source_field == '': + values = [] + elif source_field == 'Tags': + values = sorted(self.db.all_tags(), key=sort_key) + else: + if self.eligible_custom_fields[unicode(source_field)]['datatype'] in ['enumeration', 'text']: + values = self.db.all_custom(self.db.field_metadata.key_to_label( + self.eligible_custom_fields[unicode(source_field)]['field'])) + values = sorted(values, key=sort_key) + elif self.eligible_custom_fields[unicode(source_field)]['datatype'] in ['bool']: + values = [_('True'),_('False'),_('unspecified')] + elif self.eligible_custom_fields[unicode(source_field)]['datatype'] in ['composite']: + values = [_('any value'),_('unspecified')] + elif self.eligible_custom_fields[unicode(source_field)]['datatype'] in ['datetime']: + values = [_('any date'),_('unspecified')] + + values_combo = ComboBox(self, values, pattern) + values_combo.currentIndexChanged.connect(partial(self.values_index_changed, values_combo)) + self.setCellWidget(row, self.COLUMNS['PATTERN']['ordinal'], values_combo) + self.select_and_scroll_to_row(row) + + def values_index_changed(self, combo): + # After edit, select row + for row in range(self.rowCount()): + if self.cellWidget(row, self.COLUMNS['PATTERN']['ordinal']) is combo: + self.select_and_scroll_to_row(row) + break + + if self.DEBUG: + print("%s:values_index_changed(): row %d " % + (self.objectName(), row)) class ExclusionRules(GenericRulesTable): @@ -658,12 +710,13 @@ class ExclusionRules(GenericRulesTable): def __init__(self, parent_gb_hl, object_name, rules, eligible_custom_fields, db): super(ExclusionRules, self).__init__(parent_gb_hl, object_name, rules, eligible_custom_fields, db) + self.setObjectName("exclusion_rules_table") self._init_table_widget() self._initialize() def _init_table_widget(self): header_labels = [self.COLUMNS[index]['name'] \ - for index in sorted(self.COLUMNS.keys(), key=lambda c: self.COLUMNS[c]['ordinal'])] + for index in sorted(self.COLUMNS.keys(), key=lambda c: self.COLUMNS[c]['ordinal'])] self.setColumnCount(len(header_labels)) self.setHorizontalHeaderLabels(header_labels) self.setSortingEnabled(False) @@ -672,7 +725,7 @@ class ExclusionRules(GenericRulesTable): def _initialize(self): self.populate_table() self.resizeColumnsToContents() - self.resize_name(1.5) + self.resize_name() self.horizontalHeader().setStretchLastSection(True) self.clearSelection() @@ -706,20 +759,6 @@ class ExclusionRules(GenericRulesTable): 'pattern':data['pattern']}) return data_items - def populate_table(self): - # Format of rules list is different if default values vs retrieved JSON - # Hack to normalize list style - rules = self.rules - if rules and type(rules[0]) is list: - rules = rules[0] - self.setFocus() - rules = sorted(rules, key=lambda k: k['ordinal']) - for row, rule in enumerate(rules): - self.insertRow(row) - self.select_and_scroll_to_row(row) - self.populate_table_row(row, rule) - self.selectRow(0) - def populate_table_row(self, row, data): def set_rule_name_in_row(row, col, name=''): @@ -730,7 +769,7 @@ class ExclusionRules(GenericRulesTable): def set_source_field_in_row(row, col, field=''): source_combo = ComboBox(self, sorted(self.eligible_custom_fields.keys(), key=sort_key), field) - source_combo.currentIndexChanged.connect(partial(self.source_index_changed, source_combo, row)) + source_combo.currentIndexChanged.connect(partial(self._source_index_changed, source_combo)) self.setCellWidget(row, col, source_combo) return source_combo @@ -738,7 +777,8 @@ class ExclusionRules(GenericRulesTable): self.blockSignals(True) # Enabled - self.setItem(row, self.COLUMNS['ENABLED']['ordinal'], CheckableTableWidgetItem(data['enabled'])) + check_box = CheckableTableWidgetItem(data['enabled']) + self.setItem(row, self.COLUMNS['ENABLED']['ordinal'], check_box) # Rule name set_rule_name_in_row(row, self.COLUMNS['NAME']['ordinal'], name=data['name']) @@ -748,32 +788,10 @@ class ExclusionRules(GenericRulesTable): # Pattern # The contents of the Pattern field is driven by the Source field - self.source_index_changed(source_combo, row, self.COLUMNS['PATTERN']['ordinal'], pattern=data['pattern']) + self.source_index_changed(source_combo, row, pattern=data['pattern']) self.blockSignals(False) - def source_index_changed(self, combo, row, col, pattern=''): - # Populate the Pattern field based upon the Source field - source_field = str(combo.currentText()) - if source_field == '': - values = [] - elif source_field == 'Tags': - values = sorted(self.db.all_tags(), key=sort_key) - else: - if self.eligible_custom_fields[unicode(source_field)]['datatype'] in ['enumeration', 'text']: - values = self.db.all_custom(self.db.field_metadata.key_to_label( - self.eligible_custom_fields[unicode(source_field)]['field'])) - values = sorted(values, key=sort_key) - elif self.eligible_custom_fields[unicode(source_field)]['datatype'] in ['bool']: - values = ['True','False','unspecified'] - elif self.eligible_custom_fields[unicode(source_field)]['datatype'] in ['composite']: - values = ['any value','unspecified'] - elif self.eligible_custom_fields[unicode(source_field)]['datatype'] in ['datetime']: - values = ['any date','unspecified'] - - values_combo = ComboBox(self, values, pattern) - self.setCellWidget(row, self.COLUMNS['PATTERN']['ordinal'], values_combo) - class PrefixRules(GenericRulesTable): COLUMNS = { 'ENABLED':{'ordinal': 0, 'name': ''}, @@ -784,12 +802,13 @@ class PrefixRules(GenericRulesTable): def __init__(self, parent_gb_hl, object_name, rules, eligible_custom_fields, db): super(PrefixRules, self).__init__(parent_gb_hl, object_name, rules, eligible_custom_fields, db) + self.setObjectName("prefix_rules_table") self._init_table_widget() self._initialize() def _init_table_widget(self): header_labels = [self.COLUMNS[index]['name'] \ - for index in sorted(self.COLUMNS.keys(), key=lambda c: self.COLUMNS[c]['ordinal'])] + for index in sorted(self.COLUMNS.keys(), key=lambda c: self.COLUMNS[c]['ordinal'])] self.setColumnCount(len(header_labels)) self.setHorizontalHeaderLabels(header_labels) self.setSortingEnabled(False) @@ -799,14 +818,14 @@ class PrefixRules(GenericRulesTable): self.generate_prefix_list() self.populate_table() self.resizeColumnsToContents() - self.resize_name(1.5) + self.resize_name() self.horizontalHeader().setStretchLastSection(True) self.clearSelection() def convert_row_to_data(self, row): data = self.create_blank_row_data() data['ordinal'] = row - data['enabled'] = self.item(row,0).checkState() == Qt.Checked + data['enabled'] = self.item(row,self.COLUMNS['ENABLED']['ordinal']).checkState() == Qt.Checked data['name'] = unicode(self.cellWidget(row,self.COLUMNS['NAME']['ordinal']).text()).strip() data['prefix'] = unicode(self.cellWidget(row,self.COLUMNS['PREFIX']['ordinal']).currentText()).strip() data['field'] = unicode(self.cellWidget(row,self.COLUMNS['FIELD']['ordinal']).currentText()).strip() @@ -970,20 +989,6 @@ class PrefixRules(GenericRulesTable): 'prefix':data['prefix']}) return data_items - def populate_table(self): - # Format of rules list is different if default values vs retrieved JSON - # Hack to normalize list style - rules = self.rules - if rules and type(rules[0]) is list: - rules = rules[0] - self.setFocus() - rules = sorted(rules, key=lambda k: k['ordinal']) - for row, rule in enumerate(rules): - self.insertRow(row) - self.select_and_scroll_to_row(row) - self.populate_table_row(row, rule) - self.selectRow(0) - def populate_table_row(self, row, data): def set_prefix_field_in_row(row, col, field=''): @@ -998,14 +1003,12 @@ class PrefixRules(GenericRulesTable): def set_source_field_in_row(row, col, field=''): source_combo = ComboBox(self, sorted(self.eligible_custom_fields.keys(), key=sort_key), field) - source_combo.currentIndexChanged.connect(partial(self.source_index_changed, source_combo, row)) + source_combo.currentIndexChanged.connect(partial(self._source_index_changed, source_combo)) self.setCellWidget(row, col, source_combo) return source_combo - # Entry point self.blockSignals(True) - #print("prefix_rules_populate_table_row processing rule:\n%s\n" % data) # Enabled self.setItem(row, self.COLUMNS['ENABLED']['ordinal'], CheckableTableWidgetItem(data['enabled'])) @@ -1021,31 +1024,7 @@ class PrefixRules(GenericRulesTable): # Pattern # The contents of the Pattern field is driven by the Source field - self.source_index_changed(source_combo, row, self.COLUMNS['PATTERN']['ordinal'], pattern=data['pattern']) + self.source_index_changed(source_combo, row, pattern=data['pattern']) self.blockSignals(False) - def source_index_changed(self, combo, row, col, pattern=''): - # Populate the Pattern field based upon the Source field - # row, col are the control that changed - - source_field = str(combo.currentText()) - if source_field == '': - values = [] - elif source_field == 'Tags': - values = sorted(self.db.all_tags(), key=sort_key) - else: - if self.eligible_custom_fields[unicode(source_field)]['datatype'] in ['enumeration', 'text']: - values = self.db.all_custom(self.db.field_metadata.key_to_label( - self.eligible_custom_fields[unicode(source_field)]['field'])) - values = sorted(values, key=sort_key) - elif self.eligible_custom_fields[unicode(source_field)]['datatype'] in ['bool']: - values = ['True','False','unspecified'] - elif self.eligible_custom_fields[unicode(source_field)]['datatype'] in ['composite']: - values = ['any value','unspecified'] - elif self.eligible_custom_fields[unicode(source_field)]['datatype'] in ['datetime']: - values = ['any date','unspecified'] - - values_combo = ComboBox(self, values, pattern) - self.setCellWidget(row, self.COLUMNS['PATTERN']['ordinal'], values_combo) - diff --git a/src/calibre/gui2/convert/mobi_output.py b/src/calibre/gui2/convert/mobi_output.py index 50b67008d9..ac2bf15164 100644 --- a/src/calibre/gui2/convert/mobi_output.py +++ b/src/calibre/gui2/convert/mobi_output.py @@ -25,7 +25,7 @@ class PluginWidget(Widget, Ui_Form): 'mobi_keep_original_images', 'mobi_ignore_margins', 'mobi_toc_at_start', 'dont_compress', 'no_inline_toc', 'share_not_sync', - 'personal_doc']#, 'mobi_navpoints_only_deepest'] + 'personal_doc', 'mobi_file_type'] ) self.db, self.book_id = db, book_id @@ -48,6 +48,7 @@ class PluginWidget(Widget, Ui_Form): self.font_family_model = font_family_model self.opt_masthead_font.setModel(self.font_family_model) ''' + self.opt_mobi_file_type.addItems(['old', 'both', 'new']) self.initialize_options(get_option, get_help, db, book_id) diff --git a/src/calibre/gui2/convert/mobi_output.ui b/src/calibre/gui2/convert/mobi_output.ui index 2c62b8c27a..8c1c107620 100644 --- a/src/calibre/gui2/convert/mobi_output.ui +++ b/src/calibre/gui2/convert/mobi_output.ui @@ -14,80 +14,10 @@