From 142c3c4feb60f023927dff856c580f8835cae5cd Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 29 Jan 2013 09:01:00 +0530 Subject: [PATCH 1/2] Update Victoria Times --- recipes/vic_times.recipe | 253 ++++++++++++++++++++------------------- 1 file changed, 129 insertions(+), 124 deletions(-) diff --git a/recipes/vic_times.recipe b/recipes/vic_times.recipe index 391cf5eff4..48fb9038aa 100644 --- a/recipes/vic_times.recipe +++ b/recipes/vic_times.recipe @@ -1,105 +1,46 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- - __license__ = 'GPL v3' ''' www.canada.com ''' - import re -from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup +from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Tag, BeautifulStoneSoup -class CanWestPaper(BasicNewsRecipe): +class TimesColonist(BasicNewsRecipe): - # un-comment the following four lines for the Victoria Times Colonist title = u'Victoria Times Colonist' url_prefix = 'http://www.timescolonist.com' description = u'News from Victoria, BC' fp_tag = 'CAN_TC' - # un-comment the following four lines for the Vancouver Province -## title = u'Vancouver Province' -## url_prefix = 'http://www.theprovince.com' -## description = u'News from Vancouver, BC' -## fp_tag = 'CAN_VP' - - # un-comment the following four lines for the Vancouver Sun -## title = u'Vancouver Sun' -## url_prefix = 'http://www.vancouversun.com' -## description = u'News from Vancouver, BC' -## fp_tag = 'CAN_VS' - - # un-comment the following four lines for the Edmonton Journal -## title = u'Edmonton Journal' -## url_prefix = 'http://www.edmontonjournal.com' -## description = u'News from Edmonton, AB' -## fp_tag = 'CAN_EJ' - - # un-comment the following four lines for the Calgary Herald -## title = u'Calgary Herald' -## url_prefix = 'http://www.calgaryherald.com' -## description = u'News from Calgary, AB' -## fp_tag = 'CAN_CH' - - # un-comment the following four lines for the Regina Leader-Post -## title = u'Regina Leader-Post' -## url_prefix = 'http://www.leaderpost.com' -## description = u'News from Regina, SK' -## fp_tag = '' - - # un-comment the following four lines for the Saskatoon Star-Phoenix -## title = u'Saskatoon Star-Phoenix' -## url_prefix = 'http://www.thestarphoenix.com' -## description = u'News from Saskatoon, SK' -## fp_tag = '' - - # un-comment the following four lines for the Windsor Star -## title = u'Windsor Star' -## url_prefix = 'http://www.windsorstar.com' -## description = u'News from Windsor, ON' -## fp_tag = 'CAN_' - - # un-comment the following four lines for the Ottawa Citizen -## title = u'Ottawa Citizen' -## url_prefix = 'http://www.ottawacitizen.com' -## description = u'News from Ottawa, ON' -## fp_tag = 'CAN_OC' - - # un-comment the following four lines for the Montreal Gazette -## title = u'Montreal Gazette' -## url_prefix = 'http://www.montrealgazette.com' -## description = u'News from Montreal, QC' -## fp_tag = 'CAN_MG' - - + url_list = [] language = 'en_CA' __author__ = 'Nick Redding' no_stylesheets = True - timefmt = ' [%b %d]' + timefmt = ' [%b %d]' + encoding = 'utf-8' extra_css = ''' - .timestamp { font-size:xx-small; display: block; } - #storyheader { font-size: medium; } - #storyheader h1 { font-size: x-large; } - #storyheader h2 { font-size: large; font-style: italic; } - .byline { font-size:xx-small; } - #photocaption { font-size: small; font-style: italic } - #photocredit { font-size: xx-small; }''' - keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + .byline { font-size:xx-small; font-weight: bold;} + h3 { margin-bottom: 6px; } + .caption { font-size: xx-small; font-style: italic; font-weight: normal; } + ''' + keep_only_tags = [dict(name='div', attrs={'class':re.compile('main.content')})] remove_tags = [{'class':'comments'}, - dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), - dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), - dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), - dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), - dict(name='div', attrs={'class':'rule_grey_solid'}), - dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + {'id':'photocredit'}, + dict(name='div', attrs={'class':re.compile('top.controls')}), + dict(name='div', attrs={'class':re.compile('social')}), + dict(name='div', attrs={'class':re.compile('tools')}), + dict(name='div', attrs={'class':re.compile('bottom.tools')}), + dict(name='div', attrs={'class':re.compile('window')}), + dict(name='div', attrs={'class':re.compile('related.news.element')})] + def get_cover_url(self): from datetime import timedelta, date - if self.fp_tag=='': - return None cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg' br = BasicNewsRecipe.get_browser(self) daysback=1 @@ -120,6 +61,18 @@ class CanWestPaper(BasicNewsRecipe): cover = None return cover + def prepare_masthead_image(self, path_to_image, out_path): + if self.Kindle_Fire: + from calibre.utils.magick import Image, create_canvas + img = Image() + img.open(path_to_image) + width, height = img.size + img2 = create_canvas(width, height) + img2.compose(img) + img2.save(out_path) + else: + BasicNewsRecipe.prepare_masthead_image(path_to_image, out_path) + def fixChars(self,string): # Replace lsquo (\x91) fixed = re.sub("\x91","‘",string) @@ -166,55 +119,107 @@ class CanWestPaper(BasicNewsRecipe): a.replaceWith(a.renderContents().decode('cp1252','replace')) return soup - def preprocess_html(self, soup): + def preprocess_html(self,soup): + byline = soup.find('p',attrs={'class':re.compile('ancillary')}) + if byline is not None: + byline.find('a') + authstr = self.tag_to_string(byline,False) + authstr = re.sub('/ *Times Colonist','/',authstr, flags=re.IGNORECASE) + authstr = re.sub('BY */','',authstr, flags=re.IGNORECASE) + newdiv = Tag(soup,'div') + newdiv.insert(0,authstr) + newdiv['class']='byline' + byline.replaceWith(newdiv) + for caption in soup.findAll('p',attrs={'class':re.compile('caption')}): + capstr = self.tag_to_string(caption,False) + capstr = re.sub('Photograph by.*$','',capstr, flags=re.IGNORECASE) + newdiv = Tag(soup,'div') + newdiv.insert(0,capstr) + newdiv['class']='caption' + caption.replaceWith(newdiv) + for ptag in soup.findAll('p'): + ptext = self.tag_to_string(ptag,use_alt=False, normalize_whitespace=True) + ptext = re.sub(r'\s+','', ptext) + if (ptext=='') or (ptext==' '): + ptag.extract() return self.strip_anchors(soup) + raeside = False + def handle_articles(self,htag,article_list,sectitle): + atag = htag.a + if atag is not None: + url = atag['href'] + #print("Checking "+url) + if atag['href'].startswith('/'): + url = self.url_prefix+atag['href'] + if url in self.url_list: + return + self.url_list.append(url) + title = self.tag_to_string(atag,False) + if 'VIDEO' in title.upper(): + return + if 'GALLERY' in title.upper(): + return + if 'PHOTOS' in title.upper(): + return + if 'RAESIDE' in title.upper(): + if self.raeside: + return + self.raeside = True + dtag = htag.findNext('p') + description='' + if dtag is not None: + description = self.tag_to_string(dtag,False) + article_list.append(dict(title=title,url=url,date='',description=description,author='',content='')) + #print(sectitle+title+": description = "+description+" URL="+url) + def add_section_index(self,ans,securl,sectitle): + print("Add section url="+self.url_prefix+'/'+securl) + try: + soup = self.index_to_soup(self.url_prefix+'/'+securl) + except: + return ans + mainsoup = soup.find('div',attrs={'class':re.compile('main.content')}) + article_list = [] + for wdiv in mainsoup.findAll('div',attrs={'id':re.compile('featured.story')}): + for htag in wdiv.findAll('h3'): + self.handle_articles(htag,article_list,sectitle) + for ladiv in mainsoup.findAll(attrs={'class':re.compile('leading.articles')}): + for wdiv in mainsoup.findAll('div',attrs={'class':re.compile('article.row')}): + for htag in wdiv.findAll('h2'): + self.handle_articles(htag,article_list,sectitle) + ans.append((sectitle,article_list)) + return ans def parse_index(self): - soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') - - articles = {} - key = 'News' - ans = ['News'] - - # Find each instance of class="sectiontitle", class="featurecontent" - for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): - #self.log(" div class = %s" % divtag['class']) - if divtag['class'].startswith('section_title'): - # div contains section title - if not divtag.h3: - continue - key = self.tag_to_string(divtag.h3,False) - ans.append(key) - self.log("Section name %s" % key) - continue - # div contains article data - h1tag = divtag.find('h1') - if not h1tag: - continue - atag = h1tag.find('a',href=True) - if not atag: - continue - url = self.url_prefix+'/news/todays-paper/'+atag['href'] - #self.log("Section %s" % key) - #self.log("url %s" % url) - title = self.tag_to_string(atag,False) - #self.log("title %s" % title) - pubdate = '' - description = '' - ptag = divtag.find('p'); - if ptag: - description = self.tag_to_string(ptag,False) - #self.log("description %s" % description) - author = '' - autag = divtag.find('h4') - if autag: - author = self.tag_to_string(autag,False) - #self.log("author %s" % author) - if not articles.has_key(key): - articles[key] = [] - articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) - - ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + ans = [] + ans = self.add_section_index(ans,'','Web Front Page') + ans = self.add_section_index(ans,'news/','News Headlines') + ans = self.add_section_index(ans,'news/b-c/','BC News') + ans = self.add_section_index(ans,'news/national/','Natioanl News') + ans = self.add_section_index(ans,'news/world/','World News') + ans = self.add_section_index(ans,'opinion/','Opinion') + ans = self.add_section_index(ans,'opinion/letters/','Letters') + ans = self.add_section_index(ans,'business/','Business') + ans = self.add_section_index(ans,'business/money/','Money') + ans = self.add_section_index(ans,'business/technology/','Technology') + ans = self.add_section_index(ans,'business/working/','Working') + ans = self.add_section_index(ans,'sports/','Sports') + ans = self.add_section_index(ans,'sports/hockey/','Hockey') + ans = self.add_section_index(ans,'sports/football/','Football') + ans = self.add_section_index(ans,'sports/basketball/','Basketball') + ans = self.add_section_index(ans,'sports/golf/','Golf') + ans = self.add_section_index(ans,'entertainment/','entertainment') + ans = self.add_section_index(ans,'entertainment/go/','Go!') + ans = self.add_section_index(ans,'entertainment/music/','Music') + ans = self.add_section_index(ans,'entertainment/books/','Books') + ans = self.add_section_index(ans,'entertainment/Movies/','movies') + ans = self.add_section_index(ans,'entertainment/television/','Television') + ans = self.add_section_index(ans,'life/','Life') + ans = self.add_section_index(ans,'life/health/','Health') + ans = self.add_section_index(ans,'life/travel/','Travel') + ans = self.add_section_index(ans,'life/driving/','Driving') + ans = self.add_section_index(ans,'life/homes/','Homes') + ans = self.add_section_index(ans,'life/food-drink/','Food & Drink') return ans + From 2bb7ed5442b3f375c98b802a42baa99cb17704ad Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 29 Jan 2013 09:37:55 +0530 Subject: [PATCH 2/2] When downloading metadata from amazon, do not auto-correct the title case if the books language is not English --- src/calibre/ebooks/metadata/sources/amazon.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/metadata/sources/amazon.py b/src/calibre/ebooks/metadata/sources/amazon.py index 63783ba8eb..7df57a5586 100644 --- a/src/calibre/ebooks/metadata/sources/amazon.py +++ b/src/calibre/ebooks/metadata/sources/amazon.py @@ -614,10 +614,14 @@ class Amazon(Source): return domain def clean_downloaded_metadata(self, mi): - if mi.title and self.domain in ('com', 'uk'): + docase = ( + mi.language == 'eng' or + (mi.is_null('language') and self.domain in {'com', 'uk'}) + ) + if mi.title and docase: mi.title = fixcase(mi.title) mi.authors = fixauthors(mi.authors) - if self.domain in ('com', 'uk'): + if mi.tags and docase: mi.tags = list(map(fixcase, mi.tags)) mi.isbn = check_isbn(mi.isbn)