Merge from trunk

2025-11-13 01:57:00 -05:00 · 2013-01-29 07:27:18 +01:00 · 2013-01-29 07:27:18 +01:00 · 3fe0662ea7
commit 3fe0662ea7
parent 351f25293c 2bb7ed5442
2 changed files with 135 additions and 126 deletions
--- a/recipes/vic_times.recipe
+++ b/recipes/vic_times.recipe
@ -1,105 +1,46 @@
 #!/usr/bin/env  python
 # -*- coding: utf-8 -*-
-
 __license__   = 'GPL v3'

 '''
 www.canada.com
 '''
-
 import re
-from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
+from calibre.web.feeds.recipes import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import Tag, BeautifulStoneSoup


-class CanWestPaper(BasicNewsRecipe):
+class TimesColonist(BasicNewsRecipe):

-    # un-comment the following four lines for the Victoria Times Colonist
    title = u'Victoria Times Colonist'
    url_prefix = 'http://www.timescolonist.com'
    description = u'News from Victoria, BC'
    fp_tag = 'CAN_TC'

-    # un-comment the following four lines for the Vancouver Province
-##    title = u'Vancouver Province'
-##    url_prefix = 'http://www.theprovince.com'
-##    description = u'News from Vancouver, BC'
-##    fp_tag = 'CAN_VP'
-
-    # un-comment the following four lines for the Vancouver Sun
-##    title = u'Vancouver Sun'
-##    url_prefix = 'http://www.vancouversun.com'
-##    description = u'News from Vancouver, BC'
-##    fp_tag = 'CAN_VS'
-
-    # un-comment the following four lines for the Edmonton Journal
-##    title = u'Edmonton Journal'
-##    url_prefix = 'http://www.edmontonjournal.com'
-##    description = u'News from Edmonton, AB'
-##    fp_tag = 'CAN_EJ'
-
-    # un-comment the following four lines for the Calgary Herald
-##    title = u'Calgary Herald'
-##    url_prefix = 'http://www.calgaryherald.com'
-##    description = u'News from Calgary, AB'
-##    fp_tag = 'CAN_CH'
-
-    # un-comment the following four lines for the Regina Leader-Post
-##    title = u'Regina Leader-Post'
-##    url_prefix = 'http://www.leaderpost.com'
-##    description = u'News from Regina, SK'
-##    fp_tag = ''
-
-    # un-comment the following four lines for the Saskatoon Star-Phoenix
-##    title = u'Saskatoon Star-Phoenix'
-##    url_prefix = 'http://www.thestarphoenix.com'
-##    description = u'News from Saskatoon, SK'
-##    fp_tag = ''
-
-    # un-comment the following four lines for the Windsor Star
-##    title = u'Windsor Star'
-##    url_prefix = 'http://www.windsorstar.com'
-##    description = u'News from Windsor, ON'
-##    fp_tag = 'CAN_'
-
-    # un-comment the following four lines for the Ottawa Citizen
-##    title = u'Ottawa Citizen'
-##    url_prefix = 'http://www.ottawacitizen.com'
-##    description = u'News from Ottawa, ON'
-##    fp_tag = 'CAN_OC'
-
-    # un-comment the following four lines for the Montreal Gazette
-##    title = u'Montreal Gazette'
-##    url_prefix = 'http://www.montrealgazette.com'
-##    description = u'News from Montreal, QC'
-##    fp_tag = 'CAN_MG'
-
-
+    url_list = []
    language = 'en_CA'
    __author__ = 'Nick Redding'
    no_stylesheets = True
    timefmt =  ' [%b %d]'
+    encoding = 'utf-8'
    extra_css = '''
-                .timestamp {  font-size:xx-small; display: block; }
-                #storyheader { font-size: medium; }
-                #storyheader h1 { font-size: x-large; }
-                #storyheader h2 { font-size: large;  font-style: italic; }
-                .byline { font-size:xx-small; }
-                #photocaption { font-size: small; font-style: italic }
-                #photocredit { font-size: xx-small; }'''
-    keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
+                .byline { font-size:xx-small; font-weight: bold;}
+                h3 { margin-bottom: 6px; }
+                .caption { font-size: xx-small; font-style: italic; font-weight: normal; }
+                '''
+    keep_only_tags = [dict(name='div', attrs={'class':re.compile('main.content')})]
    remove_tags = [{'class':'comments'},
-                   dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
-                   dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
-                   dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
-                   dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
-                   dict(name='div', attrs={'class':'rule_grey_solid'}),
-                   dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
+                   {'id':'photocredit'},
+                   dict(name='div', attrs={'class':re.compile('top.controls')}),
+                   dict(name='div', attrs={'class':re.compile('social')}),
+                   dict(name='div', attrs={'class':re.compile('tools')}),
+                   dict(name='div', attrs={'class':re.compile('bottom.tools')}),
+                   dict(name='div', attrs={'class':re.compile('window')}),
+                   dict(name='div', attrs={'class':re.compile('related.news.element')})]
+

    def get_cover_url(self):
        from datetime import timedelta, date
-        if self.fp_tag=='':
-            return None
        cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
        br = BasicNewsRecipe.get_browser(self)
        daysback=1
@ -120,6 +61,18 @@ class CanWestPaper(BasicNewsRecipe):
            cover = None
        return cover

+    def prepare_masthead_image(self, path_to_image, out_path):
+        if self.Kindle_Fire:
+            from calibre.utils.magick import Image, create_canvas
+            img = Image()
+            img.open(path_to_image)
+            width, height = img.size
+            img2 = create_canvas(width, height)
+            img2.compose(img)
+            img2.save(out_path)
+        else:
+            BasicNewsRecipe.prepare_masthead_image(path_to_image, out_path)
+
    def fixChars(self,string):
        # Replace lsquo (\x91)
        fixed = re.sub("\x91","‘",string)
@ -167,54 +120,106 @@ class CanWestPaper(BasicNewsRecipe):
        return soup

    def preprocess_html(self,soup):
+        byline = soup.find('p',attrs={'class':re.compile('ancillary')})
+        if byline is not None:
+            byline.find('a')
+            authstr = self.tag_to_string(byline,False)
+            authstr = re.sub('/ *Times Colonist','/',authstr, flags=re.IGNORECASE)
+            authstr = re.sub('BY */','',authstr, flags=re.IGNORECASE)
+            newdiv = Tag(soup,'div')
+            newdiv.insert(0,authstr)
+            newdiv['class']='byline'
+            byline.replaceWith(newdiv)
+        for caption in soup.findAll('p',attrs={'class':re.compile('caption')}):
+            capstr = self.tag_to_string(caption,False)
+            capstr = re.sub('Photograph by.*$','',capstr, flags=re.IGNORECASE)
+            newdiv = Tag(soup,'div')
+            newdiv.insert(0,capstr)
+            newdiv['class']='caption'
+            caption.replaceWith(newdiv)
+        for ptag in soup.findAll('p'):
+            ptext = self.tag_to_string(ptag,use_alt=False, normalize_whitespace=True)
+            ptext = re.sub(r'\s+','', ptext)
+            if (ptext=='') or (ptext=='&nbsp;'):
+                ptag.extract()
        return self.strip_anchors(soup)

+    raeside = False
+    def handle_articles(self,htag,article_list,sectitle):
+        atag = htag.a
+        if atag is not None:
+            url = atag['href']
+            #print("Checking "+url)
+            if atag['href'].startswith('/'):
+                url = self.url_prefix+atag['href']
+            if url in self.url_list:
+                return
+            self.url_list.append(url)
+            title = self.tag_to_string(atag,False)
+            if 'VIDEO' in title.upper():
+                return
+            if 'GALLERY' in title.upper():
+                return
+            if 'PHOTOS' in title.upper():
+                return
+            if 'RAESIDE' in title.upper():
+                if self.raeside:
+                    return
+                self.raeside = True
+            dtag = htag.findNext('p')
+            description=''
+            if dtag is not None:
+                description = self.tag_to_string(dtag,False)
+            article_list.append(dict(title=title,url=url,date='',description=description,author='',content=''))
+            #print(sectitle+title+": description = "+description+" URL="+url)

+    def add_section_index(self,ans,securl,sectitle):
+        print("Add section url="+self.url_prefix+'/'+securl)
+        try:
+            soup = self.index_to_soup(self.url_prefix+'/'+securl)
+        except:
+            return ans
+        mainsoup = soup.find('div',attrs={'class':re.compile('main.content')})
+        article_list = []
+        for wdiv in mainsoup.findAll('div',attrs={'id':re.compile('featured.story')}):
+            for htag in wdiv.findAll('h3'):
+                self.handle_articles(htag,article_list,sectitle)
+        for ladiv in mainsoup.findAll(attrs={'class':re.compile('leading.articles')}):
+            for wdiv in mainsoup.findAll('div',attrs={'class':re.compile('article.row')}):
+                for htag in wdiv.findAll('h2'):
+                    self.handle_articles(htag,article_list,sectitle)
+        ans.append((sectitle,article_list))
+        return ans

    def parse_index(self):
-        soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
-
-        articles = {}
-        key = 'News'
-        ans = ['News']
-
-        # Find each instance of class="sectiontitle", class="featurecontent"
-        for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
-                #self.log(" div class = %s" % divtag['class'])
-                if divtag['class'].startswith('section_title'):
-                    # div contains section title
-                    if not divtag.h3:
-                        continue
-                    key = self.tag_to_string(divtag.h3,False)
-                    ans.append(key)
-                    self.log("Section name %s" % key)
-                    continue
-                # div contains article data
-                h1tag = divtag.find('h1')
-                if not h1tag:
-                    continue
-                atag = h1tag.find('a',href=True)
-                if not atag:
-                    continue
-                url = self.url_prefix+'/news/todays-paper/'+atag['href']
-                #self.log("Section %s" % key)
-                #self.log("url %s" % url)
-                title = self.tag_to_string(atag,False)
-                #self.log("title %s" % title)
-                pubdate = ''
-                description = ''
-                ptag = divtag.find('p');
-                if ptag:
-                    description = self.tag_to_string(ptag,False)
-                    #self.log("description %s" % description)
-                author = ''
-                autag = divtag.find('h4')
-                if autag:
-                    author = self.tag_to_string(autag,False)
-                    #self.log("author %s" % author)
-                if not articles.has_key(key):
-                    articles[key] = []
-                articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
-
-        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
+        ans = []
+        ans = self.add_section_index(ans,'','Web Front Page')
+        ans = self.add_section_index(ans,'news/','News Headlines')
+        ans = self.add_section_index(ans,'news/b-c/','BC News')
+        ans = self.add_section_index(ans,'news/national/','Natioanl News')
+        ans = self.add_section_index(ans,'news/world/','World News')
+        ans = self.add_section_index(ans,'opinion/','Opinion')
+        ans = self.add_section_index(ans,'opinion/letters/','Letters')
+        ans = self.add_section_index(ans,'business/','Business')
+        ans = self.add_section_index(ans,'business/money/','Money')
+        ans = self.add_section_index(ans,'business/technology/','Technology')
+        ans = self.add_section_index(ans,'business/working/','Working')
+        ans = self.add_section_index(ans,'sports/','Sports')
+        ans = self.add_section_index(ans,'sports/hockey/','Hockey')
+        ans = self.add_section_index(ans,'sports/football/','Football')
+        ans = self.add_section_index(ans,'sports/basketball/','Basketball')
+        ans = self.add_section_index(ans,'sports/golf/','Golf')
+        ans = self.add_section_index(ans,'entertainment/','entertainment')
+        ans = self.add_section_index(ans,'entertainment/go/','Go!')
+        ans = self.add_section_index(ans,'entertainment/music/','Music')
+        ans = self.add_section_index(ans,'entertainment/books/','Books')
+        ans = self.add_section_index(ans,'entertainment/Movies/','movies')
+        ans = self.add_section_index(ans,'entertainment/television/','Television')
+        ans = self.add_section_index(ans,'life/','Life')
+        ans = self.add_section_index(ans,'life/health/','Health')
+        ans = self.add_section_index(ans,'life/travel/','Travel')
+        ans = self.add_section_index(ans,'life/driving/','Driving')
+        ans = self.add_section_index(ans,'life/homes/','Homes')
+        ans = self.add_section_index(ans,'life/food-drink/','Food & Drink')
        return ans
+
--- a/src/calibre/ebooks/metadata/sources/amazon.py
+++ b/src/calibre/ebooks/metadata/sources/amazon.py
@ -614,10 +614,14 @@ class Amazon(Source):
        return domain

    def clean_downloaded_metadata(self, mi):
-        if mi.title and self.domain in ('com', 'uk'):
+        docase = (
+            mi.language == 'eng' or
+            (mi.is_null('language') and self.domain in {'com', 'uk'})
+        )
+        if mi.title and docase:
            mi.title = fixcase(mi.title)
        mi.authors = fixauthors(mi.authors)
-        if self.domain in ('com', 'uk'):
+        if mi.tags and docase:
            mi.tags = list(map(fixcase, mi.tags))
        mi.isbn = check_isbn(mi.isbn)