From ff081d1515b95f8299ead074dc22d83bb66477ed Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 26 Jan 2010 01:59:10 -0700
Subject: [PATCH] New recipe for Macleans Magazine by Nick Redding and improved
 recipe for Raleigh News and Observer

---
 resources/images/news/observer.png | Bin 0 -> 835 bytes
 resources/recipes/macleans.recipe  | 239 +++++++++++++++++++++++++++++
 resources/recipes/observer.recipe  |  37 +++--
 3 files changed, 262 insertions(+), 14 deletions(-)
 create mode 100644 resources/images/news/observer.png
 create mode 100644 resources/recipes/macleans.recipe
diff --git a/resources/images/news/observer.png b/resources/images/news/observer.png
new file mode 100644
index 0000000000000000000000000000000000000000..5fbb7a6ccc5d89e65aab9f7263ea81dab39db99b
GIT binary patch
literal 835
zcmV-J1HAl+P)<h;3K|Lk000e1NJLTq000mG000mO1^@s6AM^iV00004b3#c}2nYxW
zd<bNS0008|Nkl<ZIE`hK&2Lm?6vlt=d*3^EI-PrGTBg&0A}#$wVzgRiQa&`YVZk3@
zT$nDD7$SiM0d~mFrHLywAt8zeV<aIi8Y4BLu}!UlA8o~AX{DvZOqpq?gPrNUciz`U
zhtcShoSd^dPtJLsN7QOHb8q1ex4s@>BcB5d@K5{K079Ttgw|cV*s*6X&8~~c{W)*0
zoH|a^%oJWMhO{h<)(9yZ9b*~`Fb2FNv4IRbhmK$!*w%JVPmI#)#*vnVkdjE;C7szz
zYySYsaqw4H5K?ZIWg)djZEP^VwuV~1J<FqsBvMKcf<$*OPn>y^=#~`PuOcH+3O|f9
z`Sx3c>+6k^^!i^cO;4ekBMvH&MAgsS`SK7==`>4U4m0=Z$2@uZEUC;OPFtG7wXeWv
zgrzWAQ$?Xv#6~&}wo=rpRcy}#1o`n%vZLSdloaSNq>|6!mrH;EK-xA2kTULK#XZW4
z3uGt8SiCw67<L~!$)PJF#5=mkj(kCJ`d1=e0^9XK2!s%5Kt`fbR(`%t;kQX_*CV}W
z5P(QBiQCn~-K)cloqU7TK!(F3;~czr8EIK{Q3TR|P^PrB06_nlv!tFK<iXr+R;MQc
z*wQ~hOW)&cKXjOADoydHAJJg|7;FJ5e`^NE_lYGEwC{frfa&+%A@|)iUO0aVueXnV
z?|y)a#n6Gz?-$+&1jayM46=N0X;YRUzyuorxQ}&G$>;d?^;ZZ=C8BL@sOC7c7ca0j
zH(NKTK|w=3!fKWI&pu)IiPQAGdX)Bk&ttSkI!@#BUQaKoDMnaXZ#dIB*xhk1UdW?_
z;QsXqjPK+1^bz0DNmII=^2|-<E?<Hmz=}9X%cAhh4G>V?-ii3=)CqI<!}Dx&>^jdT
zqAe|i)hc17!hc~ftkewQOGBswuN~#{Y?k7U?~$b<;$d)>mkA9hApk?ofUGkVYX<Xl
zhW^3*2;cY3YO%=T>>o^xk8ywICP5IQwf-OT!$7)QxAMp{&(YD<&EFK*D%r8rnxX&z
N002ovPDHLkV1hsDiQoVL

literal 0
HcmV?d00001

diff --git a/resources/recipes/macleans.recipe b/resources/recipes/macleans.recipe
new file mode 100644
index 0000000000..296a56f5f3
--- /dev/null
+++ b/resources/recipes/macleans.recipe
@@ -0,0 +1,239 @@
+#!/usr/bin/env  python
+
+__license__   = 'GPL v3'
+
+'''
+macleans.ca
+'''
+from calibre.web.feeds.recipes import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import Tag
+from datetime import timedelta, date
+
+class Macleans(BasicNewsRecipe):
+    title          = u'Macleans Magazine'
+    __author__     = 'Nick Redding'
+    language = 'en_CA'
+    description = ('Macleans Magazine')
+
+    no_stylesheets = True
+    timefmt = ' [%b %d]'
+
+    # customization notes: delete sections you are not interested in
+    # set oldest_article to the maximum number of days back from today to include articles
+    sectionlist = [
+                        ['http://www2.macleans.ca/','Front Page'],
+                        ['http://www2.macleans.ca/category/canada/','Canada'],
+                        ['http://www2.macleans.ca/category/world-from-the-magazine/','World'],
+                        ['http://www2.macleans.ca/category/business','Business'],
+                        ['http://www2.macleans.ca/category/arts-culture/','Culture'],
+                        ['http://www2.macleans.ca/category/opinion','Opinion'],
+                        ['http://www2.macleans.ca/category/health-from-the-magazine/','Health'],
+                        ['http://www2.macleans.ca/category/environment-from-the-magazine/','Environment'],
+                        ['http://www2.macleans.ca/category/education/','On Campus'],
+                        ['http://www2.macleans.ca/category/travel-from-the-magazine/','Travel']
+                    ]
+    oldest_article = 7
+
+    # formatting for print version of articles
+    extra_css   =   '''h2{font-family:Times,serif; font-size:large;}
+                        small {font-family:Times,serif; font-size:xx-small; list-style-type: none;}
+                    '''
+
+    # tag handling for print version of articles
+    keep_only_tags = [dict(id='tw-print')]
+    remove_tags =   [dict({'class':'postmetadata'})]
+
+
+    def preprocess_html(self,soup):
+        for img_tag in soup.findAll('img'):
+            parent_tag = img_tag.parent
+            if parent_tag.name == 'a':
+                new_tag = Tag(soup,'p')
+                new_tag.insert(0,img_tag)
+                parent_tag.replaceWith(new_tag)
+            elif parent_tag.name == 'p':
+                if not self.tag_to_string(parent_tag) == '':
+                    new_div = Tag(soup,'div')
+                    new_tag = Tag(soup,'p')
+                    new_tag.insert(0,img_tag)
+                    parent_tag.replaceWith(new_div)
+                    new_div.insert(0,new_tag)
+                    new_div.insert(1,parent_tag)
+        return soup
+
+    def parse_index(self):
+
+
+
+        articles = {}
+        key = None
+        ans = []
+
+        def parse_index_page(page_url,page_title):
+
+            def decode_date(datestr):
+                dmysplit = datestr.strip().lower().split(',')
+                mdsplit = dmysplit[1].split()
+                m = ['january','february','march','april','may','june','july','august','september','october','november','december'].index(mdsplit[0])+1
+                d = int(mdsplit[1])
+                y = int(dmysplit[2].split()[0])
+                return date(y,m,d)
+
+            def article_title(tag):
+                atag = tag.find('a',href=True)
+                if not atag:
+                    return ''
+                return self.tag_to_string(atag)
+
+            def article_url(tag):
+                atag = tag.find('a',href=True)
+                if not atag:
+                    return ''
+                return atag['href']+'print/'
+
+            def article_description(tag):
+                for p_tag in tag.findAll('p'):
+                    d = self.tag_to_string(p_tag,False)
+                    if not d == '':
+                        return d
+                return ''
+
+            def compound_h4_h3_title(tag):
+                if tag.h4:
+                    if tag.h3:
+                        return self.tag_to_string(tag.h4,False)+u'\u2014'+self.tag_to_string(tag.h3,False)
+                    else:
+                        return self.tag_to_string(tag.h4,False)
+                elif tag.h3:
+                    return self.tag_to_string(tag.h3,False)
+                else:
+                    return ''
+
+            def compound_h2_h4_title(tag):
+                if tag.h2:
+                    if tag.h4:
+                        return self.tag_to_string(tag.h2,False)+u'\u2014'+self.tag_to_string(tag.h4,False)
+                    else:
+                        return self.tag_to_string(tag.h2,False)
+                elif tag.h4:
+                    return self.tag_to_string(tag.h4,False)
+                else:
+                    return ''
+
+
+            def handle_article(header_tag, outer_tag):
+                if header_tag:
+                    url = article_url(header_tag)
+                    title = article_title(header_tag)
+                    author_date_tag = outer_tag.h4
+                    if author_date_tag:
+                        author_date = self.tag_to_string(author_date_tag,False).split(' - ')
+                        author = author_date[0].strip()
+                        article_date = decode_date(author_date[1])
+                        earliest_date = date.today() - timedelta(days=self.oldest_article)
+                        if article_date < earliest_date:
+                            self.log("Skipping article dated %s" % author_date[1])
+                        else:
+                            excerpt_div = outer_tag.find('div','excerpt')
+                            if excerpt_div:
+                                description = article_description(excerpt_div)
+                            else:
+                                description = ''
+                            if not articles.has_key(page_title):
+                                articles[page_title] = []
+                            articles[page_title].append(dict(title=title,url=url,date=author_date[1],description=description,author=author,content=''))
+
+            def handle_category_article(cat, header_tag, outer_tag):
+                url = article_url(header_tag)
+                title = article_title(header_tag)
+                if not title == '':
+                    title = cat+u'\u2014'+title
+                a_tag = outer_tag.find('span','authorLink')
+                if a_tag:
+                    author = self.tag_to_string(a_tag,False)
+                    a_tag.parent.extract()
+                else:
+                    author = ''
+                description = article_description(outer_tag)
+                if not articles.has_key(page_title):
+                    articles[page_title] = []
+                articles[page_title].append(dict(title=title,url=url,date='',description=description,author=author,content=''))
+
+
+            soup = self.index_to_soup(page_url)
+
+            if page_title == 'Front Page':
+                # special processing for the front page
+                top_stories = soup.find('div',{ "id" : "macleansFeatured" })
+                if top_stories:
+                    for div_slide in top_stories.findAll('div','slide'):
+                        url = article_url(div_slide)
+                        div_title = div_slide.find('div','header')
+                        if div_title:
+                            title = self.tag_to_string(div_title,False)
+                        else:
+                            title = ''
+                        description = article_description(div_slide)
+                        if not articles.has_key(page_title):
+                             articles[page_title] = []
+                        articles[page_title].append(dict(title=title,url=url,date='',description=description,author='',content=''))
+
+                from_macleans = soup.find('div',{ "id" : "fromMacleans" })
+                if from_macleans:
+                    for li_tag in from_macleans.findAll('li','fromMacleansArticle'):
+                        title = compound_h4_h3_title(li_tag)
+                        url = article_url(li_tag)
+                        description = article_description(li_tag)
+                        if not articles.has_key(page_title):
+                            articles[page_title] = []
+                        articles[page_title].append(dict(title=title,url=url,date='',description=description,author='',content=''))
+
+                blog_central = soup.find('div',{ "id" : "bloglist" })
+                if blog_central:
+                    for li_tag in blog_central.findAll('li'):
+                        title = compound_h2_h4_title(li_tag)
+                        if li_tag.h4:
+                            url = article_url(li_tag.h4)
+                            if not articles.has_key(page_title):
+                                articles[page_title] = []
+                            articles[page_title].append(dict(title=title,url=url,date='',description='',author='',content=''))
+
+#                need_to_know = soup.find('div',{ "id" : "needToKnow" })
+#                if need_to_know:
+#                    for div_tag in need_to_know('div',attrs={'class' : re.compile("^needToKnowArticle")}):
+#                        title = compound_h4_h3_title(div_tag)
+#                        url = article_url(div_tag)
+#                        description = article_description(div_tag)
+#                        if not articles.has_key(page_title):
+#                            articles[page_title] = []
+#                        articles[page_title].append(dict(title=title,url=url,date='',description=description,author='',content=''))
+
+                for news_category in soup.findAll('div','newsCategory'):
+                    news_cat = self.tag_to_string(news_category.h4,False)
+                    handle_category_article(news_cat, news_category.find('h2'), news_category.find('div'))
+                    for news_item in news_category.findAll('li'):
+                        handle_category_article(news_cat,news_item.h3,news_item)
+
+                return
+
+            # find the div containing the highlight article
+            div_post = soup.find('div','post')
+            if div_post:
+                h1_tag = div_post.h1
+                handle_article(h1_tag,div_post)
+
+            # find the divs containing the rest of the articles
+            div_other = div_post.find('div', { "id" : "categoryOtherPosts" })
+            if div_other:
+                for div_entry in div_other.findAll('div','entry'):
+                    h2_tag = div_entry.h2
+                    handle_article(h2_tag,div_entry)
+
+
+
+        for page_name,page_title in self.sectionlist:
+            parse_index_page(page_name,page_title)
+            ans.append(page_title)
+
+        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
+        return ans
diff --git a/resources/recipes/observer.recipe b/resources/recipes/observer.recipe
index 139d1ff7d4..dec9da8f37 100644
--- a/resources/recipes/observer.recipe
+++ b/resources/recipes/observer.recipe
@@ -1,31 +1,40 @@
 from calibre.web.feeds.news import BasicNewsRecipe
 
 class NewsandObserver(BasicNewsRecipe):
-    title          = u'News and Observer'
+    title          = u'Raleigh News & Observer'
     description = 'News from Raleigh, North Carolina'
     language       = 'en'
-    __author__     = 'Krittika Goyal'
-    oldest_article = 5 #days
+    __author__     = 'Krittika Goyal updated by Walt Anthony'
+    oldest_article = 3 #days
     max_articles_per_feed = 25
+    summary_length = 150
+
+    no_stylesheets    = True
+    remove_javascript = True
 
-    remove_stylesheets = True
     remove_tags_before = dict(name='h1', attrs={'id':'story_headline'})
-    remove_tags_after  = dict(name='div', attrs={'id':'story_text_remaining'})
+    remove_tags_after   = dict(name='div', attrs={'id':'story_text_remaining'})
+
+
     remove_tags = [
        dict(name='iframe'),
-       dict(name='div', attrs={'id':['right-rail', 'story_tools']}),
+       dict(name='div', attrs={'id':['right-rail', 'story_tools', 'toolbox', 'toolbar', 'tool', 'shirttail', 'comment_widget', 'story_keywords', 'txtResizeTool']}),
+       dict(name='div', attrs={'class':['Buy-It-Now', 'story_link_share']}),
        dict(name='ul', attrs={'class':'bold_tabs_nav'}),
+
     ]
 
 
+
     feeds = [
-        ('Cover', 'http://www.newsobserver.com/100/index.rss'),
-        ('News', 'http://www.newsobserver.com/102/index.rss'),
-        ('Politics', 'http://www.newsobserver.com/105/index.rss'),
-        ('Business', 'http://www.newsobserver.com/104/index.rss'),
-        ('Sports', 'http://www.newsobserver.com/103/index.rss'),
-        ('College Sports', 'http://www.newsobserver.com/119/index.rss'),
-        ('Lifestyles', 'http://www.newsobserver.com/106/index.rss'),
-        ('Editorials', 'http://www.newsobserver.com/158/index.rss')]
+       ('Cover', 'http://www.newsobserver.com/100/index.rss'),
+       ('News', 'http://www.newsobserver.com/102/index.rss'),
+       ('Politics', 'http://www.newsobserver.com/105/index.rss'),
+       ('Business', 'http://www.newsobserver.com/104/index.rss'),
+       ('Sports', 'http://www.newsobserver.com/103/index.rss'),
+       ('College Sports', 'http://www.newsobserver.com/119/index.rss'),
+       ('Lifestyles', 'http://www.newsobserver.com/106/index.rss'),
+       ('Editorials', 'http://www.newsobserver.com/158/index.rss')
+    ]