Improve Philadelphia Inquirer and Macleans Magazine

2025-08-11 09:13:57 -04:00 · 2011-06-17 10:38:37 -06:00 · 2011-06-17 10:38:37 -06:00 · 0a70d18f14
commit 0a70d18f14
parent 420f806a0b
2 changed files with 56 additions and 307 deletions
--- a/recipes/macleans.recipe
+++ b/recipes/macleans.recipe
@ -1,239 +1,28 @@
 #!/usr/bin/env  python
+from calibre.web.feeds.news import BasicNewsRecipe

-__license__   = 'GPL v3'
-
-'''
-macleans.ca
-'''
-from calibre.web.feeds.recipes import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import Tag
-from datetime import timedelta, date
-
-class Macleans(BasicNewsRecipe):
+class AdvancedUserRecipe1308306308(BasicNewsRecipe):
    title          = u'Macleans Magazine'
-    __author__     = 'Nick Redding'
    language = 'en_CA'
-    description = ('Macleans Magazine')
+    __author__ = 'sexymax15'
+    oldest_article = 30
+    max_articles_per_feed = 12

+    use_embedded_content = False
+
+    remove_empty_feeds = True
    no_stylesheets = True
-    timefmt = ' [%b %d]'
+    remove_javascript = True
+    remove_tags = [dict(name ='img'),dict (id='header'),{'class':'postmetadata'}]
+    remove_tags_after = {'class':'postmetadata'}

-    # customization notes: delete sections you are not interested in
-    # set oldest_article to the maximum number of days back from today to include articles
-    sectionlist = [
-                        ['http://www2.macleans.ca/','Front Page'],
-                        ['http://www2.macleans.ca/category/canada/','Canada'],
-                        ['http://www2.macleans.ca/category/world-from-the-magazine/','World'],
-                        ['http://www2.macleans.ca/category/business','Business'],
-                        ['http://www2.macleans.ca/category/arts-culture/','Culture'],
-                        ['http://www2.macleans.ca/category/opinion','Opinion'],
-                        ['http://www2.macleans.ca/category/health-from-the-magazine/','Health'],
-                        ['http://www2.macleans.ca/category/environment-from-the-magazine/','Environment'],
-                        ['http://www2.macleans.ca/category/education/','On Campus'],
-                        ['http://www2.macleans.ca/category/travel-from-the-magazine/','Travel']
-                    ]
-    oldest_article = 7
-
-    # formatting for print version of articles
-    extra_css   =   '''h2{font-family:Times,serif; font-size:large;}
-                        small {font-family:Times,serif; font-size:xx-small; list-style-type: none;}
-                    '''
-
-    # tag handling for print version of articles
-    keep_only_tags = [dict(id='tw-print')]
-    remove_tags =   [dict({'class':'postmetadata'})]
-
-
-    def preprocess_html(self,soup):
-        for img_tag in soup.findAll('img'):
-            parent_tag = img_tag.parent
-            if parent_tag.name == 'a':
-                new_tag = Tag(soup,'p')
-                new_tag.insert(0,img_tag)
-                parent_tag.replaceWith(new_tag)
-            elif parent_tag.name == 'p':
-                if not self.tag_to_string(parent_tag) == '':
-                    new_div = Tag(soup,'div')
-                    new_tag = Tag(soup,'p')
-                    new_tag.insert(0,img_tag)
-                    parent_tag.replaceWith(new_div)
-                    new_div.insert(0,new_tag)
-                    new_div.insert(1,parent_tag)
-        return soup
-
-    def parse_index(self):
-
-
-
-        articles = {}
-        key = None
-        ans = []
-
-        def parse_index_page(page_url,page_title):
-
-            def decode_date(datestr):
-                dmysplit = datestr.strip().lower().split(',')
-                mdsplit = dmysplit[1].split()
-                m = ['january','february','march','april','may','june','july','august','september','october','november','december'].index(mdsplit[0])+1
-                d = int(mdsplit[1])
-                y = int(dmysplit[2].split()[0])
-                return date(y,m,d)
-
-            def article_title(tag):
-                atag = tag.find('a',href=True)
-                if not atag:
-                    return ''
-                return self.tag_to_string(atag)
-
-            def article_url(tag):
-                atag = tag.find('a',href=True)
-                if not atag:
-                    return ''
-                return atag['href']+'print/'
-
-            def article_description(tag):
-                for p_tag in tag.findAll('p'):
-                    d = self.tag_to_string(p_tag,False)
-                    if not d == '':
-                        return d
-                return ''
-
-            def compound_h4_h3_title(tag):
-                if tag.h4:
-                    if tag.h3:
-                        return self.tag_to_string(tag.h4,False)+u'\u2014'+self.tag_to_string(tag.h3,False)
-                    else:
-                        return self.tag_to_string(tag.h4,False)
-                elif tag.h3:
-                    return self.tag_to_string(tag.h3,False)
-                else:
-                    return ''
-
-            def compound_h2_h4_title(tag):
-                if tag.h2:
-                    if tag.h4:
-                        return self.tag_to_string(tag.h2,False)+u'\u2014'+self.tag_to_string(tag.h4,False)
-                    else:
-                        return self.tag_to_string(tag.h2,False)
-                elif tag.h4:
-                    return self.tag_to_string(tag.h4,False)
-                else:
-                    return ''
-
-
-            def handle_article(header_tag, outer_tag):
-                if header_tag:
-                    url = article_url(header_tag)
-                    title = article_title(header_tag)
-                    author_date_tag = outer_tag.h4
-                    if author_date_tag:
-                        author_date = self.tag_to_string(author_date_tag,False).split(' - ')
-                        author = author_date[0].strip()
-                        article_date = decode_date(author_date[1])
-                        earliest_date = date.today() - timedelta(days=self.oldest_article)
-                        if article_date < earliest_date:
-                            self.log("Skipping article dated %s" % author_date[1])
-                        else:
-                            excerpt_div = outer_tag.find('div','excerpt')
-                            if excerpt_div:
-                                description = article_description(excerpt_div)
-                            else:
-                                description = ''
-                            if not articles.has_key(page_title):
-                                articles[page_title] = []
-                            articles[page_title].append(dict(title=title,url=url,date=author_date[1],description=description,author=author,content=''))
-
-            def handle_category_article(cat, header_tag, outer_tag):
-                url = article_url(header_tag)
-                title = article_title(header_tag)
-                if not title == '':
-                    title = cat+u'\u2014'+title
-                a_tag = outer_tag.find('span','authorLink')
-                if a_tag:
-                    author = self.tag_to_string(a_tag,False)
-                    a_tag.parent.extract()
-                else:
-                    author = ''
-                description = article_description(outer_tag)
-                if not articles.has_key(page_title):
-                    articles[page_title] = []
-                articles[page_title].append(dict(title=title,url=url,date='',description=description,author=author,content=''))
-
-
-            soup = self.index_to_soup(page_url)
-
-            if page_title == 'Front Page':
-                # special processing for the front page
-                top_stories = soup.find('div',{ "id" : "macleansFeatured" })
-                if top_stories:
-                    for div_slide in top_stories.findAll('div','slide'):
-                        url = article_url(div_slide)
-                        div_title = div_slide.find('div','header')
-                        if div_title:
-                            title = self.tag_to_string(div_title,False)
-                        else:
-                            title = ''
-                        description = article_description(div_slide)
-                        if not articles.has_key(page_title):
-                             articles[page_title] = []
-                        articles[page_title].append(dict(title=title,url=url,date='',description=description,author='',content=''))
-
-                from_macleans = soup.find('div',{ "id" : "fromMacleans" })
-                if from_macleans:
-                    for li_tag in from_macleans.findAll('li','fromMacleansArticle'):
-                        title = compound_h4_h3_title(li_tag)
-                        url = article_url(li_tag)
-                        description = article_description(li_tag)
-                        if not articles.has_key(page_title):
-                            articles[page_title] = []
-                        articles[page_title].append(dict(title=title,url=url,date='',description=description,author='',content=''))
-
-                blog_central = soup.find('div',{ "id" : "bloglist" })
-                if blog_central:
-                    for li_tag in blog_central.findAll('li'):
-                        title = compound_h2_h4_title(li_tag)
-                        if li_tag.h4:
-                            url = article_url(li_tag.h4)
-                            if not articles.has_key(page_title):
-                                articles[page_title] = []
-                            articles[page_title].append(dict(title=title,url=url,date='',description='',author='',content=''))
-
-#                need_to_know = soup.find('div',{ "id" : "needToKnow" })
-#                if need_to_know:
-#                    for div_tag in need_to_know('div',attrs={'class' : re.compile("^needToKnowArticle")}):
-#                        title = compound_h4_h3_title(div_tag)
-#                        url = article_url(div_tag)
-#                        description = article_description(div_tag)
-#                        if not articles.has_key(page_title):
-#                            articles[page_title] = []
-#                        articles[page_title].append(dict(title=title,url=url,date='',description=description,author='',content=''))
-
-                for news_category in soup.findAll('div','newsCategory'):
-                    news_cat = self.tag_to_string(news_category.h4,False)
-                    handle_category_article(news_cat, news_category.find('h2'), news_category.find('div'))
-                    for news_item in news_category.findAll('li'):
-                        handle_category_article(news_cat,news_item.h3,news_item)
-
-                return
-
-            # find the div containing the highlight article
-            div_post = soup.find('div','post')
-            if div_post:
-                h1_tag = div_post.h1
-                handle_article(h1_tag,div_post)
-
-            # find the divs containing the rest of the articles
-            div_other = div_post.find('div', { "id" : "categoryOtherPosts" })
-            if div_other:
-                for div_entry in div_other.findAll('div','entry'):
-                    h2_tag = div_entry.h2
-                    handle_article(h2_tag,div_entry)
-
-
-
-        for page_name,page_title in self.sectionlist:
-            parse_index_page(page_name,page_title)
-            ans.append(page_title)
-
-        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
-        return ans
+    feeds          = [(u'Blog Central', u'http://www2.macleans.ca/category/blog-central/feed/'),
+ (u'Canada', u'http://www2.macleans.ca/category/canada/feed/'),
+(u'World', u'http://www2.macleans.ca/category/world-from-the-magazine/feed/'),
+(u'Business', u'http://www2.macleans.ca/category/business/feed/'),
+(u'Arts & Culture', u'http://www2.macleans.ca/category/arts-culture/feed/'),
+(u'Opinion', u'http://www2.macleans.ca/category/opinion/feed/'),
+(u'Health', u'http://www2.macleans.ca/category/health-from-the-magazine/feed/'),
+ (u'Environment', u'http://www2.macleans.ca/category/environment-from-the-magazine/feed/')]
+    def print_version(self, url):
+        return url + 'print/'
--- a/recipes/philly.recipe
+++ b/recipes/philly.recipe
@ -1,85 +1,45 @@
 #!/usr/bin/env  python
-__license__   = 'GPL v3'
-'''
-philly.com/inquirer/
-'''
-from calibre.web.feeds.recipes import BasicNewsRecipe
+from calibre.web.feeds.news import BasicNewsRecipe

-class Philly(BasicNewsRecipe):
-
-    title       = 'Philadelphia Inquirer'
-    __author__  = 'RadikalDissent and Sujata Raman'
+class AdvancedUserRecipe1308312288(BasicNewsRecipe):
+    title          = u'Philadelphia Inquirer'
+    __author__ = 'sexymax15'
    language = 'en'
    description = 'Daily news from the Philadelphia Inquirer'
-    no_stylesheets        = True
+    oldest_article = 15
+    max_articles_per_feed = 20
    use_embedded_content = False
-    oldest_article = 1
-    max_articles_per_feed = 25
+    remove_empty_feeds = True
+    no_stylesheets = True
+    remove_javascript = True

-    extra_css = '''
-        h1{font-family:verdana,arial,helvetica,sans-serif; font-size: large;}
-        h2{font-family:verdana,arial,helvetica,sans-serif; font-size: small;}
-        .body-content{font-family:verdana,arial,helvetica,sans-serif; font-size: small;}
-        .byline {font-size: small; color: #666666; font-style:italic; }
-        .lastline {font-size: small; color: #666666; font-style:italic;}
-        .contact {font-size: small; color: #666666;}
-        .contact p {font-size: small; color: #666666;}
-        #photoCaption { font-family:verdana,arial,helvetica,sans-serif; font-size:x-small;}
-        .photoCaption { font-family:verdana,arial,helvetica,sans-serif; font-size:x-small;}
-        #photoCredit{ font-family:verdana,arial,helvetica,sans-serif; font-size:x-small; color:#666666;}
-        .photoCredit{ font-family:verdana,arial,helvetica,sans-serif; font-size:x-small; color:#666666;}
-        .article_timestamp{font-size:x-small; color:#666666;}
-        a {font-family:verdana,arial,helvetica,sans-serif; font-size: x-small;}
-                '''
+   # remove_tags_before = {'class':'article_timestamp'}
+    #remove_tags_after = {'class':'graylabel'}
+    keep_only_tags= [dict(name=['h1','p'])]
+    remove_tags = [dict(name=['hr','dl','dt','img','meta','iframe','link','script','form','input','label']),
+dict(id=['toggleConfirmEmailDiv','toggleTOS','toggleUsernameMsgDiv','toggleConfirmYear','navT1_philly','secondaryNav','navPlacement','globalPrimaryNav'
+,'ugc-footer-philly','bv_footer_include','footer','header',
+'container_rag_bottom','section_rectangle','contentrightside'])
+,{'class':['megamenu3 megamenu','container misc','container_inner misc_inner'
+,'misccontainer_left_32','headlineonly','misccontainer_middle_32'
+,'misccontainer_right_32','headline formBegin',
+'post_balloon','relatedlist','linkssubhead','b_sq','dotted-rule-above'
+,'container','headlines-digest','graylabel','container_inner'
+,'rlinks_colorbar1','rlinks_colorbar2','supercontainer','container_5col_left','container_image_left',
+'digest-headline2','digest-lead','container_5col_leftmiddle',
+'container_5col_middlemiddle','container_5col_rightmiddle'
+,'container_5col_right','divclear','supercontainer_outer force-width',
+'supercontainer','containertitle  kicker-title',
+'pollquestion','pollchoice','photomore','pollbutton','container rssbox','containertitle video ',
+'containertitle_image ','container_tabtwo','selected'
+,'shadetabs','selected','tabcontentstyle','tabcontent','inner_container'
+,'arrow','container_ad','containertitlespacer','adUnit','tracking','sitemsg_911 clearfix']}]

-    keep_only_tags = [
-               dict(name='div', attrs={'class':'story-content'}),
-               dict(name='div', attrs={'id': 'contentinside'})
-                    ]
+    extra_css             = """
+                               h1{font-family: Georgia,serif; font-size: xx-large}

-    remove_tags = [
-         dict(name='div', attrs={'class':['linkssubhead','post_balloon','relatedlist','pollquestion','b_sq']}),
-         dict(name='dl', attrs={'class':'relatedlist'}),
-        dict(name='div', attrs={'id':['photoNav','sidebar_adholder']}),
-        dict(name='a', attrs={'class': ['headlineonly','bl']}),
-         dict(name='img', attrs={'class':'img_noborder'})
-    ]
-   # def print_version(self, url):
-   #     return url + '?viewAll=y'
+                            """


-    feeds = [
-        ('Front Page', 'http://www.philly.com/inquirer_front_page.rss'),
-        ('Business', 'http://www.philly.com/inq_business.rss'),
-        #('News', 'http://www.philly.com/inquirer/news/index.rss'),
-        ('Nation', 'http://www.philly.com/inq_news_world_us.rss'),
-        ('Local', 'http://www.philly.com/inquirer_local.rss'),
-        ('Health', 'http://www.philly.com/inquirer_health_science.rss'),
-        ('Education', 'http://www.philly.com/inquirer_education.rss'),
-        ('Editorial and opinion', 'http://www.philly.com/inq_news_editorial.rss'),
-        ('Sports', 'http://www.philly.com/inquirer_sports.rss')
-        ]
+    feeds          = [(u'News', u'http://www.philly.com/philly_news.rss')]

-    def get_article_url(self, article):
-        ans = article.link
-
-        try:
-            self.log('Looking for full story link in', ans)
-            soup = self.index_to_soup(ans)
-            x = soup.find(text="View All")
-
-            if x is not None:
-                ans = ans + '?viewAll=y'
-                self.log('Found full story link', ans)
-        except:
-            pass
-        return ans
-
-    def postprocess_html(self, soup,first):
-
-         for tag in soup.findAll(name='div',attrs={'class':"container_ate_qandatitle"}):
-                tag.extract()
-         for tag in soup.findAll(name='br'):
-                tag.extract()
-
-         return soup