diff --git a/recipes/macleans.recipe b/recipes/macleans.recipe index 296a56f5f3..22f94638d9 100644 --- a/recipes/macleans.recipe +++ b/recipes/macleans.recipe @@ -1,239 +1,28 @@ #!/usr/bin/env python +from calibre.web.feeds.news import BasicNewsRecipe -__license__ = 'GPL v3' - -''' -macleans.ca -''' -from calibre.web.feeds.recipes import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import Tag -from datetime import timedelta, date - -class Macleans(BasicNewsRecipe): +class AdvancedUserRecipe1308306308(BasicNewsRecipe): title = u'Macleans Magazine' - __author__ = 'Nick Redding' language = 'en_CA' - description = ('Macleans Magazine') + __author__ = 'sexymax15' + oldest_article = 30 + max_articles_per_feed = 12 + use_embedded_content = False + + remove_empty_feeds = True no_stylesheets = True - timefmt = ' [%b %d]' + remove_javascript = True + remove_tags = [dict(name ='img'),dict (id='header'),{'class':'postmetadata'}] + remove_tags_after = {'class':'postmetadata'} - # customization notes: delete sections you are not interested in - # set oldest_article to the maximum number of days back from today to include articles - sectionlist = [ - ['http://www2.macleans.ca/','Front Page'], - ['http://www2.macleans.ca/category/canada/','Canada'], - ['http://www2.macleans.ca/category/world-from-the-magazine/','World'], - ['http://www2.macleans.ca/category/business','Business'], - ['http://www2.macleans.ca/category/arts-culture/','Culture'], - ['http://www2.macleans.ca/category/opinion','Opinion'], - ['http://www2.macleans.ca/category/health-from-the-magazine/','Health'], - ['http://www2.macleans.ca/category/environment-from-the-magazine/','Environment'], - ['http://www2.macleans.ca/category/education/','On Campus'], - ['http://www2.macleans.ca/category/travel-from-the-magazine/','Travel'] - ] - oldest_article = 7 - - # formatting for print version of articles - extra_css = '''h2{font-family:Times,serif; font-size:large;} - small {font-family:Times,serif; font-size:xx-small; list-style-type: none;} - ''' - - # tag handling for print version of articles - keep_only_tags = [dict(id='tw-print')] - remove_tags = [dict({'class':'postmetadata'})] - - - def preprocess_html(self,soup): - for img_tag in soup.findAll('img'): - parent_tag = img_tag.parent - if parent_tag.name == 'a': - new_tag = Tag(soup,'p') - new_tag.insert(0,img_tag) - parent_tag.replaceWith(new_tag) - elif parent_tag.name == 'p': - if not self.tag_to_string(parent_tag) == '': - new_div = Tag(soup,'div') - new_tag = Tag(soup,'p') - new_tag.insert(0,img_tag) - parent_tag.replaceWith(new_div) - new_div.insert(0,new_tag) - new_div.insert(1,parent_tag) - return soup - - def parse_index(self): - - - - articles = {} - key = None - ans = [] - - def parse_index_page(page_url,page_title): - - def decode_date(datestr): - dmysplit = datestr.strip().lower().split(',') - mdsplit = dmysplit[1].split() - m = ['january','february','march','april','may','june','july','august','september','october','november','december'].index(mdsplit[0])+1 - d = int(mdsplit[1]) - y = int(dmysplit[2].split()[0]) - return date(y,m,d) - - def article_title(tag): - atag = tag.find('a',href=True) - if not atag: - return '' - return self.tag_to_string(atag) - - def article_url(tag): - atag = tag.find('a',href=True) - if not atag: - return '' - return atag['href']+'print/' - - def article_description(tag): - for p_tag in tag.findAll('p'): - d = self.tag_to_string(p_tag,False) - if not d == '': - return d - return '' - - def compound_h4_h3_title(tag): - if tag.h4: - if tag.h3: - return self.tag_to_string(tag.h4,False)+u'\u2014'+self.tag_to_string(tag.h3,False) - else: - return self.tag_to_string(tag.h4,False) - elif tag.h3: - return self.tag_to_string(tag.h3,False) - else: - return '' - - def compound_h2_h4_title(tag): - if tag.h2: - if tag.h4: - return self.tag_to_string(tag.h2,False)+u'\u2014'+self.tag_to_string(tag.h4,False) - else: - return self.tag_to_string(tag.h2,False) - elif tag.h4: - return self.tag_to_string(tag.h4,False) - else: - return '' - - - def handle_article(header_tag, outer_tag): - if header_tag: - url = article_url(header_tag) - title = article_title(header_tag) - author_date_tag = outer_tag.h4 - if author_date_tag: - author_date = self.tag_to_string(author_date_tag,False).split(' - ') - author = author_date[0].strip() - article_date = decode_date(author_date[1]) - earliest_date = date.today() - timedelta(days=self.oldest_article) - if article_date < earliest_date: - self.log("Skipping article dated %s" % author_date[1]) - else: - excerpt_div = outer_tag.find('div','excerpt') - if excerpt_div: - description = article_description(excerpt_div) - else: - description = '' - if not articles.has_key(page_title): - articles[page_title] = [] - articles[page_title].append(dict(title=title,url=url,date=author_date[1],description=description,author=author,content='')) - - def handle_category_article(cat, header_tag, outer_tag): - url = article_url(header_tag) - title = article_title(header_tag) - if not title == '': - title = cat+u'\u2014'+title - a_tag = outer_tag.find('span','authorLink') - if a_tag: - author = self.tag_to_string(a_tag,False) - a_tag.parent.extract() - else: - author = '' - description = article_description(outer_tag) - if not articles.has_key(page_title): - articles[page_title] = [] - articles[page_title].append(dict(title=title,url=url,date='',description=description,author=author,content='')) - - - soup = self.index_to_soup(page_url) - - if page_title == 'Front Page': - # special processing for the front page - top_stories = soup.find('div',{ "id" : "macleansFeatured" }) - if top_stories: - for div_slide in top_stories.findAll('div','slide'): - url = article_url(div_slide) - div_title = div_slide.find('div','header') - if div_title: - title = self.tag_to_string(div_title,False) - else: - title = '' - description = article_description(div_slide) - if not articles.has_key(page_title): - articles[page_title] = [] - articles[page_title].append(dict(title=title,url=url,date='',description=description,author='',content='')) - - from_macleans = soup.find('div',{ "id" : "fromMacleans" }) - if from_macleans: - for li_tag in from_macleans.findAll('li','fromMacleansArticle'): - title = compound_h4_h3_title(li_tag) - url = article_url(li_tag) - description = article_description(li_tag) - if not articles.has_key(page_title): - articles[page_title] = [] - articles[page_title].append(dict(title=title,url=url,date='',description=description,author='',content='')) - - blog_central = soup.find('div',{ "id" : "bloglist" }) - if blog_central: - for li_tag in blog_central.findAll('li'): - title = compound_h2_h4_title(li_tag) - if li_tag.h4: - url = article_url(li_tag.h4) - if not articles.has_key(page_title): - articles[page_title] = [] - articles[page_title].append(dict(title=title,url=url,date='',description='',author='',content='')) - -# need_to_know = soup.find('div',{ "id" : "needToKnow" }) -# if need_to_know: -# for div_tag in need_to_know('div',attrs={'class' : re.compile("^needToKnowArticle")}): -# title = compound_h4_h3_title(div_tag) -# url = article_url(div_tag) -# description = article_description(div_tag) -# if not articles.has_key(page_title): -# articles[page_title] = [] -# articles[page_title].append(dict(title=title,url=url,date='',description=description,author='',content='')) - - for news_category in soup.findAll('div','newsCategory'): - news_cat = self.tag_to_string(news_category.h4,False) - handle_category_article(news_cat, news_category.find('h2'), news_category.find('div')) - for news_item in news_category.findAll('li'): - handle_category_article(news_cat,news_item.h3,news_item) - - return - - # find the div containing the highlight article - div_post = soup.find('div','post') - if div_post: - h1_tag = div_post.h1 - handle_article(h1_tag,div_post) - - # find the divs containing the rest of the articles - div_other = div_post.find('div', { "id" : "categoryOtherPosts" }) - if div_other: - for div_entry in div_other.findAll('div','entry'): - h2_tag = div_entry.h2 - handle_article(h2_tag,div_entry) - - - - for page_name,page_title in self.sectionlist: - parse_index_page(page_name,page_title) - ans.append(page_title) - - ans = [(key, articles[key]) for key in ans if articles.has_key(key)] - return ans + feeds = [(u'Blog Central', u'http://www2.macleans.ca/category/blog-central/feed/'), + (u'Canada', u'http://www2.macleans.ca/category/canada/feed/'), +(u'World', u'http://www2.macleans.ca/category/world-from-the-magazine/feed/'), +(u'Business', u'http://www2.macleans.ca/category/business/feed/'), +(u'Arts & Culture', u'http://www2.macleans.ca/category/arts-culture/feed/'), +(u'Opinion', u'http://www2.macleans.ca/category/opinion/feed/'), +(u'Health', u'http://www2.macleans.ca/category/health-from-the-magazine/feed/'), + (u'Environment', u'http://www2.macleans.ca/category/environment-from-the-magazine/feed/')] + def print_version(self, url): + return url + 'print/' diff --git a/recipes/philly.recipe b/recipes/philly.recipe index 80de2f3277..c6cad5d174 100644 --- a/recipes/philly.recipe +++ b/recipes/philly.recipe @@ -1,85 +1,45 @@ #!/usr/bin/env python -__license__ = 'GPL v3' -''' -philly.com/inquirer/ -''' -from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.web.feeds.news import BasicNewsRecipe -class Philly(BasicNewsRecipe): - - title = 'Philadelphia Inquirer' - __author__ = 'RadikalDissent and Sujata Raman' +class AdvancedUserRecipe1308312288(BasicNewsRecipe): + title = u'Philadelphia Inquirer' + __author__ = 'sexymax15' language = 'en' description = 'Daily news from the Philadelphia Inquirer' - no_stylesheets = True - use_embedded_content = False - oldest_article = 1 - max_articles_per_feed = 25 + oldest_article = 15 + max_articles_per_feed = 20 + use_embedded_content = False + remove_empty_feeds = True + no_stylesheets = True + remove_javascript = True - extra_css = ''' - h1{font-family:verdana,arial,helvetica,sans-serif; font-size: large;} - h2{font-family:verdana,arial,helvetica,sans-serif; font-size: small;} - .body-content{font-family:verdana,arial,helvetica,sans-serif; font-size: small;} - .byline {font-size: small; color: #666666; font-style:italic; } - .lastline {font-size: small; color: #666666; font-style:italic;} - .contact {font-size: small; color: #666666;} - .contact p {font-size: small; color: #666666;} - #photoCaption { font-family:verdana,arial,helvetica,sans-serif; font-size:x-small;} - .photoCaption { font-family:verdana,arial,helvetica,sans-serif; font-size:x-small;} - #photoCredit{ font-family:verdana,arial,helvetica,sans-serif; font-size:x-small; color:#666666;} - .photoCredit{ font-family:verdana,arial,helvetica,sans-serif; font-size:x-small; color:#666666;} - .article_timestamp{font-size:x-small; color:#666666;} - a {font-family:verdana,arial,helvetica,sans-serif; font-size: x-small;} - ''' + # remove_tags_before = {'class':'article_timestamp'} + #remove_tags_after = {'class':'graylabel'} + keep_only_tags= [dict(name=['h1','p'])] + remove_tags = [dict(name=['hr','dl','dt','img','meta','iframe','link','script','form','input','label']), +dict(id=['toggleConfirmEmailDiv','toggleTOS','toggleUsernameMsgDiv','toggleConfirmYear','navT1_philly','secondaryNav','navPlacement','globalPrimaryNav' +,'ugc-footer-philly','bv_footer_include','footer','header', +'container_rag_bottom','section_rectangle','contentrightside']) +,{'class':['megamenu3 megamenu','container misc','container_inner misc_inner' +,'misccontainer_left_32','headlineonly','misccontainer_middle_32' +,'misccontainer_right_32','headline formBegin', +'post_balloon','relatedlist','linkssubhead','b_sq','dotted-rule-above' +,'container','headlines-digest','graylabel','container_inner' +,'rlinks_colorbar1','rlinks_colorbar2','supercontainer','container_5col_left','container_image_left', +'digest-headline2','digest-lead','container_5col_leftmiddle', +'container_5col_middlemiddle','container_5col_rightmiddle' +,'container_5col_right','divclear','supercontainer_outer force-width', +'supercontainer','containertitle kicker-title', +'pollquestion','pollchoice','photomore','pollbutton','container rssbox','containertitle video ', +'containertitle_image ','container_tabtwo','selected' +,'shadetabs','selected','tabcontentstyle','tabcontent','inner_container' +,'arrow','container_ad','containertitlespacer','adUnit','tracking','sitemsg_911 clearfix']}] - keep_only_tags = [ - dict(name='div', attrs={'class':'story-content'}), - dict(name='div', attrs={'id': 'contentinside'}) - ] + extra_css = """ + h1{font-family: Georgia,serif; font-size: xx-large} - remove_tags = [ - dict(name='div', attrs={'class':['linkssubhead','post_balloon','relatedlist','pollquestion','b_sq']}), - dict(name='dl', attrs={'class':'relatedlist'}), - dict(name='div', attrs={'id':['photoNav','sidebar_adholder']}), - dict(name='a', attrs={'class': ['headlineonly','bl']}), - dict(name='img', attrs={'class':'img_noborder'}) - ] - # def print_version(self, url): - # return url + '?viewAll=y' + """ - feeds = [ - ('Front Page', 'http://www.philly.com/inquirer_front_page.rss'), - ('Business', 'http://www.philly.com/inq_business.rss'), - #('News', 'http://www.philly.com/inquirer/news/index.rss'), - ('Nation', 'http://www.philly.com/inq_news_world_us.rss'), - ('Local', 'http://www.philly.com/inquirer_local.rss'), - ('Health', 'http://www.philly.com/inquirer_health_science.rss'), - ('Education', 'http://www.philly.com/inquirer_education.rss'), - ('Editorial and opinion', 'http://www.philly.com/inq_news_editorial.rss'), - ('Sports', 'http://www.philly.com/inquirer_sports.rss') - ] + feeds = [(u'News', u'http://www.philly.com/philly_news.rss')] - def get_article_url(self, article): - ans = article.link - - try: - self.log('Looking for full story link in', ans) - soup = self.index_to_soup(ans) - x = soup.find(text="View All") - - if x is not None: - ans = ans + '?viewAll=y' - self.log('Found full story link', ans) - except: - pass - return ans - - def postprocess_html(self, soup,first): - - for tag in soup.findAll(name='div',attrs={'class':"container_ate_qandatitle"}): - tag.extract() - for tag in soup.findAll(name='br'): - tag.extract() - - return soup