diff --git a/resources/recipes/harpers.recipe b/resources/recipes/harpers.recipe index d4d9eb9987..b2df3c00aa 100644 --- a/resources/recipes/harpers.recipe +++ b/resources/recipes/harpers.recipe @@ -29,7 +29,13 @@ class Harpers(BasicNewsRecipe): html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0em; margin-top: 0em; margin-bottom: 0.5em} img {margin-top: 0em; margin-bottom: 0.4em}"' - + extra_css = ''' + h1{ font-family:georgia ; color:#111111; font-size:large;} + .box-of-helpful{ font-family:arial ; font-size:x-small;} + p{font-family:georgia ;} + .caption{font-family:Verdana,sans-serif;font-size:x-small;color:#666666;} + ''' + keep_only_tags = [ dict(name='div', attrs={'id':'cached'}) ] remove_tags = [ dict(name='table', attrs={'class':['rcnt','rcnt topline']}) @@ -38,6 +44,17 @@ class Harpers(BasicNewsRecipe): feeds = [(u"Harper's Magazine", u'http://www.harpers.org/rss/frontpage-rss20.xml')] + def get_cover_url(self): + cover_url = None + index = 'http://harpers.org/' + soup = self.index_to_soup(index) + link_item = soup.find(name = 'img',attrs= {'class':"cover"}) + print link_item + if link_item: + cover_url = 'http://harpers.org' + link_item['src'] + print cover_url + return cover_url + def preprocess_html(self, soup): mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")]) soup.head.insert(1,mcharset) @@ -47,3 +64,5 @@ class Harpers(BasicNewsRecipe): del item['xmlns'] return soup + + diff --git a/resources/recipes/ncrnext.recipe b/resources/recipes/ncrnext.recipe new file mode 100644 index 0000000000..d8a51e62c8 --- /dev/null +++ b/resources/recipes/ncrnext.recipe @@ -0,0 +1,114 @@ +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup + +class NrcNextRecipe(BasicNewsRecipe): + __license__ = 'GPL v3' + __author__ = 'kwetal' + version = 1 + language = 'nl' + description = u'Dutch newsblog from the Dutch daily newspaper nrcnext.' + title = u'nrcnext' + + no_stylesheets = True + template_css = '' + + # I want to do some special processing on the articles. I could not solve it with the 'extra_css' property . So we do it the hard way. + keep_only_tags = [dict(name='div', attrs={'id' : 'main'})] + # If that's overkill for you comment out the previous line and uncomment the next. Then get rid of the preprocess_html() method. + #keep_only_tags = [dict(name='div', attrs={'class' : 'post'}), dict(name='div', attrs={'class' : 'vlag'}) ] + + remove_tags = [dict(name = 'div', attrs = {'class' : 'meta'}), + dict(name = 'div', attrs = {'class' : 'datumlabel'}), + dict(name = 'ul', attrs = {'class' : 'cats single'}), + dict(name = 'ul', attrs = {'class' : 'cats onderwerpen'}), + dict(name = 'ul', attrs = {'class' : 'cats rubrieken'})] + + use_embedded_content = False + + def parse_index(self) : + # Use the wesbite as an index. Their RSS feeds can be out of date. + feeds = {} + feeds[u'columnisten'] = u'http://www.nrcnext.nl/columnisten/' + feeds[u'koken'] = u'http://www.nrcnext.nl/koken/' + feeds[u'geld & werk'] = u'http://www.nrcnext.nl/geld-en-werk/' + feeds[u'vandaag'] = u'http://www.nrcnext.nl' + feeds[u'city life in afrika'] = u'http://www.nrcnext.nl/city-life-in-afrika/' + answer = [] + articles = {} + indices = [] + + for index, feed in feeds.items() : + soup = self.index_to_soup(feed) + + for post in soup.findAll(True, attrs={'class' : 'post'}) : + # Find the links to the actual articles and rember the location they're pointing to and the title + a = post.find('a', attrs={'rel' : 'bookmark'}) + href = a['href'] + title = a.renderContents() + + if index == 'columnisten' : + # In this feed/page articles can be written by more than one author. It is nice to see their names in the titles. + flag = post.find('h2', attrs = {'class' : 'vlag'}) + author = flag.contents[0].renderContents() + completeTitle = u''.join([author, u': ', title]) + else : + completeTitle = title + + # Add the article to a temporary list + article = {'title' : completeTitle, 'date' : u'', 'url' : href, 'description' : '
'} + if not articles.has_key(index) : + articles[index] = [] + articles[index].append(article) + + # Add the index title to a temporary list + indices.append(index) + + # Now, sort the temporary list of feeds in the order they appear on the website + indices = self.sort_index_by(indices, {u'columnisten' : 1, u'koken' : 3, u'geld & werk' : 2, u'vandaag' : 0, u'city life in afrika' : 4}) + # Apply this sort order to the actual list of feeds and articles + answer = [(key, articles[key]) for key in indices if articles.has_key(key)] + + return answer + + def preprocess_html(self, soup) : + # This method is called for every page, be it cartoon or TOC. We need to process each in their own way + if soup.find('div', attrs = {'id' : 'main', 'class' : 'single'}) : + # It's an article, find the interesting part + tag = soup.find('div', attrs = {'class' : 'post'}) + if tag : + # And replace any links with their text, so they don't show up underlined on my reader. + for link in tag.findAll('a') : + link.replaceWith(link.renderContents()) + + # Slows down my Sony reader; feel free to comment out + for movie in tag.findAll('span', attrs = {'class' : 'vvqbox vvqvimeo'}) : + movie.extract() + for movie in tag.findAll('span', attrs = {'class' : 'vvqbox vvqyoutube'}) : + movie.extract() + + homeMadeSoup = BeautifulSoup('') + body = homeMadeSoup.find('body') + body.append(tag) + + return homeMadeSoup + else : + # This should never happen and other famous last words... + return soup + else : + # It's a TOC, return the whole lot. + return soup + + def postproces_html(self, soup) : + # Should not happen, but it does. Slows down my Sony eReader + for img in soup.findAll('img') : + if img['src'].startswith('http://') : + img.extract() + + # Happens for some movies which we are not able to view anyway + for iframe in soup.findAll('iframe') : + if iframe['src'].startswith('http://') : + iframe.extract() + + + + diff --git a/resources/recipes/time_magazine.recipe b/resources/recipes/time_magazine.recipe index c6aeb59a45..7605a0bd59 100644 --- a/resources/recipes/time_magazine.recipe +++ b/resources/recipes/time_magazine.recipe @@ -16,6 +16,7 @@ class Time(BasicNewsRecipe): encoding = 'utf-8' no_stylesheets = True language = 'en' + remove_javascript = True extra_css = ''' h1 {font-family:Arial,Sans-serif;} h2 {font-family:Arial,Sans-serif;} @@ -31,14 +32,8 @@ class Time(BasicNewsRecipe): .credit{font-family:georgia,serif; font-size:x-small;color:#999999;} a:link{color:#CC0000;} ''' - - # remove_tags_before = dict(id="artHd") - # remove_tags_after = {'class':"ltCol"} - # remove_tags = [ - # {'class':['articleTools', 'enlarge', 'search','socialtools','blogtools','moretools','page','nextUp','next','subnav','RSS','line2','first','ybuzz','articlePagination','chiclets','imgcont','createListLink','rlinks','tabsWrap','pagination']}, - # {'id':['quigoArticle', 'contentTools', 'articleSideBar', 'header', 'navTop','articleTools','feedmodule','feedmodule3','promos','footer','linksFooter','timeArchive','belt','relatedStories','packages','Features']}, - # {'target':'_blank'}, - # ] + + keep_only_tags = [ dict(name ="div",attrs = {"id" :["article",]}) , dict(name ="div",attrs = {"class" :["artHd","artTxt","photoBkt","vertPhoto","image","copy"]}) ,] @@ -50,6 +45,8 @@ class Time(BasicNewsRecipe): recursions = 1 match_regexps = [r'/[0-9,]+-(2|3|4|5|6|7|8|9)(,\d+){0,1}.html'] + preprocess_regexps = [(re.compile( + r''), lambda m:'')] def parse_index(self): soup = self.index_to_soup('http://www.time.com/time/magazine') @@ -75,13 +72,19 @@ class Time(BasicNewsRecipe): return feeds def find_articles(self, seched): - for a in seched.findNextSiblings('a', href=True, attrs={'class':'toc_hed'}): - yield { - 'title' : self.tag_to_string(a), - 'url' : 'http://www.time.com'+a['href'], - 'date' : '', - 'description' : self.article_description(a) - } + articles = [] + for a in seched.findNextSiblings( attrs={'class':['toc_hed','rule2']}): + if a.name in "div": + break + else: + yield { + 'title' : self.tag_to_string(a), + 'url' : 'http://www.time.com'+a['href'], + 'date' : '', + 'description' : self.article_description(a) + } + + def article_description(self, a): ans = []