New recipe for ncrnext by kwetal. Updated recipe for Harpers.

2025-07-08 18:54:09 -04:00 · 2009-11-23 08:40:34 -07:00 · 2009-11-23 08:40:34 -07:00 · df73dd322b
commit df73dd322b
parent c7834a5fc1
3 changed files with 152 additions and 16 deletions
--- a/resources/recipes/harpers.recipe
+++ b/resources/recipes/harpers.recipe
@ -29,7 +29,13 @@ class Harpers(BasicNewsRecipe):

    html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0em; margin-top: 0em; margin-bottom: 0.5em} img {margin-top: 0em; margin-bottom: 0.4em}"'

-
+    extra_css = '''
+                h1{ font-family:georgia ; color:#111111; font-size:large;}                
+                .box-of-helpful{ font-family:arial ; font-size:x-small;}
+                p{font-family:georgia ;}
+                .caption{font-family:Verdana,sans-serif;font-size:x-small;color:#666666;}                                
+                '''
+              
    keep_only_tags = [ dict(name='div', attrs={'id':'cached'}) ]
    remove_tags = [
                     dict(name='table', attrs={'class':['rcnt','rcnt topline']})
@ -38,6 +44,17 @@ class Harpers(BasicNewsRecipe):

    feeds       = [(u"Harper's Magazine", u'http://www.harpers.org/rss/frontpage-rss20.xml')]

+    def get_cover_url(self):
+        cover_url = None
+        index = 'http://harpers.org/'
+        soup = self.index_to_soup(index)
+        link_item = soup.find(name = 'img',attrs= {'class':"cover"})
+        print link_item
+        if link_item:
+           cover_url = 'http://harpers.org' + link_item['src'] 
+        print cover_url   
+        return cover_url
+    
    def preprocess_html(self, soup):
        mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
        soup.head.insert(1,mcharset)
@ -47,3 +64,5 @@ class Harpers(BasicNewsRecipe):
            del item['xmlns']
        return soup

+    
+
--- a/resources/recipes/ncrnext.recipe
+++ b/resources/recipes/ncrnext.recipe
@ -0,0 +1,114 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import BeautifulSoup
+
+class NrcNextRecipe(BasicNewsRecipe):
+    __license__   = 'GPL v3'
+    __author__ = 'kwetal'
+    version = 1
+    language = 'nl'
+    description = u'Dutch newsblog from the Dutch daily newspaper nrcnext.'
+    title          = u'nrcnext'
+
+    no_stylesheets = True
+    template_css = ''
+
+    # I want to do some special processing on the articles. I could not solve it with the 'extra_css' property . So we do it the hard way.
+    keep_only_tags = [dict(name='div', attrs={'id' : 'main'})]
+    # If that's overkill for you comment out the previous line and uncomment the next. Then get rid of the preprocess_html() method.
+    #keep_only_tags = [dict(name='div', attrs={'class' : 'post'}), dict(name='div', attrs={'class' : 'vlag'}) ]
+
+    remove_tags = [dict(name = 'div', attrs = {'class' : 'meta'}),
+                          dict(name = 'div', attrs = {'class' : 'datumlabel'}),
+                          dict(name = 'ul', attrs = {'class' : 'cats single'}),
+                          dict(name = 'ul', attrs = {'class' : 'cats onderwerpen'}),
+                          dict(name = 'ul', attrs = {'class' : 'cats rubrieken'})]
+
+    use_embedded_content = False
+
+    def parse_index(self) :
+        # Use the wesbite as an index. Their RSS feeds can be out of date.
+        feeds = {}
+        feeds[u'columnisten'] = u'http://www.nrcnext.nl/columnisten/'
+        feeds[u'koken'] = u'http://www.nrcnext.nl/koken/'
+        feeds[u'geld & werk'] = u'http://www.nrcnext.nl/geld-en-werk/'
+        feeds[u'vandaag'] = u'http://www.nrcnext.nl'
+        feeds[u'city life in afrika']  = u'http://www.nrcnext.nl/city-life-in-afrika/'
+        answer = []
+        articles = {}
+        indices = []
+
+        for index, feed in feeds.items() :
+            soup = self.index_to_soup(feed)
+
+            for post in soup.findAll(True, attrs={'class' : 'post'}) :
+                # Find the links to the actual articles and rember the location they're pointing to and the title
+                a = post.find('a', attrs={'rel' : 'bookmark'})
+                href = a['href']
+                title = a.renderContents()
+
+                if index == 'columnisten' :
+                    # In this feed/page articles can be written by more than one author. It is nice to see their names in the titles.
+                    flag = post.find('h2', attrs = {'class' : 'vlag'})
+                    author = flag.contents[0].renderContents()
+                    completeTitle = u''.join([author, u': ', title])
+                else :
+                    completeTitle = title
+
+                # Add the article to a temporary list
+                article = {'title' : completeTitle, 'date' : u'', 'url'  : href, 'description' : '<p>&nbsp;</p>'}
+                if not articles.has_key(index) :
+                    articles[index] = []
+                articles[index].append(article)
+
+            # Add the index title to a temporary list
+            indices.append(index)
+
+        # Now, sort the temporary list of feeds in the order they appear on the website
+        indices = self.sort_index_by(indices, {u'columnisten' : 1, u'koken' : 3, u'geld & werk' : 2, u'vandaag' : 0, u'city life in afrika' : 4})
+        # Apply this sort order to the actual list of feeds and articles
+        answer = [(key, articles[key]) for key in indices if articles.has_key(key)]
+
+        return answer
+
+    def preprocess_html(self, soup) :
+        # This method is called for every page, be it cartoon or TOC. We need to process each in their own way
+        if soup.find('div', attrs = {'id' : 'main', 'class' : 'single'}) :
+            # It's an article, find the interesting part
+            tag = soup.find('div', attrs = {'class' : 'post'})
+            if tag :
+                # And replace any links with their text, so they don't show up underlined on my reader.
+                for link in tag.findAll('a') :
+                    link.replaceWith(link.renderContents())
+
+                # Slows down my Sony reader; feel free to comment out
+                for movie in tag.findAll('span', attrs = {'class' : 'vvqbox vvqvimeo'}) :
+                    movie.extract()
+                for movie in tag.findAll('span', attrs = {'class' : 'vvqbox vvqyoutube'}) :
+                    movie.extract()
+
+                homeMadeSoup = BeautifulSoup('<html><head></head><body></body></html>')
+                body = homeMadeSoup.find('body')
+                body.append(tag)
+
+                return homeMadeSoup
+            else :
+                # This should never happen and other famous last words...
+                return soup
+        else :
+            # It's a TOC, return the whole lot.
+            return soup
+
+    def postproces_html(self, soup) :
+        # Should not happen, but it does. Slows down my Sony eReader
+        for img in soup.findAll('img') :
+            if img['src'].startswith('http://') :
+                img.extract()
+
+        # Happens for some movies which we are not able to view anyway
+        for iframe in soup.findAll('iframe') :
+            if iframe['src'].startswith('http://') :
+                iframe.extract()
+
+
+
+
--- a/resources/recipes/time_magazine.recipe
+++ b/resources/recipes/time_magazine.recipe
@ -16,6 +16,7 @@ class Time(BasicNewsRecipe):
    encoding = 'utf-8'
    no_stylesheets        = True
    language = 'en'
+    remove_javascript     = True

    extra_css      = ''' h1 {font-family:Arial,Sans-serif;}
                         h2 {font-family:Arial,Sans-serif;}
@ -31,14 +32,8 @@ class Time(BasicNewsRecipe):
                        .credit{font-family:georgia,serif; font-size:x-small;color:#999999;}
                        a:link{color:#CC0000;}
                        '''
-    
-   # remove_tags_before = dict(id="artHd")
-   # remove_tags_after = {'class':"ltCol"}
-   # remove_tags    = [
-   #         {'class':['articleTools', 'enlarge', 'search','socialtools','blogtools','moretools','page','nextUp','next','subnav','RSS','line2','first','ybuzz','articlePagination','chiclets','imgcont','createListLink','rlinks','tabsWrap','pagination']},
-   #         {'id':['quigoArticle', 'contentTools', 'articleSideBar', 'header', 'navTop','articleTools','feedmodule','feedmodule3','promos','footer','linksFooter','timeArchive','belt','relatedStories','packages','Features']},
-   #         {'target':'_blank'},
-   #                   ]
+
+

    keep_only_tags = [ dict(name ="div",attrs = {"id" :["article",]}) ,
                        dict(name ="div",attrs = {"class" :["artHd","artTxt","photoBkt","vertPhoto","image","copy"]}) ,]
@ -50,6 +45,8 @@ class Time(BasicNewsRecipe):
    recursions = 1
    match_regexps = [r'/[0-9,]+-(2|3|4|5|6|7|8|9)(,\d+){0,1}.html']

+    preprocess_regexps = [(re.compile(
+        r'<meta .+/>'), lambda m:'')]

    def parse_index(self):
        soup = self.index_to_soup('http://www.time.com/time/magazine')
@ -75,13 +72,19 @@ class Time(BasicNewsRecipe):
        return feeds

    def find_articles(self, seched):
-        for a in seched.findNextSiblings('a', href=True, attrs={'class':'toc_hed'}):
-            yield {
-                    'title' : self.tag_to_string(a),
-                    'url'   : 'http://www.time.com'+a['href'],
-                    'date'  : '',
-                    'description' : self.article_description(a)
-                    }
+            articles = []
+            for a in seched.findNextSiblings( attrs={'class':['toc_hed','rule2']}):
+              if a.name in "div":
+                  break
+              else:
+                  yield {
+                           'title' : self.tag_to_string(a),
+                           'url'   : 'http://www.time.com'+a['href'],
+                           'date'  : '',
+                           'description' : self.article_description(a)
+                            }
+
+

    def article_description(self, a):
        ans = []