Merge from trunk

2025-08-30 23:00:21 -04:00 · 2010-10-07 16:23:44 +01:00 · 2010-10-07 16:23:44 +01:00 · 0da8bfa170
commit 0da8bfa170
parent a9e01673fd 8834c4cb5a
1 changed files with 33 additions and 26 deletions
--- a/resources/recipes/new_yorker.recipe
+++ b/resources/recipes/new_yorker.recipe
@ -1,50 +1,57 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
-__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
+__copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
 '''
 newyorker.com
 '''
 from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import Tag
 class NewYorker(BasicNewsRecipe):
    title                 = 'The New Yorker'
    __author__            = 'Darko Miletic'
    description           = 'The best of US journalism'
    oldest_article        = 15
-    language = 'en'
+    language              = 'en'
    max_articles_per_feed = 100
    no_stylesheets        = True
    use_embedded_content  = False
    publisher             = 'Conde Nast Publications'
    category              = 'news, politics, USA'
    encoding              = 'cp1252'
    publication_type      = 'magazine'
    masthead_url          = 'http://www.newyorker.com/css/i/hed/logo.gif'
    extra_css             = """
                                body {font-family: "Times New Roman",Times,serif}
                                .articleauthor{color: #9F9F9F; font-family: Arial, sans-serif; font-size: small; text-transform: uppercase}
                                .rubric{color: #CD0021; font-family: Arial, sans-serif; font-size: small; text-transform: uppercase}
                            """
-    keep_only_tags = [dict(name='div', attrs={'id':'printbody'})]
+    conversion_options = {
-    remove_tags_after = dict(name='div',attrs={'id':'articlebody'})
+                          'comment'   : description
-    remove_tags = [
+                        , 'tags'      : category
-                     dict(name='div', attrs={'class':['utils','articleRailLinks','icons'] })
+                        , 'publisher' : publisher
-                    ,dict(name='link')
+                        , 'language'  : language
-                  ]
+                        }
-
+    
-    feeds          = [(u'The New Yorker', u'http://feeds.newyorker.com/services/rss/feeds/everything.xml')]
+    keep_only_tags = [dict(name='div', attrs={'id':['articleheads','articleRail','articletext','photocredits']})]
    remove_tags    = [
                         dict(name=['meta','iframe','base','link','embed','object'])
                        ,dict(name='div', attrs={'class':['utils','articleRailLinks','icons'] })                        
                     ]
    remove_attributes = ['lang']
    feeds             = [(u'The New Yorker', u'http://feeds.newyorker.com/services/rss/feeds/everything.xml')]
    def print_version(self, url):
        return url + '?printable=true'
-    def get_article_url(self, article):
+    def image_url_processor(self, baseurl, url):
-        return article.get('guid',  None)
+        return url.strip()
-    def postprocess_html(self, soup, x):
+    def get_cover_url(self):
-        body = soup.find('body')
+        cover_url = None
-        if body:
+        soup = self.index_to_soup('http://www.newyorker.com/magazine/toc/')
-            html = soup.find('html')
+        cover_item = soup.find('img',attrs={'id':'inThisIssuePhoto'})
-            if html:
+        if cover_item:
-                body.extract()
+           cover_url = 'http://www.newyorker.com' + cover_item['src'].strip()
-                html.insert(2, body)
+        return cover_url
-        mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
+        
        soup.head.insert(1,mcharset)
        return soup