Updated recipes for NYTimes and Newsweek. Also support for setting the author of downloaded articles in the metadata.

2025-07-09 03:04:10 -04:00 · 2009-07-09 13:10:29 -06:00 · 2009-07-09 13:10:29 -06:00 · edf5bcbab6
commit edf5bcbab6
parent 25538e5c04
4 changed files with 61 additions and 13 deletions
--- a/src/calibre/web/feeds/init.py
+++ b/src/calibre/web/feeds/init.py
@ -16,7 +16,7 @@ class Article(object):
    time_offset = datetime.now() - datetime.utcnow()
-    def __init__(self, id, title, url, summary, published, content):
+    def __init__(self, id, title, url, author, summary, published, content):
        self.downloaded = False
        self.id = id
        self.title = title.strip() if title else title
@ -26,6 +26,9 @@ class Article(object):
        except:
            pass
        self.url = url
        self.author = author
        if author and not isinstance(author, unicode):
            author = author.decode('utf-8', 'replace')
        self.summary = summary
        if summary and not isinstance(summary, unicode):
            summary = summary.decode('utf-8', 'replace')
@ -39,6 +42,7 @@ class Article(object):
                traceback.print_exc()
                summary = u''
        self.text_summary = summary
        self.author = author
        self.content = content
        self.date = published
        self.utctime = datetime(*self.date[:6])
@ -50,10 +54,11 @@ class Article(object):
 (u'''\
 Title       : %s
 URL         : %s
 Author      : %s
 Summary     : %s
 Date        : %s
 Has content : %s
-'''%(self.title, self.url, self.summary[:20]+'...', self.localtime.strftime('%a, %d %b, %Y %H:%M'),
+'''%(self.title, self.url, self.author, self.summary[:20]+'...', self.localtime.strftime('%a, %d %b, %Y %H:%M'),
     bool(self.content))).encode('utf-8')
    def __str__(self):
@ -124,7 +129,8 @@ class Feed(object):
            link        = item.get('url', None)
            description = item.get('description', '')
            content     = item.get('content', '')
-            article = Article(id, title, link, description, published, content)
+            author      = item.get('author', '')
            article = Article(id, title, link, author, description, published, content)
            delta = datetime.utcnow() - article.utctime
            if delta.days*24*3600 + delta.seconds <= 24*3600*self.oldest_article:
                self.articles.append(article)
@ -149,7 +155,9 @@ class Feed(object):
            self.logger.warning('Failed to get link for %s'%title)
            self.logger.debug(traceback.format_exc())
            link = None
        description = item.get('summary', None)
        author = item.get('author', None)
        content = [i.value for i in item.get('content', []) if i.value]
        content = [i if isinstance(i, unicode) else i.decode('utf-8', 'replace')
@ -159,7 +167,7 @@ class Feed(object):
            content = None
        if not link and not content:
            return
-        article = Article(id, title, link, description, published, content)
+        article = Article(id, title, link, author, description, published, content)
        delta = datetime.utcnow() - article.utctime
        if delta.days*24*3600 + delta.seconds <= 24*3600*self.oldest_article:
            self.articles.append(article)
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@ -884,6 +884,9 @@ class BasicNewsRecipe(Recipe):
            for j, a in enumerate(f):
                if getattr(a, 'downloaded', False):
                    adir = 'feed_%d/article_%d/'%(num, j)
                    auth = a.author
                    if not auth:
                        auth = None
                    desc = a.text_summary
                    if not desc:
                        desc = None
@ -893,7 +896,7 @@ class BasicNewsRecipe(Recipe):
                        self.play_order_counter += 1
                        po = self.play_order_counter
                    parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
-                                    play_order=po, description=desc)
+                                    play_order=po, author=auth, description=desc)
                    last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
                    for sp in a.sub_pages:
                        prefix = os.path.commonprefix([opf_path, sp])
@ -925,11 +928,15 @@ class BasicNewsRecipe(Recipe):
                if po is None:
                    self.play_order_counter += 1
                    po = self.play_order_counter
                auth = getattr(f, 'author', None)
                if not auth:
                    auth = None
                desc = getattr(f, 'description', None)
                if not desc:
                    desc = None
                feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
-                    f.title, play_order=po, description=desc))
+                    f.title, play_order=po, description=desc, author=auth))
        else:
            entries.append('feed_%d/index.html'%0)
            feed_index(0, toc)
--- a/src/calibre/web/feeds/recipes/recipe_newsweek.py
+++ b/src/calibre/web/feeds/recipes/recipe_newsweek.py
@ -9,13 +9,26 @@ from calibre.web.feeds.news import BasicNewsRecipe
 class Newsweek(BasicNewsRecipe):
    title          = 'Newsweek'
-    __author__     = 'Kovid Goyal'
+    __author__     = 'Kovid Goyal and Sujata Raman'
    description    = 'Weekly news and current affairs in the US'
    no_stylesheets = True
    extra_css      = '''
                        h1{color:#383733;font-family:Arial,Helvetica,sans-serif;font-size:large;}
                        .deck{font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif;color:#383733;font-size:small;}
                        .articleInfo{color:#474537;font-family:Arial,Helvetica,sans-serif; font-size:xx-small;}
                        .authorName{color:#B61900;font-family:Arial,Helvetica,sans-serif;font-size:medium;}
                        .authorInfo{color:#0066CC;font-family:Arial,Helvetica,sans-serif;font-size:xx-small;}
                        .articleUpdated{ font-size:xx-small; color:#73726C; font-family:Arial,Helvetica,sans-serif;}
                        .issueDate{font-family :Arial,Helvetica,sans-serif;font-size:xx-small;font-style:italic;}
                        .story{color:#333333; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif;font-size:small;}
                        .photoCredit{color:#999999;font-family:Arial,Helvetica,sans-serif;font-size:xx-small;}
                        .photoCaption{color:#0A0A09;font-family:Arial,Helvetica,sans-serif;font-size:xx-small;font-weight:bold;}'''
    encoding       = 'utf-8'
    language = _('English')
    remove_tags = [
-            {'class':['navbar', 'ad', 'sponsorLinksArticle', 'mm-content',
+            {'class':['fwArticle noHr','fwArticle','subinfo','hdlBulletItem','head-content','navbar','link', 'ad', 'sponsorLinksArticle', 'mm-content',
                'inline-social-links-wrapper', 'email-article',
                'comments-and-social-links-wrapper', 'EmailArticleBlock']},
            {'id' : ['footer', 'ticker-data', 'topTenVertical',
@ -24,8 +37,6 @@ class Newsweek(BasicNewsRecipe):
            {'class': re.compile('related-cloud')},
            ]
    keep_only_tags = [{'class':['article HorizontalHeader', 'articlecontent']}]
    recursions = 1
    match_regexps = [r'http://www.newsweek.com/id/\S+/page/\d+']
--- a/src/calibre/web/feeds/recipes/recipe_nytimes_sub.py
+++ b/src/calibre/web/feeds/recipes/recipe_nytimes_sub.py
@ -16,7 +16,7 @@ class NYTimes(BasicNewsRecipe):
    __author__  = 'Kovid Goyal'
    language = _('English')
    description = 'Daily news from the New York Times (subscription version)'
-    timefmt = ' [%a, %d %b, %Y]'
+    timefmt = ''
    needs_subscription = True
    remove_tags_before = dict(id='article')
    remove_tags_after  = dict(id='article')
@ -46,39 +46,61 @@ class NYTimes(BasicNewsRecipe):
        articles = {}
        key = None
        ans = []
        allSectionKeywords = ['The Front Page', 'International','National','Obituaries','Editorials',
                              'New York','Business Day','Sports','Dining','Arts','Home','Styles']
        excludeSectionKeywords = ['Sports','Dining','Styles']
        # Find each instance of class="section-headline", class="story", class="story headline"
        for div in soup.findAll(True,
            attrs={'class':['section-headline', 'story', 'story headline']}):
            if div['class'] == 'section-headline':
                key = string.capwords(feed_title(div))
                excluded = re.compile('|'.join(excludeSectionKeywords))
                if excluded.search(key):
                    self.log("Skipping section %s" % key)
                    continue
                articles[key] = []
                ans.append(key)
-            elif div['class'] in ['story', 'story headline']:
+            elif div['class'] in ['story', 'story headline'] :
                a = div.find('a', href=True)
                if not a:
                    continue
                url = re.sub(r'\?.*', '', a['href'])
                url += '?pagewanted=all'
                title = self.tag_to_string(a, use_alt=True).strip()
                description = ''
                pubdate = strftime('%a, %d %b')
                summary = div.find(True, attrs={'class':'summary'})
                if summary:
                    description = self.tag_to_string(summary, use_alt=False)
                author = ''
                authorAttribution = div.find(True, attrs={'class':'storyheadline-author'})
                if authorAttribution:
                    author = self.tag_to_string(authorAttribution, use_alt=False)
                else:
                    authorAttribution = div.find(True, attrs={'class':'byline'})
                    if authorAttribution:
                        author = self.tag_to_string(authorAttribution, use_alt=False)
                feed = key if key is not None else 'Uncategorized'
                if not articles.has_key(feed):
                    articles[feed] = []
                if not 'podcasts' in url:
                    articles[feed].append(
                                  dict(title=title, url=url, date=pubdate,
-                                       description=description,
+                                       description=description, author=author,
                                       content=''))
        ans = self.sort_index_by(ans, {'The Front Page':-1,
                                       'Dining In, Dining Out':1,
                                       'Obituaries':2})
        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
        return ans
    def preprocess_html(self, soup):