Use today's paper to generate WSJ recipe

2025-12-20 12:05:03 -05:00 · 2009-12-22 14:07:50 -07:00 · 2009-12-22 14:07:50 -07:00 · f79a63d19c
commit f79a63d19c
parent 2213a4648a
2 changed files with 51 additions and 56 deletions
--- a/resources/recipes/wsj.recipe
+++ b/resources/recipes/wsj.recipe
@ -3,15 +3,16 @@ __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
 __docformat__ = 'restructuredtext en'
 from calibre.web.feeds.news import BasicNewsRecipe
 # http://online.wsj.com/page/us_in_todays_paper.html
 class WallStreetJournal(BasicNewsRecipe):
-        title = 'The Wall Street Journal'
+        title = 'The Wall Street Journal (US)'
        __author__ = 'Kovid Goyal and Sujata Raman'
-        description = 'News and current affairs.'
+        description = 'News and current affairs'
        needs_subscription = True
        language = 'en'
@ -60,60 +61,51 @@ class WallStreetJournal(BasicNewsRecipe):
            return soup
-        def get_article_url(self, article):
+        def wsj_get_index(self):
-            try:
+            return self.index_to_soup('http://online.wsj.com/page/us_in_todays_paper.html')
-                return article.feedburner_origlink.split('?')[0]
+
-            except AttributeError:
+        def parse_index(self):
-                return article.link.split('?')[0]
+            soup = self.wsj_get_index()
            left_column = soup.find(
                    text=lambda t: 'begin ITP Left Column' in str(t))
            table = left_column.findNext('table')
            current_section = None
            current_articles = []
            feeds = []
            for x in table.findAllNext(True):
                if x.name == 'td' and x.get('class', None) == 'b13':
                    if current_articles and current_section:
                        feeds.append((current_section, current_articles))
                    current_section = self.tag_to_string(x.a).strip()
                    current_articles = []
                    self.log('\tProcessing section:', current_section)
                if current_section is not None and x.name == 'a' and \
                        x.get('class', None) == 'bold80':
                    title = self.tag_to_string(x)
                    url = x.get('href', False)
                    if not url or not title:
                        continue
                    url = url.partition('#')[0]
                    desc = ''
                    d = x.findNextSibling(True)
                    if d.get('class', None) == 'arialResize':
                        desc = self.tag_to_string(d)
                        desc = desc.partition(u'\u2022')[0]
                    self.log('\t\tFound article:', title)
                    self.log('\t\t\t', url)
                    if url.startswith('/'):
                        url = 'http://online.wsj.com'+url
                    if desc:
                        self.log('\t\t\t', desc)
                    current_articles.append({'title': title, 'url':url,
                        'description':desc, 'date':''})
            return feeds
        def cleanup(self):
            self.browser.open('http://online.wsj.com/logout?url=http://online.wsj.com')
        feeds =  [
                #('Most Emailed - Day', 'http://online.wsj.com/xml/rss/3_7030.xml'),
                #('Most Emailed - Week', 'http://online.wsj.com/xml/rss/3_7253.xml'),
                #('Most Emailed - Month', 'http://online.wsj.com/xml/rss/3_7254.xml'),
                (' Most Viewed - Day', 'http://online.wsj.com/xml/rss/3_7198.xml'),
                (' Most Viewed - Week', 'http://online.wsj.com/xml/rss/3_7251.xml'),
                #('Most Viewed - Month', 'http://online.wsj.com/xml/rss/3_7252.xml'),
                ('Today\'s Newspaper -  Page One', 'http://online.wsj.com/xml/rss/3_7205.xml'),
                ('Today\'s Newspaper - Marketplace', 'http://online.wsj.com/xml/rss/3_7206.xml'),
                ('Today\'s Newspaper - Money & Investing', 'http://online.wsj.com/xml/rss/3_7207.xml'),
                ('Today\'s Newspaper - Personal Journal', 'http://online.wsj.com/xml/rss/3_7208.xml'),
                ('Today\'s Newspaper - Weekend Journal', 'http://online.wsj.com/xml/rss/3_7209.xml'),
                ('Opinion', 'http://online.wsj.com/xml/rss/3_7041.xml'),
                ('News - U.S.: What\'s News', 'http://online.wsj.com/xml/rss/3_7011.xml'),
                ('News - U.S. Business', 'http://online.wsj.com/xml/rss/3_7014.xml'),
                ('News - Europe: What\'s News', 'http://online.wsj.com/xml/rss/3_7012.xml'),
                ('News - Asia: What\'s News', 'http://online.wsj.com/xml/rss/3_7013.xml'),
                ('News - World News', 'http://online.wsj.com/xml/rss/3_7085.xml'),
                ('News - Economy', 'http://online.wsj.com/xml/rss/3_7086.xml'),
                ('News - Earnings', 'http://online.wsj.com/xml/rss/3_7088.xml'),
                ('News - Health', 'http://online.wsj.com/xml/rss/3_7089.xml'),
                ('News - Law', 'http://online.wsj.com/xml/rss/3_7091.xml'),
                ('News - Media & Marketing', 'http://online.wsj.com/xml/rss/3_7020.xml'),
                ('Technology - What\'s News', 'http://online.wsj.com/xml/rss/3_7015.xml'),
                ('Technology - Gadgets', 'http://online.wsj.com/xml/rss/3_7094.xml'),
                ('Technology - Telecommunications', 'http://online.wsj.com/xml/rss/3_7095.xml'),
                ('Technology - E-commerce/Media', 'http://online.wsj.com/xml/rss/3_7096.xml'),
                ('Technology - Asia', 'http://online.wsj.com/xml/rss/3_7097.xml'),
                ('Technology - Europe', 'http://online.wsj.com/xml/rss/3_7098.xml'),
                ('Markets - News', 'http://online.wsj.com/xml/rss/3_7031.xml'),
                ('Markets - Europe News', 'http://online.wsj.com/xml/rss/3_7101.xml'),
                ('Markets - Asia News', 'http://online.wsj.com/xml/rss/3_7102.xml'),
                ('Markets - Deals & Deal Makers', 'http://online.wsj.com/xml/rss/3_7099.xml'),
                ('Markets - Hedge Funds', 'http://online.wsj.com/xml/rss/3_7199.xml'),
                ('Personal Journal', 'http://online.wsj.com/xml/rss/3_7200.xml'),
                ('Personal Journal - Money', 'http://online.wsj.com/xml/rss/3_7104.xml'),
                ('Personal Journal - Health', 'http://online.wsj.com/xml/rss/3_7089.xml'),
                ('Personal Journal - Autos', 'http://online.wsj.com/xml/rss/3_7092.xml'),
                ('Personal Journal - Homes', 'http://online.wsj.com/xml/rss/3_7105.xml'),
                ('Personal Journal - Travel', 'http://online.wsj.com/xml/rss/3_7106.xml'),
                ('Personal Journal - Careers', 'http://online.wsj.com/xml/rss/3_7107.xml'),
                ('Weekend & Leisure', 'http://online.wsj.com/xml/rss/3_7201.xml'),
                ('Weekend & Leisure - Weekend Journal', 'http://online.wsj.com/xml/rss/3_7202.xml'),
                ('Weekend & Leisure - Arts & Entertainment', 'http://online.wsj.com/xml/rss/3_7177.xml'),
                ('Weekend & Leisure - Books', 'http://online.wsj.com/xml/rss/3_7203.xml'),
                ('Weekend & Leisure - Sports', 'http://online.wsj.com/xml/rss/3_7204.xml'),
                ]
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@ -1063,7 +1063,7 @@ class BasicNewsRecipe(Recipe):
        return parsed_feeds
    @classmethod
-    def tag_to_string(self, tag, use_alt=True):
+    def tag_to_string(self, tag, use_alt=True, normalize_whitespace=True):
        '''
        Convenience method to take a
        `BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/documentation.html>`_
@ -1090,7 +1090,10 @@ class BasicNewsRecipe(Recipe):
                    strings.append(res)
                elif use_alt and item.has_key('alt'):
                    strings.append(item['alt'])
-        return u''.join(strings)
+        ans = u''.join(strings)
        if normalize_whitespace:
            ans = re.sub(r'\s+', ' ', ans)
        return ans
    @classmethod
    def soup(cls, raw):