Use today's paper to generate WSJ recipe

2025-06-23 15:30:45 -04:00 · 2009-12-22 14:07:50 -07:00 · 2009-12-22 14:07:50 -07:00 · f79a63d19c
commit f79a63d19c
parent 2213a4648a
2 changed files with 51 additions and 56 deletions
--- a/resources/recipes/wsj.recipe
+++ b/resources/recipes/wsj.recipe
@ -3,15 +3,16 @@ __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
 __docformat__ = 'restructuredtext en'

+
 from calibre.web.feeds.news import BasicNewsRecipe

 # http://online.wsj.com/page/us_in_todays_paper.html

 class WallStreetJournal(BasicNewsRecipe):

-        title = 'The Wall Street Journal'
+        title = 'The Wall Street Journal (US)'
        __author__ = 'Kovid Goyal and Sujata Raman'
-        description = 'News and current affairs.'
+        description = 'News and current affairs'
        needs_subscription = True
        language = 'en'

@ -60,60 +61,51 @@ class WallStreetJournal(BasicNewsRecipe):

            return soup

-        def get_article_url(self, article):
-            try:
-                return article.feedburner_origlink.split('?')[0]
-            except AttributeError:
-                return article.link.split('?')[0]
+        def wsj_get_index(self):
+            return self.index_to_soup('http://online.wsj.com/page/us_in_todays_paper.html')
+
+        def parse_index(self):
+            soup = self.wsj_get_index()
+
+            left_column = soup.find(
+                    text=lambda t: 'begin ITP Left Column' in str(t))
+
+            table = left_column.findNext('table')
+
+            current_section = None
+            current_articles = []
+            feeds = []
+            for x in table.findAllNext(True):
+                if x.name == 'td' and x.get('class', None) == 'b13':
+                    if current_articles and current_section:
+                        feeds.append((current_section, current_articles))
+                    current_section = self.tag_to_string(x.a).strip()
+                    current_articles = []
+                    self.log('\tProcessing section:', current_section)
+                if current_section is not None and x.name == 'a' and \
+                        x.get('class', None) == 'bold80':
+                    title = self.tag_to_string(x)
+                    url = x.get('href', False)
+                    if not url or not title:
+                        continue
+                    url = url.partition('#')[0]
+                    desc = ''
+                    d = x.findNextSibling(True)
+                    if d.get('class', None) == 'arialResize':
+                        desc = self.tag_to_string(d)
+                        desc = desc.partition(u'\u2022')[0]
+                    self.log('\t\tFound article:', title)
+                    self.log('\t\t\t', url)
+                    if url.startswith('/'):
+                        url = 'http://online.wsj.com'+url
+                    if desc:
+                        self.log('\t\t\t', desc)
+                    current_articles.append({'title': title, 'url':url,
+                        'description':desc, 'date':''})
+
+            return feeds

        def cleanup(self):
            self.browser.open('http://online.wsj.com/logout?url=http://online.wsj.com')

-        feeds =  [
-                #('Most Emailed - Day', 'http://online.wsj.com/xml/rss/3_7030.xml'),
-                #('Most Emailed - Week', 'http://online.wsj.com/xml/rss/3_7253.xml'),
-                #('Most Emailed - Month', 'http://online.wsj.com/xml/rss/3_7254.xml'),
-                (' Most Viewed - Day', 'http://online.wsj.com/xml/rss/3_7198.xml'),
-                (' Most Viewed - Week', 'http://online.wsj.com/xml/rss/3_7251.xml'),
-                #('Most Viewed - Month', 'http://online.wsj.com/xml/rss/3_7252.xml'),
-                ('Today\'s Newspaper -  Page One', 'http://online.wsj.com/xml/rss/3_7205.xml'),
-                ('Today\'s Newspaper - Marketplace', 'http://online.wsj.com/xml/rss/3_7206.xml'),
-                ('Today\'s Newspaper - Money & Investing', 'http://online.wsj.com/xml/rss/3_7207.xml'),
-                ('Today\'s Newspaper - Personal Journal', 'http://online.wsj.com/xml/rss/3_7208.xml'),
-                ('Today\'s Newspaper - Weekend Journal', 'http://online.wsj.com/xml/rss/3_7209.xml'),
-                ('Opinion', 'http://online.wsj.com/xml/rss/3_7041.xml'),
-                ('News - U.S.: What\'s News', 'http://online.wsj.com/xml/rss/3_7011.xml'),
-                ('News - U.S. Business', 'http://online.wsj.com/xml/rss/3_7014.xml'),
-                ('News - Europe: What\'s News', 'http://online.wsj.com/xml/rss/3_7012.xml'),
-                ('News - Asia: What\'s News', 'http://online.wsj.com/xml/rss/3_7013.xml'),
-                ('News - World News', 'http://online.wsj.com/xml/rss/3_7085.xml'),
-                ('News - Economy', 'http://online.wsj.com/xml/rss/3_7086.xml'),
-                ('News - Earnings', 'http://online.wsj.com/xml/rss/3_7088.xml'),
-                ('News - Health', 'http://online.wsj.com/xml/rss/3_7089.xml'),
-                ('News - Law', 'http://online.wsj.com/xml/rss/3_7091.xml'),
-                ('News - Media & Marketing', 'http://online.wsj.com/xml/rss/3_7020.xml'),
-                ('Technology - What\'s News', 'http://online.wsj.com/xml/rss/3_7015.xml'),
-                ('Technology - Gadgets', 'http://online.wsj.com/xml/rss/3_7094.xml'),
-                ('Technology - Telecommunications', 'http://online.wsj.com/xml/rss/3_7095.xml'),
-                ('Technology - E-commerce/Media', 'http://online.wsj.com/xml/rss/3_7096.xml'),
-                ('Technology - Asia', 'http://online.wsj.com/xml/rss/3_7097.xml'),
-                ('Technology - Europe', 'http://online.wsj.com/xml/rss/3_7098.xml'),
-                ('Markets - News', 'http://online.wsj.com/xml/rss/3_7031.xml'),
-                ('Markets - Europe News', 'http://online.wsj.com/xml/rss/3_7101.xml'),
-                ('Markets - Asia News', 'http://online.wsj.com/xml/rss/3_7102.xml'),
-                ('Markets - Deals & Deal Makers', 'http://online.wsj.com/xml/rss/3_7099.xml'),
-                ('Markets - Hedge Funds', 'http://online.wsj.com/xml/rss/3_7199.xml'),
-                ('Personal Journal', 'http://online.wsj.com/xml/rss/3_7200.xml'),
-                ('Personal Journal - Money', 'http://online.wsj.com/xml/rss/3_7104.xml'),
-                ('Personal Journal - Health', 'http://online.wsj.com/xml/rss/3_7089.xml'),
-                ('Personal Journal - Autos', 'http://online.wsj.com/xml/rss/3_7092.xml'),
-                ('Personal Journal - Homes', 'http://online.wsj.com/xml/rss/3_7105.xml'),
-                ('Personal Journal - Travel', 'http://online.wsj.com/xml/rss/3_7106.xml'),
-                ('Personal Journal - Careers', 'http://online.wsj.com/xml/rss/3_7107.xml'),
-                ('Weekend & Leisure', 'http://online.wsj.com/xml/rss/3_7201.xml'),
-                ('Weekend & Leisure - Weekend Journal', 'http://online.wsj.com/xml/rss/3_7202.xml'),
-                ('Weekend & Leisure - Arts & Entertainment', 'http://online.wsj.com/xml/rss/3_7177.xml'),
-                ('Weekend & Leisure - Books', 'http://online.wsj.com/xml/rss/3_7203.xml'),
-                ('Weekend & Leisure - Sports', 'http://online.wsj.com/xml/rss/3_7204.xml'),
-                ]

--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@ -1063,7 +1063,7 @@ class BasicNewsRecipe(Recipe):
        return parsed_feeds

    @classmethod
-    def tag_to_string(self, tag, use_alt=True):
+    def tag_to_string(self, tag, use_alt=True, normalize_whitespace=True):
        '''
        Convenience method to take a
        `BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/documentation.html>`_
@ -1090,7 +1090,10 @@ class BasicNewsRecipe(Recipe):
                    strings.append(res)
                elif use_alt and item.has_key('alt'):
                    strings.append(item['alt'])
-        return u''.join(strings)
+        ans = u''.join(strings)
+        if normalize_whitespace:
+            ans = re.sub(r'\s+', ' ', ans)
+        return ans

    @classmethod
    def soup(cls, raw):