Update Wall Street Journal

Fixes #1395546 [Private bug](https://bugs.launchpad.net/calibre/+bug/1395546)
2025-07-09 03:04:10 -04:00 · 2014-11-25 21:24:51 +05:30 · 2014-11-25 21:24:51 +05:30 · 4cd960d9d9
commit 4cd960d9d9
parent 37439fecf6
2 changed files with 293 additions and 313 deletions
--- a/recipes/wsj.recipe
+++ b/recipes/wsj.recipe
@ -1,77 +1,136 @@
-#!/usr/bin/env  python
+from calibre.web.feeds.jsnews import JavascriptRecipe
 __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
 __docformat__ = 'restructuredtext en'
-from calibre.web.feeds.news import BasicNewsRecipe
+try:
-import copy, re
+    from calibre.web.feeds.jsnews import CSSSelect
 except ImportError:
    def CSSSelect(expr):
        from cssselect import HTMLTranslator
        from lxml.etree import XPath
        return XPath(HTMLTranslator().css_to_xpath(expr))
 # http://online.wsj.com/page/us_in_todays_paper.html
-class WallStreetJournal(BasicNewsRecipe):
+class WSJ(JavascriptRecipe):
    title = 'The Wall Street Journal'
-    __author__ = 'Kovid Goyal and Joshua Oster-Morris'
+    __author__ = 'Kovid Goyal'
    description = 'News and current affairs'
    needs_subscription = True
    language = 'en'
    compress_news_images = True
    compress_news_images_auto_size = 5
    max_articles_per_feed = 1000
-    timefmt  = ' [%a, %b %d, %Y]'
+    timefmt = ' [%a, %b %d, %Y]'
    no_stylesheets = True
    ignore_duplicate_articles = {'url'}
    remove_attributes = ['style', 'data-scrim']
    needs_subscription = True
-    keep_only_tags = [
+    keep_only_tags = (
-        dict(name='h1'), dict(name='h2', attrs={'class':['subhead', 'subHed deck']}),
+        'h1',  # 'h2.subhead', 'h2.subHed.deck',
-        dict(name='span', itemprop='author', rel='author'),
+        'span[itemprop=author][rel=author]',
-        dict(name='article', id=['article-contents', 'articleBody']),
+        'article#article-contents', 'article#articleBody',
-        dict(name='div', id='article_story_body'),
+        'div#article_story_body',
-        dict(name='div', attrs={'class':'snippet-ad-login'}),
+        # Parallax formatting
-    ]
+        'div#ncTitleArea', 'section.nc-exp-artbody',
-    remove_tags = [
+        # Error conditions, login required and page not found
-        dict(attrs={'class':['insetButton', 'insettipBox', 'author-info', 'media-object-video']}),
+        'div#snippet-ad-login', 'div.errorNotFound',
-        dict(attrs={'class':lambda x: x and 'article_tools' in x.split()}),
+    )
        dict(name='span', attrs={'data-country-code':True, 'data-ticker-code':True}),
    ]
    preprocess_regexps = [
        (re.compile(r'<!--\[if lte IE 8\]>.+?<!\[endif\]-->', re.DOTALL), lambda m: ''),
        (re.compile(r'<!\[if ! lte IE 8\]>.+?<!\[endif\]>', re.DOTALL), lambda m:''),
    ]
-    use_javascript_to_login = True
+    remove_tags = (
        '.insetButton', '.insettipBox', '.author-info', '.media-object-video',
        '.article_tools', 'span[data-country-code][data-ticker-code]',
        'div.nc-exp-artmeta',
    )
-    def javascript_login(self, br, username, password):
+    def do_login(self, br, username, password):
-        br.visit('https://id.wsj.com/access/pages/wsj/us/login_standalone.html?mg=com-wsj', timeout=120)
+        br.visit(
            'https://id.wsj.com/access/pages/wsj/us/login_standalone.html?mg=com-wsj', timeout=120)  # noqa
        f = br.select_form(nr=0)
        f['username'] = username
        f['password'] = password
        br.submit(timeout=120)
-    def populate_article_metadata(self, article, soup, first):
+    def preprocess_stage2(self, article, browser, url, recursion_level):
-        if first and hasattr(self, 'add_toc_thumbnail'):
+        # Slideshow and expandable images need to be processed here to
-            picdiv = soup.find('img', src=True)
+        # set the src attribute correctly
-            if picdiv is not None:
+        found = 0
-                self.add_toc_thumbnail(article,picdiv['src'])
+        for img in browser.css_select('img[data-in-base-data-lazy]', all=True):
            img.setAttribute('src', img.attribute('data-in-base-data-lazy'))
            found += 1
        for img in browser.css_select('img[data-enlarge]', all=True):
            img.setAttribute('src', img.attribute('data-enlarge'))
            found += 1
        if found:
            self.log.debug('Found %d dynamic images in:' % found, url)
-    def preprocess_html(self, soup):
+    def get_publication_data(self, browser):
-        # Remove thumbnail for zoomable images
+        return self.get_wsj_index(browser)
        for div in soup.findAll('div', attrs={'class':lambda x: x and 'insetZoomTargetBox' in x.split()}):
            img = div.find('img')
            if img is not None:
                img.extract()
        # Use large images
        for img in soup.findAll('img', attrs={'data-enlarge':True}):
            img['src'] = img['data-enlarge']
-        return soup
+    def abs_wsj_url(self, href):
        if not href.startswith('http'):
            href = 'http://online.wsj.com' + href
        return href
-    def wsj_get_index(self):
+    def wsj_find_articles(self, url):
-        return self.index_to_soup('http://online.wsj.com/itp')
+        root = self.index_to_soup(url)
-    def wsj_add_feed(self,feeds,title,url):
+        for x in CSSSelect('div.whatsNews-simple')(root):
            x.getparent().remove(x)
        articles = []
        for a in CSSSelect('a.mjLinkItem[href]')(root):
            container = a.xpath('ancestor::li')
            meta = CSSSelect('.meta_sectionName')(a)
            if meta:
                meta = meta[0]
                meta.getparent().remove(meta)
                meta = self.tag_to_string(meta)
            title = self.tag_to_string(a)
            if meta:
                title += ' [%s]' % meta
            url = self.abs_wsj_url(a.get('href'))
            desc = ''
            if container:
                for p in CSSSelect('p')(container[0]):
                    desc = self.tag_to_string(p)
                    if 'Subscriber Content' not in desc:
                        break
            articles.append({'title': title, 'url': url,
                             'description': desc, 'date': ''})
            self.log('\tFound article:', title)
            self.log('\t\t', desc)
        return articles
    def wsj_find_wn_articles(self, url):
        root = self.index_to_soup(url)
        articles = []
        whats_news = CSSSelect('div.whatsNews-simple.whatsNews-itp')(root)
        if whats_news:
            for a in CSSSelect('a[href]')(whats_news[-1]):
                if '/articles/' not in a.get('href', ''):
                    continue
                container = a.xpath('ancestor::p')
                for meta in CSSSelect('.meta_sectionName')(a):
                    meta.getparent().remove(meta)
                title = self.tag_to_string(a).strip()
                url = self.abs_wsj_url(a.get('href'))
                desc = ''
                if container:
                    desc = self.tag_to_string(container[0])
                articles.append({'title': title, 'url': url,
                                 'description': desc, 'date': ''})
                self.log('\tFound WN article:', title)
                self.log('\t\t', desc)
        return articles
    def wsj_add_feed(self, feeds, title, url):
        self.log('Found section:', title)
        try:
            if url.endswith('whatsnews'):
@ -82,129 +141,47 @@ class WallStreetJournal(BasicNewsRecipe):
            articles = []
        if articles:
            feeds.append((title, articles))
        return feeds
-    def abs_wsj_url(self, href):
+    def get_wsj_index(self, browser):
-        if not href.startswith('http'):
+        # return self.test_wsj_index()
-            href = 'http://online.wsj.com' + href
+        ans = {}
-        return href
+        root = self.index_to_soup('http://online.wsj.com/itp')
        for span in CSSSelect('span.date-date')(root):
            if span.text:
                self.timefmt = span.text
                break
        for a in CSSSelect('div.itpSectionHeaderPdf a[href]')(root):
            href = a.get('href')
            if href:
                break
            ans['cover'] = browser.download_file(href)
-    def parse_index(self):
+        feeds = ans['index'] = []
-        soup = self.wsj_get_index()
+        for a in CSSSelect('div.itpHeader ul.tab a[href]')(root):
-
+            if '/itp/' not in a.get('href', ''):
-        date = soup.find('span', attrs={'class':'date-date'})
+                continue
-        if date is not None:
+            pageone = a.get('href').endswith('pageone')
            self.timefmt = ' [%s]'%self.tag_to_string(date)
        cov = soup.find('div', attrs={'class':lambda x: x and 'itpSectionHeaderPdf' in x.split()})
        if cov is not None:
            a = cov.find('a', href=True)
            if a is not None:
                self.cover_url = a['href']
        feeds = []
        div = soup.find('div', attrs={'class':'itpHeader'})
        div = div.find('ul', attrs={'class':'tab'})
        for a in div.findAll('a', href=lambda x: x and '/itp/' in x):
            pageone = a['href'].endswith('pageone')
            if pageone:
                title = 'Front Section'
-                url = self.abs_wsj_url(a['href'])
+                url = self.abs_wsj_url(a.get('href'))
-                feeds = self.wsj_add_feed(feeds,title,url)
+                self.wsj_add_feed(feeds, title, url)
                title = "What's News"
-                url = url.replace('pageone','whatsnews')
+                url = url.replace('pageone', 'whatsnews')
-                feeds = self.wsj_add_feed(feeds,title,url)
+                self.wsj_add_feed(feeds, title, url)
            else:
                title = self.tag_to_string(a)
-                url = self.abs_wsj_url(a['href'])
+                url = self.abs_wsj_url(a.get('href'))
-                feeds = self.wsj_add_feed(feeds,title,url)
+                self.wsj_add_feed(feeds, title, url)
-
+        return ans
        for li in soup.findAll('li', attrs={'class':'ahed_listitem'}):
            h2 = li.find('h2')
            if h2 is None:
                continue
            a = h2.find('a', href=True)
            if a is None:
                continue
            url = a['href']
            title = self.tag_to_string(a)
            p = h2.findNextSibling('p')
            if p is not None:
                desc = self.tag_to_string(p)
            else:
                desc = ''
            if feeds:
                feeds[0][1].append({'title':title, 'url':url, 'description':desc, 'date':''})
        return feeds
    def wsj_find_wn_articles(self, url):
        soup = self.index_to_soup(url)
        articles = []
        whats_news = soup.find('div', attrs={'class':lambda x: x and 'whatsNews-simple' in x})
        if whats_news is not None:
            for a in whats_news.findAll('a', href=lambda x: x and '/article/' in x):
                container = a.findParent(['p'])
                meta = a.find(attrs={'class':'meta_sectionName'})
                if meta is not None:
                    meta.extract()
                title = self.tag_to_string(a).strip()
                url = a['href']
                desc = ''
                if container is not None:
                    desc = self.tag_to_string(container)
                articles.append({'title':title, 'url':url,
                    'description':desc, 'date':''})
                self.log('\tFound WN article:', title)
                self.log('\t\t', desc)
        return articles
    def wsj_find_articles(self, url):
        soup = self.index_to_soup(url)
        whats_news = soup.find('div', attrs={'class':lambda x: x and 'whatsNews-simple' in x})
        if whats_news is not None:
            whats_news.extract()
        articles = []
        flavorarea = soup.find('div', attrs={'class':lambda x: x and 'ahed' in x})
        if flavorarea is not None:
            flavorstory = flavorarea.find('a', href=lambda x: x and x.startswith('/article'))
            if flavorstory is not None:
                flavorstory['class'] = 'mjLinkItem'
                metapage = soup.find('span', attrs={'class':lambda x: x and 'meta_sectionName' in x})
                if metapage is not None:
                    flavorstory.append(copy.copy(metapage))  # metapage should always be A1 because that should be first on the page
        for a in soup.findAll('a', attrs={'class':'mjLinkItem'}, href=True):
            container = a.findParent(['li', 'div'])
            meta = a.find(attrs={'class':'meta_sectionName'})
            if meta is not None:
                meta.extract()
                meta = self.tag_to_string(meta).strip()
            if meta:
                title = self.tag_to_string(a).strip() + ' [%s]'%meta
            else:
                title = self.tag_to_string(a).strip()
            url = self.abs_wsj_url(a['href'])
            desc = ''
            for p in container.findAll('p'):
                desc = self.tag_to_string(p)
                if 'Subscriber Content' not in desc:
                    break
            articles.append({'title':title, 'url':url,
                'description':desc, 'date':''})
            self.log('\tFound article:', title)
            self.log('\t\t', desc)
        return articles
    def cleanup(self):
        self.browser.open('http://online.wsj.com/logout?url=http://online.wsj.com')
    def test_wsj_index(self):
        return {'index': [
            ('Testing', [
                {'title': 'Article One',
                 'url': 'http://online.wsj.com/articles/the-end-of-the-impulse-shopper-1416872108'},  # noqa
                {'title': 'Article Two',
                 'url': 'http://online.wsj.com/articles/ferguson-police-officer-not-charged-in-black-teens-shooting-1416882438'},  # noqa
                {'title': 'Article Three',
                 'url': 'http://online.wsj.com/article/SB10634695869867284248804580297251334393676.html'},  # noqa
            ]),
        ]}
--- a/recipes/wsj_free.recipe
+++ b/recipes/wsj_free.recipe
@ -1,71 +1,140 @@
-#!/usr/bin/env  python
+from calibre.web.feeds.jsnews import JavascriptRecipe
 __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
 __docformat__ = 'restructuredtext en'
-from calibre.web.feeds.news import BasicNewsRecipe
+try:
-import copy, re
+    from calibre.web.feeds.jsnews import CSSSelect
 except ImportError:
    def CSSSelect(expr):
        from cssselect import HTMLTranslator
        from lxml.etree import XPath
        return XPath(HTMLTranslator().css_to_xpath(expr))
-class WallStreetJournal(BasicNewsRecipe):
+
 class WSJ(JavascriptRecipe):
    title = 'Wall Street Journal (free)'
-    __author__ = 'Kovid Goyal, Sujata Raman, Joshua Oster-Morris, Starson17'
+    __author__ = 'Kovid Goyal'
    description = '''News and current affairs. This recipe only fetches complete
    versions of the articles that are available free on the wsj.com website.
    To get the rest of the articles, subscribe to the WSJ and use the other WSJ
    recipe.'''
    language = 'en'
-    cover_url           = 'http://dealbreaker.com/images/thumbs/Wall%20Street%20Journal%20A1.JPG'
+
    compress_news_images = True
    compress_news_images_auto_size = 5
    max_articles_per_feed = 1000
-    timefmt  = ' [%a, %b %d, %Y]'
+    timefmt = ' [%a, %b %d, %Y]'
    no_stylesheets = True
    ignore_duplicate_articles = {'url'}
    remove_attributes = ['style', 'data-scrim']
    needs_subscription = True
-    keep_only_tags = [
+    keep_only_tags = (
-        dict(name='h1'), dict(name='h2', attrs={'class':['subhead', 'subHed deck']}),
+        'h1',  # 'h2.subhead', 'h2.subHed.deck',
-        dict(name='span', itemprop='author', rel='author'),
+        'span[itemprop=author][rel=author]',
-        dict(name='article', id=['article-contents', 'articleBody']),
+        'article#article-contents', 'article#articleBody',
-        dict(name='div', id='article_story_body'),
+        'div#article_story_body',
-        dict(name='div', attrs={'class':'snippet-ad-login'}),
+        # Parallax formatting
-    ]
+        'div#ncTitleArea', 'section.nc-exp-artbody',
-    remove_tags = [
+        # Error conditions, login required and page not found
-        dict(attrs={'class':['insetButton', 'insettipBox', 'author-info', 'media-object-video']}),
+        'div#snippet-ad-login', 'div.errorNotFound',
-        dict(attrs={'class':lambda x: x and 'article_tools' in x.split()}),
+    )
        dict(name='span', attrs={'data-country-code':True, 'data-ticker-code':True}),
    ]
    preprocess_regexps = [
        (re.compile(r'<!--\[if lte IE 8\]>.+?<!\[endif\]-->', re.DOTALL), lambda m: ''),
        (re.compile(r'<!\[if ! lte IE 8\]>.+?<!\[endif\]>', re.DOTALL), lambda m:''),
    ]
-    def populate_article_metadata(self, article, soup, first):
+    remove_tags = (
-        if first and hasattr(self, 'add_toc_thumbnail'):
+        '.insetButton', '.insettipBox', '.author-info', '.media-object-video',
-            picdiv = soup.find('img', src=True)
+        '.article_tools', 'span[data-country-code][data-ticker-code]',
-            if picdiv is not None:
+        'div.nc-exp-artmeta',
-                self.add_toc_thumbnail(article,picdiv['src'])
+    )
-    def preprocess_html(self, soup):
+    def do_login(self, br, username, password):
-        # Remove thumbnail for zoomable images
+        br.visit(
-        for div in soup.findAll('div', attrs={'class':lambda x: x and 'insetZoomTargetBox' in x.split()}):
+            'https://id.wsj.com/access/pages/wsj/us/login_standalone.html?mg=com-wsj', timeout=120)  # noqa
-            img = div.find('img')
+        f = br.select_form(nr=0)
-            if img is not None:
+        f['username'] = username
-                img.extract()
+        f['password'] = password
-        # Use large images
+        br.submit(timeout=120)
        for img in soup.findAll('img', attrs={'data-enlarge':True}):
            img['src'] = img['data-enlarge']
-        return soup
+    def preprocess_stage2(self, article, browser, url, recursion_level):
        # Slideshow and expandable images need to be processed here to
        # set the src attribute correctly
        found = 0
        for img in browser.css_select('img[data-in-base-data-lazy]', all=True):
            img.setAttribute('src', img.attribute('data-in-base-data-lazy'))
            found += 1
        for img in browser.css_select('img[data-enlarge]', all=True):
            img.setAttribute('src', img.attribute('data-enlarge'))
            found += 1
        if found:
            self.log.debug('Found %d dynamic images in:' % found, url)
    def get_publication_data(self, browser):
        return self.get_wsj_index(browser)
    def abs_wsj_url(self, href):
        if not href.startswith('http'):
            href = 'http://online.wsj.com' + href
        return href
-    def wsj_get_index(self):
+    def wsj_find_articles(self, url):
-        return self.index_to_soup('http://online.wsj.com/itp')
+        root = self.index_to_soup(url)
-    def wsj_add_feed(self,feeds,title,url):
+        for x in CSSSelect('div.whatsNews-simple')(root):
            x.getparent().remove(x)
        articles = []
        for a in CSSSelect('a.mjLinkItem[href]')(root):
            container = a.xpath('ancestor::li')
            meta = CSSSelect('.meta_sectionName')(a)
            if meta:
                meta = meta[0]
                meta.getparent().remove(meta)
                meta = self.tag_to_string(meta)
            title = self.tag_to_string(a)
            if meta:
                title += ' [%s]' % meta
            url = self.abs_wsj_url(a.get('href'))
            desc = ''
            if container:
                for p in CSSSelect('p')(container[0]):
                    desc = self.tag_to_string(p)
                    if 'Subscriber Content' not in desc:
                        break
            articles.append({'title': title, 'url': url,
                             'description': desc, 'date': ''})
            self.log('\tFound article:', title)
            self.log('\t\t', desc)
        return articles
    def wsj_find_wn_articles(self, url):
        root = self.index_to_soup(url)
        articles = []
        whats_news = CSSSelect('div.whatsNews-simple.whatsNews-itp')(root)
        if whats_news:
            for a in CSSSelect('a[href]')(whats_news[-1]):
                if '/articles/' not in a.get('href', ''):
                    continue
                container = a.xpath('ancestor::p')
                for meta in CSSSelect('.meta_sectionName')(a):
                    meta.getparent().remove(meta)
                title = self.tag_to_string(a).strip()
                url = self.abs_wsj_url(a.get('href'))
                desc = ''
                if container:
                    desc = self.tag_to_string(container[0])
                articles.append({'title': title, 'url': url,
                                 'description': desc, 'date': ''})
                self.log('\tFound WN article:', title)
                self.log('\t\t', desc)
        return articles
    def wsj_add_feed(self, feeds, title, url):
        self.log('Found section:', title)
        try:
            if url.endswith('whatsnews'):
@ -76,113 +145,47 @@ class WallStreetJournal(BasicNewsRecipe):
            articles = []
        if articles:
            feeds.append((title, articles))
        return feeds
-    def parse_index(self):
+    def get_wsj_index(self, browser):
-        soup = self.wsj_get_index()
+        # return self.test_wsj_index()
        ans = {}
        root = self.index_to_soup('http://online.wsj.com/itp')
        for span in CSSSelect('span.date-date')(root):
            if span.text:
                self.timefmt = span.text
                break
        for a in CSSSelect('div.itpSectionHeaderPdf a[href]')(root):
            href = a.get('href')
            if href:
                break
            ans['cover'] = browser.download_file(href)
-        date = soup.find('span', attrs={'class':'date-date'})
+        feeds = ans['index'] = []
-        if date is not None:
+        for a in CSSSelect('div.itpHeader ul.tab a[href]')(root):
-            self.timefmt = ' [%s]'%self.tag_to_string(date)
+            if '/itp/' not in a.get('href', ''):
-
+                continue
-        feeds = []
+            pageone = a.get('href').endswith('pageone')
        div = soup.find('div', attrs={'class':'itpHeader'})
        div = div.find('ul', attrs={'class':'tab'})
        for a in div.findAll('a', href=lambda x: x and '/itp/' in x):
            pageone = a['href'].endswith('pageone')
            if pageone:
                title = 'Front Section'
-                url = self.abs_wsj_url(a['href'])
+                url = self.abs_wsj_url(a.get('href'))
-                feeds = self.wsj_add_feed(feeds,title,url)
+                self.wsj_add_feed(feeds, title, url)
-                title = 'What''s News'
+                title = "What's News"
-                url = url.replace('pageone','whatsnews')
+                url = url.replace('pageone', 'whatsnews')
-                feeds = self.wsj_add_feed(feeds,title,url)
+                self.wsj_add_feed(feeds, title, url)
            else:
                title = self.tag_to_string(a)
-                url = self.abs_wsj_url(a['href'])
+                url = self.abs_wsj_url(a.get('href'))
-                feeds = self.wsj_add_feed(feeds,title,url)
+                self.wsj_add_feed(feeds, title, url)
        return ans
-        for li in soup.findAll('li', attrs={'class':'ahed_listitem'}):
+    def test_wsj_index(self):
-            h2 = li.find('h2')
+        return {'index': [
-            if h2 is None:
+            ('Testing', [
-                continue
+                {'title': 'Article One',
-            a = h2.find('a', href=True)
+                 'url': 'http://online.wsj.com/articles/the-end-of-the-impulse-shopper-1416872108'},  # noqa
-            if a is None:
+                {'title': 'Article Two',
-                continue
+                 'url': 'http://online.wsj.com/articles/ferguson-police-officer-not-charged-in-black-teens-shooting-1416882438'},  # noqa
-            url = a['href']
+                {'title': 'Article Three',
-            title = self.tag_to_string(a)
+                 'url': 'http://online.wsj.com/article/SB10634695869867284248804580297251334393676.html'},  # noqa
-            p = h2.findNextSibling('p')
+            ]),
-            if p is not None:
+        ]}
                desc = self.tag_to_string(p)
            else:
                desc = ''
            if feeds:
                feeds[0][1].append({'title':title, 'url':url, 'description':desc, 'date':''})
        feeds = [x for x in feeds if x[0] == 'Opinion']
        return feeds
    def wsj_find_wn_articles(self, url):
        soup = self.index_to_soup(url)
        articles = []
        whats_news = soup.find('div', attrs={'class':lambda x: x and 'whatsNews-simple' in x})
        if whats_news is not None:
            for a in whats_news.findAll('a', href=lambda x: x and '/article/' in x):
                container = a.findParent(['p'])
                meta = a.find(attrs={'class':'meta_sectionName'})
                if meta is not None:
                    meta.extract()
                title = self.tag_to_string(a).strip()
                url = a['href']
                desc = ''
                if container is not None:
                    desc = self.tag_to_string(container)
                articles.append({'title':title, 'url':url,
                    'description':desc, 'date':''})
                self.log('\tFound WN article:', title)
        return articles
    def wsj_find_articles(self, url):
        soup = self.index_to_soup(url)
        whats_news = soup.find('div', attrs={'class':lambda x: x and 'whatsNews-simple' in x})
        if whats_news is not None:
            whats_news.extract()
        articles = []
        flavorarea = soup.find('div', attrs={'class':lambda x: x and 'ahed' in x})
        if flavorarea is not None:
            flavorstory = flavorarea.find('a', href=lambda x: x and x.startswith('/article'))
            if flavorstory is not None:
                flavorstory['class'] = 'mjLinkItem'
                metapage = soup.find('span', attrs={'class':lambda x: x and 'meta_sectionName' in x})
                if metapage is not None:
                    flavorstory.append(copy.copy(metapage))  # metapage should always be A1 because that should be first on the page
        for a in soup.findAll('a', attrs={'class':'mjLinkItem'}, href=True):
            container = a.findParent(['li', 'div'])
            meta = a.find(attrs={'class':'meta_sectionName'})
            if meta is not None:
                meta.extract()
                meta = self.tag_to_string(meta).strip()
            if meta:
                title = self.tag_to_string(a).strip() + ' [%s]'%meta
            else:
                title = self.tag_to_string(a).strip()
            url = self.abs_wsj_url(a['href'])
            desc = ''
            for p in container.findAll('p'):
                desc = self.tag_to_string(p)
                if 'Subscriber Content' not in desc:
                    break
            articles.append({'title':title, 'url':url,
                'description':desc, 'date':''})
            self.log('\tFound article:', title)
        return articles