Update Wall Street Journal

Fixes #1395546 [Private bug](https://bugs.launchpad.net/calibre/+bug/1395546)
This commit is contained in:
Kovid Goyal 2014-11-25 21:24:51 +05:30
parent 37439fecf6
commit 4cd960d9d9
2 changed files with 293 additions and 313 deletions

View File

@ -1,77 +1,136 @@
#!/usr/bin/env python from calibre.web.feeds.jsnews import JavascriptRecipe
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
from calibre.web.feeds.news import BasicNewsRecipe try:
import copy, re from calibre.web.feeds.jsnews import CSSSelect
except ImportError:
def CSSSelect(expr):
from cssselect import HTMLTranslator
from lxml.etree import XPath
return XPath(HTMLTranslator().css_to_xpath(expr))
# http://online.wsj.com/page/us_in_todays_paper.html
class WallStreetJournal(BasicNewsRecipe): class WSJ(JavascriptRecipe):
title = 'The Wall Street Journal' title = 'The Wall Street Journal'
__author__ = 'Kovid Goyal and Joshua Oster-Morris' __author__ = 'Kovid Goyal'
description = 'News and current affairs' description = 'News and current affairs'
needs_subscription = True
language = 'en' language = 'en'
compress_news_images = True compress_news_images = True
compress_news_images_auto_size = 5 compress_news_images_auto_size = 5
max_articles_per_feed = 1000 max_articles_per_feed = 1000
timefmt = ' [%a, %b %d, %Y]' timefmt = ' [%a, %b %d, %Y]'
no_stylesheets = True no_stylesheets = True
ignore_duplicate_articles = {'url'} ignore_duplicate_articles = {'url'}
remove_attributes = ['style', 'data-scrim'] remove_attributes = ['style', 'data-scrim']
needs_subscription = True
keep_only_tags = [ keep_only_tags = (
dict(name='h1'), dict(name='h2', attrs={'class':['subhead', 'subHed deck']}), 'h1', # 'h2.subhead', 'h2.subHed.deck',
dict(name='span', itemprop='author', rel='author'), 'span[itemprop=author][rel=author]',
dict(name='article', id=['article-contents', 'articleBody']), 'article#article-contents', 'article#articleBody',
dict(name='div', id='article_story_body'), 'div#article_story_body',
dict(name='div', attrs={'class':'snippet-ad-login'}), # Parallax formatting
] 'div#ncTitleArea', 'section.nc-exp-artbody',
remove_tags = [ # Error conditions, login required and page not found
dict(attrs={'class':['insetButton', 'insettipBox', 'author-info', 'media-object-video']}), 'div#snippet-ad-login', 'div.errorNotFound',
dict(attrs={'class':lambda x: x and 'article_tools' in x.split()}), )
dict(name='span', attrs={'data-country-code':True, 'data-ticker-code':True}),
]
preprocess_regexps = [
(re.compile(r'<!--\[if lte IE 8\]>.+?<!\[endif\]-->', re.DOTALL), lambda m: ''),
(re.compile(r'<!\[if ! lte IE 8\]>.+?<!\[endif\]>', re.DOTALL), lambda m:''),
]
use_javascript_to_login = True remove_tags = (
'.insetButton', '.insettipBox', '.author-info', '.media-object-video',
'.article_tools', 'span[data-country-code][data-ticker-code]',
'div.nc-exp-artmeta',
)
def javascript_login(self, br, username, password): def do_login(self, br, username, password):
br.visit('https://id.wsj.com/access/pages/wsj/us/login_standalone.html?mg=com-wsj', timeout=120) br.visit(
'https://id.wsj.com/access/pages/wsj/us/login_standalone.html?mg=com-wsj', timeout=120) # noqa
f = br.select_form(nr=0) f = br.select_form(nr=0)
f['username'] = username f['username'] = username
f['password'] = password f['password'] = password
br.submit(timeout=120) br.submit(timeout=120)
def populate_article_metadata(self, article, soup, first): def preprocess_stage2(self, article, browser, url, recursion_level):
if first and hasattr(self, 'add_toc_thumbnail'): # Slideshow and expandable images need to be processed here to
picdiv = soup.find('img', src=True) # set the src attribute correctly
if picdiv is not None: found = 0
self.add_toc_thumbnail(article,picdiv['src']) for img in browser.css_select('img[data-in-base-data-lazy]', all=True):
img.setAttribute('src', img.attribute('data-in-base-data-lazy'))
found += 1
for img in browser.css_select('img[data-enlarge]', all=True):
img.setAttribute('src', img.attribute('data-enlarge'))
found += 1
if found:
self.log.debug('Found %d dynamic images in:' % found, url)
def preprocess_html(self, soup): def get_publication_data(self, browser):
# Remove thumbnail for zoomable images return self.get_wsj_index(browser)
for div in soup.findAll('div', attrs={'class':lambda x: x and 'insetZoomTargetBox' in x.split()}):
img = div.find('img')
if img is not None:
img.extract()
# Use large images
for img in soup.findAll('img', attrs={'data-enlarge':True}):
img['src'] = img['data-enlarge']
return soup def abs_wsj_url(self, href):
if not href.startswith('http'):
href = 'http://online.wsj.com' + href
return href
def wsj_get_index(self): def wsj_find_articles(self, url):
return self.index_to_soup('http://online.wsj.com/itp') root = self.index_to_soup(url)
def wsj_add_feed(self,feeds,title,url): for x in CSSSelect('div.whatsNews-simple')(root):
x.getparent().remove(x)
articles = []
for a in CSSSelect('a.mjLinkItem[href]')(root):
container = a.xpath('ancestor::li')
meta = CSSSelect('.meta_sectionName')(a)
if meta:
meta = meta[0]
meta.getparent().remove(meta)
meta = self.tag_to_string(meta)
title = self.tag_to_string(a)
if meta:
title += ' [%s]' % meta
url = self.abs_wsj_url(a.get('href'))
desc = ''
if container:
for p in CSSSelect('p')(container[0]):
desc = self.tag_to_string(p)
if 'Subscriber Content' not in desc:
break
articles.append({'title': title, 'url': url,
'description': desc, 'date': ''})
self.log('\tFound article:', title)
self.log('\t\t', desc)
return articles
def wsj_find_wn_articles(self, url):
root = self.index_to_soup(url)
articles = []
whats_news = CSSSelect('div.whatsNews-simple.whatsNews-itp')(root)
if whats_news:
for a in CSSSelect('a[href]')(whats_news[-1]):
if '/articles/' not in a.get('href', ''):
continue
container = a.xpath('ancestor::p')
for meta in CSSSelect('.meta_sectionName')(a):
meta.getparent().remove(meta)
title = self.tag_to_string(a).strip()
url = self.abs_wsj_url(a.get('href'))
desc = ''
if container:
desc = self.tag_to_string(container[0])
articles.append({'title': title, 'url': url,
'description': desc, 'date': ''})
self.log('\tFound WN article:', title)
self.log('\t\t', desc)
return articles
def wsj_add_feed(self, feeds, title, url):
self.log('Found section:', title) self.log('Found section:', title)
try: try:
if url.endswith('whatsnews'): if url.endswith('whatsnews'):
@ -82,129 +141,47 @@ class WallStreetJournal(BasicNewsRecipe):
articles = [] articles = []
if articles: if articles:
feeds.append((title, articles)) feeds.append((title, articles))
return feeds
def abs_wsj_url(self, href): def get_wsj_index(self, browser):
if not href.startswith('http'): # return self.test_wsj_index()
href = 'http://online.wsj.com' + href ans = {}
return href root = self.index_to_soup('http://online.wsj.com/itp')
for span in CSSSelect('span.date-date')(root):
if span.text:
self.timefmt = span.text
break
for a in CSSSelect('div.itpSectionHeaderPdf a[href]')(root):
href = a.get('href')
if href:
break
ans['cover'] = browser.download_file(href)
def parse_index(self): feeds = ans['index'] = []
soup = self.wsj_get_index() for a in CSSSelect('div.itpHeader ul.tab a[href]')(root):
if '/itp/' not in a.get('href', ''):
date = soup.find('span', attrs={'class':'date-date'}) continue
if date is not None: pageone = a.get('href').endswith('pageone')
self.timefmt = ' [%s]'%self.tag_to_string(date)
cov = soup.find('div', attrs={'class':lambda x: x and 'itpSectionHeaderPdf' in x.split()})
if cov is not None:
a = cov.find('a', href=True)
if a is not None:
self.cover_url = a['href']
feeds = []
div = soup.find('div', attrs={'class':'itpHeader'})
div = div.find('ul', attrs={'class':'tab'})
for a in div.findAll('a', href=lambda x: x and '/itp/' in x):
pageone = a['href'].endswith('pageone')
if pageone: if pageone:
title = 'Front Section' title = 'Front Section'
url = self.abs_wsj_url(a['href']) url = self.abs_wsj_url(a.get('href'))
feeds = self.wsj_add_feed(feeds,title,url) self.wsj_add_feed(feeds, title, url)
title = "What's News" title = "What's News"
url = url.replace('pageone','whatsnews') url = url.replace('pageone', 'whatsnews')
feeds = self.wsj_add_feed(feeds,title,url) self.wsj_add_feed(feeds, title, url)
else: else:
title = self.tag_to_string(a) title = self.tag_to_string(a)
url = self.abs_wsj_url(a['href']) url = self.abs_wsj_url(a.get('href'))
feeds = self.wsj_add_feed(feeds,title,url) self.wsj_add_feed(feeds, title, url)
return ans
for li in soup.findAll('li', attrs={'class':'ahed_listitem'}):
h2 = li.find('h2')
if h2 is None:
continue
a = h2.find('a', href=True)
if a is None:
continue
url = a['href']
title = self.tag_to_string(a)
p = h2.findNextSibling('p')
if p is not None:
desc = self.tag_to_string(p)
else:
desc = ''
if feeds:
feeds[0][1].append({'title':title, 'url':url, 'description':desc, 'date':''})
return feeds
def wsj_find_wn_articles(self, url):
soup = self.index_to_soup(url)
articles = []
whats_news = soup.find('div', attrs={'class':lambda x: x and 'whatsNews-simple' in x})
if whats_news is not None:
for a in whats_news.findAll('a', href=lambda x: x and '/article/' in x):
container = a.findParent(['p'])
meta = a.find(attrs={'class':'meta_sectionName'})
if meta is not None:
meta.extract()
title = self.tag_to_string(a).strip()
url = a['href']
desc = ''
if container is not None:
desc = self.tag_to_string(container)
articles.append({'title':title, 'url':url,
'description':desc, 'date':''})
self.log('\tFound WN article:', title)
self.log('\t\t', desc)
return articles
def wsj_find_articles(self, url):
soup = self.index_to_soup(url)
whats_news = soup.find('div', attrs={'class':lambda x: x and 'whatsNews-simple' in x})
if whats_news is not None:
whats_news.extract()
articles = []
flavorarea = soup.find('div', attrs={'class':lambda x: x and 'ahed' in x})
if flavorarea is not None:
flavorstory = flavorarea.find('a', href=lambda x: x and x.startswith('/article'))
if flavorstory is not None:
flavorstory['class'] = 'mjLinkItem'
metapage = soup.find('span', attrs={'class':lambda x: x and 'meta_sectionName' in x})
if metapage is not None:
flavorstory.append(copy.copy(metapage)) # metapage should always be A1 because that should be first on the page
for a in soup.findAll('a', attrs={'class':'mjLinkItem'}, href=True):
container = a.findParent(['li', 'div'])
meta = a.find(attrs={'class':'meta_sectionName'})
if meta is not None:
meta.extract()
meta = self.tag_to_string(meta).strip()
if meta:
title = self.tag_to_string(a).strip() + ' [%s]'%meta
else:
title = self.tag_to_string(a).strip()
url = self.abs_wsj_url(a['href'])
desc = ''
for p in container.findAll('p'):
desc = self.tag_to_string(p)
if 'Subscriber Content' not in desc:
break
articles.append({'title':title, 'url':url,
'description':desc, 'date':''})
self.log('\tFound article:', title)
self.log('\t\t', desc)
return articles
def cleanup(self):
self.browser.open('http://online.wsj.com/logout?url=http://online.wsj.com')
def test_wsj_index(self):
return {'index': [
('Testing', [
{'title': 'Article One',
'url': 'http://online.wsj.com/articles/the-end-of-the-impulse-shopper-1416872108'}, # noqa
{'title': 'Article Two',
'url': 'http://online.wsj.com/articles/ferguson-police-officer-not-charged-in-black-teens-shooting-1416882438'}, # noqa
{'title': 'Article Three',
'url': 'http://online.wsj.com/article/SB10634695869867284248804580297251334393676.html'}, # noqa
]),
]}

View File

@ -1,71 +1,140 @@
#!/usr/bin/env python from calibre.web.feeds.jsnews import JavascriptRecipe
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
from calibre.web.feeds.news import BasicNewsRecipe try:
import copy, re from calibre.web.feeds.jsnews import CSSSelect
except ImportError:
def CSSSelect(expr):
from cssselect import HTMLTranslator
from lxml.etree import XPath
return XPath(HTMLTranslator().css_to_xpath(expr))
class WallStreetJournal(BasicNewsRecipe):
class WSJ(JavascriptRecipe):
title = 'Wall Street Journal (free)' title = 'Wall Street Journal (free)'
__author__ = 'Kovid Goyal, Sujata Raman, Joshua Oster-Morris, Starson17' __author__ = 'Kovid Goyal'
description = '''News and current affairs. This recipe only fetches complete description = '''News and current affairs. This recipe only fetches complete
versions of the articles that are available free on the wsj.com website. versions of the articles that are available free on the wsj.com website.
To get the rest of the articles, subscribe to the WSJ and use the other WSJ To get the rest of the articles, subscribe to the WSJ and use the other WSJ
recipe.''' recipe.'''
language = 'en' language = 'en'
cover_url = 'http://dealbreaker.com/images/thumbs/Wall%20Street%20Journal%20A1.JPG'
compress_news_images = True
compress_news_images_auto_size = 5
max_articles_per_feed = 1000 max_articles_per_feed = 1000
timefmt = ' [%a, %b %d, %Y]' timefmt = ' [%a, %b %d, %Y]'
no_stylesheets = True no_stylesheets = True
ignore_duplicate_articles = {'url'} ignore_duplicate_articles = {'url'}
remove_attributes = ['style', 'data-scrim'] remove_attributes = ['style', 'data-scrim']
needs_subscription = True
keep_only_tags = [ keep_only_tags = (
dict(name='h1'), dict(name='h2', attrs={'class':['subhead', 'subHed deck']}), 'h1', # 'h2.subhead', 'h2.subHed.deck',
dict(name='span', itemprop='author', rel='author'), 'span[itemprop=author][rel=author]',
dict(name='article', id=['article-contents', 'articleBody']), 'article#article-contents', 'article#articleBody',
dict(name='div', id='article_story_body'), 'div#article_story_body',
dict(name='div', attrs={'class':'snippet-ad-login'}), # Parallax formatting
] 'div#ncTitleArea', 'section.nc-exp-artbody',
remove_tags = [ # Error conditions, login required and page not found
dict(attrs={'class':['insetButton', 'insettipBox', 'author-info', 'media-object-video']}), 'div#snippet-ad-login', 'div.errorNotFound',
dict(attrs={'class':lambda x: x and 'article_tools' in x.split()}), )
dict(name='span', attrs={'data-country-code':True, 'data-ticker-code':True}),
]
preprocess_regexps = [
(re.compile(r'<!--\[if lte IE 8\]>.+?<!\[endif\]-->', re.DOTALL), lambda m: ''),
(re.compile(r'<!\[if ! lte IE 8\]>.+?<!\[endif\]>', re.DOTALL), lambda m:''),
]
def populate_article_metadata(self, article, soup, first): remove_tags = (
if first and hasattr(self, 'add_toc_thumbnail'): '.insetButton', '.insettipBox', '.author-info', '.media-object-video',
picdiv = soup.find('img', src=True) '.article_tools', 'span[data-country-code][data-ticker-code]',
if picdiv is not None: 'div.nc-exp-artmeta',
self.add_toc_thumbnail(article,picdiv['src']) )
def preprocess_html(self, soup): def do_login(self, br, username, password):
# Remove thumbnail for zoomable images br.visit(
for div in soup.findAll('div', attrs={'class':lambda x: x and 'insetZoomTargetBox' in x.split()}): 'https://id.wsj.com/access/pages/wsj/us/login_standalone.html?mg=com-wsj', timeout=120) # noqa
img = div.find('img') f = br.select_form(nr=0)
if img is not None: f['username'] = username
img.extract() f['password'] = password
# Use large images br.submit(timeout=120)
for img in soup.findAll('img', attrs={'data-enlarge':True}):
img['src'] = img['data-enlarge']
return soup def preprocess_stage2(self, article, browser, url, recursion_level):
# Slideshow and expandable images need to be processed here to
# set the src attribute correctly
found = 0
for img in browser.css_select('img[data-in-base-data-lazy]', all=True):
img.setAttribute('src', img.attribute('data-in-base-data-lazy'))
found += 1
for img in browser.css_select('img[data-enlarge]', all=True):
img.setAttribute('src', img.attribute('data-enlarge'))
found += 1
if found:
self.log.debug('Found %d dynamic images in:' % found, url)
def get_publication_data(self, browser):
return self.get_wsj_index(browser)
def abs_wsj_url(self, href): def abs_wsj_url(self, href):
if not href.startswith('http'): if not href.startswith('http'):
href = 'http://online.wsj.com' + href href = 'http://online.wsj.com' + href
return href return href
def wsj_get_index(self): def wsj_find_articles(self, url):
return self.index_to_soup('http://online.wsj.com/itp') root = self.index_to_soup(url)
def wsj_add_feed(self,feeds,title,url): for x in CSSSelect('div.whatsNews-simple')(root):
x.getparent().remove(x)
articles = []
for a in CSSSelect('a.mjLinkItem[href]')(root):
container = a.xpath('ancestor::li')
meta = CSSSelect('.meta_sectionName')(a)
if meta:
meta = meta[0]
meta.getparent().remove(meta)
meta = self.tag_to_string(meta)
title = self.tag_to_string(a)
if meta:
title += ' [%s]' % meta
url = self.abs_wsj_url(a.get('href'))
desc = ''
if container:
for p in CSSSelect('p')(container[0]):
desc = self.tag_to_string(p)
if 'Subscriber Content' not in desc:
break
articles.append({'title': title, 'url': url,
'description': desc, 'date': ''})
self.log('\tFound article:', title)
self.log('\t\t', desc)
return articles
def wsj_find_wn_articles(self, url):
root = self.index_to_soup(url)
articles = []
whats_news = CSSSelect('div.whatsNews-simple.whatsNews-itp')(root)
if whats_news:
for a in CSSSelect('a[href]')(whats_news[-1]):
if '/articles/' not in a.get('href', ''):
continue
container = a.xpath('ancestor::p')
for meta in CSSSelect('.meta_sectionName')(a):
meta.getparent().remove(meta)
title = self.tag_to_string(a).strip()
url = self.abs_wsj_url(a.get('href'))
desc = ''
if container:
desc = self.tag_to_string(container[0])
articles.append({'title': title, 'url': url,
'description': desc, 'date': ''})
self.log('\tFound WN article:', title)
self.log('\t\t', desc)
return articles
def wsj_add_feed(self, feeds, title, url):
self.log('Found section:', title) self.log('Found section:', title)
try: try:
if url.endswith('whatsnews'): if url.endswith('whatsnews'):
@ -76,113 +145,47 @@ class WallStreetJournal(BasicNewsRecipe):
articles = [] articles = []
if articles: if articles:
feeds.append((title, articles)) feeds.append((title, articles))
return feeds
def parse_index(self): def get_wsj_index(self, browser):
soup = self.wsj_get_index() # return self.test_wsj_index()
ans = {}
root = self.index_to_soup('http://online.wsj.com/itp')
for span in CSSSelect('span.date-date')(root):
if span.text:
self.timefmt = span.text
break
for a in CSSSelect('div.itpSectionHeaderPdf a[href]')(root):
href = a.get('href')
if href:
break
ans['cover'] = browser.download_file(href)
date = soup.find('span', attrs={'class':'date-date'}) feeds = ans['index'] = []
if date is not None: for a in CSSSelect('div.itpHeader ul.tab a[href]')(root):
self.timefmt = ' [%s]'%self.tag_to_string(date) if '/itp/' not in a.get('href', ''):
continue
feeds = [] pageone = a.get('href').endswith('pageone')
div = soup.find('div', attrs={'class':'itpHeader'})
div = div.find('ul', attrs={'class':'tab'})
for a in div.findAll('a', href=lambda x: x and '/itp/' in x):
pageone = a['href'].endswith('pageone')
if pageone: if pageone:
title = 'Front Section' title = 'Front Section'
url = self.abs_wsj_url(a['href']) url = self.abs_wsj_url(a.get('href'))
feeds = self.wsj_add_feed(feeds,title,url) self.wsj_add_feed(feeds, title, url)
title = 'What''s News' title = "What's News"
url = url.replace('pageone','whatsnews') url = url.replace('pageone', 'whatsnews')
feeds = self.wsj_add_feed(feeds,title,url) self.wsj_add_feed(feeds, title, url)
else: else:
title = self.tag_to_string(a) title = self.tag_to_string(a)
url = self.abs_wsj_url(a['href']) url = self.abs_wsj_url(a.get('href'))
feeds = self.wsj_add_feed(feeds,title,url) self.wsj_add_feed(feeds, title, url)
return ans
for li in soup.findAll('li', attrs={'class':'ahed_listitem'}): def test_wsj_index(self):
h2 = li.find('h2') return {'index': [
if h2 is None: ('Testing', [
continue {'title': 'Article One',
a = h2.find('a', href=True) 'url': 'http://online.wsj.com/articles/the-end-of-the-impulse-shopper-1416872108'}, # noqa
if a is None: {'title': 'Article Two',
continue 'url': 'http://online.wsj.com/articles/ferguson-police-officer-not-charged-in-black-teens-shooting-1416882438'}, # noqa
url = a['href'] {'title': 'Article Three',
title = self.tag_to_string(a) 'url': 'http://online.wsj.com/article/SB10634695869867284248804580297251334393676.html'}, # noqa
p = h2.findNextSibling('p') ]),
if p is not None: ]}
desc = self.tag_to_string(p)
else:
desc = ''
if feeds:
feeds[0][1].append({'title':title, 'url':url, 'description':desc, 'date':''})
feeds = [x for x in feeds if x[0] == 'Opinion']
return feeds
def wsj_find_wn_articles(self, url):
soup = self.index_to_soup(url)
articles = []
whats_news = soup.find('div', attrs={'class':lambda x: x and 'whatsNews-simple' in x})
if whats_news is not None:
for a in whats_news.findAll('a', href=lambda x: x and '/article/' in x):
container = a.findParent(['p'])
meta = a.find(attrs={'class':'meta_sectionName'})
if meta is not None:
meta.extract()
title = self.tag_to_string(a).strip()
url = a['href']
desc = ''
if container is not None:
desc = self.tag_to_string(container)
articles.append({'title':title, 'url':url,
'description':desc, 'date':''})
self.log('\tFound WN article:', title)
return articles
def wsj_find_articles(self, url):
soup = self.index_to_soup(url)
whats_news = soup.find('div', attrs={'class':lambda x: x and 'whatsNews-simple' in x})
if whats_news is not None:
whats_news.extract()
articles = []
flavorarea = soup.find('div', attrs={'class':lambda x: x and 'ahed' in x})
if flavorarea is not None:
flavorstory = flavorarea.find('a', href=lambda x: x and x.startswith('/article'))
if flavorstory is not None:
flavorstory['class'] = 'mjLinkItem'
metapage = soup.find('span', attrs={'class':lambda x: x and 'meta_sectionName' in x})
if metapage is not None:
flavorstory.append(copy.copy(metapage)) # metapage should always be A1 because that should be first on the page
for a in soup.findAll('a', attrs={'class':'mjLinkItem'}, href=True):
container = a.findParent(['li', 'div'])
meta = a.find(attrs={'class':'meta_sectionName'})
if meta is not None:
meta.extract()
meta = self.tag_to_string(meta).strip()
if meta:
title = self.tag_to_string(a).strip() + ' [%s]'%meta
else:
title = self.tag_to_string(a).strip()
url = self.abs_wsj_url(a['href'])
desc = ''
for p in container.findAll('p'):
desc = self.tag_to_string(p)
if 'Subscriber Content' not in desc:
break
articles.append({'title':title, 'url':url,
'description':desc, 'date':''})
self.log('\tFound article:', title)
return articles