#!/usr/bin/python2 # vim:fileencoding=utf-8 # License: GPLv3 Copyright: 2020, Jose Ortiz from __future__ import absolute_import, division, print_function, unicode_literals import json from time import sleep from mechanize import Request from contextlib import closing from calibre.web.feeds.news import BasicNewsRecipe def absolutize(url): if url.startswith('/'): url = ('https://www.journalofaccountancy.com' + url).partition('#')[0] return url def classes(classes): q = frozenset(classes.split(' ')) return dict( attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)} ) class JournalOfAccountancy(BasicNewsRecipe): __author__ = 'Jose Ortiz' language = 'en' title = u'Journal of Accountancy' description = ( 'A monthly journal of tax, financial reporting, auditing and other' ' topics of accountancy from American Institute of Certified Public' ' Accountants (AICPA).' ) publication_type = 'magazine' masthead_url = 'http://developmentprofits.com/images/JournalOfAccountancy.jpg' no_stylesheets = True remove_javascript = True conversion_options = { 'comments': description, 'tags': 'News, Accountancy', 'publisher': 'American Institute of Certified Public Accountants (AICPA)' } keep_only_tags = [classes('contentSectionArticlePage')] def parse_index(self): # ISSUES ###################### issues_url = 'https://www.journalofaccountancy.com/issues.html' with closing(self.browser.open(issues_url)): pass ############################### common_headers = { 'X-Requested-With': 'XMLHttpRequest', 'Accept': 'application/json, text/javascript, */*; q=0.01', 'DNT': '1', 'Pragma': 'no-cache', 'Cache-Control': 'no-cache' } URL_TEMPLATE = 'https://www.journalofaccountancy.com/content/jofa-home/issues/jcr:content/main-content-section/issuelibrary.%s.service' # INIT ################################################# init_url = URL_TEMPLATE % 'init' init_headers = {'Referer': issues_url} init_headers.update(common_headers) self.log('\nINIT URL at ', init_url) with closing(self.browser.open(Request(init_url, None, init_headers))) as r: issue_path = json.loads(r.read())[0]['page']['path'] ######################################################## # FILTER ############################### filter_url = URL_TEMPLATE % ('filter.' + issue_path.split('/')[-2]) filter_headers = {'issues': issue_path} filter_headers.update(init_headers) self.log('\nFILTER URL at ', filter_url) with closing( self.browser.open(Request(filter_url, None, filter_headers)) ) as r: issue_data = json.loads(r.read())[0] ######################################## self.cover_url = absolutize(issue_data['issueCover']['src']) self.log('cover_url at ', self.cover_url) self.timefmt = ' ' + issue_data['issueName'] # INDEX #################################### index_url = absolutize(issue_path + '.html') self.log('INDEX URL at ', index_url) self.log('3 second pause') sleep(3) # mimicking human user behavior with closing(self.browser.open(index_url)): pass ############################################ service_headers = {'Referer': index_url} service_headers.update(common_headers) def get_data(service): service_url = ( 'https://www.journalofaccountancy.com' + issue_path + '/jcr:content/main-content-section/' + service + '.en.service' ) self.log('\nSERIVICE URL at ', service_url) req = Request(service_url, None, service_headers) with closing(self.browser.open(req)) as r: return json.loads(r.read()) def make_topic(category, articles): topic = (category, []) self.log(topic[0]) for article in articles: title = article['articleTitle'] url = absolutize(article['page']['path'] + '.html') desc = article.get('articleAbstract') self.log('\t', title, ' at ', url) topic[1].append({'title': title, 'url': url, 'description': desc}) return topic ans = [ make_topic('SPOTLIGHT', get_data('issuelanding/articles1')), make_topic('FEATURES', get_data('issuelanding/articles2')) ] for category, articles in get_data('articletypelist').items(): ans.append(make_topic(category, articles)) return ans