__license__ = 'GPL v3' __copyright__ = "2008, Derry FitzGerald. 2009 Modified by Ray Kinsella and David O'Callaghan, 2011 Modified by Phil Burns, 2013 Tom Scholl, 2016 by leo738" ''' irishtimes.com ''' import re import json from uuid import uuid4 from mechanize import Request try: from urllib.parse import urlencode, urljoin except ImportError: from urllib import urlencode from urlparse import urljoin from calibre.web.feeds.news import BasicNewsRecipe from calibre.ptempfile import PersistentTemporaryFile class IrishTimes(BasicNewsRecipe): title = u'The Irish Times' __author__ = "Derry FitzGerald, Ray Kinsella, David O'Callaghan and Phil Burns, Tom Scholl" description = 'Daily news from The Irish Times' needs_subscription = True language = 'en_IE' masthead_url = 'http://www.irishtimes.com/assets/images/generic/website/logo_theirishtimes.png' encoding = 'utf-8' oldest_article = 1.0 max_articles_per_feed = 100 simultaneous_downloads = 5 remove_empty_feeds = True no_stylesheets = True temp_files = [] articles_are_obfuscated = True feeds = [ ('News', 'https://www.irishtimes.com/cmlink/the-irish-times-news-1.1319192'), ('World', 'https://www.irishtimes.com/cmlink/irishtimesworldfeed-1.1321046'), ('Politics', 'https://www.irishtimes.com/cmlink/irish-times-politics-rss-1.1315953'), ('Business', 'https://www.irishtimes.com/cmlink/the-irish-times-business-1.1319195'), ('Culture', 'https://www.irishtimes.com/cmlink/the-irish-times-culture-1.1319213'), ('Sport', 'https://www.irishtimes.com/cmlink/the-irish-times-sport-1.1319194'), ('Debate', 'https://www.irishtimes.com/cmlink/debate-1.1319211'), ('Life & Style', 'https://www.irishtimes.com/cmlink/the-irish-times-life-style-1.1319214'), ] def get_browser(self): # To understand the signin logic read signin javascript from submit button from # https://www.irishtimes.com/signin br = BasicNewsRecipe.get_browser(self) url = 'https://www.irishtimes.com/signin' deviceid = str(uuid4()).replace('-', '') # Enable debug stuff? # br.set_debug_http(True) br.open(url).read() rurl = 'https://www.irishtimes.com/auth-rest-api/v1/paywall/login' rq = Request(rurl, headers={ 'Accept': '*/*', 'Accept-Language': 'en-US,en;q=0.5', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Referer': url, 'X-Requested-With': 'XMLHttpRequest', }, data=urlencode({'username': self.username, 'password': self.password,'deviceid':deviceid, 'persistent':'on'})) r = br.open(rq) raw = r.read() data = json.loads(raw) # print(data) if r.code != 200 or b'user_id' not in raw: raise ValueError('Failed to log in check username/password') # Set cookie br.set_cookie('IT_PW_AUTH', data['varnish_id'], '.irishtimes.com') # br.set_debug_http(False) return br def get_obfuscated_article(self, url): # Insert a pic from the original url, but use content from the print url pic = None pics = self.index_to_soup(url) div = pics.find('div', {'class' : re.compile('image-carousel')}) if div: pic = div.img if pic: try: pic['src'] = urljoin(url, pic['src']) pic.extract() except: pic = None content = self.index_to_soup(url + '?mode=print&ot=example.AjaxPageLayout.ot') if pic: content.p.insert(0, pic) self.temp_files.append(PersistentTemporaryFile('_fa.html')) self.temp_files[-1].write(content.prettify().encode('utf-8')) self.temp_files[-1].close() return self.temp_files[-1].name