#!/usr/bin/env python2 __license__ = 'GPL v3' __copyright__ = '2018, Gary Arnold garnold@garyarnold.com' __docformat__ = 'restructuredtext en' ''' granta.com ''' import re from calibre.web.feeds.news import BasicNewsRecipe # Set this variable to the URL of the issue you want to download, if not the current issue force_issue_download = None ################################################################## # Adapted from https://gist.github.com/FlyingTopHat/7cfdd5434ec704916174 def plus_with_unknown_component(first_comp, second_comp, result): if result is None: return first_comp + second_comp component = (first_comp if second_comp is None else second_comp) return result - component def subtract_with_unknown_component(first_comp, second_comp, result): if result is None: return first_comp - second_comp return (first_comp - result) if second_comp is None else (result + second_comp) def multiply_with_unknown_component(first_comp, second_comp, result): if result is None: return first_comp * second_comp component = (first_comp if second_comp is None else second_comp) return result / component def solve_captcha(captcha): # # Convert from a word problem into a numeric problem numeric_problem = '' for part in captcha.split(' '): numeric_problem = numeric_problem + str(text2num(part)) # Parse into parts pattern = re.compile( u'(?P[0-9]+)?' u'\\s*(?P[+×−])\\s*' u'(?P[0-9]+)' u'\\s*(=)\\s*' u'(?P[0-9]+)?', re.UNICODE) calculationParts = re.search(pattern, numeric_problem) if calculationParts is None: return 0 operator = calculationParts.group('operator') result = calculationParts.group('result') result = int(result) if result is not None else None component_one = calculationParts.group('first_component') component_one = int(component_one) if component_one is not None else None component_two = calculationParts.group('second_component') component_two = int(component_two) if component_two is not None else None # Calculate answer answer = 0 if operator == u'+': answer = plus_with_unknown_component(component_one, component_two, result) elif operator == u'×': answer = multiply_with_unknown_component(component_one, component_two, result) elif operator == u'−': answer = subtract_with_unknown_component(component_one, component_two, result) return answer ################################################################## ################################################################## # Adapted from https://github.com/ghewgill/text2num/blob/master/text2num.py # Removes external dependency on digify library Small = { 'zero': 0, 'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9, 'ten': 10, 'eleven': 11, 'twelve': 12, 'thirteen': 13, 'fourteen': 14, 'fifteen': 15, 'sixteen': 16, 'seventeen': 17, 'eighteen': 18, 'nineteen': 19, 'twenty': 20, 'thirty': 30, 'forty': 40, 'fifty': 50, 'sixty': 60, 'seventy': 70, 'eighty': 80, 'ninety': 90 } Magnitude = { 'thousand': 1000, 'million': 1000000, 'billion': 1000000000, 'trillion': 1000000000000, 'quadrillion': 1000000000000000, 'quintillion': 1000000000000000000, 'sextillion': 1000000000000000000000, 'septillion': 1000000000000000000000000, 'octillion': 1000000000000000000000000000, 'nonillion': 1000000000000000000000000000000, 'decillion': 1000000000000000000000000000000000, } def text2num(s): a = re.split(r"[\s-]+", s) n = 0 g = 0 for w in a: x = Small.get(w, None) if x is not None: g += x elif w == "hundred" and g != 0: g *= 100 else: x = Magnitude.get(w, None) if x is not None: n += g * x g = 0 else: return s return n + g ################################################################## ################################################################## # Utilities def absurl(url): if url.startswith('/'): url = 'https://www.granta.com' + url return url def stripstyle(tag): if tag is not None: del tag['style'] def get_innermost_string(tag): while hasattr(tag, 'contents') and len(tag.contents) > 0 and tag.contents[0] is not None: tag = tag.contents[0] return str(tag).strip() ################################################################## class Granta(BasicNewsRecipe): title = u'Granta' description = u'The Magazine of New Writing' language = 'en' __author__ = 'Gary Arnold' needs_subscription = True keep_only_tags = [ dict(name='div', attrs={'class': 'article-feature-image-container'}), dict(name='div', attrs={'class': 'col-xs-12 col-sm-12 col-md-12 article-header'}), dict(name='div', attrs={'class': 'carousel-inner'}), dict(name='div', attrs={'class': 'article-content'}), ] preprocess_regexps = [(re.compile(r'.*?', re.DOTALL), lambda m:'')] def get_browser(self): br = BasicNewsRecipe.get_browser(self) if self.username and self.password: # User has a subscription, log in response = br.open('https://granta.com/') # Get captcha solution captcha = '0' html = response.read() soup = self.index_to_soup(html) captcha_field = soup.find('input', attrs={'name': 'capcha'}) captcha_question = '' if captcha_field is not None: captcha_question = captcha_field['placeholder'] if captcha_question is not None: captcha = str(solve_captcha(captcha_question)) br.select_form(method="post", action="https://granta.com/") br['username'] = self.username br['password'] = self.password br['capcha'] = captcha self.log.info('Logging in with captcha ' + str(captcha_question) + ', solution ' + captcha) br.submit() return br def preprocess_html(self, soup): articleHeader = soup.find( 'div', attrs={'class': 'article-feature-image-container'}) if articleHeader is None: articleHeader = soup.find( 'div', attrs={'class': lambda x: x and 'article-header' in x.split()}) if articleHeader is not None: image = articleHeader.find( 'div', attrs={'class': 'article-feature-image'}) if image is not None and image.attrs is not None: style = dict(image.attrs)['style'] if style is not None: m = re.search(r'url\(([^\)]*)\)', style) if m.group(1) is not None: stripstyle(image) image.name = 'img' image['src'] = m.group(1) stripstyle(articleHeader.find('h1')) stripstyle(articleHeader.find('h2')) return soup def parse_index(self): if force_issue_download is None: soup = self.index_to_soup('https://granta.com/') # Get latest issue issueInfo = soup.find( 'div', attrs={'class': lambda x: x and 'dnd_container__heading' in x.split()}) issueAnchor = issueInfo.find('a') issueTitle = issueAnchor.contents[0] issueLink = issueAnchor.get('href') else: issueLink = force_issue_download issueTitle = '' soup = self.index_to_soup(issueLink) # Find cover cover = soup.find('div', attrs={'class': 'product-img-container'}) if cover is not None: img = cover.find('img', src=True) self.cover_url = absurl(img['src']) self.log.info('Found cover at:', self.cover_url) # Find TOC tocs = soup.findAll('div', attrs={'class': 'product-article'}) articles = [] for toc in tocs: if (self.username and self.password) or (toc.find('img') is None): # Either user is logged in or the article is unlocked h1 = toc.find('h1') h2 = toc.find('h2') if h1.find('a') is not None and len(h1.find('a').contents) > 0 and h1.find('a').contents[0] is not None: title = get_innermost_string(h1.find('a').contents[0]) elif len(h1.contents) > 0 and h1.contents[0] is not None: title = get_innermost_string(h1.contents[0]) else: title = '' if h2.find('a') is not None and len(h2.find('a').contents) > 0 and h2.find('a').contents[0] is not None: author = get_innermost_string(h2.find('a').contents[0]) title = title + u' (%s)' % author elif len(h2.contents) > 0 and h2.contents[0] is not None: author = get_innermost_string(h2.contents[0]) title = title + u' (%s)' % author else: author = '' url = absurl(h1.find('a', href=True)['href']) self.log.info('Found article:', title) self.log.info('\t', url) articles.append({'title': title, 'url': url, 'date': '', 'description': ''}) return [(issueTitle, articles)]