From 304a198799869ccc67ce1d950725ce5b275bcbf0 Mon Sep 17 00:00:00 2001
From: Gary Arnold <garnold@cataliahealth.com>
Date: Mon, 5 Mar 2018 13:59:07 -0800
Subject: [PATCH 1/5] ga - Adds recipe for Granta magazine, subscription
 optional

---
 recipes/granta.recipe | 274 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 274 insertions(+)
 create mode 100644 recipes/granta.recipe
diff --git a/recipes/granta.recipe b/recipes/granta.recipe
new file mode 100644
index 0000000000..d5d42f2357
--- /dev/null
+++ b/recipes/granta.recipe
@@ -0,0 +1,274 @@
+#!/usr/bin/env  python2
+__license__ = 'GPL v3'
+__copyright__ = '2018, Gary Arnold garnold@garyarnold.com'
+__docformat__ = 'restructuredtext en'
+
+'''
+granta.com
+'''
+import re
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+##################################################################
+# Adapted from https://gist.github.com/FlyingTopHat/7cfdd5434ec704916174
+def plus_with_unknown_component(first_comp, second_comp, result):
+    if result is None:
+        return first_comp + second_comp
+
+    component = (first_comp if second_comp is None else second_comp)
+    return result - component
+
+
+def subtract_with_unknown_component(first_comp, second_comp, result):
+    if result is None:
+        return first_comp - second_comp
+
+    return (first_comp - result) if second_comp is None else (result + second_comp)
+
+
+def multiply_with_unknown_component(first_comp, second_comp, result):
+    if result is None:
+        return first_comp * second_comp
+
+    component = (first_comp if second_comp is None else second_comp)
+    return result / component
+
+
+def solve_captcha(captcha):
+    # # Convert from a word problem into a numeric problem
+    numeric_problem = ''
+    for part in captcha.split(' '):
+        numeric_problem = numeric_problem + str(text2num(part))
+
+    # Parse into parts
+    pattern = re.compile(u'(?P<first_component>[0-9]+)?'
+                         + u'\s*(?P<operator>[+×−])\s*'
+                         + u'(?P<second_component>[0-9]+)'
+                         + u'\s*(=)\s*'
+                         + u'(?P<result>[0-9]+)?', re.UNICODE)
+
+    calculationParts = re.search(pattern, numeric_problem)
+    if calculationParts is None:
+        return 0
+
+    operator = calculationParts.group('operator')
+
+    result = calculationParts.group('result')
+    result = int(result) if result is not None else None
+
+    component_one = calculationParts.group('first_component')
+    component_one = int(component_one) if component_one is not None else None
+
+    component_two = calculationParts.group('second_component')
+    component_two = int(component_two) if component_two is not None else None
+
+    # Calculate answer
+    answer = 0
+    if operator == u'+':
+        answer = plus_with_unknown_component(component_one, component_two, result)
+    elif operator == u'×':
+        answer = multiply_with_unknown_component(component_one, component_two, result)
+    elif operator == u'−':
+        answer = subtract_with_unknown_component(component_one, component_two, result)
+
+    return answer
+##################################################################
+
+
+##################################################################
+# Adapted from https://github.com/ghewgill/text2num/blob/master/text2num.py
+# Removes external dependency on digify library
+Small = {
+    'zero': 0,
+    'one': 1,
+    'two': 2,
+    'three': 3,
+    'four': 4,
+    'five': 5,
+    'six': 6,
+    'seven': 7,
+    'eight': 8,
+    'nine': 9,
+    'ten': 10,
+    'eleven': 11,
+    'twelve': 12,
+    'thirteen': 13,
+    'fourteen': 14,
+    'fifteen': 15,
+    'sixteen': 16,
+    'seventeen': 17,
+    'eighteen': 18,
+    'nineteen': 19,
+    'twenty': 20,
+    'thirty': 30,
+    'forty': 40,
+    'fifty': 50,
+    'sixty': 60,
+    'seventy': 70,
+    'eighty': 80,
+    'ninety': 90
+}
+
+Magnitude = {
+    'thousand':     1000,
+    'million':      1000000,
+    'billion':      1000000000,
+    'trillion':     1000000000000,
+    'quadrillion':  1000000000000000,
+    'quintillion':  1000000000000000000,
+    'sextillion':   1000000000000000000000,
+    'septillion':   1000000000000000000000000,
+    'octillion':    1000000000000000000000000000,
+    'nonillion':    1000000000000000000000000000000,
+    'decillion':    1000000000000000000000000000000000,
+}
+
+
+def text2num(s):
+    a = re.split(r"[\s-]+", s)
+    n = 0
+    g = 0
+    for w in a:
+        x = Small.get(w, None)
+        if x is not None:
+            g += x
+        elif w == "hundred" and g != 0:
+            g *= 100
+        else:
+            x = Magnitude.get(w, None)
+            if x is not None:
+                n += g * x
+                g = 0
+            else:
+                return s
+    return n + g
+##################################################################
+
+
+##################################################################
+# Utilities
+def absurl(url):
+    if url.startswith('/'):
+        url = 'https://www.granta.com' + url
+    return url
+
+
+def stripstyle(tag):
+    if tag is not None:
+        del tag['style']
+##################################################################
+
+
+class Granta(BasicNewsRecipe):
+
+    title = u'Granta'
+    description = u'Granta magazine'
+    language = 'en'
+
+    __author__ = 'Gary Arnold'
+
+    needs_subscription = True
+
+    keep_only_tags = [
+        dict(name='div', attrs={'class': 'article-feature-image-container'}),
+        dict(name='div', attrs={'class': 'carousel-inner'}),
+        dict(name='div', attrs={'class': 'article-content'}),
+    ]
+
+    preprocess_regexps = [(re.compile(r'<head>.*?</head>', re.DOTALL), lambda
+                           m:'<head></head>')]
+
+    def get_browser(self):
+        br = BasicNewsRecipe.get_browser(self)
+        if self.username and self.password:
+            # User has a subscription, log in
+            response = br.open('https://granta.com/')
+
+            # Get captcha solution
+            captcha = '0'
+            html = response.read()
+            soup = self.index_to_soup(html)
+            captcha_field = soup.find('input', attrs={'name': 'capcha'})
+            captcha_question = ''
+            if captcha_field is not None:
+                captcha_question = captcha_field['placeholder']
+                if captcha_question is not None:
+                    captcha = str(solve_captcha(captcha_question))
+
+            br.select_form(method="post", action="https://granta.com/")
+            br['username'] = self.username
+            br['password'] = self.password
+            br['capcha'] = captcha
+            self.log.info('Logging in with captcha ' + str(captcha_question) + ', solution ' + captcha)
+            br.submit()
+
+        return br
+
+    def preprocess_html(self, soup):
+        articleHeader = soup.find(
+            'div', attrs={'class': 'article-feature-image-container'})
+        if articleHeader is None:
+            # This feels brittle, but bs3 demands a full match
+            articleHeader = soup.find(
+                'div', attrs={'class': 'col-xs-12 col-sm-12 col-md-12 article-header'})
+        if articleHeader is not None:
+            image = articleHeader.find(
+                'div', attrs={'class': 'article-feature-image'})
+            if image is not None and image.attrs is not None:
+                style = dict(image.attrs)['style']
+                if style is not None:
+                    m = re.search('url\(([^\)]*)\)', style)
+                    if m.group(1) is not None:
+                        stripstyle(image)
+                        image.name = 'img'
+                        image['src'] = m.group(1)
+
+            stripstyle(articleHeader.find('h1'))
+            stripstyle(articleHeader.find('h2'))
+
+        return soup
+
+    def parse_index(self):
+        self.log.info('Making soup out of index')
+        soup = self.index_to_soup('https://granta.com/')
+
+        # import urllib2
+        # from calibre.ebooks.BeautifulSoup import BeautifulSoup
+        # page = urllib2.urlopen("https://granta.com/introduction-animalia/")
+        # soup = BeautifulSoup(page)
+
+        # Get latest issue
+        issueInfo = soup.find(
+            'div', attrs={'class': 'dnd_container dnd_container__heading'})
+
+        issueAnchor = issueInfo.find('a')
+        issueTitle = issueAnchor.contents[0]
+        issueLink = issueAnchor.get('href')
+        soup = self.index_to_soup(issueLink)
+
+        # Find cover
+        cover = soup.find('div', attrs={'class': 'product-img-container'})
+        if cover is not None:
+            img = cover.find('img', src=True)
+            self.cover_url = absurl(img['src'])
+            self.log.info('Found cover at:', self.cover_url)
+
+        # Find TOC
+        tocs = soup.findAll('div', attrs={'class': 'product-article'})
+        articles = []
+        for toc in tocs:
+            if (self.username and self.password) or (toc.find('img') is None):
+                # Either user is logged in or the article is unlocked
+                h1 = toc.find('h1')
+                h2 = toc.find('h2')
+                title = h1.find('a').contents[0].strip()
+                author = h2.find('a').contents[0].strip()
+                title = title + u' (%s)' % author
+                url = absurl(h1.find('a', href=True)['href'])
+                self.log.info('Found article:', title)
+                self.log.info('\t', url)
+                articles.append({'title': title, 'url': url,
+                                 'date': '', 'description': ''})
+
+        return [(issueTitle, articles)]

From 76c2d5a545b2fdbded364cc782ddb011a1179725 Mon Sep 17 00:00:00 2001
From: Gary Arnold <garnold@cataliahealth.com>
Date: Mon, 5 Mar 2018 14:02:58 -0800
Subject: [PATCH 2/5] ga - Removes some debugging comments

---
 recipes/granta.recipe | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/recipes/granta.recipe b/recipes/granta.recipe
index d5d42f2357..d85b47ac61 100644
--- a/recipes/granta.recipe
+++ b/recipes/granta.recipe
@@ -233,11 +233,6 @@ class Granta(BasicNewsRecipe):
         self.log.info('Making soup out of index')
         soup = self.index_to_soup('https://granta.com/')
 
-        # import urllib2
-        # from calibre.ebooks.BeautifulSoup import BeautifulSoup
-        # page = urllib2.urlopen("https://granta.com/introduction-animalia/")
-        # soup = BeautifulSoup(page)
-
         # Get latest issue
         issueInfo = soup.find(
             'div', attrs={'class': 'dnd_container dnd_container__heading'})

From 840ca8092b5a1f559158854b2886047b85d0bb15 Mon Sep 17 00:00:00 2001
From: Gary Arnold <garnold@cataliahealth.com>
Date: Mon, 5 Mar 2018 14:19:30 -0800
Subject: [PATCH 3/5] ga - More resilient with different title/author
 formatting

---
 recipes/granta.recipe | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/recipes/granta.recipe b/recipes/granta.recipe
index d85b47ac61..af64509fb7 100644
--- a/recipes/granta.recipe
+++ b/recipes/granta.recipe
@@ -257,9 +257,20 @@ class Granta(BasicNewsRecipe):
                 # Either user is logged in or the article is unlocked
                 h1 = toc.find('h1')
                 h2 = toc.find('h2')
-                title = h1.find('a').contents[0].strip()
-                author = h2.find('a').contents[0].strip()
-                title = title + u' (%s)' % author
+                if h1.find('a') is not None and h1.find('a').contents is not None:
+                    title = h1.find('a').contents[0].strip()
+                elif h1.contents[0] is not None:
+                    title = h1.contents[0]
+                else:
+                    title = ''
+                if h2.find('a') is not None and h2.find('a').contents is not None:
+                    author = h2.find('a').contents[0].strip()
+                    title = title + u' (%s)' % author
+                elif h2.contents[0] is not None:
+                    author = h2.contents[0]
+                    title = title + u' (%s)' % author
+                else:
+                    author = ''
                 url = absurl(h1.find('a', href=True)['href'])
                 self.log.info('Found article:', title)
                 self.log.info('\t', url)

From 159d081456fdcc1d77acf610e964338532ff250d Mon Sep 17 00:00:00 2001
From: Gary Arnold <garnold@cataliahealth.com>
Date: Mon, 5 Mar 2018 14:55:46 -0800
Subject: [PATCH 4/5] ga - Corrects more parsing edge cases (issue 136)

---
 recipes/granta.recipe | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/recipes/granta.recipe b/recipes/granta.recipe
index af64509fb7..a278ef9e8f 100644
--- a/recipes/granta.recipe
+++ b/recipes/granta.recipe
@@ -259,14 +259,14 @@ class Granta(BasicNewsRecipe):
                 h2 = toc.find('h2')
                 if h1.find('a') is not None and h1.find('a').contents is not None:
                     title = h1.find('a').contents[0].strip()
-                elif h1.contents[0] is not None:
+                elif len(h1.contents) > 0 and h1.contents[0] is not None:
                     title = h1.contents[0]
                 else:
                     title = ''
                 if h2.find('a') is not None and h2.find('a').contents is not None:
                     author = h2.find('a').contents[0].strip()
                     title = title + u' (%s)' % author
-                elif h2.contents[0] is not None:
+                elif len(h2.contents) > 0 and h2.contents[0] is not None:
                     author = h2.contents[0]
                     title = title + u' (%s)' % author
                 else:

From dcab7f1569071eead879f6b7ccb813813118f064 Mon Sep 17 00:00:00 2001
From: Gary Arnold <garnold@cataliahealth.com>
Date: Mon, 5 Mar 2018 15:08:04 -0800
Subject: [PATCH 5/5] ga - Adds ability to easily override which issue will be
 downloaded

---
 recipes/granta.recipe | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/recipes/granta.recipe b/recipes/granta.recipe
index a278ef9e8f..b6539825e8 100644
--- a/recipes/granta.recipe
+++ b/recipes/granta.recipe
@@ -10,6 +10,9 @@ import re
 
 from calibre.web.feeds.news import BasicNewsRecipe
 
+# Set this variable to the URL of the issue you want to download, if not the current issue
+force_issue_download = None
+
 ##################################################################
 # Adapted from https://gist.github.com/FlyingTopHat/7cfdd5434ec704916174
 def plus_with_unknown_component(first_comp, second_comp, result):
@@ -230,16 +233,20 @@ class Granta(BasicNewsRecipe):
         return soup
 
     def parse_index(self):
-        self.log.info('Making soup out of index')
-        soup = self.index_to_soup('https://granta.com/')
+        if force_issue_download is None:
+            soup = self.index_to_soup('https://granta.com/')
 
-        # Get latest issue
-        issueInfo = soup.find(
-            'div', attrs={'class': 'dnd_container dnd_container__heading'})
+            # Get latest issue
+            issueInfo = soup.find(
+                'div', attrs={'class': 'dnd_container dnd_container__heading'})
+
+            issueAnchor = issueInfo.find('a')
+            issueTitle = issueAnchor.contents[0]
+            issueLink = issueAnchor.get('href')
+        else:
+            issueLink = force_issue_download
+            issueTitle = ''
 
-        issueAnchor = issueInfo.find('a')
-        issueTitle = issueAnchor.contents[0]
-        issueLink = issueAnchor.get('href')
         soup = self.index_to_soup(issueLink)
 
         # Find cover