calibre/recipes/granta.recipe

#!/usr/bin/env  python2
__license__ = 'GPL v3'
__copyright__ = '2018, Gary Arnold garnold@garyarnold.com'
__docformat__ = 'restructuredtext en'

'''
granta.com
'''
import re

from calibre.web.feeds.news import BasicNewsRecipe

# Set this variable to the URL of the issue you want to download, if not the current issue
force_issue_download = None

##################################################################
# Adapted from https://gist.github.com/FlyingTopHat/7cfdd5434ec704916174


def plus_with_unknown_component(first_comp, second_comp, result):
    if result is None:
        return first_comp + second_comp

    component = (first_comp if second_comp is None else second_comp)
    return result - component


def subtract_with_unknown_component(first_comp, second_comp, result):
    if result is None:
        return first_comp - second_comp

    return (first_comp - result) if second_comp is None else (result + second_comp)


def multiply_with_unknown_component(first_comp, second_comp, result):
    if result is None:
        return first_comp * second_comp

    component = (first_comp if second_comp is None else second_comp)
    return result / component


def solve_captcha(captcha):
    # # Convert from a word problem into a numeric problem
    numeric_problem = ''
    for part in captcha.split(' '):
        numeric_problem = numeric_problem + str(text2num(part))

    # Parse into parts
    pattern = re.compile(
            u'(?P<first_component>[0-9]+)?'
            u'\\s*(?P<operator>[+×−])\\s*'
            u'(?P<second_component>[0-9]+)'
            u'\\s*(=)\\s*'
            u'(?P<result>[0-9]+)?', re.UNICODE)

    calculationParts = re.search(pattern, numeric_problem)
    if calculationParts is None:
        return 0

    operator = calculationParts.group('operator')

    result = calculationParts.group('result')
    result = int(result) if result is not None else None

    component_one = calculationParts.group('first_component')
    component_one = int(component_one) if component_one is not None else None

    component_two = calculationParts.group('second_component')
    component_two = int(component_two) if component_two is not None else None

    # Calculate answer
    answer = 0
    if operator == u'+':
        answer = plus_with_unknown_component(component_one, component_two, result)
    elif operator == u'×':
        answer = multiply_with_unknown_component(component_one, component_two, result)
    elif operator == u'−':
        answer = subtract_with_unknown_component(component_one, component_two, result)

    return answer
##################################################################


##################################################################
# Adapted from https://github.com/ghewgill/text2num/blob/master/text2num.py
# Removes external dependency on digify library
Small = {
    'zero': 0,
    'one': 1,
    'two': 2,
    'three': 3,
    'four': 4,
    'five': 5,
    'six': 6,
    'seven': 7,
    'eight': 8,
    'nine': 9,
    'ten': 10,
    'eleven': 11,
    'twelve': 12,
    'thirteen': 13,
    'fourteen': 14,
    'fifteen': 15,
    'sixteen': 16,
    'seventeen': 17,
    'eighteen': 18,
    'nineteen': 19,
    'twenty': 20,
    'thirty': 30,
    'forty': 40,
    'fifty': 50,
    'sixty': 60,
    'seventy': 70,
    'eighty': 80,
    'ninety': 90
}

Magnitude = {
    'thousand':     1000,
    'million':      1000000,
    'billion':      1000000000,
    'trillion':     1000000000000,
    'quadrillion':  1000000000000000,
    'quintillion':  1000000000000000000,
    'sextillion':   1000000000000000000000,
    'septillion':   1000000000000000000000000,
    'octillion':    1000000000000000000000000000,
    'nonillion':    1000000000000000000000000000000,
    'decillion':    1000000000000000000000000000000000,
}


def text2num(s):
    a = re.split(r"[\s-]+", s)
    n = 0
    g = 0
    for w in a:
        x = Small.get(w, None)
        if x is not None:
            g += x
        elif w == "hundred" and g != 0:
            g *= 100
        else:
            x = Magnitude.get(w, None)
            if x is not None:
                n += g * x
                g = 0
            else:
                return s
    return n + g
##################################################################


##################################################################
# Utilities
def absurl(url):
    if url.startswith('/'):
        url = 'https://www.granta.com' + url
    return url


def stripstyle(tag):
    if tag is not None:
        del tag['style']


def get_innermost_string(tag):
    while hasattr(tag, 'contents') and len(tag.contents) > 0 and tag.contents[0] is not None:
        tag = tag.contents[0]
    return str(tag).strip()
##################################################################


class Granta(BasicNewsRecipe):

    title = u'Granta'
    description = u'The Magazine of New Writing'
    language = 'en'

    __author__ = 'Gary Arnold'

    needs_subscription = True

    keep_only_tags = [
        dict(name='div', attrs={'class': 'article-feature-image-container'}),
        dict(name='div', attrs={'class': 'col-xs-12 col-sm-12 col-md-12 article-header'}),
        dict(name='div', attrs={'class': 'carousel-inner'}),
        dict(name='div', attrs={'class': 'article-content'}),
    ]

    preprocess_regexps = [(re.compile(r'<head>.*?</head>', re.DOTALL), lambda
                           m:'<head></head>')]

    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
        if self.username and self.password:
            # User has a subscription, log in
            response = br.open('https://granta.com/')

            # Get captcha solution
            captcha = '0'
            html = response.read()
            soup = self.index_to_soup(html)
            captcha_field = soup.find('input', attrs={'name': 'capcha'})
            captcha_question = ''
            if captcha_field is not None:
                captcha_question = captcha_field['placeholder']
                if captcha_question is not None:
                    captcha = str(solve_captcha(captcha_question))

            br.select_form(method="post", action="https://granta.com/")
            br['username'] = self.username
            br['password'] = self.password
            br['capcha'] = captcha
            self.log.info('Logging in with captcha ' + str(captcha_question) + ', solution ' + captcha)
            br.submit()

        return br

    def preprocess_html(self, soup):
        articleHeader = soup.find(
            'div', attrs={'class': 'article-feature-image-container'})
        if articleHeader is None:
            articleHeader = soup.find(
                    'div', attrs={'class': lambda x: x and 'article-header' in x.split()})
        if articleHeader is not None:
            image = articleHeader.find(
                'div', attrs={'class': 'article-feature-image'})
            if image is not None and image.attrs is not None:
                style = dict(image.attrs)['style']
                if style is not None:
                    m = re.search(r'url\(([^\)]*)\)', style)
                    if m.group(1) is not None:
                        stripstyle(image)
                        image.name = 'img'
                        image['src'] = m.group(1)

            stripstyle(articleHeader.find('h1'))
            stripstyle(articleHeader.find('h2'))

        return soup

    def parse_index(self):
        if force_issue_download is None:
            soup = self.index_to_soup('https://granta.com/')

            # Get latest issue
            issueInfo = soup.find(
                    'div', attrs={'class': lambda x: x and 'dnd_container__heading' in x.split()})

            issueAnchor = issueInfo.find('a')
            issueTitle = issueAnchor.contents[0]
            issueLink = issueAnchor.get('href')
        else:
            issueLink = force_issue_download
            issueTitle = ''

        soup = self.index_to_soup(issueLink)

        # Find cover
        cover = soup.find('div', attrs={'class': 'product-img-container'})
        if cover is not None:
            img = cover.find('img', src=True)
            self.cover_url = absurl(img['src'])
            self.log.info('Found cover at:', self.cover_url)

        # Find TOC
        tocs = soup.findAll('div', attrs={'class': 'product-article'})
        articles = []
        for toc in tocs:
            if (self.username and self.password) or (toc.find('img') is None):
                # Either user is logged in or the article is unlocked
                h1 = toc.find('h1')
                h2 = toc.find('h2')
                if h1.find('a') is not None and len(h1.find('a').contents) > 0 and h1.find('a').contents[0] is not None:
                    title = get_innermost_string(h1.find('a').contents[0])
                elif len(h1.contents) > 0 and h1.contents[0] is not None:
                    title = get_innermost_string(h1.contents[0])
                else:
                    title = ''
                if h2.find('a') is not None and len(h2.find('a').contents) > 0 and h2.find('a').contents[0] is not None:
                    author = get_innermost_string(h2.find('a').contents[0])
                    title = title + u' (%s)' % author
                elif len(h2.contents) > 0 and h2.contents[0] is not None:
                    author = get_innermost_string(h2.contents[0])
                    title = title + u' (%s)' % author
                else:
                    author = ''
                url = absurl(h1.find('a', href=True)['href'])
                self.log.info('Found article:', title)
                self.log.info('\t', url)
                articles.append({'title': title, 'url': url,
                                 'date': '', 'description': ''})

        return [(issueTitle, articles)]