calibre/recipes/thenewcriterion.recipe

# -*- mode: python -*-
# -*- coding: utf-8 -*-
# vi: set fenc=utf-8 ft=python :
# kate: encoding utf-8; syntax python;

__license__ = 'GPL v3'
__copyright__ = '2019, Darko Miletic <darko.miletic at gmail.com>'
'''
www.newcriterion.com
'''

try:
    from urllib.parse import urlencode
except ImportError:
    from urllib import urlencode
import re
from mechanize import Request
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryFile


class TheNewCriterion(BasicNewsRecipe):
    title                = 'The New Criterion'
    __author__           = 'Darko Miletic'
    description          = 'On the front lines of the battle for culture'
    publisher            = 'The Foundation for Cultural Review'
    category             = 'art, politics, USA, world'
    oldest_article       = 40
    no_stylesheets       = True
    encoding             = 'utf8'
    use_embedded_content = False
    language             = 'en'
    remove_empty_feeds   = True
    publication_type     = 'magazine'
    needs_subscription   = 'optional'
    delay                = 1
    simultaneous_downloads = 1
    timeout                = 8
    ignore_duplicate_articles = {'url'}
    articles_are_obfuscated = True
    temp_files              = []
    fetch_retries           = 10
    auto_cleanup         = True
    masthead_url         = 'https://www.newcriterion.com/themes/thenewcriterion/assets/img/horizontal-logo.svg'
    extra_css            = """
        body{font-family: Galliard, serif}
    """

    conversion_options = {
        'comment': description, 'tags': category, 'publisher': publisher, 'language': language
    }

    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
        br.open('https://www.newcriterion.com/')
        if self.username is not None and self.password is not None:
            data = urlencode({'login': self.username, 'password': self.password})
            header = {
                'X-OCTOBER-REQUEST-HANDLER': 'onSignin',
                'X-Requested-With': 'XMLHttpRequest',
                'DNT':'1',
                'X-OCTOBER-REQUEST-PARTIALS':'',
                'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'
            }
            request = Request('https://www.newcriterion.com/', data, header)
            br.open(request)
        return br

    def parse_index(self):
        part = strftime('/issues/%Y/') + str(int(strftime('%m')))
        partf = part + '/'
        currentIssue_url = 'https://www.newcriterion.com' + part
        soup1 = self.index_to_soup(currentIssue_url)
        self.log(currentIssue_url)
        rsr = re.compile('^' + partf + '.+$')
        date = strftime(' %B %Y')
        articles = []
        subset = soup1.find('div', id='main')
        for item in subset.findAll('a', href=True):
            relurl = str(item['href'])
            if rsr.search(relurl):
                title = ''
                description = ''
                if item.find('div'):
                    title = self.tag_to_string(item.div.h1).strip()
                    description = self.tag_to_string(item.div.p)
                else:
                    title = self.tag_to_string(item.h1).strip()
                    description = self.tag_to_string(item.p)
                articles.append({
                    'title': title,
                    'date': date,
                    'url': 'https://www.newcriterion.com' + relurl,
                    'description': description
                })
        return [(self.title, articles)]

    def get_obfuscated_article(self, url):
        result = None
        count = 0
        while (count < self.fetch_retries):
            try:
                response = self.browser.open(url, timeout=self.timeout)
                html = response.read()
                count = self.fetch_retries
                tfile = PersistentTemporaryFile('_fa.html')
                tfile.write(html)
                tfile.close()
                self.temp_files.append(tfile)
                result = tfile.name
            except:
                print("Retrying download...")
            count += 1
        return result