#!/usr/bin/env python2
# vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2018, Kovid Goyal <kovid at kovidgoyal.net>

from __future__ import absolute_import, division, print_function, unicode_literals

import datetime
import re

from calibre import strftime
from calibre.utils.date import strptime
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag

is_web_edition = False
oldest_web_edition_article = 7  # days

# The sections to download when downloading the web edition, comment out
# the section you are not interested in
web_sections = [
    ('World', 'world'),
    ('U.S.', 'us'),
    ('Politics', 'politics'),
    ('New York', 'nyregion'),
    ('Business', 'business'),
    ('Technology', 'technology'),
    ('Sports', 'sports'),
    ('Science', 'science'),
    ('Health', 'health'),
    ('Opinion', 'opinion'),
    ('Arts', 'arts'),
    ('Books', 'books'),
    ('Movies', 'movies'),
    ('Music', 'arts/music'),
    ('Television', 'arts/television'),
    ('Style', 'style'),
    ('Dining & Wine', 'dining'),
    ('Fashion & Style', 'fashion'),
    ('Home & Garden', 'garden'),
    ('Travel', 'travel'),
    ('Education', 'education'),
    ('Multimedia', 'multimedia'),
    ('Obituaries', 'obituaries'),
    ('Sunday Magazine', 'magazine')
]
url_date_pat = re.compile(r'/(2\d\d\d)/(\d\d)/(\d\d)/')


def date_from_url(url):
    m = url_date_pat.search(url)
    if m is not None:
        return datetime.date(*map(int, m.groups()))


def format_date(d):
    try:
        return strftime(' [%a, %d %b %Y]', d)
    except Exception:
        return strftime(' [%Y/%m/%d]', d)


def classes(classes):
    q = frozenset(classes.split(' '))
    return dict(attrs={
        'class': lambda x: x and frozenset(x.split()).intersection(q)})


class NewYorkTimes(BasicNewsRecipe):

    title = 'The New York Times'
    if is_web_edition:
        description = 'New York Times (Web). You can edit the recipe to remove sections you are not interested in.'
    else:
        description = 'Today\'s New York Times'
    encoding = 'utf-8'
    __author__ = 'Kovid Goyal'
    language = 'en'
    ignore_duplicate_articles = {'title', 'url'}
    no_stylesheets = True
    compress_news_images = True
    compress_news_images_auto_size = 5
    remove_attributes = ['style']

    remove_tags = [
        dict(attrs={'aria-label':'tools'.split()}),
        dict(attrs={'aria-label': lambda x: x and 'New York Times Logo' in x}),
        dict(href='#site-content #site-index'.split()),
        dict(attrs={'aria-hidden':'true'}),
        dict(attrs={'data-videoid':True}),
        dict(name='button meta link'.split()),
        dict(id=lambda x: x and x.startswith('story-ad-')),
        dict(name='head'),
        dict(role='toolbar'),
        dict(name='a', href=lambda x: x and '#story-continues-' in x),
        dict(name='a', href=lambda x: x and '#whats-next' in x),
        dict(id=lambda x: x and 'sharetools-' in x),
        dict(id='newsletter-promo supported-by-ad bottom-wrapper'.split()),
        classes('story-print-citation supported-by accessibility-ad-header visually-hidden bottom-of-article ad'),
        dict(attrs={'class': lambda x: x and (
            'SectionBar' in x or 'recirculation' in x or 'ResponsiveAd' in x or 'accessibility-visuallyHidden' in x or 'RelatedCoverage' in x)}),
    ]

    def preprocess_html(self, soup):
        article = soup.find(id='story')
        # The NYT is apparently A/B testing a new page layout
        has_supplemental = article is not None and article.find(**classes('story-body-supplemental')) is not None
        if has_supplemental:
            keep_only_tags = [
                dict(id='story-header'),
                classes('story-body-supplemental story-interrupter'),
            ]
        else:
            keep_only_tags = [
                dict(id='story'),
            ]
        body = Tag(soup, 'body')
        for spec in keep_only_tags:
            for tag in soup.find('body').findAll(**spec):
                body.insert(len(body.contents), tag)
        soup.find('body').replaceWith(body)

        # Add a space to the dateline
        t = soup.find(**classes('dateline'))
        if t is not None:
            t.insert(0, ' ')

        # Remove empty li tags
        for li in soup.findAll('li', attrs={'class': lambda x: x and x.startswith('css-')}):
            if not li.contents and not li.string:
                li.extract()

        # Ensure the headline is first
        h1 = soup.find('h1', itemprop='headline')
        if h1 is not None:
            h1.extract()
            soup.find('body').contents.insert(0, h1)
        return soup

    def read_nyt_metadata(self):
        INDEX = 'https://www.nytimes.com/section/todayspaper'
        # INDEX = 'file:///t/raw.html'
        soup = self.index_to_soup(INDEX)
        pdate = soup.find('meta', attrs={'name':'pdate', 'content': True})['content']
        date = strptime(pdate, '%Y%m%d', assume_utc=False, as_utc=False)
        self.cover_url = 'https://static01.nyt.com/images/{}/nytfrontpage/scan.jpg'.format(date.strftime('%Y/%m/%d'))
        self.timefmt = strftime(' [%d %b, %Y]', date)
        return soup

    def parse_todays_sections(self, container):
        for h2 in container.findAll('h2', **classes('headline')):
            title = self.tag_to_string(h2)
            a = h2.find('a', href=True)
            url = a['href']
            if '?' in url:
                url = url.split('?')[0]
            p = h2.findParent(**classes('story-body'))
            desc = ''
            if p is not None:
                s = p.find(**classes('summary'))
                if s is not None:
                    desc = self.tag_to_string(s)
            date = ''
            d = date_from_url(url)
            if d is not None:
                date = format_date(d)

            self.log('\t', title + date, ': ', url)
            self.log('\t\t', desc)
            yield {'title': title, 'url': url, 'description': desc, 'date': date}

    def parse_todays_page(self):
        soup = self.read_nyt_metadata()
        section = soup.find(id=lambda x: x and x.startswith('collection-todays-new-york-times'))
        feeds = []
        for i, h1 in enumerate(section.findAll('h1')):
            if i == 0:
                continue
            section_title = self.tag_to_string(h1)
            self.log('Found section:', section_title)
            if i == 1:
                container = h1.parent
                articles = list(self.parse_todays_sections(container))
                articles += list(self.parse_todays_sections(container.findNextSibling('div')))
            else:
                articles = list(self.parse_todays_sections(h1.findNextSibling('ol')))
            if articles:
                feeds.append((section_title, articles))
        return feeds

    def parse_highlights(self, container):
        for article in container.findAll('article', **classes('story')):
            h2 = article.find('h2')
            if h2 is not None:
                title = self.tag_to_string(h2)
                a = h2.find('a', href=True)
                if a is not None:
                    url = a['href']
                    desc = ''
                    p = article.find(**classes('summary'))
                    if p is not None:
                        desc = self.tag_to_string(p)
                    date = ''
                    d = date_from_url(url)
                    if d is not None:
                        date = format_date(d)
                        today = datetime.date.today()
                        delta = today - d
                        if delta.days > oldest_web_edition_article:
                            self.log.debug('\tSkipping article', title, 'as it is too old')
                            continue
                    yield {'title': title, 'url': url, 'description': desc, 'date': date}

    def parse_web_section(self, soup, slug):

        def log(article):
            self.log('\t', article['title'] + article['date'], ':', article['url'])
            if article.get('description'):
                self.log('\t\t', article['description'])

        container = soup.find(itemtype='http://schema.org/CollectionPage')
        highlights = container.find('section', **classes('highlights'))
        for article in self.parse_highlights(highlights):
            log(article)
            yield article
        extra = container.find('section', attrs={'data-collection-type': True})
        if extra is not None:
            title = self.tag_to_string(extra.find('h2'))
            for article in self.parse_highlights(extra):
                article['title'] = '{}: {}'.format(title, article['title'])
                log(article)
                yield article

    def parse_web_sections(self):
        self.read_nyt_metadata()
        feeds = []
        for section_title, slug in web_sections:
            url = 'https://www.nytimes.com/section/' + slug
            try:
                soup = self.index_to_soup(url)
            except Exception:
                self.log.error('Failed to download section:', url)
                continue
            self.log('Found section:', section_title)
            articles = list(self.parse_web_section(soup, slug))
            if articles:
                feeds.append((section_title, articles))
            if self.test and len(feeds) >= self.test[0]:
                break
        return feeds

    def parse_index(self):
        # return [('All articles', [
        #     {'title': 'XXXXX', 'url': 'https://www.nytimes.com/2018/05/31/theater/best-25-plays-how-we-made-the-list.html'},
        # ])]
        if is_web_edition:
            return self.parse_web_sections()
        return self.parse_todays_page()

    # The NYT occassionally returns bogus articles for some reason just in case
    # it is because of cookies, dont store cookies
    def get_browser(self, *args, **kwargs):
        return self

    def clone_browser(self, *args, **kwargs):
        return self.get_browser()

    def open_novisit(self, *args, **kwargs):
        from calibre import browser
        br = browser()
        response = br.open_novisit(*args, **kwargs)
        # headers = response.info()
        # if headers.get('X-PageType') == 'vi-story':
        #     import tempfile
        #     with tempfile.NamedTemporaryFile(suffix='.html', dir='/t/n', delete=False) as f:
        #         f.write(response.read())
        #     import time
        #     time.sleep(1)
        #     br = browser()
        #     response = br.open_novisit(*args, **kwargs)
        return response

    open = open_novisit