calibre/recipes/nanfengchuang.recipe

# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
from __future__ import unicode_literals
from calibre.web.feeds.recipes import BasicNewsRecipe
from lxml import html

__license__ = 'GPL v3'


class Nfcmag(BasicNewsRecipe):

    __author__ = '2014, Chen Wei <weichen302@gmx.com>'
    title = 'Nan Feng Chuang / South Reviews Magazine'
    description = '''
South Reviews Magazine, established in 1985, is a Guangzhou-based political and
economic biweekly. South Reviews enjoys a reputation of being fair and objective, with graceful
narration, insightful expression among its readers, mostly government
officials, economic leaders and intellectuals. It has been praised as “the No.1
Political& Economical Magazine in China”.

The US magazine Time described South Reviews as "a highbrow news magazine".
Other international media organizations such as BBC and NHK have conducted
tracking shots of South Reviews journalists, to record their unique value
special position in China’s media industry. Harvard-Yenching Library, Stanford
University's East Asia Library and UC Berkeley Library have collections of the
magazine since its first issue, taking them as an important source to
understand China's economic and social reform.

Since 2008, South Reviews has been committed to transforming into a
research-based media organization. Most of its editors, reporters and
contributors have remarkably strong academic backgrounds, coming from Peking
University, Tsinghua University, London School of Economics and Political
Science, the Chinese University of Hong Kong, Renmin University of China, and
other well-known institutions. The magazine has established research divisions,
including the State Policy Research Center and the Brand Promotion Research
Center, working in cooperation with well-known academic institutions and
providing valuable research reports for governments and companies.

'''
    language = 'zh'
    encoding = 'UTF-8'
    publisher = 'South Reviews Magazine'
    publication_type = 'magazine'
    category = 'news, Business, China'
    timefmt = ' [%a, %d %b, %Y]'
    needs_subscription = False

    remove_tags = [dict(attrs={'class': ['side-left', 'side-right',
                                         'breadcrumbs', 'score', 'weboNav']}),
                   dict(attrs={'id': ['header', 'footer']}),
                   dict(name=['script', 'noscript', 'style'])]
    no_stylesheets = True
    remove_javascript = True
    current_issue_url = ""
    current_issue_cover = ""

    def parse_index(self):

        baseurl = 'http://www.nfcmag.com/'
        raw = self.index_to_soup('http://www.nfcmag.com/magazine', raw=True)
        soup_start = html.fromstring(raw)

        els = soup_start.xpath("""//div[contains(@class, 'lastest-magazine')
                         and contains(@class, 'comBox')]
                         //a[@href and not(@id) and not(child::img)]
                         """)
        for x in els:
            issueurl = x.get('href')
            if not issueurl.lower().startswith('http://'):
                issueurl = baseurl + issueurl
            break

        raw = self.index_to_soup(issueurl, raw=True)
        soup_issue = html.fromstring(raw)

        coverimg = soup_issue.xpath("""//div[contains(@class, 'lastest-magazine')
                         and contains(@class, 'comBox')]
                         //img[@*] """)
        imgurl = coverimg[0].get('src')
        if not imgurl.lower().startswith('http://'):
            imgurl = baseurl + imgurl
        self.current_issue_cover = imgurl
        feeds = []

        sections = soup_issue.xpath("""//div[contains(@class, 'article-box')
                         and contains(@class, 'comBox')] """)
        for sec in sections:
            pages = sec.xpath('.//h5')
            sec_title = sec.xpath('.//h4')[0].text_content()
            self.log('Found section:', sec_title)
            articles = []
            for x in pages:
                url = x.xpath('.//a')[0].get('href')
                if not url.lower().startswith('http://'):
                    url = baseurl + url
                url = url[:-5] + '-s.html'  # to print view

                title = x.text_content()

                articles.append({'title': title, 'url': url, 'date': None})

            if articles:
                feeds.append((sec_title, articles))
        return feeds

    def get_cover_url(self):
        return self.current_issue_cover