calibre/recipes/caijing.recipe

import re
from calibre.web.feeds.recipes import BasicNewsRecipe

__license__ = 'GPL v3'


class Caijing(BasicNewsRecipe):

    '''based on the recipe wrote by Eric Chen at 2011'''

    __author__ = '2014, Chen Wei <weichen302@gmx.com>'
    title = 'Caijing Magazine'
    description = '''
    Founded in 1998, the fortnightly CAIJING Magazine has firmly established
    itself as a news authority and leading voice for business and financial
    issues in China.

    CAIJING Magazine closely tracks the most important aspects of China's
    economic reforms, developments and policy changes, as well as major events
    in the capital markets. It also offers a broad international perspective
    through first-hand reporting on international political and economic
    issues.

    CAIJING Magazine is China's most widely read business and finance magazine,
    with a circulation of 225,000 per issue. It boasts top-level readers from
    government, business and academic circles.'''
    language = 'zh'
    encoding = 'UTF-8'
    publisher = 'Caijing Magazine'
    publication_type = 'magazine'
    category = 'news, Business, China'
    timefmt = ' [%a, %d %b, %Y]'
    needs_subscription = True

    remove_tags = [dict(attrs={'class': ['head_nav', 'mcont_logo', 'header',
                                         'bottom', 'footer', 'magazine_ipad', 'cjartShare', 'ar_about',
                                         'main_rt', 'mcont_nav', 'new']}),
                   dict(attrs={'id': ['articlePl']}),
                   dict(name=['script', 'noscript', 'style'])]
    no_stylesheets = True
    remove_javascript = True
    current_issue_url = ""
    current_issue_cover = ""

    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
        if self.username is not None and self.password is not None:
            br.open('http://service.caijing.com.cn/usermanage/login')
            br.select_form(name='mainLoginForm')
            br['username'] = self.username
            br['password'] = self.password
            br.submit()
        return br

    def parse_index(self):
        soup_start = self.index_to_soup('http://magazine.caijing.com.cn/')
        jumpurl = soup_start.find('script').contents[0].split()
        for line in jumpurl:
            if 'http' in line.lower():
                issuesurl = line.split('"')[1]
                break

        soup_issues = self.index_to_soup(issuesurl)
        # find the latest issue
        div = soup_issues.find('div', attrs={'class': 'fmcon'})
        current_issue_url = div.find('a', href=True)['href']

        soup = self.index_to_soup(current_issue_url)
        coverimg = soup.find('div', {'class': 'zzfm_img'})
        self.current_issue_cover = coverimg.find('img')['src']

        feeds = []
        for section in soup.findAll('div',
                                    attrs={'class': re.compile(r'(fmwz_ml|zzlm_nr)2?$')}):
            section_title = self.tag_to_string(section.find('div',
                                                            attrs={'class': re.compile(r'(lmnav_bt|zzlm_bt)1?$')}))
            self.log('Found section:', section_title)
            articles = []
            for post in section.findAll('div',
                                        attrs={'class': re.compile(r'(fmwz_bt|zzlm_nr_bt)')}):
                title = self.tag_to_string(post)
                url = post.find('a')['href']
                articles.append({'title': title, 'url': url, 'date': None})

            if articles:
                feeds.append((section_title, articles))
        return feeds

    def get_cover_url(self):
        return self.current_issue_cover