calibre/recipes/am730.recipe

# vim:fileencoding=UTF-8
from __future__ import unicode_literals
from __future__ import print_function
__license__ = 'GPL v3'
__copyright__ = '2013, Eddie Lau'
__Date__ = ''

'''
Change Log:
2013/09/28 -- update due to website redesign, add cover
2013/03/30 -- first version
'''

try:
    from urllib.parse import unquote
except ImportError:
    from urllib import unquote

from calibre.web.feeds.recipes import BasicNewsRecipe


class AM730(BasicNewsRecipe):
    title = u'AM730'
    __author__ = 'Eddie Lau'
    publisher = 'AM730'
    oldest_article = 1
    max_articles_per_feed = 100
    language = 'zh'
    encoding = 'utf-8'
    auto_cleanup = False
    remove_javascript = True
    use_embedded_content = False
    no_stylesheets = True
    description = 'http://www.am730.com.hk'
    category = 'Chinese, News, Hong Kong'
    masthead_url = 'https://upload.wikimedia.org/wikipedia/en/5/58/Am730_Hong_Kong_newspaper_logo.png'
    extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 20px; margin-bottom: 20px; max-height:70%;} div[id=articleHeader] {font-size:200%; text-align:left; font-weight:bold;} li {font-size:50%; margin-left:auto; margin-right:auto;}'  # noqa
    remove_tags =[dict(name='div',attrs={'class':'col-xs-12 col-sm-1 col-md-1 share-button'}),
                dict(name='div',attrs={'class':'logo-container print-logo'}),
                dict(name='div',attrs={'id':'galleria'})]
    keep_only_tags = [dict(name='div', attrs={'class': 'col-xs-12 col-sm-12 col-md-12 news-detail-content-container'}),
                     # dict(name='div',attrs={'class':'columns-left'})]
                     ]
    compress_news_images = True
    compress_news_images_auto_size = 16
    compress_news_images_max_size = 20  # kB
    scale_news_images =(600,800)
    ignore_duplicate_articles = {'title', 'url'}

    debug=False

    def get_cover_url(self):
        return self.masthead_url

    def getAMSectionArticles(self, sectionName,url):
        # print sectionName
        soup = self.index_to_soup(url)
        articles = []
        for aTag in soup.findAll('a',attrs={'class':'newsimglink'}):
            href = aTag.get('href',False)
            if not href.encode("utf-8").startswith(url.encode("utf-8")) :
                continue  # not in same section

            title = href.split('/')[-1].split('-')[0]
            title = unquote(title.encode('ASCII'))  # .decode('utf-8')
            if self.debug:
                print(title)
            try:
                if articles.index({'title':title,'url':href})>=0:
                    # print 'already added'
                    continue  # already added
            except:
                pass

            articles.append({'title':title,'url':href})

            if (len(articles) >= self.max_articles_per_feed):
                break
        if self.debug:
            print(articles)
        return (sectionName,articles)

    def parse_index(self):
        # hard code sections
        Sections=[('新聞','https://www.am730.com.hk/news/%E6%96%B0%E8%81%9E'),
        ('財經','https://www.am730.com.hk/news/%E8%B2%A1%E7%B6%93'),
        ('健康','https://www.am730.com.hk/news/%E5%81%A5%E5%BA%B7'),
        ('科技','https://www.am730.com.hk/news/%E7%A7%91%E6%8A%80'),
        ('體育','https://www.am730.com.hk/news/%E9%AB%94%E8%82%B2'),
        ('娛樂','https://www.am730.com.hk/news/%E5%A8%9B%E6%A8%82'),
        ('旅遊.飲食','https://www.am730.com.hk/news/%E6%97%85%E9%81%8A.%E9%A3%B2%E9%A3%9F')
         ]       # articles =[]
        SectionsArticles=[]
        for (title, url) in Sections:
            if self.debug:
                print(title)
                print(url)
            SectionsArticles.append(self.getAMSectionArticles(title,url))
#        feeds.append(articles[0]['url'])
        return SectionsArticles