# vim:fileencoding=UTF-8 from __future__ import unicode_literals from __future__ import print_function __license__ = 'GPL v3' __copyright__ = '2013, Eddie Lau' __Date__ = '' ''' Change Log: 2013/09/28 -- update due to website redesign, add cover 2013/03/30 -- first version ''' try: from urllib.parse import unquote except ImportError: from urllib import unquote from calibre.web.feeds.recipes import BasicNewsRecipe class AM730(BasicNewsRecipe): title = u'AM730' __author__ = 'Eddie Lau' publisher = 'AM730' oldest_article = 1 max_articles_per_feed = 100 language = 'zh' encoding = 'utf-8' auto_cleanup = False remove_javascript = True use_embedded_content = False no_stylesheets = True description = 'http://www.am730.com.hk' category = 'Chinese, News, Hong Kong' masthead_url = 'https://upload.wikimedia.org/wikipedia/en/5/58/Am730_Hong_Kong_newspaper_logo.png' extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 20px; margin-bottom: 20px; max-height:70%;} div[id=articleHeader] {font-size:200%; text-align:left; font-weight:bold;} li {font-size:50%; margin-left:auto; margin-right:auto;}' # noqa remove_tags =[dict(name='div',attrs={'class':'col-xs-12 col-sm-1 col-md-1 share-button'}), dict(name='div',attrs={'class':'logo-container print-logo'}), dict(name='div',attrs={'id':'galleria'})] keep_only_tags = [dict(name='div', attrs={'class': 'col-xs-12 col-sm-12 col-md-12 news-detail-content-container'}), # dict(name='div',attrs={'class':'columns-left'})] ] compress_news_images = True compress_news_images_auto_size = 16 compress_news_images_max_size = 20 # kB scale_news_images =(600,800) ignore_duplicate_articles = {'title', 'url'} debug=False def get_cover_url(self): return self.masthead_url def getAMSectionArticles(self, sectionName,url): # print sectionName soup = self.index_to_soup(url) articles = [] for aTag in soup.findAll('a',attrs={'class':'newsimglink'}): href = aTag.get('href',False) if not href.encode("utf-8").startswith(url.encode("utf-8")) : continue # not in same section title = href.split('/')[-1].split('-')[0] title = unquote(title.encode('ASCII')) # .decode('utf-8') if self.debug: print(title) try: if articles.index({'title':title,'url':href})>=0: # print 'already added' continue # already added except: pass articles.append({'title':title,'url':href}) if (len(articles) >= self.max_articles_per_feed): break if self.debug: print(articles) return (sectionName,articles) def parse_index(self): # hard code sections Sections=[('新聞','https://www.am730.com.hk/news/%E6%96%B0%E8%81%9E'), ('財經','https://www.am730.com.hk/news/%E8%B2%A1%E7%B6%93'), ('健康','https://www.am730.com.hk/news/%E5%81%A5%E5%BA%B7'), ('科技','https://www.am730.com.hk/news/%E7%A7%91%E6%8A%80'), ('體育','https://www.am730.com.hk/news/%E9%AB%94%E8%82%B2'), ('娛樂','https://www.am730.com.hk/news/%E5%A8%9B%E6%A8%82'), ('旅遊.飲食','https://www.am730.com.hk/news/%E6%97%85%E9%81%8A.%E9%A3%B2%E9%A3%9F') ] # articles =[] SectionsArticles=[] for (title, url) in Sections: if self.debug: print(title) print(url) SectionsArticles.append(self.getAMSectionArticles(title,url)) # feeds.append(articles[0]['url']) return SectionsArticles