calibre/recipes/am730.recipe
2019-04-01 13:57:21 +05:30

101 lines
3.8 KiB
Plaintext

# vim:fileencoding=UTF-8
from __future__ import unicode_literals
from __future__ import print_function
__license__ = 'GPL v3'
__copyright__ = '2013, Eddie Lau'
__Date__ = ''
'''
Change Log:
2013/09/28 -- update due to website redesign, add cover
2013/03/30 -- first version
'''
try:
from urllib.parse import unquote
except ImportError:
from urllib import unquote
from calibre.web.feeds.recipes import BasicNewsRecipe
class AM730(BasicNewsRecipe):
title = u'AM730'
__author__ = 'Eddie Lau'
publisher = 'AM730'
oldest_article = 1
max_articles_per_feed = 100
language = 'zh'
encoding = 'utf-8'
auto_cleanup = False
remove_javascript = True
use_embedded_content = False
no_stylesheets = True
description = 'http://www.am730.com.hk'
category = 'Chinese, News, Hong Kong'
masthead_url = 'https://upload.wikimedia.org/wikipedia/en/5/58/Am730_Hong_Kong_newspaper_logo.png'
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 20px; margin-bottom: 20px; max-height:70%;} div[id=articleHeader] {font-size:200%; text-align:left; font-weight:bold;} li {font-size:50%; margin-left:auto; margin-right:auto;}' # noqa
remove_tags =[dict(name='div',attrs={'class':'col-xs-12 col-sm-1 col-md-1 share-button'}),
dict(name='div',attrs={'class':'logo-container print-logo'}),
dict(name='div',attrs={'id':'galleria'})]
keep_only_tags = [dict(name='div', attrs={'class': 'col-xs-12 col-sm-12 col-md-12 news-detail-content-container'}),
# dict(name='div',attrs={'class':'columns-left'})]
]
compress_news_images = True
compress_news_images_auto_size = 16
compress_news_images_max_size = 20 # kB
scale_news_images =(600,800)
ignore_duplicate_articles = {'title', 'url'}
debug=False
def get_cover_url(self):
return self.masthead_url
def getAMSectionArticles(self, sectionName,url):
# print sectionName
soup = self.index_to_soup(url)
articles = []
for aTag in soup.findAll('a',attrs={'class':'newsimglink'}):
href = aTag.get('href',False)
if not href.encode("utf-8").startswith(url.encode("utf-8")) :
continue # not in same section
title = href.split('/')[-1].split('-')[0]
title = unquote(title.encode('ASCII')) # .decode('utf-8')
if self.debug:
print(title)
try:
if articles.index({'title':title,'url':href})>=0:
# print 'already added'
continue # already added
except:
pass
articles.append({'title':title,'url':href})
if (len(articles) >= self.max_articles_per_feed):
break
if self.debug:
print(articles)
return (sectionName,articles)
def parse_index(self):
# hard code sections
Sections=[('新聞','https://www.am730.com.hk/news/%E6%96%B0%E8%81%9E'),
('財經','https://www.am730.com.hk/news/%E8%B2%A1%E7%B6%93'),
('健康','https://www.am730.com.hk/news/%E5%81%A5%E5%BA%B7'),
('科技','https://www.am730.com.hk/news/%E7%A7%91%E6%8A%80'),
('體育','https://www.am730.com.hk/news/%E9%AB%94%E8%82%B2'),
('娛樂','https://www.am730.com.hk/news/%E5%A8%9B%E6%A8%82'),
('旅遊.飲食','https://www.am730.com.hk/news/%E6%97%85%E9%81%8A.%E9%A3%B2%E9%A3%9F')
] # articles =[]
SectionsArticles=[]
for (title, url) in Sections:
if self.debug:
print(title)
print(url)
SectionsArticles.append(self.getAMSectionArticles(title,url))
# feeds.append(articles[0]['url'])
return SectionsArticles