Rewite non-work recipe

This commit is contained in:
Kennyl 2017-01-25 23:21:03 +08:00 committed by GitHub
parent 2693a2c614
commit d7527ddef8

View File

@ -15,6 +15,8 @@ from calibre.utils.date import now as nowf
import os import os
import datetime import datetime
import re import re
import urllib
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from contextlib import nested from contextlib import nested
from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ebooks.BeautifulSoup import BeautifulSoup
@ -24,13 +26,12 @@ from calibre.ebooks.metadata import MetaInformation
from calibre.utils.localization import canonicalize_lang from calibre.utils.localization import canonicalize_lang
class AppleDaily(BasicNewsRecipe): class AM730(BasicNewsRecipe):
title = u'AM730' title = u'AM730'
__author__ = 'Eddie Lau' __author__ = 'Eddie Lau'
publisher = 'AM730' publisher = 'AM730'
oldest_article = 1 oldest_article = 1
max_articles_per_feed = 100 max_articles_per_feed = 100
auto_cleanup = False
language = 'zh' language = 'zh'
encoding = 'utf-8' encoding = 'utf-8'
auto_cleanup = False auto_cleanup = False
@ -39,240 +40,66 @@ class AppleDaily(BasicNewsRecipe):
no_stylesheets = True no_stylesheets = True
description = 'http://www.am730.com.hk' description = 'http://www.am730.com.hk'
category = 'Chinese, News, Hong Kong' category = 'Chinese, News, Hong Kong'
masthead_url = 'http://www.am730.com.hk/images/logo.jpg' masthead_url = 'https://upload.wikimedia.org/wikipedia/en/5/58/Am730_Hong_Kong_newspaper_logo.png'
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 20px; margin-bottom: 20px; max-height:70%;} div[id=articleHeader] {font-size:200%; text-align:left; font-weight:bold;} li {font-size:50%; margin-left:auto; margin-right:auto;}' # noqa extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 20px; margin-bottom: 20px; max-height:70%;} div[id=articleHeader] {font-size:200%; text-align:left; font-weight:bold;} li {font-size:50%; margin-left:auto; margin-right:auto;}' # noqa
keep_only_tags = [dict(name='h2', attrs={'class': 'printTopic'}), remove_tags =[dict(name='div',attrs={'class':'col-xs-12 col-sm-1 col-md-1 share-button'}),
dict(name='div', attrs={'id': 'article_content'}), dict(name='div',attrs={'class':'logo-container print-logo'}),
dict(name='div', attrs={'id': 'slider'})] dict(name='div',attrs={'id':'galleria'})]
remove_tags = [dict(name='img', attrs={'src': 'images/am730_article_logo.jpg'}), keep_only_tags = [dict(name='div', attrs={'class': 'col-xs-12 col-sm-12 col-md-12 news-detail-content-container'}),
dict(name='img', attrs={'src': 'images/am_endmark.gif'})] #dict(name='div',attrs={'class':'columns-left'})]
]
compress_news_images = True
compress_news_images_auto_size = 16
compress_news_images_max_size = 20 #kB
scale_news_images =(600,800)
ignore_duplicate_articles = {'title', 'url'}
def get_dtlocal(self): debug=False
dt_utc = datetime.datetime.utcnow()
# convert UTC to local hk time - at HKT 6am, all news are available
return dt_utc + datetime.timedelta(8.0 / 24) - datetime.timedelta(6.0 / 24)
def get_fetchdate(self): def getAMSectionArticles(self, sectionName,url):
if __Date__ != '': # print sectionName
return __Date__ soup = self.index_to_soup(url)
else: articles = []
return self.get_dtlocal().strftime("%Y%m%d") for aTag in soup.findAll('a',attrs={'class':'newsimglink'}):
href = aTag.get('href',False)
if not href.encode("utf-8").startswith(url.encode("utf-8")) :
continue #not in same section
title = href.split('/')[-1].split('-')[0]
title = urllib.unquote(title.encode('ASCII'))#.decode('utf-8')
if self.debug:
print title
try:
if articles.index({'title':title,'url':href})>=0:
# print 'already added'
continue #already added
except:
pass
def get_fetchformatteddate(self): articles.append({'title':title,'url':href})
if __Date__ != '':
return __Date__[0:4] + '-' + __Date__[4:6] + '-' + __Date__[6:8]
else:
return self.get_dtlocal().strftime("%Y-%m-%d")
def get_fetchyear(self): if ( len(articles) >= self.max_articles_per_feed ):
if __Date__ != '': break
return __Date__[0:4] if self.debug:
else: print articles
return self.get_dtlocal().strftime("%Y") return (sectionName,articles)
def get_fetchmonth(self):
if __Date__ != '':
return __Date__[4:6]
else:
return self.get_dtlocal().strftime("%m")
def get_fetchday(self):
if __Date__ != '':
return __Date__[6:8]
else:
return self.get_dtlocal().strftime("%d")
# Note: does not work with custom date given by __Date__
def get_weekday(self):
return self.get_dtlocal().weekday()
def get_cover_url(self):
soup = self.index_to_soup('http://www.am730.com.hk')
cover = 'http://www.am730.com.hk/' + \
soup.find(attrs={'id': 'mini_news_img'}).find(
'img').get('src', False)
br = BasicNewsRecipe.get_browser(self)
try:
br.open(cover)
except:
cover = None
return cover
def populate_article_metadata(self, article, soup, first):
if first and hasattr(self, 'add_toc_thumbnail'):
picdiv = soup.find('img')
if picdiv is not None:
self.add_toc_thumbnail(article, picdiv['src'])
def parse_index(self): def parse_index(self):
feeds = [] #hard code sections
soup = self.index_to_soup('http://www.am730.com.hk/') Sections=[ ('新聞','https://www.am730.com.hk/news/%E6%96%B0%E8%81%9E'),
optgroups = soup.findAll('optgroup') ('財經','https://www.am730.com.hk/news/%E8%B2%A1%E7%B6%93'),
for optgroup in optgroups: ('健康','https://www.am730.com.hk/news/%E5%81%A5%E5%BA%B7'),
sectitle = optgroup.get('label') ('科技','https://www.am730.com.hk/news/%E7%A7%91%E6%8A%80'),
articles = [] ('體育','https://www.am730.com.hk/news/%E9%AB%94%E8%82%B2'),
for option in optgroup.findAll('option'): ('娛樂','https://www.am730.com.hk/news/%E5%A8%9B%E6%A8%82'),
articlelink = "http://www.am730.com.hk/" + option.get('value') ('旅遊.飲食','https://www.am730.com.hk/news/%E6%97%85%E9%81%8A.%E9%A3%B2%E9%A3%9F')
title = option.string ] # articles =[]
articles.append({'title': title, 'url': articlelink}) SectionsArticles=[]
feeds.append((sectitle, articles)) for (title, url) in Sections:
return feeds if self.debug:
print title
def create_opf(self, feeds, dir=None): print url
if dir is None: SectionsArticles.append(self.getAMSectionArticles(title,url))
dir = self.output_dir # articles.append({'title': '狗狗救墮激流同伴?搶樹枝?', 'url': 'https://www.am730.com.hk/news/%E6%96%B0%E8%81%9E/%E7%8B%97%E7%8B%97%E6%95%91%E5%A2%AE%E6%BF%80%E6%B5%81%E5%90%8C%E4%BC%B4%EF%BC%9F%E6%90%B6%E6%A8%B9%E6%9E%9D%EF%BC%9F-15432'})
title = self.short_title() # feeds.append(articles[0]['url'])
if self.output_profile.periodical_date_in_title: return SectionsArticles
title += strftime(self.timefmt)
mi = MetaInformation(title, [__appname__])
mi.publisher = __appname__
mi.author_sort = __appname__
if self.publication_type:
mi.publication_type = 'periodical:' + \
self.publication_type + ':' + self.short_title()
mi.timestamp = nowf()
article_titles, aseen = [], set()
for f in feeds:
for a in f:
if a.title and a.title not in aseen:
aseen.add(a.title)
article_titles.append(force_unicode(a.title, 'utf-8'))
mi.comments = self.description
if not isinstance(mi.comments, unicode):
mi.comments = mi.comments.decode('utf-8', 'replace')
mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
'\n\n'.join(article_titles))
language = canonicalize_lang(self.language)
if language is not None:
mi.language = language
# This one affects the pub date shown in kindle title
# now appears to need the time field to be > 12.00noon as well
mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(
self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
opf_path = os.path.join(dir, 'index.opf')
ncx_path = os.path.join(dir, 'index.ncx')
opf = OPFCreator(dir, mi)
# Add mastheadImage entry to <guide> section
mp = getattr(self, 'masthead_path', None)
if mp is not None and os.access(mp, os.R_OK):
from calibre.ebooks.metadata.opf2 import Guide
ref = Guide.Reference(os.path.basename(
self.masthead_path), os.getcwdu())
ref.type = 'masthead'
ref.title = 'Masthead Image'
opf.guide.append(ref)
manifest = [os.path.join(dir, 'feed_%d' % i)
for i in range(len(feeds))]
manifest.append(os.path.join(dir, 'index.html'))
manifest.append(os.path.join(dir, 'index.ncx'))
# Get cover
cpath = getattr(self, 'cover_path', None)
if cpath is None:
pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
if self.default_cover(pf):
cpath = pf.name
if cpath is not None and os.access(cpath, os.R_OK):
opf.cover = cpath
manifest.append(cpath)
# Get masthead
mpath = getattr(self, 'masthead_path', None)
if mpath is not None and os.access(mpath, os.R_OK):
manifest.append(mpath)
opf.create_manifest_from_files_in(manifest)
for mani in opf.manifest:
if mani.path.endswith('.ncx'):
mani.id = 'ncx'
if mani.path.endswith('mastheadImage.jpg'):
mani.id = 'masthead-image'
entries = ['index.html']
toc = TOC(base_path=dir)
self.play_order_counter = 0
self.play_order_map = {}
def feed_index(num, parent):
f = feeds[num]
for j, a in enumerate(f):
if getattr(a, 'downloaded', False):
adir = 'feed_%d/article_%d/' % (num, j)
auth = a.author
if not auth:
auth = None
desc = a.text_summary
if not desc:
desc = None
else:
desc = self.description_limiter(desc)
tt = a.toc_thumbnail if a.toc_thumbnail else None
entries.append('%sindex.html' % adir)
po = self.play_order_map.get(entries[-1], None)
if po is None:
self.play_order_counter += 1
po = self.play_order_counter
parent.add_item('%sindex.html' % adir, None,
a.title if a.title else _(
'Untitled Article'),
play_order=po, author=auth,
description=desc, toc_thumbnail=tt)
last = os.path.join(
self.output_dir, ('%sindex.html' % adir).replace('/', os.sep))
for sp in a.sub_pages:
prefix = os.path.commonprefix([opf_path, sp])
relp = sp[len(prefix):]
entries.append(relp.replace(os.sep, '/'))
last = sp
if os.path.exists(last):
with open(last, 'rb') as fi:
src = fi.read().decode('utf-8')
soup = BeautifulSoup(src)
body = soup.find('body')
if body is not None:
prefix = '/'.join('..'for i in range(2 *
len(re.findall(r'link\d+', last))))
templ = self.navbar.generate(True, num, j, len(f),
not self.has_single_feed,
a.orig_url, __appname__, prefix=prefix,
center=self.center_navbar)
elem = BeautifulSoup(templ.render(
doctype='xhtml').decode('utf-8')).find('div')
body.insert(len(body.contents), elem)
with open(last, 'wb') as fi:
fi.write(unicode(soup).encode('utf-8'))
if len(feeds) == 0:
raise Exception('All feeds are empty, aborting.')
if len(feeds) > 1:
for i, f in enumerate(feeds):
entries.append('feed_%d/index.html' % i)
po = self.play_order_map.get(entries[-1], None)
if po is None:
self.play_order_counter += 1
po = self.play_order_counter
auth = getattr(f, 'author', None)
if not auth:
auth = None
desc = getattr(f, 'description', None)
if not desc:
desc = None
feed_index(i, toc.add_item('feed_%d/index.html' % i, None,
f.title, play_order=po, description=desc, author=auth))
else:
entries.append('feed_%d/index.html' % 0)
feed_index(0, toc)
for i, p in enumerate(entries):
entries[i] = os.path.join(dir, p.replace('/', os.sep))
opf.create_spine(entries)
opf.set_toc(toc)
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
opf.render(opf_file, ncx_file)