mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Delete Apple Daily
its no longer under publication.
This commit is contained in:
parent
10d995c26c
commit
693378d64e
@ -1,305 +0,0 @@
|
||||
# vim:fileencoding=UTF-8
|
||||
from __future__ import unicode_literals
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013-2015, Eddie Lau'
|
||||
__Date__ = ''
|
||||
|
||||
from calibre import (__appname__, force_unicode, strftime)
|
||||
from calibre.utils.date import now as nowf, utcnow
|
||||
import os
|
||||
import datetime
|
||||
import re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||
from calibre.ebooks.metadata.toc import TOC
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
from calibre.utils.localization import canonicalize_lang
|
||||
|
||||
|
||||
class AppleDaily(BasicNewsRecipe):
|
||||
title = u'蘋果日報 (香港)'
|
||||
__author__ = 'Eddie Lau'
|
||||
publisher = '蘋果日報'
|
||||
publication_type= 'newspaper'
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100
|
||||
auto_cleanup = False
|
||||
language = 'zh'
|
||||
encoding = 'utf-8'
|
||||
auto_cleanup = False
|
||||
remove_javascript = True
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
description = 'http://hkm.appledaily.com/'
|
||||
category = 'Chinese, News, Hong Kong'
|
||||
masthead_url = 'https://upload.wikimedia.org/wikipedia/commons/8/86/Apple_Daily_Title.svg'
|
||||
|
||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} h1 {font-size:125%; text-align:left; font-weight:bold;} p{font-size:90%;} p[class=video-caption] {font-size:50%; margin-left:auto; margin-right:auto;}' # noqa
|
||||
keep_only_tags = [dict(name='div', attrs={'id': 'content-article'})]
|
||||
remove_tags = [dict(name='div', attrs={'class': 'prev-next-btn'}),
|
||||
dict(name='p', attrs={'class': 'next'}),
|
||||
dict(name='meta'),
|
||||
dict(name='link')]
|
||||
|
||||
def get_dtlocal(self):
|
||||
dt_utc = utcnow()
|
||||
# convert UTC to local hk time - at HKT 6am, all news are available
|
||||
return dt_utc + datetime.timedelta(8.0 / 24) - datetime.timedelta(6.0 / 24)
|
||||
|
||||
def get_fetchdate(self):
|
||||
if __Date__ != '':
|
||||
return __Date__
|
||||
else:
|
||||
return self.get_dtlocal().strftime("%Y%m%d")
|
||||
|
||||
def get_fetchformatteddate(self):
|
||||
if __Date__ != '':
|
||||
return __Date__[0:4] + '-' + __Date__[4:6] + '-' + __Date__[6:8]
|
||||
else:
|
||||
return self.get_dtlocal().strftime("%Y-%m-%d")
|
||||
|
||||
def get_fetchyear(self):
|
||||
if __Date__ != '':
|
||||
return __Date__[0:4]
|
||||
else:
|
||||
return self.get_dtlocal().strftime("%Y")
|
||||
|
||||
def get_fetchmonth(self):
|
||||
if __Date__ != '':
|
||||
return __Date__[4:6]
|
||||
else:
|
||||
return self.get_dtlocal().strftime("%m")
|
||||
|
||||
def get_fetchday(self):
|
||||
if __Date__ != '':
|
||||
return __Date__[6:8]
|
||||
else:
|
||||
return self.get_dtlocal().strftime("%d")
|
||||
|
||||
# Note: does not work with custom date given by __Date__
|
||||
def get_weekday(self):
|
||||
return self.get_dtlocal().weekday()
|
||||
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup('http://hkm.appledaily.com/')
|
||||
cover = soup.find(attrs={'class': 'top-news'}).get('src', False)
|
||||
br = BasicNewsRecipe.get_browser(self)
|
||||
try:
|
||||
br.open(cover)
|
||||
except:
|
||||
cover = None
|
||||
return cover
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
if first and hasattr(self, 'add_toc_thumbnail'):
|
||||
picdiv = soup.find('img')
|
||||
if picdiv is not None:
|
||||
self.add_toc_thumbnail(article, picdiv['src'])
|
||||
|
||||
def parse_index(self):
|
||||
feeds = []
|
||||
soup = self.index_to_soup('http://hkm.appledaily.com/')
|
||||
ul = soup.find(attrs={'class': 'menu'})
|
||||
sectionList = []
|
||||
for li in ul.findAll('li'):
|
||||
relativea = li.find('a', href=True).get('href', False)
|
||||
a = 'http://hkm.appledaily.com/' + relativea
|
||||
title = li.find('a', text=True).strip()
|
||||
# if (time.tzname != 'HKT'):
|
||||
# if (title == u'三藩市'):
|
||||
# continue
|
||||
# if (title == u'洛杉磯'):
|
||||
# continue
|
||||
# if (title == u'紐 約'):
|
||||
# continue
|
||||
# if (title == u'美 國'):
|
||||
# continue
|
||||
# if (not title == u'動新聞') and (relativea.startswith('list.php')):
|
||||
if (relativea.find('category=daily')!= -1)and (relativea.startswith('list.php')):
|
||||
sectionList.append((title, a))
|
||||
for title, url in sectionList:
|
||||
title = title.replace(" ", "")
|
||||
articles = self.parse_section(url)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
return feeds
|
||||
|
||||
def parse_section(self, url):
|
||||
soup = self.index_to_soup(url)
|
||||
ul = soup.find(attrs={'class': 'list'})
|
||||
current_articles = []
|
||||
if ul is None :
|
||||
return current_articles
|
||||
for li in ul.findAll('li'):
|
||||
a = li.find('a', href=True)
|
||||
title = li.find('p', text=True).strip()
|
||||
if a is not None:
|
||||
current_articles.append(
|
||||
{'title': title, 'url': 'http://hkm.appledaily.com/' + a.get('href', False)})
|
||||
pass
|
||||
return current_articles
|
||||
|
||||
def create_opf(self, feeds, dir=None):
|
||||
if dir is None:
|
||||
dir = self.output_dir
|
||||
title = self.short_title()
|
||||
if self.output_profile.periodical_date_in_title:
|
||||
title += strftime(self.timefmt)
|
||||
mi = MetaInformation(title, [__appname__])
|
||||
mi.publisher = __appname__
|
||||
mi.author_sort = __appname__
|
||||
if self.publication_type:
|
||||
mi.publication_type = 'periodical:' + \
|
||||
self.publication_type + ':' + self.short_title()
|
||||
mi.timestamp = nowf()
|
||||
article_titles, aseen = [], set()
|
||||
for f in feeds:
|
||||
for a in f:
|
||||
if a.title and a.title not in aseen:
|
||||
aseen.add(a.title)
|
||||
article_titles.append(force_unicode(a.title, 'utf-8'))
|
||||
|
||||
mi.comments = self.description
|
||||
if not isinstance(mi.comments, type(u'')):
|
||||
mi.comments = mi.comments.decode('utf-8', 'replace')
|
||||
mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
|
||||
'\n\n'.join(article_titles))
|
||||
|
||||
language = canonicalize_lang(self.language)
|
||||
if language is not None:
|
||||
mi.language = language
|
||||
# This one affects the pub date shown in kindle title
|
||||
# mi.pubdate = nowf()
|
||||
# now appears to need the time field to be > 12.00noon as well
|
||||
mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(
|
||||
self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
|
||||
opf_path = os.path.join(dir, 'index.opf')
|
||||
ncx_path = os.path.join(dir, 'index.ncx')
|
||||
|
||||
opf = OPFCreator(dir, mi)
|
||||
# Add mastheadImage entry to <guide> section
|
||||
mp = getattr(self, 'masthead_path', None)
|
||||
if mp is not None and os.access(mp, os.R_OK):
|
||||
from calibre.ebooks.metadata.opf2 import Guide
|
||||
ref = Guide.Reference(os.path.basename(
|
||||
self.masthead_path), os.getcwd())
|
||||
ref.type = 'masthead'
|
||||
ref.title = 'Masthead Image'
|
||||
opf.guide.append(ref)
|
||||
|
||||
manifest = [os.path.join(dir, 'feed_%d' % i)
|
||||
for i in range(len(feeds))]
|
||||
manifest.append(os.path.join(dir, 'index.html'))
|
||||
manifest.append(os.path.join(dir, 'index.ncx'))
|
||||
|
||||
# Get cover
|
||||
cpath = getattr(self, 'cover_path', None)
|
||||
if cpath is None:
|
||||
pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
|
||||
if self.default_cover(pf):
|
||||
cpath = pf.name
|
||||
if cpath is not None and os.access(cpath, os.R_OK):
|
||||
opf.cover = cpath
|
||||
manifest.append(cpath)
|
||||
|
||||
# Get masthead
|
||||
mpath = getattr(self, 'masthead_path', None)
|
||||
if mpath is not None and os.access(mpath, os.R_OK):
|
||||
manifest.append(mpath)
|
||||
|
||||
opf.create_manifest_from_files_in(manifest)
|
||||
for mani in opf.manifest:
|
||||
if mani.path.endswith('.ncx'):
|
||||
mani.id = 'ncx'
|
||||
if mani.path.endswith('mastheadImage.jpg'):
|
||||
mani.id = 'masthead-image'
|
||||
|
||||
entries = ['index.html']
|
||||
toc = TOC(base_path=dir)
|
||||
self.play_order_counter = 0
|
||||
self.play_order_map = {}
|
||||
|
||||
def feed_index(num, parent):
|
||||
f = feeds[num]
|
||||
for j, a in enumerate(f):
|
||||
if getattr(a, 'downloaded', False):
|
||||
adir = 'feed_%d/article_%d/' % (num, j)
|
||||
auth = a.author
|
||||
if not auth:
|
||||
auth = None
|
||||
desc = a.text_summary
|
||||
if not desc:
|
||||
desc = None
|
||||
else:
|
||||
desc = self.description_limiter(desc)
|
||||
tt = a.toc_thumbnail if a.toc_thumbnail else None
|
||||
entries.append('%sindex.html' % adir)
|
||||
po = self.play_order_map.get(entries[-1], None)
|
||||
if po is None:
|
||||
self.play_order_counter += 1
|
||||
po = self.play_order_counter
|
||||
parent.add_item('%sindex.html' % adir, None,
|
||||
a.title if a.title else _(
|
||||
'Untitled Article'),
|
||||
play_order=po, author=auth,
|
||||
description=desc, toc_thumbnail=tt)
|
||||
last = os.path.join(
|
||||
self.output_dir, ('%sindex.html' % adir).replace('/', os.sep))
|
||||
for sp in a.sub_pages:
|
||||
prefix = os.path.commonprefix([opf_path, sp])
|
||||
relp = sp[len(prefix):]
|
||||
entries.append(relp.replace(os.sep, '/'))
|
||||
last = sp
|
||||
|
||||
if os.path.exists(last):
|
||||
with open(last, 'rb') as fi:
|
||||
src = fi.read().decode('utf-8')
|
||||
src = src.replace('height:260px !important;','') # fix flow-player div tag parent
|
||||
soup = BeautifulSoup(src)
|
||||
body = soup.find('body')
|
||||
if body is not None:
|
||||
prefix = '/'.join('..'for i in range(2 *
|
||||
len(re.findall(r'link\d+', last))))
|
||||
templ = self.navbar.generate(True, num, j, len(f),
|
||||
not self.has_single_feed,
|
||||
a.orig_url, __appname__, prefix=prefix,
|
||||
center=self.center_navbar)
|
||||
translatedTempl =re.sub(
|
||||
'<hr.*<br','<hr>本篇由 '+__appname__+
|
||||
' 快取自 <a href="http://hkm.appledaily.com/" >蘋果日報</a> ; <a href="'+a.orig_url+'">本篇來源位置</a>。'+
|
||||
'<br',templ.render(doctype='xhtml').decode('utf-8'),flags=re.S)
|
||||
elem = BeautifulSoup(translatedTempl).find('div')
|
||||
body.insert(len(body.contents), elem)
|
||||
with open(last, 'wb') as fi:
|
||||
fi.write(type(u'')(soup).encode('utf-8'))
|
||||
if len(feeds) == 0:
|
||||
raise Exception('All feeds are empty, aborting.')
|
||||
|
||||
if len(feeds) > 1:
|
||||
for i, f in enumerate(feeds):
|
||||
entries.append('feed_%d/index.html' % i)
|
||||
po = self.play_order_map.get(entries[-1], None)
|
||||
if po is None:
|
||||
self.play_order_counter += 1
|
||||
po = self.play_order_counter
|
||||
auth = getattr(f, 'author', None)
|
||||
if not auth:
|
||||
auth = None
|
||||
desc = getattr(f, 'description', None)
|
||||
if not desc:
|
||||
desc = None
|
||||
feed_index(i, toc.add_item('feed_%d/index.html' % i, None,
|
||||
f.title, play_order=po, description=desc, author=auth))
|
||||
|
||||
else:
|
||||
entries.append('feed_%d/index.html' % 0)
|
||||
feed_index(0, toc)
|
||||
|
||||
for i, p in enumerate(entries):
|
||||
entries[i] = os.path.join(dir, p.replace('/', os.sep))
|
||||
opf.create_spine(entries)
|
||||
opf.set_toc(toc)
|
||||
|
||||
with open(opf_path, 'wb') as opf_file, open(ncx_path, 'wb') as ncx_file:
|
||||
opf.render(opf_file, ncx_file)
|
@ -1,5 +1,4 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe, classes
|
||||
from calibre.ptempfile import PersistentTemporaryFile
|
||||
|
||||
class STHKRecipe(BasicNewsRecipe):
|
||||
title = '星島日報 (香港)'
|
||||
@ -37,32 +36,23 @@ class STHKRecipe(BasicNewsRecipe):
|
||||
except Exception as e:
|
||||
url = e.hdrs.get('location')
|
||||
soup = self.index_to_soup(url)
|
||||
link = soup.find('a', href=True)
|
||||
skip_sections =[ # add sections you want to skip
|
||||
link = soup.find('a', href=True)['href']
|
||||
skip_sections = [ # add sections you want to skip
|
||||
'/video/', '/videos/', '/media/', 'podcast'
|
||||
]
|
||||
if any(x in link['href'] for x in skip_sections):
|
||||
self.log('Aborting Article ', link['href'])
|
||||
if any(x in link for x in skip_sections):
|
||||
self.log('Aborting Article ', link)
|
||||
self.abort_article('skipping video links')
|
||||
html = br.open(link).read()
|
||||
return ({ 'data': html, 'url': link })
|
||||
|
||||
self.log('Downloading ', link['href'])
|
||||
html = br.open(link['href']).read()
|
||||
pt = PersistentTemporaryFile('.html')
|
||||
pt.write(html)
|
||||
pt.close()
|
||||
return pt.name
|
||||
|
||||
feeds = []
|
||||
|
||||
sections = [
|
||||
'daily', 'realtime', 'education', 'property', 'racing', 'supplement', 'kol'
|
||||
feeds = [
|
||||
('日報', 'https://news.google.com/rss/search?q=when:27h+allinurl:https%3A%2F%2Fstd.stheadline.com%2Fdaily%2F&hl=zh-HK&gl=HK&ceid=HK:zh'),
|
||||
('即時', 'https://news.google.com/rss/search?q=when:27h+allinurl:https%3A%2F%2Fstd.stheadline.com%2Frealtime%2F&hl=zh-HK&gl=HK&ceid=HK:zh'),
|
||||
('副刊', 'https://news.google.com/rss/search?q=when:27h+allinurl:https%3A%2F%2Fstd.stheadline.com%2Fsupplement%2F&hl=zh-HK&gl=HK&ceid=HK:zh'),
|
||||
('其他的 新聞', 'https://news.google.com/rss/search?q=when:27h+allinurl:https%3A%2F%2Fstd.stheadline.com&hl=zh-HK&gl=HK&ceid=HK:zh')
|
||||
]
|
||||
|
||||
for sec in sections:
|
||||
a = 'https://news.google.com/rss/search?q=when:27h+allinurl:https%3A%2F%2Fstd.stheadline.com{}&hl=zh-HK&gl=HK&ceid=HK:zh'
|
||||
feeds.append((sec.capitalize(), a.format('%2F' + sec + '%2F')))
|
||||
feeds.append(('Others', a.format('')))
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
article.title = article.title.replace(' - 星島頭條', '')
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user