diff --git a/recipes/apple_daily.recipe b/recipes/apple_daily.recipe deleted file mode 100644 index bb95ca5ec8..0000000000 --- a/recipes/apple_daily.recipe +++ /dev/null @@ -1,305 +0,0 @@ -# vim:fileencoding=UTF-8 -from __future__ import unicode_literals -__license__ = 'GPL v3' -__copyright__ = '2013-2015, Eddie Lau' -__Date__ = '' - -from calibre import (__appname__, force_unicode, strftime) -from calibre.utils.date import now as nowf, utcnow -import os -import datetime -import re -from calibre.web.feeds.recipes import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup -from calibre.ebooks.metadata.opf2 import OPFCreator -from calibre.ebooks.metadata.toc import TOC -from calibre.ebooks.metadata import MetaInformation -from calibre.utils.localization import canonicalize_lang - - -class AppleDaily(BasicNewsRecipe): - title = u'蘋果日報 (香港)' - __author__ = 'Eddie Lau' - publisher = '蘋果日報' - publication_type= 'newspaper' - oldest_article = 1 - max_articles_per_feed = 100 - auto_cleanup = False - language = 'zh' - encoding = 'utf-8' - auto_cleanup = False - remove_javascript = True - use_embedded_content = False - no_stylesheets = True - description = 'http://hkm.appledaily.com/' - category = 'Chinese, News, Hong Kong' - masthead_url = 'https://upload.wikimedia.org/wikipedia/commons/8/86/Apple_Daily_Title.svg' - - extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} h1 {font-size:125%; text-align:left; font-weight:bold;} p{font-size:90%;} p[class=video-caption] {font-size:50%; margin-left:auto; margin-right:auto;}' # noqa - keep_only_tags = [dict(name='div', attrs={'id': 'content-article'})] - remove_tags = [dict(name='div', attrs={'class': 'prev-next-btn'}), - dict(name='p', attrs={'class': 'next'}), - dict(name='meta'), - dict(name='link')] - - def get_dtlocal(self): - dt_utc = utcnow() - # convert UTC to local hk time - at HKT 6am, all news are available - return dt_utc + datetime.timedelta(8.0 / 24) - datetime.timedelta(6.0 / 24) - - def get_fetchdate(self): - if __Date__ != '': - return __Date__ - else: - return self.get_dtlocal().strftime("%Y%m%d") - - def get_fetchformatteddate(self): - if __Date__ != '': - return __Date__[0:4] + '-' + __Date__[4:6] + '-' + __Date__[6:8] - else: - return self.get_dtlocal().strftime("%Y-%m-%d") - - def get_fetchyear(self): - if __Date__ != '': - return __Date__[0:4] - else: - return self.get_dtlocal().strftime("%Y") - - def get_fetchmonth(self): - if __Date__ != '': - return __Date__[4:6] - else: - return self.get_dtlocal().strftime("%m") - - def get_fetchday(self): - if __Date__ != '': - return __Date__[6:8] - else: - return self.get_dtlocal().strftime("%d") - - # Note: does not work with custom date given by __Date__ - def get_weekday(self): - return self.get_dtlocal().weekday() - - def get_cover_url(self): - soup = self.index_to_soup('http://hkm.appledaily.com/') - cover = soup.find(attrs={'class': 'top-news'}).get('src', False) - br = BasicNewsRecipe.get_browser(self) - try: - br.open(cover) - except: - cover = None - return cover - - def populate_article_metadata(self, article, soup, first): - if first and hasattr(self, 'add_toc_thumbnail'): - picdiv = soup.find('img') - if picdiv is not None: - self.add_toc_thumbnail(article, picdiv['src']) - - def parse_index(self): - feeds = [] - soup = self.index_to_soup('http://hkm.appledaily.com/') - ul = soup.find(attrs={'class': 'menu'}) - sectionList = [] - for li in ul.findAll('li'): - relativea = li.find('a', href=True).get('href', False) - a = 'http://hkm.appledaily.com/' + relativea - title = li.find('a', text=True).strip() - # if (time.tzname != 'HKT'): - # if (title == u'三藩市'): - # continue - # if (title == u'洛杉磯'): - # continue - # if (title == u'紐  約'): - # continue - # if (title == u'美  國'): - # continue - # if (not title == u'動新聞') and (relativea.startswith('list.php')): - if (relativea.find('category=daily')!= -1)and (relativea.startswith('list.php')): - sectionList.append((title, a)) - for title, url in sectionList: - title = title.replace(" ", "") - articles = self.parse_section(url) - if articles: - feeds.append((title, articles)) - return feeds - - def parse_section(self, url): - soup = self.index_to_soup(url) - ul = soup.find(attrs={'class': 'list'}) - current_articles = [] - if ul is None : - return current_articles - for li in ul.findAll('li'): - a = li.find('a', href=True) - title = li.find('p', text=True).strip() - if a is not None: - current_articles.append( - {'title': title, 'url': 'http://hkm.appledaily.com/' + a.get('href', False)}) - pass - return current_articles - - def create_opf(self, feeds, dir=None): - if dir is None: - dir = self.output_dir - title = self.short_title() - if self.output_profile.periodical_date_in_title: - title += strftime(self.timefmt) - mi = MetaInformation(title, [__appname__]) - mi.publisher = __appname__ - mi.author_sort = __appname__ - if self.publication_type: - mi.publication_type = 'periodical:' + \ - self.publication_type + ':' + self.short_title() - mi.timestamp = nowf() - article_titles, aseen = [], set() - for f in feeds: - for a in f: - if a.title and a.title not in aseen: - aseen.add(a.title) - article_titles.append(force_unicode(a.title, 'utf-8')) - - mi.comments = self.description - if not isinstance(mi.comments, type(u'')): - mi.comments = mi.comments.decode('utf-8', 'replace') - mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' + - '\n\n'.join(article_titles)) - - language = canonicalize_lang(self.language) - if language is not None: - mi.language = language - # This one affects the pub date shown in kindle title - # mi.pubdate = nowf() - # now appears to need the time field to be > 12.00noon as well - mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int( - self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0) - opf_path = os.path.join(dir, 'index.opf') - ncx_path = os.path.join(dir, 'index.ncx') - - opf = OPFCreator(dir, mi) - # Add mastheadImage entry to section - mp = getattr(self, 'masthead_path', None) - if mp is not None and os.access(mp, os.R_OK): - from calibre.ebooks.metadata.opf2 import Guide - ref = Guide.Reference(os.path.basename( - self.masthead_path), os.getcwd()) - ref.type = 'masthead' - ref.title = 'Masthead Image' - opf.guide.append(ref) - - manifest = [os.path.join(dir, 'feed_%d' % i) - for i in range(len(feeds))] - manifest.append(os.path.join(dir, 'index.html')) - manifest.append(os.path.join(dir, 'index.ncx')) - - # Get cover - cpath = getattr(self, 'cover_path', None) - if cpath is None: - pf = open(os.path.join(dir, 'cover.jpg'), 'wb') - if self.default_cover(pf): - cpath = pf.name - if cpath is not None and os.access(cpath, os.R_OK): - opf.cover = cpath - manifest.append(cpath) - - # Get masthead - mpath = getattr(self, 'masthead_path', None) - if mpath is not None and os.access(mpath, os.R_OK): - manifest.append(mpath) - - opf.create_manifest_from_files_in(manifest) - for mani in opf.manifest: - if mani.path.endswith('.ncx'): - mani.id = 'ncx' - if mani.path.endswith('mastheadImage.jpg'): - mani.id = 'masthead-image' - - entries = ['index.html'] - toc = TOC(base_path=dir) - self.play_order_counter = 0 - self.play_order_map = {} - - def feed_index(num, parent): - f = feeds[num] - for j, a in enumerate(f): - if getattr(a, 'downloaded', False): - adir = 'feed_%d/article_%d/' % (num, j) - auth = a.author - if not auth: - auth = None - desc = a.text_summary - if not desc: - desc = None - else: - desc = self.description_limiter(desc) - tt = a.toc_thumbnail if a.toc_thumbnail else None - entries.append('%sindex.html' % adir) - po = self.play_order_map.get(entries[-1], None) - if po is None: - self.play_order_counter += 1 - po = self.play_order_counter - parent.add_item('%sindex.html' % adir, None, - a.title if a.title else _( - 'Untitled Article'), - play_order=po, author=auth, - description=desc, toc_thumbnail=tt) - last = os.path.join( - self.output_dir, ('%sindex.html' % adir).replace('/', os.sep)) - for sp in a.sub_pages: - prefix = os.path.commonprefix([opf_path, sp]) - relp = sp[len(prefix):] - entries.append(relp.replace(os.sep, '/')) - last = sp - - if os.path.exists(last): - with open(last, 'rb') as fi: - src = fi.read().decode('utf-8') - src = src.replace('height:260px !important;','') # fix flow-player div tag parent - soup = BeautifulSoup(src) - body = soup.find('body') - if body is not None: - prefix = '/'.join('..'for i in range(2 * - len(re.findall(r'link\d+', last)))) - templ = self.navbar.generate(True, num, j, len(f), - not self.has_single_feed, - a.orig_url, __appname__, prefix=prefix, - center=self.center_navbar) - translatedTempl =re.sub( - '本篇由 '+__appname__+ - ' 快取自 蘋果日報 ; 本篇來源位置。'+ - ' 1: - for i, f in enumerate(feeds): - entries.append('feed_%d/index.html' % i) - po = self.play_order_map.get(entries[-1], None) - if po is None: - self.play_order_counter += 1 - po = self.play_order_counter - auth = getattr(f, 'author', None) - if not auth: - auth = None - desc = getattr(f, 'description', None) - if not desc: - desc = None - feed_index(i, toc.add_item('feed_%d/index.html' % i, None, - f.title, play_order=po, description=desc, author=auth)) - - else: - entries.append('feed_%d/index.html' % 0) - feed_index(0, toc) - - for i, p in enumerate(entries): - entries[i] = os.path.join(dir, p.replace('/', os.sep)) - opf.create_spine(entries) - opf.set_toc(toc) - - with open(opf_path, 'wb') as opf_file, open(ncx_path, 'wb') as ncx_file: - opf.render(opf_file, ncx_file) diff --git a/recipes/singtaohk.recipe b/recipes/singtaohk.recipe index 226a2d7ce2..86137d1458 100644 --- a/recipes/singtaohk.recipe +++ b/recipes/singtaohk.recipe @@ -1,5 +1,4 @@ from calibre.web.feeds.news import BasicNewsRecipe, classes -from calibre.ptempfile import PersistentTemporaryFile class STHKRecipe(BasicNewsRecipe): title = '星島日報 (香港)' @@ -37,32 +36,23 @@ class STHKRecipe(BasicNewsRecipe): except Exception as e: url = e.hdrs.get('location') soup = self.index_to_soup(url) - link = soup.find('a', href=True) - skip_sections =[ # add sections you want to skip + link = soup.find('a', href=True)['href'] + skip_sections = [ # add sections you want to skip '/video/', '/videos/', '/media/', 'podcast' ] - if any(x in link['href'] for x in skip_sections): - self.log('Aborting Article ', link['href']) + if any(x in link for x in skip_sections): + self.log('Aborting Article ', link) self.abort_article('skipping video links') + html = br.open(link).read() + return ({ 'data': html, 'url': link }) - self.log('Downloading ', link['href']) - html = br.open(link['href']).read() - pt = PersistentTemporaryFile('.html') - pt.write(html) - pt.close() - return pt.name - - feeds = [] - - sections = [ - 'daily', 'realtime', 'education', 'property', 'racing', 'supplement', 'kol' + feeds = [ + ('日報', 'https://news.google.com/rss/search?q=when:27h+allinurl:https%3A%2F%2Fstd.stheadline.com%2Fdaily%2F&hl=zh-HK&gl=HK&ceid=HK:zh'), + ('即時', 'https://news.google.com/rss/search?q=when:27h+allinurl:https%3A%2F%2Fstd.stheadline.com%2Frealtime%2F&hl=zh-HK&gl=HK&ceid=HK:zh'), + ('副刊', 'https://news.google.com/rss/search?q=when:27h+allinurl:https%3A%2F%2Fstd.stheadline.com%2Fsupplement%2F&hl=zh-HK&gl=HK&ceid=HK:zh'), + ('其他的 新聞', 'https://news.google.com/rss/search?q=when:27h+allinurl:https%3A%2F%2Fstd.stheadline.com&hl=zh-HK&gl=HK&ceid=HK:zh') ] - for sec in sections: - a = 'https://news.google.com/rss/search?q=when:27h+allinurl:https%3A%2F%2Fstd.stheadline.com{}&hl=zh-HK&gl=HK&ceid=HK:zh' - feeds.append((sec.capitalize(), a.format('%2F' + sec + '%2F'))) - feeds.append(('Others', a.format(''))) - def populate_article_metadata(self, article, soup, first): article.title = article.title.replace(' - 星島頭條', '')