# vim:fileencoding=UTF-8 from __future__ import unicode_literals __license__ = 'GPL v3' __copyright__ = '2013-2015, Eddie Lau' __Date__ = '' from calibre import (__appname__, force_unicode, strftime) from calibre.utils.date import now as nowf import os import datetime import re from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.metadata.toc import TOC from calibre.ebooks.metadata import MetaInformation from calibre.utils.localization import canonicalize_lang class AppleDaily(BasicNewsRecipe): title = u'蘋果日報 (香港)' __author__ = 'Eddie Lau' publisher = '蘋果日報' publication_type= 'newspaper' oldest_article = 1 max_articles_per_feed = 100 auto_cleanup = False language = 'zh' encoding = 'utf-8' auto_cleanup = False remove_javascript = True use_embedded_content = False no_stylesheets = True description = 'http://hkm.appledaily.com/' category = 'Chinese, News, Hong Kong' masthead_url = 'https://upload.wikimedia.org/wikipedia/commons/8/86/Apple_Daily_Title.svg' extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} h1 {font-size:125%; text-align:left; font-weight:bold;} p{font-size:90%;} p[class=video-caption] {font-size:50%; margin-left:auto; margin-right:auto;}' # noqa keep_only_tags = [dict(name='div', attrs={'id': 'content-article'})] remove_tags = [dict(name='div', attrs={'class': 'prev-next-btn'}), dict(name='p', attrs={'class': 'next'}), dict(name='meta'), dict(name='link')] def get_dtlocal(self): dt_utc = datetime.datetime.utcnow() # convert UTC to local hk time - at HKT 6am, all news are available return dt_utc + datetime.timedelta(8.0 / 24) - datetime.timedelta(6.0 / 24) def get_fetchdate(self): if __Date__ != '': return __Date__ else: return self.get_dtlocal().strftime("%Y%m%d") def get_fetchformatteddate(self): if __Date__ != '': return __Date__[0:4] + '-' + __Date__[4:6] + '-' + __Date__[6:8] else: return self.get_dtlocal().strftime("%Y-%m-%d") def get_fetchyear(self): if __Date__ != '': return __Date__[0:4] else: return self.get_dtlocal().strftime("%Y") def get_fetchmonth(self): if __Date__ != '': return __Date__[4:6] else: return self.get_dtlocal().strftime("%m") def get_fetchday(self): if __Date__ != '': return __Date__[6:8] else: return self.get_dtlocal().strftime("%d") # Note: does not work with custom date given by __Date__ def get_weekday(self): return self.get_dtlocal().weekday() def get_cover_url(self): soup = self.index_to_soup('http://hkm.appledaily.com/') cover = soup.find(attrs={'class': 'top-news'}).get('src', False) br = BasicNewsRecipe.get_browser(self) try: br.open(cover) except: cover = None return cover def populate_article_metadata(self, article, soup, first): if first and hasattr(self, 'add_toc_thumbnail'): picdiv = soup.find('img') if picdiv is not None: self.add_toc_thumbnail(article, picdiv['src']) def parse_index(self): feeds = [] soup = self.index_to_soup('http://hkm.appledaily.com/') ul = soup.find(attrs={'class': 'menu'}) sectionList = [] for li in ul.findAll('li'): relativea = li.find('a', href=True).get('href', False) a = 'http://hkm.appledaily.com/' + relativea title = li.find('a', text=True).strip() # if (time.tzname != 'HKT'): # if (title == u'三藩市'): # continue # if (title == u'洛杉磯'): # continue # if (title == u'紐  約'): # continue # if (title == u'美  國'): # continue # if (not title == u'動新聞') and (relativea.startswith('list.php')): if (relativea.find('category=daily')!= -1)and (relativea.startswith('list.php')): sectionList.append((title, a)) for title, url in sectionList: title = title.replace(" ", "") articles = self.parse_section(url) if articles: feeds.append((title, articles)) return feeds def parse_section(self, url): soup = self.index_to_soup(url) ul = soup.find(attrs={'class': 'list'}) current_articles = [] if ul is None : return current_articles for li in ul.findAll('li'): a = li.find('a', href=True) title = li.find('p', text=True).strip() if a is not None: current_articles.append( {'title': title, 'url': 'http://hkm.appledaily.com/' + a.get('href', False)}) pass return current_articles def create_opf(self, feeds, dir=None): if dir is None: dir = self.output_dir title = self.short_title() if self.output_profile.periodical_date_in_title: title += strftime(self.timefmt) mi = MetaInformation(title, [__appname__]) mi.publisher = __appname__ mi.author_sort = __appname__ if self.publication_type: mi.publication_type = 'periodical:' + \ self.publication_type + ':' + self.short_title() mi.timestamp = nowf() article_titles, aseen = [], set() for f in feeds: for a in f: if a.title and a.title not in aseen: aseen.add(a.title) article_titles.append(force_unicode(a.title, 'utf-8')) mi.comments = self.description if not isinstance(mi.comments, type(u'')): mi.comments = mi.comments.decode('utf-8', 'replace') mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' + '\n\n'.join(article_titles)) language = canonicalize_lang(self.language) if language is not None: mi.language = language # This one affects the pub date shown in kindle title # mi.pubdate = nowf() # now appears to need the time field to be > 12.00noon as well mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int( self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0) opf_path = os.path.join(dir, 'index.opf') ncx_path = os.path.join(dir, 'index.ncx') opf = OPFCreator(dir, mi) # Add mastheadImage entry to section mp = getattr(self, 'masthead_path', None) if mp is not None and os.access(mp, os.R_OK): from calibre.ebooks.metadata.opf2 import Guide ref = Guide.Reference(os.path.basename( self.masthead_path), os.getcwd()) ref.type = 'masthead' ref.title = 'Masthead Image' opf.guide.append(ref) manifest = [os.path.join(dir, 'feed_%d' % i) for i in range(len(feeds))] manifest.append(os.path.join(dir, 'index.html')) manifest.append(os.path.join(dir, 'index.ncx')) # Get cover cpath = getattr(self, 'cover_path', None) if cpath is None: pf = open(os.path.join(dir, 'cover.jpg'), 'wb') if self.default_cover(pf): cpath = pf.name if cpath is not None and os.access(cpath, os.R_OK): opf.cover = cpath manifest.append(cpath) # Get masthead mpath = getattr(self, 'masthead_path', None) if mpath is not None and os.access(mpath, os.R_OK): manifest.append(mpath) opf.create_manifest_from_files_in(manifest) for mani in opf.manifest: if mani.path.endswith('.ncx'): mani.id = 'ncx' if mani.path.endswith('mastheadImage.jpg'): mani.id = 'masthead-image' entries = ['index.html'] toc = TOC(base_path=dir) self.play_order_counter = 0 self.play_order_map = {} def feed_index(num, parent): f = feeds[num] for j, a in enumerate(f): if getattr(a, 'downloaded', False): adir = 'feed_%d/article_%d/' % (num, j) auth = a.author if not auth: auth = None desc = a.text_summary if not desc: desc = None else: desc = self.description_limiter(desc) tt = a.toc_thumbnail if a.toc_thumbnail else None entries.append('%sindex.html' % adir) po = self.play_order_map.get(entries[-1], None) if po is None: self.play_order_counter += 1 po = self.play_order_counter parent.add_item('%sindex.html' % adir, None, a.title if a.title else _( 'Untitled Article'), play_order=po, author=auth, description=desc, toc_thumbnail=tt) last = os.path.join( self.output_dir, ('%sindex.html' % adir).replace('/', os.sep)) for sp in a.sub_pages: prefix = os.path.commonprefix([opf_path, sp]) relp = sp[len(prefix):] entries.append(relp.replace(os.sep, '/')) last = sp if os.path.exists(last): with open(last, 'rb') as fi: src = fi.read().decode('utf-8') src = src.replace('height:260px !important;','') # fix flow-player div tag parent soup = BeautifulSoup(src) body = soup.find('body') if body is not None: prefix = '/'.join('..'for i in range(2 * len(re.findall(r'link\d+', last)))) templ = self.navbar.generate(True, num, j, len(f), not self.has_single_feed, a.orig_url, __appname__, prefix=prefix, center=self.center_navbar) translatedTempl =re.sub( '本篇由 '+__appname__+ ' 快取自 蘋果日報 ; 本篇來源位置。'+ ' 1: for i, f in enumerate(feeds): entries.append('feed_%d/index.html' % i) po = self.play_order_map.get(entries[-1], None) if po is None: self.play_order_counter += 1 po = self.play_order_counter auth = getattr(f, 'author', None) if not auth: auth = None desc = getattr(f, 'description', None) if not desc: desc = None feed_index(i, toc.add_item('feed_%d/index.html' % i, None, f.title, play_order=po, description=desc, author=auth)) else: entries.append('feed_%d/index.html' % 0) feed_index(0, toc) for i, p in enumerate(entries): entries[i] = os.path.join(dir, p.replace('/', os.sep)) opf.create_spine(entries) opf.set_toc(toc) with open(opf_path, 'wb') as opf_file, open(ncx_path, 'wb') as ncx_file: opf.render(opf_file, ncx_file)