# vim:fileencoding=UTF-8 from __future__ import unicode_literals __license__ = 'GPL v3' __copyright__ = '2011-2013, Eddie Lau' # data source: normal, mobile __Source__ = 'mobile' # please replace the following "True" with "False". (Default: True) __MakePeriodical__ = True # Turn below to True if your device supports display of CJK titles # (Default: False) __UseChineseTitle__ = True # Set it to False if you want to skip images (Default: True) __KeepImages__ = True # Set it to True if you want to include a summary in Kindle's article view # (Default: False) __IncludeSummary__ = True # Set it to True if you want thumbnail images in Kindle's article view # (Default: True) __IncludeThumbnails__ = True ''' Change Log: 2013/03/31 -- fix cover retrieval code and heading size, and remove   in summary 2011/12/29 -- first version done ''' from calibre.utils.date import now as nowf import os import datetime import re from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.metadata.toc import TOC from calibre.ebooks.metadata import MetaInformation from calibre.utils.localization import canonicalize_lang # MAIN CLASS class STHKRecipe(BasicNewsRecipe): if __UseChineseTitle__ is True: title = u'\u661f\u5cf6\u65e5\u5831 (\u9999\u6e2f)' else: title = 'Sing Tao Daily - Hong Kong' description = 'Hong Kong Chinese Newspaper (http://singtao.com)' category = 'Chinese, News, Hong Kong' extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} td[class=caption] {font-size:50%;} td[class=bodyhead]{font-weight:bold; font-size:150%;} td[class=stmobheadline]{font-weight:bold; font-size:200%;}' # noqa masthead_url = 'http://upload.wikimedia.org/wikipedia/en/d/dd/Singtao-usa.png' if __Source__ == 'normal': keep_only_tags = [ dict(name='td', attrs={'class': ['bodyhead', 'bodytext']})] else: keep_only_tags = [dict(name='td', attrs={'class': ['stmobheadline']}), dict(name='img', attrs={'width': ['146']}), dict(name='td', attrs={'class': ['bodytextg']}), ] if __KeepImages__: remove_tags = [dict(name='hr')] else: remove_tags = [dict(name='hr'), dict(name='img')] remove_attributes = ['align'] preprocess_regexps = [ (re.compile(r'', re.DOTALL | re.IGNORECASE), lambda match: '

'), ] oldest_article = 1 max_articles_per_feed = 200 __author__ = 'Eddie Lau' publisher = 'Sing Tao Ltd.' remove_javascript = True use_embedded_content = False no_stylesheets = True language = 'zh' encoding = 'Big5-HKSCS' recursions = 0 conversion_options = {'linearize_tables': True} timefmt = '' auto_cleanup = False def get_dtlocal(self): dt_utc = datetime.datetime.utcnow() # convert UTC to local hk time - at HKT 4.00am, all news are available dt_local = dt_utc + \ datetime.timedelta(8.0 / 24) - datetime.timedelta(4.0 / 24) return dt_local def get_fetchdate(self): return self.get_dtlocal().strftime("%Y%m%d") def get_fetchformatteddate(self): return self.get_dtlocal().strftime("%Y-%m-%d") def get_fetchyear(self): return self.get_dtlocal().strftime("%Y") def get_fetchmonth(self): return self.get_dtlocal().strftime("%m") def get_fetchday(self): return self.get_dtlocal().strftime("%d") def get_cover_url(self): soup = self.index_to_soup('http://m.singtao.com/') cover = soup.find(attrs={'class': 'special'}).get('src', False) br = BasicNewsRecipe.get_browser(self) try: br.open(cover) except: cover = None return cover def parse_index(self): feeds = [] dateStr = self.get_fetchdate() dateStr if __Source__ == 'normal': # single-item section for title, url in [(u'\u793e\u8ad6 Editorial', 'http://singtao.com/yesterday/jou/j_index.html')]: article = self.parse_singleitem_section(url) if article: feeds.append((title, article)) # multiple items # for title, url in [(u'\u8981\u805e\u6e2f\u805e Local', 'http://singtao.com/yesterday/loc/a_index.html'), # (u'\u8ca1\u7d93 Finance', 'http://singtao.com/yesterday/fin/d_index.html'), # (u'\u5730\u7522 Properties', 'http://singtao.com/yesterday/pro/h_index.html'), # (u'\u6559\u80b2 Education', 'http://singtao.com/yesterday/edu/g_index.asp'), # (u'\u5a1b\u6a02 Entertainment', 'http://singtao.com/yesterday/ent/f_index.html'), # (u'\u9ad4\u80b2 Sports', 'http://singtao.com/yesterday/spo/c_index.html'), # (u'\u99ac\u7d93 Horse Racing', 'http://singtao.com/yesterday/rac/n_index.html') # ]: # articles = self.parse_section(url) # if articles: # feeds.append((title, articles)) # special: supplement # for title, url, baseurl in [(u'\u526f\u520a Supplements', 'http://singtao.com/yesterday/sup/m_index.html', '/')]: # articles = self.parse_section_withouttext(url, baseurl) # if articles: # feeds.append((title, articles)) # multiple-item sections # for title, url in [(u'\u570b\u969b World', 'http://singtao.com/yesterday/int/b_index.html'), # (u'\u4e2d\u570b China', 'http://singtao.com/yesterday/chi/e_index.html') # ]: # articles = self.parse_section(url) # if articles: # feeds.append((title, articles)) for title, url, baseurl in [(u'\u8981\u805e\u6e2f\u805e Local', 'http://singtao.com/yesterday/loc/a_index.html', '/'), (u'\u8ca1\u7d93 Finance', 'http://singtao.com/yesterday/fin/d_index.html', '/'), (u'\u5730\u7522 Properties', 'http://singtao.com/yesterday/pro/h_index.html', '/'), (u'\u6559\u80b2 Education', 'http://singtao.com/yesterday/edu/g_index.asp', '/'), (u'\u5a1b\u6a02 Entertainment', 'http://singtao.com/yesterday/ent/f_index.html', '/'), (u'\u9ad4\u80b2 Sports', 'http://singtao.com/yesterday/spo/c_index.html', '/'), (u'\u99ac\u7d93 Horse Racing', 'http://singtao.com/yesterday/rac/n_index.html', '/'), (u'\u526f\u520a Supplements', 'http://singtao.com/yesterday/sup/m_index.html', '/'), (u'\u570b\u969b World', 'http://singtao.com/yesterday/int/b_index.html', '/'), (u'\u4e2d\u570b China', 'http://singtao.com/yesterday/chi/e_index.html', '/')]: articles = self.parse_section_withouttext(url, baseurl) if articles: feeds.append((title, articles)) else: # use mobile # single-item section for title, url in [(u'\u793e\u8ad6 Editorial', 'http://m.singtao.com/showContent.php?main=paper&sub=0&title=0')]: article = self.parse_singleitem_section_m(url) if article: feeds.append((title, article)) # multiple-item section for title, url, baseurl in [(u'\u8981\u805e\u6e2f\u805e Local', 'http://m.singtao.com/showTitle.php?main=paper&sub=1', 'http://m.singtao.com/'), (u'\u8ca1\u7d93 Finance', 'http://m.singtao.com/showTitle.php?main=paper&sub=2', 'http://m.singtao.com/'), (u'\u5730\u7522 Properties', 'http://m.singtao.com/showTitle.php?main=paper&sub=3', 'http://m.singtao.com/'), (u'\u6559\u80b2 Education', 'http://m.singtao.com/showTitle.php?main=paper&sub=4', 'http://m.singtao.com/'), (u'\u5a1b\u6a02 Entertainment', 'http://m.singtao.com/showTitle.php?main=paper&sub=5', 'http://m.singtao.com/'), (u'\u99ac\u7d93 Horse Racing', 'http://m.singtao.com/showTitle.php?main=paper&sub=6', 'http://m.singtao.com/'), (u'\u9ad4\u80b2 Sports', 'http://m.singtao.com/showTitle.php?main=paper&sub=7', 'http://m.singtao.com/'), (u'\u526f\u520a Supplements', 'http://m.singtao.com/showTitle.php?main=paper&sub=8', 'http://m.singtao.com/'), (u'\u570b\u969b World', 'http://m.singtao.com/showTitle.php?main=paper&sub=9', 'http://m.singtao.com/'), (u'\u4e2d\u570b China', 'http://m.singtao.com/showTitle.php?main=paper&sub=10', 'http://m.singtao.com/')]: articles = self.parse_multiitem_section_m(url, baseurl) if articles: feeds.append((title, articles)) return feeds def parse_singleitem_section(self, url): current_articles = [] current_articles.append( {'title': '', 'url': url, 'description': '', 'date': ''}) return current_articles def parse_singleitem_section_m(self, url): current_articles = [] current_articles.append( {'title': '', 'url': url, 'description': '', 'date': ''}) return current_articles def parse_section(self, url): soup = self.index_to_soup(url) # find tag tables = soup.findAll(name={'table'}, attrs={'width': ['436']}) current_articles_all = [] for table in tables: divs = table.findAll(name={'a'}) current_articles = [] included_urls = [] for i in divs: title = self.tag_to_string(i) urlstr = i.get('href', False) urlstr = url + '/../' + urlstr if urlstr not in included_urls: current_articles.append( {'title': title, 'url': urlstr, 'description': '', 'date': ''}) included_urls.append(urlstr) current_articles_all.extend(current_articles) return current_articles_all def parse_section_withouttext(self, url, baseurl): soup = self.index_to_soup(url) # find all a tag links = soup.findAll(name={'a'}) linksexcluded = soup.findAll(name={'a'}, attrs={'class': 'secondhead'}) for elink in linksexcluded: links.remove(elink) linksexcluded = soup.findAll(name={'a'}, attrs={'class': 'second02'}) for elink in linksexcluded: links.remove(elink) current_articles_all = [] included_urls = [] for link in links: title = self.tag_to_string(link) if len(title.strip()) > 0: urlstr = link.get('href', False) if urlstr.rfind(baseurl) == -1 and urlstr.rfind('mailto:') == -1: urlstr = url + '/../' + urlstr if urlstr not in included_urls: current_articles_all.append( {'title': title, 'url': urlstr, 'description': '', 'date': ''}) included_urls.append(urlstr) return current_articles_all def parse_multiitem_section_m(self, url, baseurl): soup = self.index_to_soup(url) # find all a tag links = soup.findAll(name={'span'}, attrs={'class': 'urlurl'}) current_articles_all = [] included_urls = [] for linkraw in links: linkclean = soup.findAll(name={'a'}) for link in linkclean: title = self.tag_to_string(link) if len(title.strip()) > 0: urlstr = link.get('href', False) urlstr = baseurl + urlstr if urlstr not in included_urls: current_articles_all.append( {'title': title, 'url': urlstr, 'description': '', 'date': ''}) included_urls.append(urlstr) return current_articles_all def populate_article_metadata(self, article, soup, first): if __Source__ == 'normal': # get title if not fetched in parse_section() function if article.title == '' or len(article.title.strip()) == 0: articletitle = soup.findAll('td', attrs={'class': 'bodyhead'}) if articletitle: articletitlemod = articletitle[0].find('font') if articletitlemod: article.title = articletitlemod.string.strip() else: article.title = articletitle[0].string.strip() else: # use the title in the text in any case articletitle = soup.findAll('td', attrs={'class': 'stmobheadline'}) if articletitle: articletitle[0].br.extract() article.title = articletitle[0].contents[0] # get thumbnail image if __IncludeThumbnails__ and first and hasattr(self, 'add_toc_thumbnail'): img = soup.find('img') if img is not None: self.add_toc_thumbnail(article, img['src']) try: if __IncludeSummary__ and len(article.text_summary.strip()) == 0: # look for content if __Source__ == 'normal': articlebodies = soup.findAll( 'font', attrs={'class': 'bodytext'}) else: articlebodies = soup.findAll( 'div', attrs={'class': 'hkadj'}) if articlebodies: for articlebody in articlebodies: if articlebody: # the text may or may not be enclosed in

# tag paras = articlebody.findAll('p') if not paras: paras = articlebody textFound = False for p in paras: if not textFound: summary_candidate = self.tag_to_string( p).strip().replace(' ', '') if len(summary_candidate) > 0: summary_candidate = summary_candidate.replace( u'(\u661f\u5cf6\u65e5\u5831\u5831\u9053)', '', 1) article.summary = article.text_summary = summary_candidate textFound = True else: # display a simple text # article.summary = article.text_summary = u'\u66f4\u591a......' # display word counts counts = 0 if __Source__ == 'normal': articlebodies = soup.findAll( 'font', attrs={'class': 'bodytext'}) else: articlebodies = soup.findAll( 'div', attrs={'class': 'hkadj'}) if articlebodies: for articlebody in articlebodies: # the text may or may not be enclosed in

tag paras = articlebody.findAll('p') if not paras: paras = articlebody for p in paras: summary_candidate = self.tag_to_string(p).strip() counts += len(summary_candidate) article.summary = article.text_summary = u'\uff08' + \ str(counts) + u'\u5b57\uff09' except: self.log("Error creating article descriptions") return # override from the one in version 0.8.31 def create_opf(self, feeds, dir=None): if dir is None: dir = self.output_dir title = self.short_title() # change 1: allow our own flag to tell if a periodical is to be generated # also use customed date instead of current time if __MakePeriodical__ is False or self.output_profile.periodical_date_in_title: title = title + ' ' + self.get_fetchformatteddate() # end of change 1 # change 2: __appname__ replaced by newspaper publisher __appname__ = self.publisher mi = MetaInformation(title, [__appname__]) mi.publisher = __appname__ mi.author_sort = __appname__ # change 3: use __MakePeriodical__ flag to tell if a periodical should # be generated if __MakePeriodical__ is True: mi.publication_type = 'periodical:' + \ self.publication_type + ':' + self.short_title() else: mi.publication_type = self.publication_type + ':' + self.short_title() # mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title() # change 4: in the following, all the nowf() are changed to adjusted time # This one doesn't matter mi.timestamp = nowf() # change 5: skip listing the articles # article_titles, aseen = [], set() # for f in feeds: # for a in f: # if a.title and a.title not in aseen: # aseen.add(a.title) # article_titles.append(force_unicode(a.title, 'utf-8')) # mi.comments = self.description # if not isinstance(mi.comments, unicode): # mi.comments = mi.comments.decode('utf-8', 'replace') # mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' + # '\n\n'.join(article_titles)) language = canonicalize_lang(self.language) if language is not None: mi.language = language # This one affects the pub date shown in kindle title # mi.pubdate = nowf() # now appears to need the time field to be > 12.00noon as well mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int( self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0) opf_path = os.path.join(dir, 'index.opf') ncx_path = os.path.join(dir, 'index.ncx') opf = OPFCreator(dir, mi) # Add mastheadImage entry to section mp = getattr(self, 'masthead_path', None) if mp is not None and os.access(mp, os.R_OK): from calibre.ebooks.metadata.opf2 import Guide ref = Guide.Reference(os.path.basename( self.masthead_path), os.getcwd()) ref.type = 'masthead' ref.title = 'Masthead Image' opf.guide.append(ref) manifest = [os.path.join(dir, 'feed_%d' % i) for i in range(len(feeds))] manifest.append(os.path.join(dir, 'index.html')) manifest.append(os.path.join(dir, 'index.ncx')) # Get cover cpath = getattr(self, 'cover_path', None) if cpath is None: pf = open(os.path.join(dir, 'cover.jpg'), 'wb') if self.default_cover(pf): cpath = pf.name if cpath is not None and os.access(cpath, os.R_OK): opf.cover = cpath manifest.append(cpath) # Get masthead mpath = getattr(self, 'masthead_path', None) if mpath is not None and os.access(mpath, os.R_OK): manifest.append(mpath) opf.create_manifest_from_files_in(manifest) for mani in opf.manifest: if mani.path.endswith('.ncx'): mani.id = 'ncx' if mani.path.endswith('mastheadImage.jpg'): mani.id = 'masthead-image' entries = ['index.html'] toc = TOC(base_path=dir) self.play_order_counter = 0 self.play_order_map = {} def feed_index(num, parent): f = feeds[num] for j, a in enumerate(f): if getattr(a, 'downloaded', False): adir = 'feed_%d/article_%d/' % (num, j) auth = a.author if not auth: auth = None desc = a.text_summary if not desc: desc = None else: desc = self.description_limiter(desc) tt = a.toc_thumbnail if a.toc_thumbnail else None entries.append('%sindex.html' % adir) po = self.play_order_map.get(entries[-1], None) if po is None: self.play_order_counter += 1 po = self.play_order_counter parent.add_item('%sindex.html' % adir, None, a.title if a.title else ( 'Untitled Article'), play_order=po, author=auth, description=desc, toc_thumbnail=tt) last = os.path.join( self.output_dir, ('%sindex.html' % adir).replace('/', os.sep)) for sp in a.sub_pages: prefix = os.path.commonprefix([opf_path, sp]) relp = sp[len(prefix):] entries.append(relp.replace(os.sep, '/')) last = sp if os.path.exists(last): with open(last, 'rb') as fi: src = fi.read().decode('utf-8') soup = BeautifulSoup(src) body = soup.find('body') if body is not None: prefix = '/'.join('..'for i in range(2 * len(re.findall(r'link\d+', last)))) templ = self.navbar.generate(True, num, j, len(f), not self.has_single_feed, a.orig_url, __appname__, prefix=prefix, center=self.center_navbar) elem = BeautifulSoup(templ.render( doctype='xhtml').decode('utf-8')).find('div') body.insert(len(body.contents), elem) with open(last, 'wb') as fi: fi.write(type(u'')(soup).encode('utf-8')) if len(feeds) == 0: raise Exception('All feeds are empty, aborting.') if len(feeds) > 1: for i, f in enumerate(feeds): entries.append('feed_%d/index.html' % i) po = self.play_order_map.get(entries[-1], None) if po is None: self.play_order_counter += 1 po = self.play_order_counter auth = getattr(f, 'author', None) if not auth: auth = None desc = getattr(f, 'description', None) if not desc: desc = None feed_index(i, toc.add_item('feed_%d/index.html' % i, None, f.title, play_order=po, description=desc, author=auth)) else: entries.append('feed_%d/index.html' % 0) feed_index(0, toc) for i, p in enumerate(entries): entries[i] = os.path.join(dir, p.replace('/', os.sep)) opf.create_spine(entries) opf.set_toc(toc) with open(opf_path, 'wb') as opf_file, open(ncx_path, 'wb') as ncx_file: opf.render(opf_file, ncx_file)