__license__ = 'GPL v3' __copyright__ = '2010, Eddie Lau' ''' Change Log: 2010/12/07: add entertainment section, use newspaper front page as ebook cover, suppress date display in section list (to avoid wrong date display in case the user generates the ebook in a time zone different from HKT) 2010/11/22: add English section, remove eco-news section which is not updated daily, correct ordering of articles 2010/11/12: add news image and eco-news section 2010/11/08: add parsing of finance section 2010/11/06: temporary work-around for Kindle device having no capability to display unicode in section/article list. 2010/10/31: skip repeated articles in section pages ''' import os, datetime, re from calibre.web.feeds.recipes import BasicNewsRecipe from contextlib import nested from calibre import __appname__ from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.metadata.toc import TOC from calibre.ebooks.metadata import MetaInformation class MPHKRecipe(BasicNewsRecipe): IsKindleUsed = True # to avoid generating periodical in which CJK characters can't be displayed in section/article view title = 'Ming Pao - Hong Kong' oldest_article = 1 max_articles_per_feed = 100 __author__ = 'Eddie Lau' description = 'Hong Kong Chinese Newspaper' publisher = 'news.mingpao.com' category = 'Chinese, News, Hong Kong' remove_javascript = True use_embedded_content = False no_stylesheets = True language = 'zh' encoding = 'Big5-HKSCS' recursions = 0 conversion_options = {'linearize_tables':True} timefmt = '' extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;}' masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif' keep_only_tags = [dict(name='h1'), dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title dict(attrs={'class':['photo']}), dict(attrs={'id':['newscontent']}), # entertainment page content dict(attrs={'id':['newscontent01','newscontent02']})] remove_tags = [dict(name='style'), dict(attrs={'id':['newscontent135']})] # for the finance page remove_attributes = ['width'] preprocess_regexps = [ (re.compile(r'

', re.DOTALL|re.IGNORECASE), lambda match: '

'), (re.compile(r'

', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(r'

', re.DOTALL|re.IGNORECASE), # for entertainment page lambda match: '') ] def image_url_processor(cls, baseurl, url): # trick: break the url at the first occurance of digit, add an additional # '_' at the front # not working, may need to move this to preprocess_html() method # minIdx = 10000 # i0 = url.find('0') # if i0 >= 0 and i0 < minIdx: # minIdx = i0 # i1 = url.find('1') # if i1 >= 0 and i1 < minIdx: # minIdx = i1 # i2 = url.find('2') # if i2 >= 0 and i2 < minIdx: # minIdx = i2 # i3 = url.find('3') # if i3 >= 0 and i0 < minIdx: # minIdx = i3 # i4 = url.find('4') # if i4 >= 0 and i4 < minIdx: # minIdx = i4 # i5 = url.find('5') # if i5 >= 0 and i5 < minIdx: # minIdx = i5 # i6 = url.find('6') # if i6 >= 0 and i6 < minIdx: # minIdx = i6 # i7 = url.find('7') # if i7 >= 0 and i7 < minIdx: # minIdx = i7 # i8 = url.find('8') # if i8 >= 0 and i8 < minIdx: # minIdx = i8 # i9 = url.find('9') # if i9 >= 0 and i9 < minIdx: # minIdx = i9 return url def get_dtlocal(self): dt_utc = datetime.datetime.utcnow() # convert UTC to local hk time - at around HKT 6.00am, all news are available dt_local = dt_utc - datetime.timedelta(-2.0/24) return dt_local def get_fetchdate(self): return self.get_dtlocal().strftime("%Y%m%d") def get_fetchday(self): # convert UTC to local hk time - at around HKT 6.00am, all news are available return self.get_dtlocal().strftime("%d") def get_cover_url(self): cover = 'http://news.mingpao.com/' + self.get_fetchdate() + '/' + self.get_fetchdate() + '_' + self.get_fetchday() + 'gacov.jpg' br = BasicNewsRecipe.get_browser() try: br.open(cover) except: cover = None return cover def parse_index(self): feeds = [] dateStr = self.get_fetchdate() for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'), (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'), (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'), (u'\u793e\u8a55\u2027\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'), (u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'), (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'), (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'), ('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'), (u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: articles = self.parse_section(url) if articles: feeds.append((title, articles)) # special - finance fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm') if fin_articles: feeds.append((u'\u7d93\u6fdf Finance', fin_articles)) # special - eco-friendly # eco_articles = self.parse_eco_section('http://tssl.mingpao.com/htm/marketing/eco/cfm/Eco1.cfm') # if eco_articles: # feeds.append((u'\u74b0\u4fdd Eco News', eco_articles)) # special - entertainment ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm') if ent_articles: feeds.append((u'\u5f71\u8996 Entertainment', ent_articles)) return feeds def parse_section(self, url): dateStr = self.get_fetchdate() soup = self.index_to_soup(url) divs = soup.findAll(attrs={'class': ['bullet','bullet_grey']}) current_articles = [] included_urls = [] divs.reverse() for i in divs: a = i.find('a', href = True) title = self.tag_to_string(a) url = a.get('href', False) url = 'http://news.mingpao.com/' + dateStr + '/' +url if url not in included_urls and url.rfind('Redirect') == -1: current_articles.append({'title': title, 'url': url, 'description':'', 'date':''}) included_urls.append(url) current_articles.reverse() return current_articles def parse_fin_section(self, url): dateStr = self.get_fetchdate() soup = self.index_to_soup(url) a = soup.findAll('a', href= True) current_articles = [] for i in a: url = i.get('href', False) if not url.rfind(dateStr) == -1 and url.rfind('index') == -1: title = self.tag_to_string(i) url = 'http://www.mpfinance.com/cfm/' +url current_articles.append({'title': title, 'url': url, 'description':''}) return current_articles def parse_eco_section(self, url): dateStr = self.get_fetchdate() soup = self.index_to_soup(url) divs = soup.findAll(attrs={'class': ['bullet']}) current_articles = [] included_urls = [] for i in divs: a = i.find('a', href = True) title = self.tag_to_string(a) url = a.get('href', False) url = 'http://tssl.mingpao.com/htm/marketing/eco/cfm/' +url if url not in included_urls and url.rfind('Redirect') == -1 and not url.rfind('.txt') == -1 and not url.rfind(dateStr) == -1: current_articles.append({'title': title, 'url': url, 'description':''}) included_urls.append(url) return current_articles def parse_ent_section(self, url): soup = self.index_to_soup(url) a = soup.findAll('a', href=True) a.reverse() current_articles = [] included_urls = [] for i in a: title = self.tag_to_string(i) url = 'http://ol.mingpao.com/cfm/' + i.get('href', False) if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('star') == -1): current_articles.append({'title': title, 'url': url, 'description': ''}) included_urls.append(url) current_articles.reverse() return current_articles def preprocess_html(self, soup): for item in soup.findAll(style=True): del item['style'] for item in soup.findAll(style=True): del item['width'] for item in soup.findAll(stype=True): del item['absmiddle'] return soup def create_opf(self, feeds, dir=None): if self.IsKindleUsed == False: super(MPHKRecipe,self).create_opf(feeds, dir) return if dir is None: dir = self.output_dir title = self.short_title() title += ' ' + self.get_fetchdate() #if self.output_profile.periodical_date_in_title: # title += strftime(self.timefmt) mi = MetaInformation(title, [__appname__]) mi.publisher = __appname__ mi.author_sort = __appname__ mi.publication_type = self.publication_type+':'+self.short_title() #mi.timestamp = nowf() mi.timestamp = self.get_dtlocal() mi.comments = self.description if not isinstance(mi.comments, unicode): mi.comments = mi.comments.decode('utf-8', 'replace') #mi.pubdate = nowf() mi.pubdate = self.get_dtlocal() opf_path = os.path.join(dir, 'index.opf') ncx_path = os.path.join(dir, 'index.ncx') opf = OPFCreator(dir, mi) # Add mastheadImage entry to section mp = getattr(self, 'masthead_path', None) if mp is not None and os.access(mp, os.R_OK): from calibre.ebooks.metadata.opf2 import Guide ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu()) ref.type = 'masthead' ref.title = 'Masthead Image' opf.guide.append(ref) manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))] manifest.append(os.path.join(dir, 'index.html')) manifest.append(os.path.join(dir, 'index.ncx')) # Get cover cpath = getattr(self, 'cover_path', None) if cpath is None: pf = open(os.path.join(dir, 'cover.jpg'), 'wb') if self.default_cover(pf): cpath = pf.name if cpath is not None and os.access(cpath, os.R_OK): opf.cover = cpath manifest.append(cpath) # Get masthead mpath = getattr(self, 'masthead_path', None) if mpath is not None and os.access(mpath, os.R_OK): manifest.append(mpath) opf.create_manifest_from_files_in(manifest) for mani in opf.manifest: if mani.path.endswith('.ncx'): mani.id = 'ncx' if mani.path.endswith('mastheadImage.jpg'): mani.id = 'masthead-image' entries = ['index.html'] toc = TOC(base_path=dir) self.play_order_counter = 0 self.play_order_map = {} def feed_index(num, parent): f = feeds[num] for j, a in enumerate(f): if getattr(a, 'downloaded', False): adir = 'feed_%d/article_%d/'%(num, j) auth = a.author if not auth: auth = None desc = a.text_summary if not desc: desc = None else: desc = self.description_limiter(desc) entries.append('%sindex.html'%adir) po = self.play_order_map.get(entries[-1], None) if po is None: self.play_order_counter += 1 po = self.play_order_counter parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'), play_order=po, author=auth, description=desc) last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep)) for sp in a.sub_pages: prefix = os.path.commonprefix([opf_path, sp]) relp = sp[len(prefix):] entries.append(relp.replace(os.sep, '/')) last = sp if os.path.exists(last): with open(last, 'rb') as fi: src = fi.read().decode('utf-8') soup = BeautifulSoup(src) body = soup.find('body') if body is not None: prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last)))) templ = self.navbar.generate(True, num, j, len(f), not self.has_single_feed, a.orig_url, __appname__, prefix=prefix, center=self.center_navbar) elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div') body.insert(len(body.contents), elem) with open(last, 'wb') as fi: fi.write(unicode(soup).encode('utf-8')) if len(feeds) == 0: raise Exception('All feeds are empty, aborting.') if len(feeds) > 1: for i, f in enumerate(feeds): entries.append('feed_%d/index.html'%i) po = self.play_order_map.get(entries[-1], None) if po is None: self.play_order_counter += 1 po = self.play_order_counter auth = getattr(f, 'author', None) if not auth: auth = None desc = getattr(f, 'description', None) if not desc: desc = None feed_index(i, toc.add_item('feed_%d/index.html'%i, None, f.title, play_order=po, description=desc, author=auth)) else: entries.append('feed_%d/index.html'%0) feed_index(0, toc) for i, p in enumerate(entries): entries[i] = os.path.join(dir, p.replace('/', os.sep)) opf.create_spine(entries) opf.set_toc(toc) with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file): opf.render(opf_file, ncx_file)