Updated Ming Pao

2025-07-09 03:04:10 -04:00 · 2011-03-08 18:54:37 -07:00 · 2011-03-08 18:54:37 -07:00 · a4b50102ff
commit a4b50102ff
parent 14717a5e92
1 changed files with 331 additions and 254 deletions
--- a/resources/recipes/ming_pao.recipe
+++ b/resources/recipes/ming_pao.recipe
@ -1,7 +1,20 @@
 __license__   = 'GPL v3'
 __copyright__ = '2010-2011, Eddie Lau'
+
+# Users of Kindle 3 (with limited system-level CJK support)
+# please replace the following "True" with "False".
+__MakePeriodical__ = True
+# Turn it to True if your device supports display of CJK titles
+__UseChineseTitle__ = False
+
+
 '''
 Change Log:
+2011/03/06: add new articles for finance section, also a new section "Columns"
+2011/02/28: rearrange the sections
+            [Disabled until Kindle has better CJK support and can remember last (section,article) read in Sections & Articles
+            View] make it the same title if generating a periodical, so past issue will be automatically put into "Past Issues"
+            folder in Kindle 3
 2011/02/20: skip duplicated links in finance section, put photos which may extend a whole page to the back of the articles
            clean up the indentation
 2010/12/07: add entertainment section, use newspaper front page as ebook cover, suppress date display in section list
@ -19,21 +32,17 @@ import os, datetime, re
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from contextlib import nested

-
 from calibre.ebooks.BeautifulSoup import BeautifulSoup
 from calibre.ebooks.metadata.opf2 import OPFCreator
 from calibre.ebooks.metadata.toc import TOC
 from calibre.ebooks.metadata import MetaInformation

 class MPHKRecipe(BasicNewsRecipe):
-    IsCJKWellSupported = True  # Set to False to avoid generating periodical in which CJK characters can't be displayed in section/article view
 	title          = 'Ming Pao - Hong Kong'
 	oldest_article = 1
 	max_articles_per_feed = 100
 	__author__            = 'Eddie Lau'
-    description = ('Hong Kong Chinese Newspaper (http://news.mingpao.com). If'
-                  'you are using a Kindle with firmware < 3.1, customize the'
-                  'recipe')
+	description           = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
 	publisher             = 'MingPao'
 	category              = 'Chinese, News, Hong Kong'
 	remove_javascript = True
@ -48,12 +57,14 @@ class MPHKRecipe(BasicNewsRecipe):
 	masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
 	keep_only_tags = [dict(name='h1'),
                      dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
-                      dict(attrs={'id':['newscontent']}), # entertainment page content
+                      dict(name='font', attrs={'color':['AA0000']}), # for column articles title
+                      dict(attrs={'id':['newscontent']}), # entertainment and column page content
                      dict(attrs={'id':['newscontent01','newscontent02']}),
                      dict(attrs={'class':['photo']})
                      ]
 	remove_tags = [dict(name='style'),
-                   dict(attrs={'id':['newscontent135']})]  # for the finance page
+    			   dict(attrs={'id':['newscontent135']}),  # for the finance page
+    			   dict(name='table')]  # for content fetched from life.mingpao.com
 	remove_attributes = ['width']
 	preprocess_regexps = [
                          (re.compile(r'<h5>', re.DOTALL|re.IGNORECASE),
@ -61,7 +72,12 @@ class MPHKRecipe(BasicNewsRecipe):
                          (re.compile(r'</h5>', re.DOTALL|re.IGNORECASE),
                          lambda match: '</h1>'),
                          (re.compile(r'<p><a href=.+?</a></p>', re.DOTALL|re.IGNORECASE), # for entertainment page
-                          lambda match: '')
+                          lambda match: ''),
+                          # skip <br> after title in life.mingpao.com fetched article
+                          (re.compile(r"<div id='newscontent'><br>", re.DOTALL|re.IGNORECASE),
+                          lambda match: "<div id='newscontent'>"),
+                          (re.compile(r"<br><br></b>", re.DOTALL|re.IGNORECASE),
+                          lambda match: "</b>")
                         ]

 	def image_url_processor(cls, baseurl, url):
@ -129,28 +145,55 @@ class MPHKRecipe(BasicNewsRecipe):
 	def parse_index(self):
 		feeds = []
 		dateStr = self.get_fetchdate()
+
 		for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
 		                   (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
-                           (u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'),
-                           (u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
-                           (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
-                           (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'),
-                           ('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
-                           (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
-                           (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'),
-                           (u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
-                           (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
+		                   (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm')]:
 			articles = self.parse_section(url)
 			if articles:
 				feeds.append((title, articles))
+
+		# special- editorial
+		ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
+		if ed_articles:
+			feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
+
+		for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
+                           (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
+                           (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm')]:
+			articles = self.parse_section(url)
+			if articles:
+				feeds.append((title, articles))
+
 		# special - finance
-        fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
+		#fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
+		fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
 		if fin_articles:
 			feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
+
+		for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
+                           (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
+			articles = self.parse_section(url)
+			if articles:
+				feeds.append((title, articles))
+
 		# special - entertainment
 		ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
 		if ent_articles:
 			feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
+
+		for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
+                           (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
+			articles = self.parse_section(url)
+			if articles:
+				feeds.append((title, articles))
+
+
+		# special- columns
+		col_articles = self.parse_col_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn')
+		if col_articles:
+			feeds.append((u'\u5c08\u6b04 Columns', col_articles))
+
 		return feeds

 	def parse_section(self, url):
@ -171,15 +214,33 @@ class MPHKRecipe(BasicNewsRecipe):
 		current_articles.reverse()
 		return current_articles

+	def parse_ed_section(self, url):
+		self.get_fetchdate()
+		soup = self.index_to_soup(url)
+		a = soup.findAll('a', href=True)
+		a.reverse()
+		current_articles = []
+		included_urls = []
+		for i in a:
+			title = self.tag_to_string(i)
+			url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
+			if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('nal') == -1):
+				current_articles.append({'title': title, 'url': url, 'description': ''})
+				included_urls.append(url)
+		current_articles.reverse()
+		return current_articles
+
 	def parse_fin_section(self, url):
-        dateStr = self.get_fetchdate()
+		self.get_fetchdate()
 		soup = self.index_to_soup(url)
 		a = soup.findAll('a', href= True)
 		current_articles = []
 		included_urls = []
 		for i in a:
-            url = 'http://www.mpfinance.com/cfm/' + i.get('href', False)
-            if url not in included_urls and not url.rfind(dateStr) == -1 and url.rfind('index') == -1:
+			#url = 'http://www.mpfinance.com/cfm/' + i.get('href', False)
+			url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
+			#if url not in included_urls and not url.rfind(dateStr) == -1 and url.rfind('index') == -1:
+			if url not in included_urls and (not url.rfind('txt') == -1) and (not url.rfind('nal') == -1):
 				title = self.tag_to_string(i)
 				current_articles.append({'title': title, 'url': url, 'description':''})
 				included_urls.append(url)
@ -201,6 +262,22 @@ class MPHKRecipe(BasicNewsRecipe):
 		current_articles.reverse()
 		return current_articles

+	def parse_col_section(self, url):
+		self.get_fetchdate()
+		soup = self.index_to_soup(url)
+		a = soup.findAll('a', href=True)
+		a.reverse()
+		current_articles = []
+		included_urls = []
+		for i in a:
+			title = self.tag_to_string(i)
+			url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
+			if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('ncl') == -1):
+				current_articles.append({'title': title, 'url': url, 'description': ''})
+				included_urls.append(url)
+		current_articles.reverse()
+		return current_articles
+
 	def preprocess_html(self, soup):
 		for item in soup.findAll(style=True):
 			del item['style']
@ -213,18 +290,18 @@ class MPHKRecipe(BasicNewsRecipe):
 	def create_opf(self, feeds, dir=None):
 		if dir is None:
 			dir = self.output_dir
-        if self.IsCJKWellSupported == True:
-            # use Chinese title
-            title = u'\u660e\u5831 (\u9999\u6e2f) ' + self.get_fetchformatteddate()
+		if __UseChineseTitle__ == True:
+			title = u'\u660e\u5831 (\u9999\u6e2f)'
 		else:
-            # use English title
-            title = self.short_title() + ' ' + self.get_fetchformatteddate()
-        if True:  # force date in title
-            #    title += strftime(self.timefmt)
+			title = self.short_title()
+		# if not generating a periodical, force date to apply in title
+		if __MakePeriodical__ == False:
+			title = title + ' ' + self.get_fetchformatteddate()
+		if True:
 			mi = MetaInformation(title, [self.publisher])
 			mi.publisher = self.publisher
 			mi.author_sort = self.publisher
-            if self.IsCJKWellSupported == True:
+			if __MakePeriodical__ == True:
 				mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
 			else:
 				mi.publication_type = self.publication_type+':'+self.short_title()