"),
(re.compile(r"
", re.DOTALL|re.IGNORECASE),
- lambda match: "")
+ lambda match: ""),
+ (re.compile(r'

', re.DOTALL|re.IGNORECASE),
+ lambda match: ''),
+ (re.compile(r'

', re.DOTALL|re.IGNORECASE),
+ lambda match: ''),
+ (re.compile(r'

', re.DOTALL|re.IGNORECASE),
+ lambda match: ''),
+ #(re.compile(r'[
.+?]', re.DOTALL|re.IGNORECASE),
+ #lambda match: '')
]
elif __Region__ == 'Vancouver':
if __UseChineseTitle__ == True:
@@ -221,6 +240,10 @@ class MPRecipe(BasicNewsRecipe):
return __Date__[6:8]
else:
return self.get_dtlocal().strftime("%d")
+
+ # Note: does not work with custom date given by __Date__
+ def get_weekday(self):
+ return self.get_dtlocal().weekday()
def get_cover_url(self):
if __Region__ == 'Hong Kong':
@@ -260,7 +283,23 @@ class MPRecipe(BasicNewsRecipe):
if articles:
feeds.append((title, articles))
- if __InclPremium__ == True:
+# if __InclPremium__ == True:
+# # parse column section articles directly from .txt files
+# for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
+# ]:
+# articles = self.parse_section2_txt(url, keystr)
+# if articles:
+# feeds.append((title, articles))
+#
+# for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
+# (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
+# articles = self.parse_section(url)
+# if articles:
+# feeds.append((title, articles))
+
+ # new
+ if __InclPremium__ == True and (self.get_weekday() <> 6 or __ParseSelectedMobile__ == False):
+ # if both not on Sunday and not __ParseSelectedMobile__, go ahead
# parse column section articles directly from .txt files
for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
]:
@@ -268,17 +307,45 @@ class MPRecipe(BasicNewsRecipe):
if articles:
feeds.append((title, articles))
- for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
- (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
- articles = self.parse_section(url)
+ if __InclPremium__ == False or self.get_weekday() <> 6:
+ for title, url, seckey in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm', 'ja')]:
+ if __ParseTxt__ == False:
+ articles = self.parse_section(url)
+ else:
+ articles = self.parse_section_txt(url, seckey)
+ if articles:
+ feeds.append((title, articles))
+ else:
+ if __InclPremium__ == True and __ParseSelectedMobile__ == True:
+ articles = self.parse_section_mobile('http://premium.mingpao.com/pda/palm', 'HotNews1.cfm?cat=ja&token=b218bc260b89c0&online=1')
+ if articles:
+ feeds.append((u'\u526f\u520a Supplement', articles))
+ else:
+ for title, url, seckey in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm', 'ja')]:
+ if __ParseTxt__ == False:
+ articles = self.parse_section(url)
+ else:
+ articles = self.parse_section_txt(url, seckey)
+ if articles:
+ feeds.append((title, articles))
+
+ for title, url, seckey in [(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm', 'em')]:
+ if __ParseTxt__ == False:
+ articles = self.parse_section(url)
+ else:
+ articles = self.parse_section_txt(url, seckey)
if articles:
feeds.append((title, articles))
+ # end of new
else:
- for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
- (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
- (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
- (u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm')]:
- articles = self.parse_section(url)
+ for title, url, seckey in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm', 'ga'),
+ (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm', 'gb'),
+ (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm', 'gf'),
+ (u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm', 'mr')]:
+ if __ParseTxt__ == False:
+ articles = self.parse_section(url)
+ else:
+ articles = self.parse_section_txt(url, seckey)
if articles:
feeds.append((title, articles))
@@ -287,10 +354,13 @@ class MPRecipe(BasicNewsRecipe):
#if ed_articles:
# feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
- for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
- (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
- (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm')]:
- articles = self.parse_section(url)
+ for title, url, seckey in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm', 'fa'),
+ (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm', 'ca'),
+ (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm', 'ta')]:
+ if __ParseTxt__ == False:
+ articles = self.parse_section(url)
+ else:
+ articles = self.parse_section_txt(url, seckey)
if articles:
feeds.append((title, articles))
@@ -322,7 +392,9 @@ class MPRecipe(BasicNewsRecipe):
if articles:
feeds.append((title, articles))
- if __InclPremium__ == True:
+
+ if __InclPremium__ == True and (self.get_weekday() <> 6 or __ParseSelectedMobile__ == False):
+ # if both not on Sunday or not __ParseSelectedMobile__, go ahead
# parse column section articles directly from .txt files
for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
]:
@@ -330,12 +402,36 @@ class MPRecipe(BasicNewsRecipe):
if articles:
feeds.append((title, articles))
- for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
- (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
- articles = self.parse_section(url)
+ if __InclPremium__ == False or self.get_weekday() <> 6:
+ for title, url, seckey in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm', 'ja')]:
+ if __ParseTxt__ == False:
+ articles = self.parse_section(url)
+ else:
+ articles = self.parse_section_txt(url, seckey)
+ if articles:
+ feeds.append((title, articles))
+ else:
+ if __InclPremium__ == True and __ParseSelectedMobile__ == True:
+ articles = self.parse_section_mobile('http://premium.mingpao.com/pda/palm', 'HotNews1.cfm?cat=ja&token=b218bc260b89c0&online=1')
+ if articles:
+ feeds.append((u'\u526f\u520a Supplement', articles))
+ else:
+ for title, url, seckey in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm', 'ja')]:
+ if __ParseTxt__ == False:
+ articles = self.parse_section(url)
+ else:
+ articles = self.parse_section_txt(url, seckey)
+ if articles:
+ feeds.append((title, articles))
+
+ for title, url, seckey in [(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm', 'em')]:
+ if __ParseTxt__ == False:
+ articles = self.parse_section(url)
+ else:
+ articles = self.parse_section_txt(url, seckey)
if articles:
feeds.append((title, articles))
-
+
elif __Region__ == 'Vancouver':
for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VAindex.htm'),
(u'\u52a0\u570b Canada', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VBindex.htm'),
@@ -366,7 +462,7 @@ class MPRecipe(BasicNewsRecipe):
feeds.append((title, articles))
return feeds
- # parse from news.mingpao.com
+ # parse from news.mingpao.com (web html)
def parse_section(self, url):
dateStr = self.get_fetchdate()
soup = self.index_to_soup(url)
@@ -379,17 +475,57 @@ class MPRecipe(BasicNewsRecipe):
title = self.tag_to_string(a)
url = a.get('href', False)
url = 'http://news.mingpao.com/' + dateStr + '/' +url
- # replace the url to the print-friendly version
- if __ParsePFF__ == True:
+ # replace the url to the alternative version
+ if __ParsePF__ == True:
+ # printer-friendly option
if url.rfind('Redirect') <> -1 and __InclPremium__ == True:
url = re.sub(dateStr + '.*' + dateStr, dateStr, url)
url = re.sub('%2F.*%2F', '/', url)
- title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '')
+ if __InclPremium__ == True:
+ title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '')
url = url.replace('%2Etxt', '_print.htm')
url = url.replace('%5F', '_')
else:
url = url.replace('.htm', '_print.htm')
- if url not in included_urls and url.rfind('Redirect') == -1:
+ #if url not in included_urls and url.rfind('Redirect') == -1 and (__InclPremium__ == False or title.rfind(u'\u6536\u8cbb\u5167\u5bb9') == -1):
+ if url not in included_urls and (__InclPremium__ == True or title.rfind(u'\u6536\u8cbb\u5167\u5bb9') == -1):
+ current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
+ included_urls.append(url)
+ current_articles.reverse()
+ return current_articles
+
+ # parse from news.mingpao.com (txt)
+ def parse_section_txt(self, url, ch):
+ dateStr = self.get_fetchdate()
+ soup = self.index_to_soup(url)
+ divs = soup.findAll(attrs={'class': ['bullet','bullet_grey']})
+ current_articles = []
+ included_urls = []
+ divs.reverse()
+ for i in divs:
+ a = i.find('a', href = True)
+ title = self.tag_to_string(a)
+ url = a.get('href', False)
+ #print 'Base url: ', url
+ # replace the url to the alternative version
+ # text version
+ if url.rfind('Redirect') <> -1:
+ url = 'http://news.mingpao.com/' + dateStr + '/' +url
+ #print 'original url: ', url
+ url = re.sub(dateStr + '/../cfm/Redirect.cfm.*NewsFile=', 'ftp/WebNews2/', url)
+ url = re.sub('%2F', '/', url)
+ if __InclPremium__ == True:
+ title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '')
+ url = url.replace('%2Etxt', '.txt')
+ url = url.replace('%5F', '_')
+ else:
+ # get the first two char in url as ch
+ seckey = url[0:2]
+ url = url.replace('.htm', '.txt')
+ url = 'http://news.mingpao.com/ftp/WebNews2/' + dateStr + '/' + ch + '/' + seckey + '/' + url
+ #print 'updated url: ', url
+ if url not in included_urls and (__InclPremium__ == True or title.rfind(u'\u6536\u8cbb\u5167\u5bb9') == -1):
+ #if url not in included_urls and (url.rfind('Redirect') == -1) and (__InclPremium__ == False or title.rfind(u'\u6536\u8cbb\u5167\u5bb9') == -1):
current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
included_urls.append(url)
current_articles.reverse()
@@ -415,7 +551,7 @@ class MPRecipe(BasicNewsRecipe):
current_articles.append({'title': title, 'url': url, 'description': ''})
included_urls.append(url)
except:
- print 'skipping a premium article'
+ print 'skipping a premium article'
current_articles.reverse()
return current_articles
@@ -437,6 +573,20 @@ class MPRecipe(BasicNewsRecipe):
current_articles.reverse()
return current_articles
+ # parse from mobile version
+ def parse_section_mobile(self, base, page):
+ soup = self.index_to_soup(base + '/' + page)
+ a = soup.findAll('a', href=True)
+ current_articles = []
+ included_urls = []
+ for i in a:
+ title = self.tag_to_string(i)
+ url = i.get('href', False)
+ if url not in included_urls and url.rfind('HotNews2.cfm') <> -1:
+ current_articles.append({'title': title, 'url': base + '/' + url, 'description': ''})
+ included_urls.append(url)
+ return current_articles
+
# parse from www.mingpaovan.com
def parse_section3(self, url, baseUrl):
self.get_fetchdate()
@@ -631,15 +781,22 @@ class MPRecipe(BasicNewsRecipe):
br.open_novisit(url + "/../" + gifimg[5:len(gifimg)-1])
new_html = new_html.replace(img, gifimg)
except:
- # find the location of the first _
- pos = img.find('_')
- if pos > -1:
- # if found, insert _ after the first _
- newimg = img[0:pos] + '_' + img[pos:]
- new_html = new_html.replace(img, newimg)
+ if __ParseTxt__ == False:
+ # find the location of the first _
+ pos = img.find('_')
+ if pos > -1:
+ # if found, insert _ after the first _
+ newimg = img[0:pos] + '_' + img[pos:]
+ new_html = new_html.replace(img, newimg)
+ else:
+ # if not found, insert _ after "
+ new_html = new_html.replace(img[1:], '"_' + img[1:])
else:
- # if not found, insert _ after "
- new_html = new_html.replace(img[1:], '"_' + img[1:])
+ # insert to front
+ #print 'imgstr: ', img
+ pos = img.find('_')
+ new_html = new_html.replace(img[5:], '_' + img[5:])
+
elif url.rfind('life.mingpao.com') > -1:
imglist = re.findall('src=\'?.*?jpg\'', new_html)
br = mechanize.Browser()
@@ -673,9 +830,13 @@ class MPRecipe(BasicNewsRecipe):
newimg = img[0:pos+1] + '_' + img[pos+1:]
#print 'Use hi-res img', newimg
new_html = new_html.replace(img, newimg)
+ # test
+ #print new_html
return new_html
def preprocess_html(self, soup):
+ for mobiletitle in soup.findAll('font', attrs={'color': ['navy']}):
+ mobiletitle.name = 'h1'
for item in soup.findAll(style=True):
del item['style']
for item in soup.findAll(style=True):
@@ -909,3 +1070,4 @@ class MPRecipe(BasicNewsRecipe):
opf.render(opf_file, ncx_file)
+