Updated Ming Pao and Various Taiwanese news sources by Eddie Lau

2025-06-23 15:30:45 -04:00 · 2011-05-12 10:10:14 -06:00 · 2011-05-12 10:10:14 -06:00 · 16c92f3d23
commit 16c92f3d23
parent c7c9ade376
4 changed files with 505 additions and 304 deletions
--- a/recipes/china_times.recipe
+++ b/recipes/china_times.recipe
@ -0,0 +1,42 @@
 # -*- coding: utf-8 -*-
 __license__   = 'GPL v3'
 # dug from http://www.mobileread.com/forums/showthread.php?p=1012294
 from calibre.web.feeds.news import BasicNewsRecipe
 class AdvancedUserRecipe1277443634(BasicNewsRecipe):
    title = u'中時電子報'
    oldest_article = 1
    max_articles_per_feed = 100
    feeds = [(u'焦點', u'http://rss.chinatimes.com/rss/focus-u.rss'),
             (u'政治', u'http://rss.chinatimes.com/rss/Politic-u.rss'),
             (u'社會', u'http://rss.chinatimes.com/rss/social-u.rss'),
             (u'國際', u'http://rss.chinatimes.com/rss/international-u.rss'),
             (u'兩岸', u'http://rss.chinatimes.com/rss/mainland-u.rss'),
             (u'地方', u'http://rss.chinatimes.com/rss/local-u.rss'),
             (u'言論', u'http://rss.chinatimes.com/rss/comment-u.rss'),
             (u'科技', u'http://rss.chinatimes.com/rss/technology-u.rss'),
             (u'運動', u'http://rss.chinatimes.com/rss/sport-u.rss'),
             (u'藝文', u'http://rss.chinatimes.com/rss/philology-u.rss'),
             #(u'旺報', u'http://rss.chinatimes.com/rss/want-u.rss'),
             #(u'財經', u'http://rss.chinatimes.com/rss/finance-u.rss'),  # broken links
             #(u'股市', u'http://rss.chinatimes.com/rss/stock-u.rss')  # broken links
             ]
    __author__ = 'einstuerzende, updated by Eddie Lau'
    __version__ = '1.0'
    language = 'zh'
    publisher = 'China Times Group'
    description = 'China Times (Taiwan)'
    category = 'News, Chinese, Taiwan'
    remove_javascript = True
    use_embedded_content = False
    no_stylesheets = True
    encoding = 'big5'
    conversion_options = {'linearize_tables':True}
    masthead_url = 'http://www.fcuaa.org/gif/chinatimeslogo.gif'
    cover_url = 'http://www.fcuaa.org/gif/chinatimeslogo.gif'
    keep_only_tags = [dict(name='div', attrs={'class':['articlebox','articlebox clearfix']})]
    remove_tags = [dict(name='div', attrs={'class':['focus-news']})]
--- a/recipes/liberty_times.recipe
+++ b/recipes/liberty_times.recipe
@ -0,0 +1,44 @@
 # -*- coding: utf-8 -*-
 __license__   = 'GPL v3'
 # dug from http://www.mobileread.com/forums/showthread.php?p=1012294
 from calibre.web.feeds.news import BasicNewsRecipe
 class AdvancedUserRecipe1277443634(BasicNewsRecipe):
 	title = u'自由電子報'
 	oldest_article = 1
 	max_articles_per_feed = 100
 	feeds = [(u'焦點新聞', u'http://www.libertytimes.com.tw/rss/fo.xml'),
                  (u'政治新聞', u'http://www.libertytimes.com.tw/rss/p.xml'),
                  (u'生活新聞', u'http://www.libertytimes.com.tw/rss/life.xml'),
                  (u'國際新聞', u'http://www.libertytimes.com.tw/rss/int.xml'),
                  (u'自由廣場', u'http://www.libertytimes.com.tw/rss/o.xml'),
                  (u'社會新聞', u'http://www.libertytimes.com.tw/rss/so.xml'),
                  (u'體育新聞', u'http://www.libertytimes.com.tw/rss/sp.xml'),
                  (u'財經焦點', u'http://www.libertytimes.com.tw/rss/e.xml'),
                  (u'證券理財', u'http://www.libertytimes.com.tw/rss/stock.xml'),
                  (u'影視焦點', u'http://www.libertytimes.com.tw/rss/show.xml'),
                  (u'北部新聞', u'http://www.libertytimes.com.tw/rss/north.xml'),
                  (u'中部新聞', u'http://www.libertytimes.com.tw/rss/center.xml'),
                  (u'南部新聞', u'http://www.libertytimes.com.tw/rss/south.xml'),
                  (u'大台北新聞', u'http://www.libertytimes.com.tw/rss/taipei.xml'),
                  (u'藝術文化', u'http://www.libertytimes.com.tw/rss/art.xml'),
                 ]
 	extra_css = '''span[class='insubject1'][id='newtitle'] {font-size:200%; font-weight:bold;}'''
 	__author__ = 'einstuerzende, updated by Eddie Lau'
 	__version__ = '1.1'
 	language = 'zh'
 	publisher = 'Liberty Times Group'
 	description = 'Liberty Times (Taiwan)'
 	category = 'News, Chinese, Taiwan'
 	remove_javascript = True
 	use_embedded_content = False
 	no_stylesheets = True
 	encoding = 'big5'
 	conversion_options = {'linearize_tables':True}
 	masthead_url = 'http://www.libertytimes.com.tw/2008/images/img_auto/005/logo_new.gif'
 	cover_url = 'http://www.libertytimes.com.tw/2008/images/img_auto/005/logo_new.gif'
 	keep_only_tags = [dict(name='td', attrs={'id':['newsContent']})]
--- a/recipes/ming_pao.recipe
+++ b/recipes/ming_pao.recipe
@ -1,15 +1,18 @@
 # -*- coding: utf-8 -*-
 __license__   = 'GPL v3'
 __copyright__ = '2010-2011, Eddie Lau'
-# Users of Kindle 3 (with limited system-level CJK support)
+# Users of Kindle 3 with limited system-level CJK support
 # please replace the following "True" with "False".
 __MakePeriodical__ = True
-# Turn it to True if your device supports display of CJK titles
+# Turn below to true if your device supports display of CJK titles
 __UseChineseTitle__ = False
-
+# Trun below to true if you wish to use life.mingpao.com as the main article source
 __UseLife__ = True
 '''
 Change Log:
 2011/05/12: switch the main parse source to life.mingpao.com, which has more photos on the article pages
 2011/03/06: add new articles for finance section, also a new section "Columns"
 2011/02/28: rearrange the sections
            [Disabled until Kindle has better CJK support and can remember last (section,article) read in Sections & Articles
@ -32,41 +35,43 @@ import os, datetime, re
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from contextlib import nested
 from calibre.ebooks.BeautifulSoup import BeautifulSoup
 from calibre.ebooks.metadata.opf2 import OPFCreator
 from calibre.ebooks.metadata.toc import TOC
 from calibre.ebooks.metadata import MetaInformation
 class MPHKRecipe(BasicNewsRecipe):
-	title          = 'Ming Pao - Hong Kong'
+    title          = 'Ming Pao - Hong Kong'
-	oldest_article = 1
+    oldest_article = 1
-	max_articles_per_feed = 100
+    max_articles_per_feed = 100
-	__author__            = 'Eddie Lau'
+    __author__            = 'Eddie Lau'
-	description           = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
+    description           = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
-	publisher             = 'MingPao'
+    publisher             = 'MingPao'
-	category              = 'Chinese, News, Hong Kong'
+    category              = 'Chinese, News, Hong Kong'
-	remove_javascript = True
+    remove_javascript = True
-	use_embedded_content   = False
+    use_embedded_content   = False
-	no_stylesheets = True
+    no_stylesheets = True
-	language = 'zh'
+    language = 'zh'
-	encoding = 'Big5-HKSCS'
+    encoding = 'Big5-HKSCS'
-	recursions = 0
+    recursions = 0
-	conversion_options = {'linearize_tables':True}
+    conversion_options = {'linearize_tables':True}
-	timefmt = ''
+    timefmt = ''
-	extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;}'
+    extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;}'
-	masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
+    masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
-	keep_only_tags = [dict(name='h1'),
+    keep_only_tags = [dict(name='h1'),
                      dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
                      dict(name='font', attrs={'color':['AA0000']}), # for column articles title
                      dict(attrs={'id':['newscontent']}), # entertainment and column page content
                      dict(attrs={'id':['newscontent01','newscontent02']}),
-                      dict(attrs={'class':['photo']})
+                      dict(attrs={'class':['photo']}),
                      dict(name='img', attrs={'width':['180'], 'alt':['按圖放大']}) # images for source from life.mingpao.com
                      ]
-	remove_tags = [dict(name='style'),
+    remove_tags = [dict(name='style'),
-    			   dict(attrs={'id':['newscontent135']}),  # for the finance page
+                   dict(attrs={'id':['newscontent135']}),  # for the finance page from mpfinance.com
-    			   dict(name='table')]  # for content fetched from life.mingpao.com
+                   dict(name='table')]  # for content fetched from life.mingpao.com
-	remove_attributes = ['width']
+    remove_attributes = ['width']
-	preprocess_regexps = [
+    preprocess_regexps = [
                          (re.compile(r'<h5>', re.DOTALL|re.IGNORECASE),
                          lambda match: '<h1>'),
                          (re.compile(r'</h5>', re.DOTALL|re.IGNORECASE),
@ -80,10 +85,10 @@ class MPHKRecipe(BasicNewsRecipe):
                          lambda match: "</b>")
                         ]
-	def image_url_processor(cls, baseurl, url):
+    def image_url_processor(cls, baseurl, url):
-		# trick: break the url at the first occurance of digit, add an additional
+        # trick: break the url at the first occurance of digit, add an additional
-		# '_' at the front
+        # '_' at the front
-		# not working, may need to move this to preprocess_html() method
+        # not working, may need to move this to preprocess_html() method
 #        minIdx = 10000
 #        i0 = url.find('0')
 #        if i0 >= 0 and i0 < minIdx:
@ -115,314 +120,357 @@ class MPHKRecipe(BasicNewsRecipe):
 #        i9 = url.find('9')
 #        if i9 >= 0 and i9 < minIdx:
 #           minIdx = i9
-		return url
+        return url
-	def get_dtlocal(self):
+    def get_dtlocal(self):
-		dt_utc = datetime.datetime.utcnow()
+        dt_utc = datetime.datetime.utcnow()
-		# convert UTC to local hk time - at around HKT 6.00am, all news are available
+        # convert UTC to local hk time - at around HKT 6.00am, all news are available
-		dt_local = dt_utc - datetime.timedelta(-2.0/24)
+        dt_local = dt_utc - datetime.timedelta(-2.0/24)
-		return dt_local
+        return dt_local
-	def get_fetchdate(self):
+    def get_fetchdate(self):
-		return self.get_dtlocal().strftime("%Y%m%d")
+        return self.get_dtlocal().strftime("%Y%m%d")
-	def get_fetchformatteddate(self):
+    def get_fetchformatteddate(self):
-		return self.get_dtlocal().strftime("%Y-%m-%d")
+        return self.get_dtlocal().strftime("%Y-%m-%d")
-	def get_fetchday(self):
+    def get_fetchday(self):
-		# convert UTC to local hk time - at around HKT 6.00am, all news are available
+        # dt_utc = datetime.datetime.utcnow()
-		return self.get_dtlocal().strftime("%d")
+        # convert UTC to local hk time - at around HKT 6.00am, all news are available
        # dt_local = dt_utc - datetime.timedelta(-2.0/24)
        return self.get_dtlocal().strftime("%d")
-	def get_cover_url(self):
+    def get_cover_url(self):
-		cover = 'http://news.mingpao.com/' + self.get_fetchdate() + '/' + self.get_fetchdate() + '_' + self.get_fetchday() + 'gacov.jpg'
+        cover = 'http://news.mingpao.com/' + self.get_fetchdate() + '/' + self.get_fetchdate() + '_' + self.get_fetchday() + 'gacov.jpg'
-		br = BasicNewsRecipe.get_browser()
+        br = BasicNewsRecipe.get_browser()
-		try:
+        try:
-			br.open(cover)
+            br.open(cover)
-		except:
+        except:
-			cover = None
+            cover = None
-		return cover
+        return cover
-	def parse_index(self):
+    def parse_index(self):
-		feeds = []
+        feeds = []
-		dateStr = self.get_fetchdate()
+        dateStr = self.get_fetchdate()
-		for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
+        if __UseLife__:
-		                   (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
+            for title, url, keystr in [(u'\u8981\u805e Headline', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalga', 'nal'),
-		                   (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm')]:
+                                       (u'\u6e2f\u805e Local', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalgb', 'nal'),
-			articles = self.parse_section(url)
+                                       (u'\u6559\u80b2 Education', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalgf', 'nal'),
-			if articles:
+                                       (u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr', 'nal'),
-				feeds.append((title, articles))
+                                       (u'\u8ad6\u58c7 Forum', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalfa', 'nal'),
                                       (u'\u4e2d\u570b China', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalca', 'nal'),
                                       (u'\u570b\u969b World', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalta', 'nal'),
                                       (u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal'),
                                       (u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'),
                                       (u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal'),
                                       (u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')]:
                articles = self.parse_section2(url, keystr)
                if articles:
                    feeds.append((title, articles))
-		# special- editorial
+            for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
-		ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
+                               (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
-		if ed_articles:
+                articles = self.parse_section(url)
-			feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
+                if articles:
                    feeds.append((title, articles))
        else:
            for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
                               (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
                               (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm')]:
                articles = self.parse_section(url)
                if articles:
                    feeds.append((title, articles))
-		for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
+            # special- editorial
-                           (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
+            ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
-                           (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm')]:
+            if ed_articles:
-			articles = self.parse_section(url)
+                feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
 			if articles:
 				feeds.append((title, articles))
-		# special - finance
+            for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
-		#fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
+                               (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
-		fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
+                               (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm')]:
-		if fin_articles:
+                articles = self.parse_section(url)
-			feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
+                if articles:
                    feeds.append((title, articles))
-		for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
+            # special - finance
-                           (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
+            #fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
-			articles = self.parse_section(url)
+            fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
-			if articles:
+            if fin_articles:
-				feeds.append((title, articles))
+                feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
-		# special - entertainment
+            for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
-		ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
+                               (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
-		if ent_articles:
+                articles = self.parse_section(url)
-			feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
+                if articles:
                    feeds.append((title, articles))
-		for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
+            # special - entertainment
-                           (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
+            ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
-			articles = self.parse_section(url)
+            if ent_articles:
-			if articles:
+                feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
-				feeds.append((title, articles))
+
            for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
                               (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
                articles = self.parse_section(url)
                if articles:
                    feeds.append((title, articles))
-		# special- columns
+            # special- columns
-		col_articles = self.parse_col_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn')
+            col_articles = self.parse_col_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn')
-		if col_articles:
+            if col_articles:
-			feeds.append((u'\u5c08\u6b04 Columns', col_articles))
+                feeds.append((u'\u5c08\u6b04 Columns', col_articles))
-		return feeds
+        return feeds
-	def parse_section(self, url):
+    # parse from news.mingpao.com
-		dateStr = self.get_fetchdate()
+    def parse_section(self, url):
-		soup = self.index_to_soup(url)
+        dateStr = self.get_fetchdate()
-		divs = soup.findAll(attrs={'class': ['bullet','bullet_grey']})
+        soup = self.index_to_soup(url)
-		current_articles = []
+        divs = soup.findAll(attrs={'class': ['bullet','bullet_grey']})
-		included_urls = []
+        current_articles = []
-		divs.reverse()
+        included_urls = []
-		for i in divs:
+        divs.reverse()
-			a = i.find('a', href = True)
+        for i in divs:
-			title = self.tag_to_string(a)
+            a = i.find('a', href = True)
-			url = a.get('href', False)
+            title = self.tag_to_string(a)
-			url = 'http://news.mingpao.com/' + dateStr + '/' +url
+            url = a.get('href', False)
-			if url not in included_urls and url.rfind('Redirect') == -1:
+            url = 'http://news.mingpao.com/' + dateStr + '/' +url
-				current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
+            if url not in included_urls and url.rfind('Redirect') == -1:
-				included_urls.append(url)
+                current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
-		current_articles.reverse()
+                included_urls.append(url)
-		return current_articles
+        current_articles.reverse()
        return current_articles
-	def parse_ed_section(self, url):
+    # parse from life.mingpao.com
-		self.get_fetchdate()
+    def parse_section2(self, url, keystr):
-		soup = self.index_to_soup(url)
+        self.get_fetchdate()
-		a = soup.findAll('a', href=True)
+        soup = self.index_to_soup(url)
-		a.reverse()
+        a = soup.findAll('a', href=True)
-		current_articles = []
+        a.reverse()
-		included_urls = []
+        current_articles = []
-		for i in a:
+        included_urls = []
-			title = self.tag_to_string(i)
+        for i in a:
-			url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
+            title = self.tag_to_string(i)
-			if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('nal') == -1):
+            url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
-				current_articles.append({'title': title, 'url': url, 'description': ''})
+            if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
-				included_urls.append(url)
+                current_articles.append({'title': title, 'url': url, 'description': ''})
-		current_articles.reverse()
+                included_urls.append(url)
-		return current_articles
+        current_articles.reverse()
        return current_articles
-	def parse_fin_section(self, url):
+    def parse_ed_section(self, url):
-		self.get_fetchdate()
+        self.get_fetchdate()
-		soup = self.index_to_soup(url)
+        soup = self.index_to_soup(url)
-		a = soup.findAll('a', href= True)
+        a = soup.findAll('a', href=True)
-		current_articles = []
+        a.reverse()
-		included_urls = []
+        current_articles = []
-		for i in a:
+        included_urls = []
-			#url = 'http://www.mpfinance.com/cfm/' + i.get('href', False)
+        for i in a:
-			url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
+            title = self.tag_to_string(i)
-			#if url not in included_urls and not url.rfind(dateStr) == -1 and url.rfind('index') == -1:
+            url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
-			if url not in included_urls and (not url.rfind('txt') == -1) and (not url.rfind('nal') == -1):
+            if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('nal') == -1):
-				title = self.tag_to_string(i)
+                current_articles.append({'title': title, 'url': url, 'description': ''})
-				current_articles.append({'title': title, 'url': url, 'description':''})
+                included_urls.append(url)
-				included_urls.append(url)
+        current_articles.reverse()
-		return current_articles
+        return current_articles
-	def parse_ent_section(self, url):
+    def parse_fin_section(self, url):
-		self.get_fetchdate()
+        self.get_fetchdate()
-		soup = self.index_to_soup(url)
+        soup = self.index_to_soup(url)
-		a = soup.findAll('a', href=True)
+        a = soup.findAll('a', href= True)
-		a.reverse()
+        current_articles = []
-		current_articles = []
+        included_urls = []
-		included_urls = []
+        for i in a:
-		for i in a:
+            #url = 'http://www.mpfinance.com/cfm/' + i.get('href', False)
-			title = self.tag_to_string(i)
+            url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
-			url = 'http://ol.mingpao.com/cfm/' + i.get('href', False)
+            #if url not in included_urls and not url.rfind(dateStr) == -1 and url.rfind('index') == -1:
-			if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('star') == -1):
+            if url not in included_urls and (not url.rfind('txt') == -1) and (not url.rfind('nal') == -1):
-				current_articles.append({'title': title, 'url': url, 'description': ''})
+                title = self.tag_to_string(i)
-				included_urls.append(url)
+                current_articles.append({'title': title, 'url': url, 'description':''})
-		current_articles.reverse()
+                included_urls.append(url)
-		return current_articles
+        return current_articles
-	def parse_col_section(self, url):
+    def parse_ent_section(self, url):
-		self.get_fetchdate()
+        self.get_fetchdate()
-		soup = self.index_to_soup(url)
+        soup = self.index_to_soup(url)
-		a = soup.findAll('a', href=True)
+        a = soup.findAll('a', href=True)
-		a.reverse()
+        a.reverse()
-		current_articles = []
+        current_articles = []
-		included_urls = []
+        included_urls = []
-		for i in a:
+        for i in a:
-			title = self.tag_to_string(i)
+            title = self.tag_to_string(i)
-			url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
+            url = 'http://ol.mingpao.com/cfm/' + i.get('href', False)
-			if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('ncl') == -1):
+            if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('star') == -1):
-				current_articles.append({'title': title, 'url': url, 'description': ''})
+                current_articles.append({'title': title, 'url': url, 'description': ''})
-				included_urls.append(url)
+                included_urls.append(url)
-		current_articles.reverse()
+        current_articles.reverse()
-		return current_articles
+        return current_articles
-	def preprocess_html(self, soup):
+    def parse_col_section(self, url):
-		for item in soup.findAll(style=True):
+        self.get_fetchdate()
-			del item['style']
+        soup = self.index_to_soup(url)
-		for item in soup.findAll(style=True):
+        a = soup.findAll('a', href=True)
-			del item['width']
+        a.reverse()
-		for item in soup.findAll(stype=True):
+        current_articles = []
-			del item['absmiddle']
+        included_urls = []
-		return soup
+        for i in a:
            title = self.tag_to_string(i)
            url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
            if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('ncl') == -1):
                current_articles.append({'title': title, 'url': url, 'description': ''})
                included_urls.append(url)
        current_articles.reverse()
        return current_articles
-	def create_opf(self, feeds, dir=None):
+    def preprocess_html(self, soup):
-		if dir is None:
+        for item in soup.findAll(style=True):
-			dir = self.output_dir
+            del item['style']
-		if __UseChineseTitle__ == True:
+        for item in soup.findAll(style=True):
-			title = u'\u660e\u5831 (\u9999\u6e2f)'
+            del item['width']
-		else:
+        for item in soup.findAll(stype=True):
-			title = self.short_title()
+            del item['absmiddle']
-		# if not generating a periodical, force date to apply in title
+        return soup
 		if __MakePeriodical__ == False:
 			title = title + ' ' + self.get_fetchformatteddate()
 		if True:
 			mi = MetaInformation(title, [self.publisher])
 			mi.publisher = self.publisher
 			mi.author_sort = self.publisher
 			if __MakePeriodical__ == True:
 				mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
 			else:
 				mi.publication_type = self.publication_type+':'+self.short_title()
 			#mi.timestamp = nowf()
 			mi.timestamp = self.get_dtlocal()
 			mi.comments = self.description
 			if not isinstance(mi.comments, unicode):
 				mi.comments = mi.comments.decode('utf-8', 'replace')
 			#mi.pubdate = nowf()
 			mi.pubdate = self.get_dtlocal()
 			opf_path = os.path.join(dir, 'index.opf')
 			ncx_path = os.path.join(dir, 'index.ncx')
 			opf = OPFCreator(dir, mi)
 			# Add mastheadImage entry to <guide> section
 			mp = getattr(self, 'masthead_path', None)
 			if mp is not None and os.access(mp, os.R_OK):
 				from calibre.ebooks.metadata.opf2 import Guide
 				ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
 				ref.type = 'masthead'
 				ref.title = 'Masthead Image'
 				opf.guide.append(ref)
-			manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
+    def create_opf(self, feeds, dir=None):
-			manifest.append(os.path.join(dir, 'index.html'))
+        if dir is None:
-			manifest.append(os.path.join(dir, 'index.ncx'))
+            dir = self.output_dir
        if __UseChineseTitle__ == True:
            title = u'\u660e\u5831 (\u9999\u6e2f)'
        else:
            title = self.short_title()
        # if not generating a periodical, force date to apply in title
        if __MakePeriodical__ == False:
            title = title + ' ' + self.get_fetchformatteddate()
        if True:
            mi = MetaInformation(title, [self.publisher])
            mi.publisher = self.publisher
            mi.author_sort = self.publisher
            if __MakePeriodical__ == True:
                mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
            else:
                mi.publication_type = self.publication_type+':'+self.short_title()
            #mi.timestamp = nowf()
            mi.timestamp = self.get_dtlocal()
            mi.comments = self.description
            if not isinstance(mi.comments, unicode):
                mi.comments = mi.comments.decode('utf-8', 'replace')
            #mi.pubdate = nowf()
            mi.pubdate = self.get_dtlocal()
            opf_path = os.path.join(dir, 'index.opf')
            ncx_path = os.path.join(dir, 'index.ncx')
            opf = OPFCreator(dir, mi)
            # Add mastheadImage entry to <guide> section
            mp = getattr(self, 'masthead_path', None)
            if mp is not None and os.access(mp, os.R_OK):
                from calibre.ebooks.metadata.opf2 import Guide
                ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
                ref.type = 'masthead'
                ref.title = 'Masthead Image'
                opf.guide.append(ref)
-			# Get cover
+            manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
-			cpath = getattr(self, 'cover_path', None)
+            manifest.append(os.path.join(dir, 'index.html'))
-			if cpath is None:
+            manifest.append(os.path.join(dir, 'index.ncx'))
 				pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
 				if self.default_cover(pf):
 					cpath =  pf.name
 			if cpath is not None and os.access(cpath, os.R_OK):
 				opf.cover = cpath
 				manifest.append(cpath)
-			# Get masthead
+            # Get cover
-			mpath = getattr(self, 'masthead_path', None)
+            cpath = getattr(self, 'cover_path', None)
-			if mpath is not None and os.access(mpath, os.R_OK):
+            if cpath is None:
-				manifest.append(mpath)
+                pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
                if self.default_cover(pf):
                    cpath =  pf.name
            if cpath is not None and os.access(cpath, os.R_OK):
                opf.cover = cpath
                manifest.append(cpath)
-			opf.create_manifest_from_files_in(manifest)
+            # Get masthead
-			for mani in opf.manifest:
+            mpath = getattr(self, 'masthead_path', None)
-				if mani.path.endswith('.ncx'):
+            if mpath is not None and os.access(mpath, os.R_OK):
-					mani.id = 'ncx'
+                manifest.append(mpath)
 				if mani.path.endswith('mastheadImage.jpg'):
 					mani.id = 'masthead-image'
 			entries = ['index.html']
 			toc = TOC(base_path=dir)
 			self.play_order_counter = 0
 			self.play_order_map = {}
-		def feed_index(num, parent):
+            opf.create_manifest_from_files_in(manifest)
-			f = feeds[num]
+            for mani in opf.manifest:
-			for j, a in enumerate(f):
+                if mani.path.endswith('.ncx'):
-				if getattr(a, 'downloaded', False):
+                    mani.id = 'ncx'
-					adir = 'feed_%d/article_%d/'%(num, j)
+                if mani.path.endswith('mastheadImage.jpg'):
-					auth = a.author
+                    mani.id = 'masthead-image'
-					if not auth:
+            entries = ['index.html']
-						auth = None
+            toc = TOC(base_path=dir)
-					desc = a.text_summary
+            self.play_order_counter = 0
-					if not desc:
+            self.play_order_map = {}
-						desc = None
+
-					else:
+        def feed_index(num, parent):
-						desc = self.description_limiter(desc)
+            f = feeds[num]
-					entries.append('%sindex.html'%adir)
+            for j, a in enumerate(f):
-					po = self.play_order_map.get(entries[-1], None)
+                if getattr(a, 'downloaded', False):
-					if po is None:
+                    adir = 'feed_%d/article_%d/'%(num, j)
-						self.play_order_counter += 1
+                    auth = a.author
-						po = self.play_order_counter
+                    if not auth:
-					parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
+                        auth = None
                    desc = a.text_summary
                    if not desc:
                        desc = None
                    else:
                        desc = self.description_limiter(desc)
                    entries.append('%sindex.html'%adir)
                    po = self.play_order_map.get(entries[-1], None)
                    if po is None:
                        self.play_order_counter += 1
                        po = self.play_order_counter
                    parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
                                    play_order=po, author=auth, description=desc)
-					last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
+                    last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
-					for sp in a.sub_pages:
+                    for sp in a.sub_pages:
-						prefix = os.path.commonprefix([opf_path, sp])
+                        prefix = os.path.commonprefix([opf_path, sp])
-						relp = sp[len(prefix):]
+                        relp = sp[len(prefix):]
-						entries.append(relp.replace(os.sep, '/'))
+                        entries.append(relp.replace(os.sep, '/'))
-						last = sp
+                        last = sp
-					if os.path.exists(last):
+                    if os.path.exists(last):
-						with open(last, 'rb') as fi:
+                        with open(last, 'rb') as fi:
-							src = fi.read().decode('utf-8')
+                            src = fi.read().decode('utf-8')
-						soup = BeautifulSoup(src)
+                        soup = BeautifulSoup(src)
-						body = soup.find('body')
+                        body = soup.find('body')
-						if body is not None:
+                        if body is not None:
-							prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
+                            prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
-							templ = self.navbar.generate(True, num, j, len(f),
+                            templ = self.navbar.generate(True, num, j, len(f),
                                            not self.has_single_feed,
                                            a.orig_url, self.publisher, prefix=prefix,
                                            center=self.center_navbar)
-							elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
+                            elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
-							body.insert(len(body.contents), elem)
+                            body.insert(len(body.contents), elem)
-							with open(last, 'wb') as fi:
+                            with open(last, 'wb') as fi:
-								fi.write(unicode(soup).encode('utf-8'))
+                                fi.write(unicode(soup).encode('utf-8'))
-		if len(feeds) == 0:
+        if len(feeds) == 0:
-			raise Exception('All feeds are empty, aborting.')
+            raise Exception('All feeds are empty, aborting.')
-		if len(feeds) > 1:
+        if len(feeds) > 1:
-			for i, f in enumerate(feeds):
+            for i, f in enumerate(feeds):
-				entries.append('feed_%d/index.html'%i)
+                entries.append('feed_%d/index.html'%i)
-				po = self.play_order_map.get(entries[-1], None)
+                po = self.play_order_map.get(entries[-1], None)
-				if po is None:
+                if po is None:
-					self.play_order_counter += 1
+                    self.play_order_counter += 1
-					po = self.play_order_counter
+                    po = self.play_order_counter
-				auth = getattr(f, 'author', None)
+                auth = getattr(f, 'author', None)
-				if not auth:
+                if not auth:
-					auth = None
+                    auth = None
-				desc = getattr(f, 'description', None)
+                desc = getattr(f, 'description', None)
-				if not desc:
+                if not desc:
-					desc = None
+                    desc = None
-				feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
+                feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
                           f.title, play_order=po, description=desc, author=auth))
-		else:
+        else:
-			entries.append('feed_%d/index.html'%0)
+            entries.append('feed_%d/index.html'%0)
-			feed_index(0, toc)
+            feed_index(0, toc)
-		for i, p in enumerate(entries):
+        for i, p in enumerate(entries):
-			entries[i] = os.path.join(dir, p.replace('/', os.sep))
+            entries[i] = os.path.join(dir, p.replace('/', os.sep))
-		opf.create_spine(entries)
+        opf.create_spine(entries)
-		opf.set_toc(toc)
+        opf.set_toc(toc)
        with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
            opf.render(opf_file, ncx_file)
 		with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
 			opf.render(opf_file, ncx_file)
--- a/recipes/united_daily.recipe
+++ b/recipes/united_daily.recipe
@ -0,0 +1,67 @@
 # -*- coding: utf-8 -*-
 __license__   = 'GPL v3'
 from calibre.web.feeds.news import BasicNewsRecipe
 class UnitedDaily(BasicNewsRecipe):
    title = u'聯合新聞網'
    oldest_article = 1
    max_articles_per_feed = 100
    feeds = [(u'焦點', u'http://udn.com/udnrss/focus.xml'),
             (u'政治', u'http://udn.com/udnrss/politics.xml'),
             (u'社會', u'http://udn.com/udnrss/social.xml'),
             (u'生活', u'http://udn.com/udnrss/life.xml'),
             (u'綜合', u'http://udn.com/udnrss/education.xml'),
             (u'意見評論', u'http://udn.com/udnrss/opinion.xml'),
             (u'大台北', u'http://udn.com/udnrss/local_taipei.xml'),
             (u'桃竹苗', u'http://udn.com/udnrss/local_tyhcml.xml'),
             (u'中彰投', u'http://udn.com/udnrss/local_tcchnt.xml'),
             (u'雲嘉南', u'http://udn.com/udnrss/local_ylcytn.xml'),
             (u'高屏離島', u'http://udn.com/udnrss/local_ksptisland.xml'),
             (u'基宜花東', u'http://udn.com/udnrss/local_klilhltt.xml'),
             (u'台灣百寶鄉', u'http://udn.com/udnrss/local_oddlyenough.xml'),
             (u'兩岸要聞', u'http://udn.com/udnrss/mainland.xml'),
             (u'國際焦點', u'http://udn.com/udnrss/international.xml'),
             (u'台商經貿', u'http://udn.com/udnrss/financechina.xml'),
             (u'國際財經', u'http://udn.com/udnrss/financeworld.xml'),
             (u'財經焦點', u'http://udn.com/udnrss/financesfocus.xml'),
             (u'股市要聞', u'http://udn.com/udnrss/stock.xml'),
             (u'股市快訊', u'http://udn.com/udnrss/stklatest.xml'),
             (u'稅務法務', u'http://udn.com/udnrss/tax.xml'),
             (u'房市情報', u'http://udn.com/udnrss/houses.xml'),
             (u'棒球', u'http://udn.com/udnrss/baseball.xml'),
             (u'籃球', u'http://udn.com/udnrss/basketball.xml'),
             (u'體壇動態', u'http://udn.com/udnrss/sportsfocus.xml'),
             (u'熱門星聞', u'http://udn.com/udnrss/starsfocus.xml'),
             (u'廣電港陸', u'http://udn.com/udnrss/tv.xml'),
             (u'海外星球', u'http://udn.com/udnrss/starswestern.xml'),
             (u'日韓星情', u'http://udn.com/udnrss/starsjk.xml'),
             (u'電影世界', u'http://udn.com/udnrss/movie.xml'),
             (u'流行音樂', u'http://udn.com/udnrss/music.xml'),
             (u'觀點專題', u'http://udn.com/udnrss/starssubject.xml'),
             (u'食樂指南', u'http://udn.com/udnrss/food.xml'),
             (u'折扣好康', u'http://udn.com/udnrss/shopping.xml'),
             (u'醫藥新聞', u'http://udn.com/udnrss/health.xml'),
             (u'家婦繽紛', u'http://udn.com/udnrss/benfen.xml'),
             (u'談星論命', u'http://udn.com/udnrss/astrology.xml'),
             (u'文化副刊', u'http://udn.com/udnrss/reading.xml'),
             ]
    extra_css = '''div[id='story_title'] {font-size:200%; font-weight:bold;}'''
    __author__ = 'Eddie Lau'
    __version__ = '1.0'
    language = 'zh'
    publisher = 'United Daily News Group'
    description = 'United Daily (Taiwan)'
    category = 'News, Chinese, Taiwan'
    remove_javascript = True
    use_embedded_content = False
    no_stylesheets = True
    encoding = 'big5'
    conversion_options = {'linearize_tables':True}
    masthead_url = 'http://udn.com/NEWS/2004/images/logo_udn.gif'
    cover_url = 'http://udn.com/NEWS/2004/images/logo_udn.gif'
    keep_only_tags = [dict(name='div', attrs={'id':['story_title','story_author', 'story']})]
    remove_tags = [dict(name='div', attrs={'id':['mvouter']})]