Updated Ming Pao and Various Taiwanese news sources by Eddie Lau

This commit is contained in:
Kovid Goyal 2011-05-12 10:10:14 -06:00
parent c7c9ade376
commit 16c92f3d23
4 changed files with 505 additions and 304 deletions

View File

@ -0,0 +1,42 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
# dug from http://www.mobileread.com/forums/showthread.php?p=1012294
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1277443634(BasicNewsRecipe):
title = u'中時電子報'
oldest_article = 1
max_articles_per_feed = 100
feeds = [(u'焦點', u'http://rss.chinatimes.com/rss/focus-u.rss'),
(u'政治', u'http://rss.chinatimes.com/rss/Politic-u.rss'),
(u'社會', u'http://rss.chinatimes.com/rss/social-u.rss'),
(u'國際', u'http://rss.chinatimes.com/rss/international-u.rss'),
(u'兩岸', u'http://rss.chinatimes.com/rss/mainland-u.rss'),
(u'地方', u'http://rss.chinatimes.com/rss/local-u.rss'),
(u'言論', u'http://rss.chinatimes.com/rss/comment-u.rss'),
(u'科技', u'http://rss.chinatimes.com/rss/technology-u.rss'),
(u'運動', u'http://rss.chinatimes.com/rss/sport-u.rss'),
(u'藝文', u'http://rss.chinatimes.com/rss/philology-u.rss'),
#(u'旺報', u'http://rss.chinatimes.com/rss/want-u.rss'),
#(u'財經', u'http://rss.chinatimes.com/rss/finance-u.rss'), # broken links
#(u'股市', u'http://rss.chinatimes.com/rss/stock-u.rss') # broken links
]
__author__ = 'einstuerzende, updated by Eddie Lau'
__version__ = '1.0'
language = 'zh'
publisher = 'China Times Group'
description = 'China Times (Taiwan)'
category = 'News, Chinese, Taiwan'
remove_javascript = True
use_embedded_content = False
no_stylesheets = True
encoding = 'big5'
conversion_options = {'linearize_tables':True}
masthead_url = 'http://www.fcuaa.org/gif/chinatimeslogo.gif'
cover_url = 'http://www.fcuaa.org/gif/chinatimeslogo.gif'
keep_only_tags = [dict(name='div', attrs={'class':['articlebox','articlebox clearfix']})]
remove_tags = [dict(name='div', attrs={'class':['focus-news']})]

View File

@ -0,0 +1,44 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
# dug from http://www.mobileread.com/forums/showthread.php?p=1012294
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1277443634(BasicNewsRecipe):
title = u'自由電子報'
oldest_article = 1
max_articles_per_feed = 100
feeds = [(u'焦點新聞', u'http://www.libertytimes.com.tw/rss/fo.xml'),
(u'政治新聞', u'http://www.libertytimes.com.tw/rss/p.xml'),
(u'生活新聞', u'http://www.libertytimes.com.tw/rss/life.xml'),
(u'國際新聞', u'http://www.libertytimes.com.tw/rss/int.xml'),
(u'自由廣場', u'http://www.libertytimes.com.tw/rss/o.xml'),
(u'社會新聞', u'http://www.libertytimes.com.tw/rss/so.xml'),
(u'體育新聞', u'http://www.libertytimes.com.tw/rss/sp.xml'),
(u'財經焦點', u'http://www.libertytimes.com.tw/rss/e.xml'),
(u'證券理財', u'http://www.libertytimes.com.tw/rss/stock.xml'),
(u'影視焦點', u'http://www.libertytimes.com.tw/rss/show.xml'),
(u'北部新聞', u'http://www.libertytimes.com.tw/rss/north.xml'),
(u'中部新聞', u'http://www.libertytimes.com.tw/rss/center.xml'),
(u'南部新聞', u'http://www.libertytimes.com.tw/rss/south.xml'),
(u'大台北新聞', u'http://www.libertytimes.com.tw/rss/taipei.xml'),
(u'藝術文化', u'http://www.libertytimes.com.tw/rss/art.xml'),
]
extra_css = '''span[class='insubject1'][id='newtitle'] {font-size:200%; font-weight:bold;}'''
__author__ = 'einstuerzende, updated by Eddie Lau'
__version__ = '1.1'
language = 'zh'
publisher = 'Liberty Times Group'
description = 'Liberty Times (Taiwan)'
category = 'News, Chinese, Taiwan'
remove_javascript = True
use_embedded_content = False
no_stylesheets = True
encoding = 'big5'
conversion_options = {'linearize_tables':True}
masthead_url = 'http://www.libertytimes.com.tw/2008/images/img_auto/005/logo_new.gif'
cover_url = 'http://www.libertytimes.com.tw/2008/images/img_auto/005/logo_new.gif'
keep_only_tags = [dict(name='td', attrs={'id':['newsContent']})]

View File

@ -1,15 +1,18 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2010-2011, Eddie Lau' __copyright__ = '2010-2011, Eddie Lau'
# Users of Kindle 3 (with limited system-level CJK support) # Users of Kindle 3 with limited system-level CJK support
# please replace the following "True" with "False". # please replace the following "True" with "False".
__MakePeriodical__ = True __MakePeriodical__ = True
# Turn it to True if your device supports display of CJK titles # Turn below to true if your device supports display of CJK titles
__UseChineseTitle__ = False __UseChineseTitle__ = False
# Trun below to true if you wish to use life.mingpao.com as the main article source
__UseLife__ = True
''' '''
Change Log: Change Log:
2011/05/12: switch the main parse source to life.mingpao.com, which has more photos on the article pages
2011/03/06: add new articles for finance section, also a new section "Columns" 2011/03/06: add new articles for finance section, also a new section "Columns"
2011/02/28: rearrange the sections 2011/02/28: rearrange the sections
[Disabled until Kindle has better CJK support and can remember last (section,article) read in Sections & Articles [Disabled until Kindle has better CJK support and can remember last (section,article) read in Sections & Articles
@ -32,41 +35,43 @@ import os, datetime, re
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from contextlib import nested from contextlib import nested
from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.metadata.toc import TOC from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata import MetaInformation
class MPHKRecipe(BasicNewsRecipe): class MPHKRecipe(BasicNewsRecipe):
title = 'Ming Pao - Hong Kong' title = 'Ming Pao - Hong Kong'
oldest_article = 1 oldest_article = 1
max_articles_per_feed = 100 max_articles_per_feed = 100
__author__ = 'Eddie Lau' __author__ = 'Eddie Lau'
description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)' description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
publisher = 'MingPao' publisher = 'MingPao'
category = 'Chinese, News, Hong Kong' category = 'Chinese, News, Hong Kong'
remove_javascript = True remove_javascript = True
use_embedded_content = False use_embedded_content = False
no_stylesheets = True no_stylesheets = True
language = 'zh' language = 'zh'
encoding = 'Big5-HKSCS' encoding = 'Big5-HKSCS'
recursions = 0 recursions = 0
conversion_options = {'linearize_tables':True} conversion_options = {'linearize_tables':True}
timefmt = '' timefmt = ''
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;}' extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;}'
masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif' masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
keep_only_tags = [dict(name='h1'), keep_only_tags = [dict(name='h1'),
dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
dict(name='font', attrs={'color':['AA0000']}), # for column articles title dict(name='font', attrs={'color':['AA0000']}), # for column articles title
dict(attrs={'id':['newscontent']}), # entertainment and column page content dict(attrs={'id':['newscontent']}), # entertainment and column page content
dict(attrs={'id':['newscontent01','newscontent02']}), dict(attrs={'id':['newscontent01','newscontent02']}),
dict(attrs={'class':['photo']}) dict(attrs={'class':['photo']}),
dict(name='img', attrs={'width':['180'], 'alt':['按圖放大']}) # images for source from life.mingpao.com
] ]
remove_tags = [dict(name='style'), remove_tags = [dict(name='style'),
dict(attrs={'id':['newscontent135']}), # for the finance page dict(attrs={'id':['newscontent135']}), # for the finance page from mpfinance.com
dict(name='table')] # for content fetched from life.mingpao.com dict(name='table')] # for content fetched from life.mingpao.com
remove_attributes = ['width'] remove_attributes = ['width']
preprocess_regexps = [ preprocess_regexps = [
(re.compile(r'<h5>', re.DOTALL|re.IGNORECASE), (re.compile(r'<h5>', re.DOTALL|re.IGNORECASE),
lambda match: '<h1>'), lambda match: '<h1>'),
(re.compile(r'</h5>', re.DOTALL|re.IGNORECASE), (re.compile(r'</h5>', re.DOTALL|re.IGNORECASE),
@ -80,10 +85,10 @@ class MPHKRecipe(BasicNewsRecipe):
lambda match: "</b>") lambda match: "</b>")
] ]
def image_url_processor(cls, baseurl, url): def image_url_processor(cls, baseurl, url):
# trick: break the url at the first occurance of digit, add an additional # trick: break the url at the first occurance of digit, add an additional
# '_' at the front # '_' at the front
# not working, may need to move this to preprocess_html() method # not working, may need to move this to preprocess_html() method
# minIdx = 10000 # minIdx = 10000
# i0 = url.find('0') # i0 = url.find('0')
# if i0 >= 0 and i0 < minIdx: # if i0 >= 0 and i0 < minIdx:
@ -115,314 +120,357 @@ class MPHKRecipe(BasicNewsRecipe):
# i9 = url.find('9') # i9 = url.find('9')
# if i9 >= 0 and i9 < minIdx: # if i9 >= 0 and i9 < minIdx:
# minIdx = i9 # minIdx = i9
return url return url
def get_dtlocal(self): def get_dtlocal(self):
dt_utc = datetime.datetime.utcnow() dt_utc = datetime.datetime.utcnow()
# convert UTC to local hk time - at around HKT 6.00am, all news are available # convert UTC to local hk time - at around HKT 6.00am, all news are available
dt_local = dt_utc - datetime.timedelta(-2.0/24) dt_local = dt_utc - datetime.timedelta(-2.0/24)
return dt_local return dt_local
def get_fetchdate(self): def get_fetchdate(self):
return self.get_dtlocal().strftime("%Y%m%d") return self.get_dtlocal().strftime("%Y%m%d")
def get_fetchformatteddate(self): def get_fetchformatteddate(self):
return self.get_dtlocal().strftime("%Y-%m-%d") return self.get_dtlocal().strftime("%Y-%m-%d")
def get_fetchday(self): def get_fetchday(self):
# convert UTC to local hk time - at around HKT 6.00am, all news are available # dt_utc = datetime.datetime.utcnow()
return self.get_dtlocal().strftime("%d") # convert UTC to local hk time - at around HKT 6.00am, all news are available
# dt_local = dt_utc - datetime.timedelta(-2.0/24)
return self.get_dtlocal().strftime("%d")
def get_cover_url(self): def get_cover_url(self):
cover = 'http://news.mingpao.com/' + self.get_fetchdate() + '/' + self.get_fetchdate() + '_' + self.get_fetchday() + 'gacov.jpg' cover = 'http://news.mingpao.com/' + self.get_fetchdate() + '/' + self.get_fetchdate() + '_' + self.get_fetchday() + 'gacov.jpg'
br = BasicNewsRecipe.get_browser() br = BasicNewsRecipe.get_browser()
try: try:
br.open(cover) br.open(cover)
except: except:
cover = None cover = None
return cover return cover
def parse_index(self): def parse_index(self):
feeds = [] feeds = []
dateStr = self.get_fetchdate() dateStr = self.get_fetchdate()
for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'), if __UseLife__:
(u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'), for title, url, keystr in [(u'\u8981\u805e Headline', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalga', 'nal'),
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm')]: (u'\u6e2f\u805e Local', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalgb', 'nal'),
articles = self.parse_section(url) (u'\u6559\u80b2 Education', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalgf', 'nal'),
if articles: (u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr', 'nal'),
feeds.append((title, articles)) (u'\u8ad6\u58c7 Forum', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalfa', 'nal'),
(u'\u4e2d\u570b China', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalca', 'nal'),
(u'\u570b\u969b World', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalta', 'nal'),
(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal'),
(u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'),
(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal'),
(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')]:
articles = self.parse_section2(url, keystr)
if articles:
feeds.append((title, articles))
# special- editorial for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr') (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
if ed_articles: articles = self.parse_section(url)
feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles)) if articles:
feeds.append((title, articles))
else:
for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
(u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm')]:
articles = self.parse_section(url)
if articles:
feeds.append((title, articles))
for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'), # special- editorial
(u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'), ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
(u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm')]: if ed_articles:
articles = self.parse_section(url) feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
if articles:
feeds.append((title, articles))
# special - finance for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
#fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm') (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea') (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm')]:
if fin_articles: articles = self.parse_section(url)
feeds.append((u'\u7d93\u6fdf Finance', fin_articles)) if articles:
feeds.append((title, articles))
for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), # special - finance
(u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]: #fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
articles = self.parse_section(url) fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
if articles: if fin_articles:
feeds.append((title, articles)) feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
# special - entertainment for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm') (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
if ent_articles: articles = self.parse_section(url)
feeds.append((u'\u5f71\u8996 Film/TV', ent_articles)) if articles:
feeds.append((title, articles))
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), # special - entertainment
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
articles = self.parse_section(url) if ent_articles:
if articles: feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
feeds.append((title, articles))
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
articles = self.parse_section(url)
if articles:
feeds.append((title, articles))
# special- columns # special- columns
col_articles = self.parse_col_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn') col_articles = self.parse_col_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn')
if col_articles: if col_articles:
feeds.append((u'\u5c08\u6b04 Columns', col_articles)) feeds.append((u'\u5c08\u6b04 Columns', col_articles))
return feeds return feeds
def parse_section(self, url): # parse from news.mingpao.com
dateStr = self.get_fetchdate() def parse_section(self, url):
soup = self.index_to_soup(url) dateStr = self.get_fetchdate()
divs = soup.findAll(attrs={'class': ['bullet','bullet_grey']}) soup = self.index_to_soup(url)
current_articles = [] divs = soup.findAll(attrs={'class': ['bullet','bullet_grey']})
included_urls = [] current_articles = []
divs.reverse() included_urls = []
for i in divs: divs.reverse()
a = i.find('a', href = True) for i in divs:
title = self.tag_to_string(a) a = i.find('a', href = True)
url = a.get('href', False) title = self.tag_to_string(a)
url = 'http://news.mingpao.com/' + dateStr + '/' +url url = a.get('href', False)
if url not in included_urls and url.rfind('Redirect') == -1: url = 'http://news.mingpao.com/' + dateStr + '/' +url
current_articles.append({'title': title, 'url': url, 'description':'', 'date':''}) if url not in included_urls and url.rfind('Redirect') == -1:
included_urls.append(url) current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
current_articles.reverse() included_urls.append(url)
return current_articles current_articles.reverse()
return current_articles
def parse_ed_section(self, url): # parse from life.mingpao.com
self.get_fetchdate() def parse_section2(self, url, keystr):
soup = self.index_to_soup(url) self.get_fetchdate()
a = soup.findAll('a', href=True) soup = self.index_to_soup(url)
a.reverse() a = soup.findAll('a', href=True)
current_articles = [] a.reverse()
included_urls = [] current_articles = []
for i in a: included_urls = []
title = self.tag_to_string(i) for i in a:
url = 'http://life.mingpao.com/cfm/' + i.get('href', False) title = self.tag_to_string(i)
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('nal') == -1): url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
current_articles.append({'title': title, 'url': url, 'description': ''}) if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
included_urls.append(url) current_articles.append({'title': title, 'url': url, 'description': ''})
current_articles.reverse() included_urls.append(url)
return current_articles current_articles.reverse()
return current_articles
def parse_fin_section(self, url): def parse_ed_section(self, url):
self.get_fetchdate() self.get_fetchdate()
soup = self.index_to_soup(url) soup = self.index_to_soup(url)
a = soup.findAll('a', href= True) a = soup.findAll('a', href=True)
current_articles = [] a.reverse()
included_urls = [] current_articles = []
for i in a: included_urls = []
#url = 'http://www.mpfinance.com/cfm/' + i.get('href', False) for i in a:
url = 'http://life.mingpao.com/cfm/' + i.get('href', False) title = self.tag_to_string(i)
#if url not in included_urls and not url.rfind(dateStr) == -1 and url.rfind('index') == -1: url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
if url not in included_urls and (not url.rfind('txt') == -1) and (not url.rfind('nal') == -1): if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('nal') == -1):
title = self.tag_to_string(i) current_articles.append({'title': title, 'url': url, 'description': ''})
current_articles.append({'title': title, 'url': url, 'description':''}) included_urls.append(url)
included_urls.append(url) current_articles.reverse()
return current_articles return current_articles
def parse_ent_section(self, url): def parse_fin_section(self, url):
self.get_fetchdate() self.get_fetchdate()
soup = self.index_to_soup(url) soup = self.index_to_soup(url)
a = soup.findAll('a', href=True) a = soup.findAll('a', href= True)
a.reverse() current_articles = []
current_articles = [] included_urls = []
included_urls = [] for i in a:
for i in a: #url = 'http://www.mpfinance.com/cfm/' + i.get('href', False)
title = self.tag_to_string(i) url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
url = 'http://ol.mingpao.com/cfm/' + i.get('href', False) #if url not in included_urls and not url.rfind(dateStr) == -1 and url.rfind('index') == -1:
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('star') == -1): if url not in included_urls and (not url.rfind('txt') == -1) and (not url.rfind('nal') == -1):
current_articles.append({'title': title, 'url': url, 'description': ''}) title = self.tag_to_string(i)
included_urls.append(url) current_articles.append({'title': title, 'url': url, 'description':''})
current_articles.reverse() included_urls.append(url)
return current_articles return current_articles
def parse_col_section(self, url): def parse_ent_section(self, url):
self.get_fetchdate() self.get_fetchdate()
soup = self.index_to_soup(url) soup = self.index_to_soup(url)
a = soup.findAll('a', href=True) a = soup.findAll('a', href=True)
a.reverse() a.reverse()
current_articles = [] current_articles = []
included_urls = [] included_urls = []
for i in a: for i in a:
title = self.tag_to_string(i) title = self.tag_to_string(i)
url = 'http://life.mingpao.com/cfm/' + i.get('href', False) url = 'http://ol.mingpao.com/cfm/' + i.get('href', False)
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('ncl') == -1): if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('star') == -1):
current_articles.append({'title': title, 'url': url, 'description': ''}) current_articles.append({'title': title, 'url': url, 'description': ''})
included_urls.append(url) included_urls.append(url)
current_articles.reverse() current_articles.reverse()
return current_articles return current_articles
def preprocess_html(self, soup): def parse_col_section(self, url):
for item in soup.findAll(style=True): self.get_fetchdate()
del item['style'] soup = self.index_to_soup(url)
for item in soup.findAll(style=True): a = soup.findAll('a', href=True)
del item['width'] a.reverse()
for item in soup.findAll(stype=True): current_articles = []
del item['absmiddle'] included_urls = []
return soup for i in a:
title = self.tag_to_string(i)
url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('ncl') == -1):
current_articles.append({'title': title, 'url': url, 'description': ''})
included_urls.append(url)
current_articles.reverse()
return current_articles
def create_opf(self, feeds, dir=None): def preprocess_html(self, soup):
if dir is None: for item in soup.findAll(style=True):
dir = self.output_dir del item['style']
if __UseChineseTitle__ == True: for item in soup.findAll(style=True):
title = u'\u660e\u5831 (\u9999\u6e2f)' del item['width']
else: for item in soup.findAll(stype=True):
title = self.short_title() del item['absmiddle']
# if not generating a periodical, force date to apply in title return soup
if __MakePeriodical__ == False:
title = title + ' ' + self.get_fetchformatteddate()
if True:
mi = MetaInformation(title, [self.publisher])
mi.publisher = self.publisher
mi.author_sort = self.publisher
if __MakePeriodical__ == True:
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
else:
mi.publication_type = self.publication_type+':'+self.short_title()
#mi.timestamp = nowf()
mi.timestamp = self.get_dtlocal()
mi.comments = self.description
if not isinstance(mi.comments, unicode):
mi.comments = mi.comments.decode('utf-8', 'replace')
#mi.pubdate = nowf()
mi.pubdate = self.get_dtlocal()
opf_path = os.path.join(dir, 'index.opf')
ncx_path = os.path.join(dir, 'index.ncx')
opf = OPFCreator(dir, mi)
# Add mastheadImage entry to <guide> section
mp = getattr(self, 'masthead_path', None)
if mp is not None and os.access(mp, os.R_OK):
from calibre.ebooks.metadata.opf2 import Guide
ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
ref.type = 'masthead'
ref.title = 'Masthead Image'
opf.guide.append(ref)
manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))] def create_opf(self, feeds, dir=None):
manifest.append(os.path.join(dir, 'index.html')) if dir is None:
manifest.append(os.path.join(dir, 'index.ncx')) dir = self.output_dir
if __UseChineseTitle__ == True:
title = u'\u660e\u5831 (\u9999\u6e2f)'
else:
title = self.short_title()
# if not generating a periodical, force date to apply in title
if __MakePeriodical__ == False:
title = title + ' ' + self.get_fetchformatteddate()
if True:
mi = MetaInformation(title, [self.publisher])
mi.publisher = self.publisher
mi.author_sort = self.publisher
if __MakePeriodical__ == True:
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
else:
mi.publication_type = self.publication_type+':'+self.short_title()
#mi.timestamp = nowf()
mi.timestamp = self.get_dtlocal()
mi.comments = self.description
if not isinstance(mi.comments, unicode):
mi.comments = mi.comments.decode('utf-8', 'replace')
#mi.pubdate = nowf()
mi.pubdate = self.get_dtlocal()
opf_path = os.path.join(dir, 'index.opf')
ncx_path = os.path.join(dir, 'index.ncx')
opf = OPFCreator(dir, mi)
# Add mastheadImage entry to <guide> section
mp = getattr(self, 'masthead_path', None)
if mp is not None and os.access(mp, os.R_OK):
from calibre.ebooks.metadata.opf2 import Guide
ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
ref.type = 'masthead'
ref.title = 'Masthead Image'
opf.guide.append(ref)
# Get cover manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
cpath = getattr(self, 'cover_path', None) manifest.append(os.path.join(dir, 'index.html'))
if cpath is None: manifest.append(os.path.join(dir, 'index.ncx'))
pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
if self.default_cover(pf):
cpath = pf.name
if cpath is not None and os.access(cpath, os.R_OK):
opf.cover = cpath
manifest.append(cpath)
# Get masthead # Get cover
mpath = getattr(self, 'masthead_path', None) cpath = getattr(self, 'cover_path', None)
if mpath is not None and os.access(mpath, os.R_OK): if cpath is None:
manifest.append(mpath) pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
if self.default_cover(pf):
cpath = pf.name
if cpath is not None and os.access(cpath, os.R_OK):
opf.cover = cpath
manifest.append(cpath)
opf.create_manifest_from_files_in(manifest) # Get masthead
for mani in opf.manifest: mpath = getattr(self, 'masthead_path', None)
if mani.path.endswith('.ncx'): if mpath is not None and os.access(mpath, os.R_OK):
mani.id = 'ncx' manifest.append(mpath)
if mani.path.endswith('mastheadImage.jpg'):
mani.id = 'masthead-image'
entries = ['index.html']
toc = TOC(base_path=dir)
self.play_order_counter = 0
self.play_order_map = {}
def feed_index(num, parent): opf.create_manifest_from_files_in(manifest)
f = feeds[num] for mani in opf.manifest:
for j, a in enumerate(f): if mani.path.endswith('.ncx'):
if getattr(a, 'downloaded', False): mani.id = 'ncx'
adir = 'feed_%d/article_%d/'%(num, j) if mani.path.endswith('mastheadImage.jpg'):
auth = a.author mani.id = 'masthead-image'
if not auth: entries = ['index.html']
auth = None toc = TOC(base_path=dir)
desc = a.text_summary self.play_order_counter = 0
if not desc: self.play_order_map = {}
desc = None
else: def feed_index(num, parent):
desc = self.description_limiter(desc) f = feeds[num]
entries.append('%sindex.html'%adir) for j, a in enumerate(f):
po = self.play_order_map.get(entries[-1], None) if getattr(a, 'downloaded', False):
if po is None: adir = 'feed_%d/article_%d/'%(num, j)
self.play_order_counter += 1 auth = a.author
po = self.play_order_counter if not auth:
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'), auth = None
desc = a.text_summary
if not desc:
desc = None
else:
desc = self.description_limiter(desc)
entries.append('%sindex.html'%adir)
po = self.play_order_map.get(entries[-1], None)
if po is None:
self.play_order_counter += 1
po = self.play_order_counter
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
play_order=po, author=auth, description=desc) play_order=po, author=auth, description=desc)
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep)) last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
for sp in a.sub_pages: for sp in a.sub_pages:
prefix = os.path.commonprefix([opf_path, sp]) prefix = os.path.commonprefix([opf_path, sp])
relp = sp[len(prefix):] relp = sp[len(prefix):]
entries.append(relp.replace(os.sep, '/')) entries.append(relp.replace(os.sep, '/'))
last = sp last = sp
if os.path.exists(last): if os.path.exists(last):
with open(last, 'rb') as fi: with open(last, 'rb') as fi:
src = fi.read().decode('utf-8') src = fi.read().decode('utf-8')
soup = BeautifulSoup(src) soup = BeautifulSoup(src)
body = soup.find('body') body = soup.find('body')
if body is not None: if body is not None:
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last)))) prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
templ = self.navbar.generate(True, num, j, len(f), templ = self.navbar.generate(True, num, j, len(f),
not self.has_single_feed, not self.has_single_feed,
a.orig_url, self.publisher, prefix=prefix, a.orig_url, self.publisher, prefix=prefix,
center=self.center_navbar) center=self.center_navbar)
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div') elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
body.insert(len(body.contents), elem) body.insert(len(body.contents), elem)
with open(last, 'wb') as fi: with open(last, 'wb') as fi:
fi.write(unicode(soup).encode('utf-8')) fi.write(unicode(soup).encode('utf-8'))
if len(feeds) == 0: if len(feeds) == 0:
raise Exception('All feeds are empty, aborting.') raise Exception('All feeds are empty, aborting.')
if len(feeds) > 1: if len(feeds) > 1:
for i, f in enumerate(feeds): for i, f in enumerate(feeds):
entries.append('feed_%d/index.html'%i) entries.append('feed_%d/index.html'%i)
po = self.play_order_map.get(entries[-1], None) po = self.play_order_map.get(entries[-1], None)
if po is None: if po is None:
self.play_order_counter += 1 self.play_order_counter += 1
po = self.play_order_counter po = self.play_order_counter
auth = getattr(f, 'author', None) auth = getattr(f, 'author', None)
if not auth: if not auth:
auth = None auth = None
desc = getattr(f, 'description', None) desc = getattr(f, 'description', None)
if not desc: if not desc:
desc = None desc = None
feed_index(i, toc.add_item('feed_%d/index.html'%i, None, feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
f.title, play_order=po, description=desc, author=auth)) f.title, play_order=po, description=desc, author=auth))
else: else:
entries.append('feed_%d/index.html'%0) entries.append('feed_%d/index.html'%0)
feed_index(0, toc) feed_index(0, toc)
for i, p in enumerate(entries): for i, p in enumerate(entries):
entries[i] = os.path.join(dir, p.replace('/', os.sep)) entries[i] = os.path.join(dir, p.replace('/', os.sep))
opf.create_spine(entries) opf.create_spine(entries)
opf.set_toc(toc) opf.set_toc(toc)
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
opf.render(opf_file, ncx_file)
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
opf.render(opf_file, ncx_file)

View File

@ -0,0 +1,67 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
from calibre.web.feeds.news import BasicNewsRecipe
class UnitedDaily(BasicNewsRecipe):
title = u'聯合新聞網'
oldest_article = 1
max_articles_per_feed = 100
feeds = [(u'焦點', u'http://udn.com/udnrss/focus.xml'),
(u'政治', u'http://udn.com/udnrss/politics.xml'),
(u'社會', u'http://udn.com/udnrss/social.xml'),
(u'生活', u'http://udn.com/udnrss/life.xml'),
(u'綜合', u'http://udn.com/udnrss/education.xml'),
(u'意見評論', u'http://udn.com/udnrss/opinion.xml'),
(u'大台北', u'http://udn.com/udnrss/local_taipei.xml'),
(u'桃竹苗', u'http://udn.com/udnrss/local_tyhcml.xml'),
(u'中彰投', u'http://udn.com/udnrss/local_tcchnt.xml'),
(u'雲嘉南', u'http://udn.com/udnrss/local_ylcytn.xml'),
(u'高屏離島', u'http://udn.com/udnrss/local_ksptisland.xml'),
(u'基宜花東', u'http://udn.com/udnrss/local_klilhltt.xml'),
(u'台灣百寶鄉', u'http://udn.com/udnrss/local_oddlyenough.xml'),
(u'兩岸要聞', u'http://udn.com/udnrss/mainland.xml'),
(u'國際焦點', u'http://udn.com/udnrss/international.xml'),
(u'台商經貿', u'http://udn.com/udnrss/financechina.xml'),
(u'國際財經', u'http://udn.com/udnrss/financeworld.xml'),
(u'財經焦點', u'http://udn.com/udnrss/financesfocus.xml'),
(u'股市要聞', u'http://udn.com/udnrss/stock.xml'),
(u'股市快訊', u'http://udn.com/udnrss/stklatest.xml'),
(u'稅務法務', u'http://udn.com/udnrss/tax.xml'),
(u'房市情報', u'http://udn.com/udnrss/houses.xml'),
(u'棒球', u'http://udn.com/udnrss/baseball.xml'),
(u'籃球', u'http://udn.com/udnrss/basketball.xml'),
(u'體壇動態', u'http://udn.com/udnrss/sportsfocus.xml'),
(u'熱門星聞', u'http://udn.com/udnrss/starsfocus.xml'),
(u'廣電港陸', u'http://udn.com/udnrss/tv.xml'),
(u'海外星球', u'http://udn.com/udnrss/starswestern.xml'),
(u'日韓星情', u'http://udn.com/udnrss/starsjk.xml'),
(u'電影世界', u'http://udn.com/udnrss/movie.xml'),
(u'流行音樂', u'http://udn.com/udnrss/music.xml'),
(u'觀點專題', u'http://udn.com/udnrss/starssubject.xml'),
(u'食樂指南', u'http://udn.com/udnrss/food.xml'),
(u'折扣好康', u'http://udn.com/udnrss/shopping.xml'),
(u'醫藥新聞', u'http://udn.com/udnrss/health.xml'),
(u'家婦繽紛', u'http://udn.com/udnrss/benfen.xml'),
(u'談星論命', u'http://udn.com/udnrss/astrology.xml'),
(u'文化副刊', u'http://udn.com/udnrss/reading.xml'),
]
extra_css = '''div[id='story_title'] {font-size:200%; font-weight:bold;}'''
__author__ = 'Eddie Lau'
__version__ = '1.0'
language = 'zh'
publisher = 'United Daily News Group'
description = 'United Daily (Taiwan)'
category = 'News, Chinese, Taiwan'
remove_javascript = True
use_embedded_content = False
no_stylesheets = True
encoding = 'big5'
conversion_options = {'linearize_tables':True}
masthead_url = 'http://udn.com/NEWS/2004/images/logo_udn.gif'
cover_url = 'http://udn.com/NEWS/2004/images/logo_udn.gif'
keep_only_tags = [dict(name='div', attrs={'id':['story_title','story_author', 'story']})]
remove_tags = [dict(name='div', attrs={'id':['mvouter']})]