mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Add Ming Pao Vancouver and Toronto by Eddie Lau
This commit is contained in:
parent
6747188703
commit
eb0358025d
@ -1,17 +1,23 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010-2011, Eddie Lau'
|
||||
|
||||
# Region - Hong Kong, Vancouver, Toronto
|
||||
__Region__ = 'Hong Kong'
|
||||
# Users of Kindle 3 with limited system-level CJK support
|
||||
# please replace the following "True" with "False".
|
||||
__MakePeriodical__ = True
|
||||
# Turn below to true if your device supports display of CJK titles
|
||||
__UseChineseTitle__ = False
|
||||
# Trun below to true if you wish to use life.mingpao.com as the main article source
|
||||
# Set it to False if you want to skip images
|
||||
__KeepImages__ = True
|
||||
# (HK only) Turn below to true if you wish to use life.mingpao.com as the main article source
|
||||
__UseLife__ = True
|
||||
|
||||
|
||||
'''
|
||||
Change Log:
|
||||
2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source
|
||||
provide options to remove all images in the file
|
||||
2011/05/12: switch the main parse source to life.mingpao.com, which has more photos on the article pages
|
||||
2011/03/06: add new articles for finance section, also a new section "Columns"
|
||||
2011/02/28: rearrange the sections
|
||||
@ -34,21 +40,96 @@ Change Log:
|
||||
import os, datetime, re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from contextlib import nested
|
||||
|
||||
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||
from calibre.ebooks.metadata.toc import TOC
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
|
||||
class MPHKRecipe(BasicNewsRecipe):
|
||||
title = 'Ming Pao - Hong Kong'
|
||||
# MAIN CLASS
|
||||
class MPRecipe(BasicNewsRecipe):
|
||||
if __Region__ == 'Hong Kong':
|
||||
title = 'Ming Pao - Hong Kong'
|
||||
description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
|
||||
category = 'Chinese, News, Hong Kong'
|
||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;}'
|
||||
masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
|
||||
keep_only_tags = [dict(name='h1'),
|
||||
dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
|
||||
dict(name='font', attrs={'color':['AA0000']}), # for column articles title
|
||||
dict(attrs={'id':['newscontent']}), # entertainment and column page content
|
||||
dict(attrs={'id':['newscontent01','newscontent02']}),
|
||||
dict(attrs={'class':['photo']}),
|
||||
dict(name='table', attrs={'width':['100%'], 'border':['0'], 'cellspacing':['5'], 'cellpadding':['0']}), # content in printed version of life.mingpao.com
|
||||
dict(name='img', attrs={'width':['180'], 'alt':['按圖放大']}) # images for source from life.mingpao.com
|
||||
]
|
||||
if __KeepImages__:
|
||||
remove_tags = [dict(name='style'),
|
||||
dict(attrs={'id':['newscontent135']}), # for the finance page from mpfinance.com
|
||||
dict(name='font', attrs={'size':['2'], 'color':['666666']}), # article date in life.mingpao.com article
|
||||
#dict(name='table') # for content fetched from life.mingpao.com
|
||||
]
|
||||
else:
|
||||
remove_tags = [dict(name='style'),
|
||||
dict(attrs={'id':['newscontent135']}), # for the finance page from mpfinance.com
|
||||
dict(name='font', attrs={'size':['2'], 'color':['666666']}), # article date in life.mingpao.com article
|
||||
dict(name='img'),
|
||||
#dict(name='table') # for content fetched from life.mingpao.com
|
||||
]
|
||||
remove_attributes = ['width']
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'<h5>', re.DOTALL|re.IGNORECASE),
|
||||
lambda match: '<h1>'),
|
||||
(re.compile(r'</h5>', re.DOTALL|re.IGNORECASE),
|
||||
lambda match: '</h1>'),
|
||||
(re.compile(r'<p><a href=.+?</a></p>', re.DOTALL|re.IGNORECASE), # for entertainment page
|
||||
lambda match: ''),
|
||||
# skip <br> after title in life.mingpao.com fetched article
|
||||
(re.compile(r"<div id='newscontent'><br>", re.DOTALL|re.IGNORECASE),
|
||||
lambda match: "<div id='newscontent'>"),
|
||||
(re.compile(r"<br><br></b>", re.DOTALL|re.IGNORECASE),
|
||||
lambda match: "</b>")
|
||||
]
|
||||
elif __Region__ == 'Vancouver':
|
||||
title = 'Ming Pao - Vancouver'
|
||||
description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)'
|
||||
category = 'Chinese, News, Vancouver'
|
||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
|
||||
masthead_url = 'http://www.mingpaovan.com/image/mainlogo2_VAN2.gif'
|
||||
keep_only_tags = [dict(name='table', attrs={'width':['450'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['1']}),
|
||||
dict(name='table', attrs={'width':['450'], 'border':['0'], 'cellspacing':['3'], 'cellpadding':['3'], 'id':['tblContent3']}),
|
||||
dict(name='table', attrs={'width':['180'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['0'], 'bgcolor':['F0F0F0']}),
|
||||
]
|
||||
if __KeepImages__:
|
||||
remove_tags = [dict(name='img', attrs={'src':['../../../image/magnifier.gif']})] # the magnifier icon
|
||||
else:
|
||||
remove_tags = [dict(name='img')]
|
||||
remove_attributes = ['width']
|
||||
preprocess_regexps = [(re.compile(r' ', re.DOTALL|re.IGNORECASE),
|
||||
lambda match: ''),
|
||||
]
|
||||
elif __Region__ == 'Toronto':
|
||||
title = 'Ming Pao - Toronto'
|
||||
description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)'
|
||||
category = 'Chinese, News, Toronto'
|
||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
|
||||
masthead_url = 'http://www.mingpaotor.com/image/mainlogo2_TOR2.gif'
|
||||
keep_only_tags = [dict(name='table', attrs={'width':['450'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['1']}),
|
||||
dict(name='table', attrs={'width':['450'], 'border':['0'], 'cellspacing':['3'], 'cellpadding':['3'], 'id':['tblContent3']}),
|
||||
dict(name='table', attrs={'width':['180'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['0'], 'bgcolor':['F0F0F0']}),
|
||||
]
|
||||
if __KeepImages__:
|
||||
remove_tags = [dict(name='img', attrs={'src':['../../../image/magnifier.gif']})] # the magnifier icon
|
||||
else:
|
||||
remove_tags = [dict(name='img')]
|
||||
remove_attributes = ['width']
|
||||
preprocess_regexps = [(re.compile(r' ', re.DOTALL|re.IGNORECASE),
|
||||
lambda match: ''),
|
||||
]
|
||||
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100
|
||||
__author__ = 'Eddie Lau'
|
||||
description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
|
||||
publisher = 'MingPao'
|
||||
category = 'Chinese, News, Hong Kong'
|
||||
remove_javascript = True
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
@ -57,33 +138,6 @@ class MPHKRecipe(BasicNewsRecipe):
|
||||
recursions = 0
|
||||
conversion_options = {'linearize_tables':True}
|
||||
timefmt = ''
|
||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;}'
|
||||
masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
|
||||
keep_only_tags = [dict(name='h1'),
|
||||
dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
|
||||
dict(name='font', attrs={'color':['AA0000']}), # for column articles title
|
||||
dict(attrs={'id':['newscontent']}), # entertainment and column page content
|
||||
dict(attrs={'id':['newscontent01','newscontent02']}),
|
||||
dict(attrs={'class':['photo']}),
|
||||
dict(name='img', attrs={'width':['180'], 'alt':['按圖放大']}) # images for source from life.mingpao.com
|
||||
]
|
||||
remove_tags = [dict(name='style'),
|
||||
dict(attrs={'id':['newscontent135']}), # for the finance page from mpfinance.com
|
||||
dict(name='table')] # for content fetched from life.mingpao.com
|
||||
remove_attributes = ['width']
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'<h5>', re.DOTALL|re.IGNORECASE),
|
||||
lambda match: '<h1>'),
|
||||
(re.compile(r'</h5>', re.DOTALL|re.IGNORECASE),
|
||||
lambda match: '</h1>'),
|
||||
(re.compile(r'<p><a href=.+?</a></p>', re.DOTALL|re.IGNORECASE), # for entertainment page
|
||||
lambda match: ''),
|
||||
# skip <br> after title in life.mingpao.com fetched article
|
||||
(re.compile(r"<div id='newscontent'><br>", re.DOTALL|re.IGNORECASE),
|
||||
lambda match: "<div id='newscontent'>"),
|
||||
(re.compile(r"<br><br></b>", re.DOTALL|re.IGNORECASE),
|
||||
lambda match: "</b>")
|
||||
]
|
||||
|
||||
def image_url_processor(cls, baseurl, url):
|
||||
# trick: break the url at the first occurance of digit, add an additional
|
||||
@ -124,8 +178,18 @@ class MPHKRecipe(BasicNewsRecipe):
|
||||
|
||||
def get_dtlocal(self):
|
||||
dt_utc = datetime.datetime.utcnow()
|
||||
# convert UTC to local hk time - at around HKT 6.00am, all news are available
|
||||
dt_local = dt_utc - datetime.timedelta(-2.0/24)
|
||||
if __Region__ == 'Hong Kong':
|
||||
# convert UTC to local hk time - at HKT 4.30am, all news are available
|
||||
dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(4.5/24)
|
||||
# dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(4.5/24)
|
||||
elif __Region__ == 'Vancouver':
|
||||
# convert UTC to local Vancouver time - at PST time 4.30am, all news are available
|
||||
dt_local = dt_utc + datetime.timedelta(-8.0/24) - datetime.timedelta(4.5/24)
|
||||
#dt_local = dt_utc.astimezone(pytz.timezone('America/Vancouver')) - datetime.timedelta(4.5/24)
|
||||
elif __Region__ == 'Toronto':
|
||||
# convert UTC to local Toronto time - at EST time 4.30am, all news are available
|
||||
dt_local = dt_utc + datetime.timedelta(-5.0/24) - datetime.timedelta(4.5/24)
|
||||
#dt_local = dt_utc.astimezone(pytz.timezone('America/Toronto')) - datetime.timedelta(4.5/24)
|
||||
return dt_local
|
||||
|
||||
def get_fetchdate(self):
|
||||
@ -135,13 +199,15 @@ class MPHKRecipe(BasicNewsRecipe):
|
||||
return self.get_dtlocal().strftime("%Y-%m-%d")
|
||||
|
||||
def get_fetchday(self):
|
||||
# dt_utc = datetime.datetime.utcnow()
|
||||
# convert UTC to local hk time - at around HKT 6.00am, all news are available
|
||||
# dt_local = dt_utc - datetime.timedelta(-2.0/24)
|
||||
return self.get_dtlocal().strftime("%d")
|
||||
|
||||
def get_cover_url(self):
|
||||
cover = 'http://news.mingpao.com/' + self.get_fetchdate() + '/' + self.get_fetchdate() + '_' + self.get_fetchday() + 'gacov.jpg'
|
||||
if __Region__ == 'Hong Kong':
|
||||
cover = 'http://news.mingpao.com/' + self.get_fetchdate() + '/' + self.get_fetchdate() + '_' + self.get_fetchday() + 'gacov.jpg'
|
||||
elif __Region__ == 'Vancouver':
|
||||
cover = 'http://www.mingpaovan.com/ftp/News/' + self.get_fetchdate() + '/' + self.get_fetchday() + 'pgva1s.jpg'
|
||||
elif __Region__ == 'Toronto':
|
||||
cover = 'http://www.mingpaotor.com/ftp/News/' + self.get_fetchdate() + '/' + self.get_fetchday() + 'pgtas.jpg'
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
try:
|
||||
br.open(cover)
|
||||
@ -153,76 +219,104 @@ class MPHKRecipe(BasicNewsRecipe):
|
||||
feeds = []
|
||||
dateStr = self.get_fetchdate()
|
||||
|
||||
if __UseLife__:
|
||||
for title, url, keystr in [(u'\u8981\u805e Headline', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalga', 'nal'),
|
||||
(u'\u6e2f\u805e Local', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalgb', 'nal'),
|
||||
(u'\u6559\u80b2 Education', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalgf', 'nal'),
|
||||
(u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr', 'nal'),
|
||||
(u'\u8ad6\u58c7 Forum', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalfa', 'nal'),
|
||||
(u'\u4e2d\u570b China', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalca', 'nal'),
|
||||
(u'\u570b\u969b World', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalta', 'nal'),
|
||||
(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal'),
|
||||
(u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'),
|
||||
(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal'),
|
||||
(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')]:
|
||||
articles = self.parse_section2(url, keystr)
|
||||
if __Region__ == 'Hong Kong':
|
||||
if __UseLife__:
|
||||
for title, url, keystr in [(u'\u8981\u805e Headline', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalga', 'nal'),
|
||||
(u'\u6e2f\u805e Local', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalgb', 'nal'),
|
||||
(u'\u6559\u80b2 Education', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalgf', 'nal'),
|
||||
(u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr', 'nal'),
|
||||
(u'\u8ad6\u58c7 Forum', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalfa', 'nal'),
|
||||
(u'\u4e2d\u570b China', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalca', 'nal'),
|
||||
(u'\u570b\u969b World', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalta', 'nal'),
|
||||
(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal'),
|
||||
(u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'),
|
||||
(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal'),
|
||||
(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')]:
|
||||
articles = self.parse_section2(url, keystr)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
|
||||
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
|
||||
articles = self.parse_section(url)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
else:
|
||||
for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
|
||||
(u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
|
||||
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm')]:
|
||||
articles = self.parse_section(url)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
# special- editorial
|
||||
ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
|
||||
if ed_articles:
|
||||
feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
|
||||
|
||||
for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
|
||||
(u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
|
||||
(u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm')]:
|
||||
articles = self.parse_section(url)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
# special - finance
|
||||
#fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
|
||||
fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
|
||||
if fin_articles:
|
||||
feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
|
||||
|
||||
for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
|
||||
(u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
|
||||
articles = self.parse_section(url)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
# special - entertainment
|
||||
ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
|
||||
if ent_articles:
|
||||
feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
|
||||
|
||||
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
|
||||
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
|
||||
articles = self.parse_section(url)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
|
||||
# special- columns
|
||||
col_articles = self.parse_col_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn')
|
||||
if col_articles:
|
||||
feeds.append((u'\u5c08\u6b04 Columns', col_articles))
|
||||
elif __Region__ == 'Vancouver':
|
||||
for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VAindex.htm'),
|
||||
(u'\u52a0\u570b Canada', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VBindex.htm'),
|
||||
(u'\u793e\u5340 Local', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VDindex.htm'),
|
||||
(u'\u6e2f\u805e Hong Kong', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/HK-VGindex.htm'),
|
||||
(u'\u570b\u969b World', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VTindex.htm'),
|
||||
(u'\u4e2d\u570b China', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VCindex.htm'),
|
||||
(u'\u7d93\u6fdf Economics', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VEindex.htm'),
|
||||
(u'\u9ad4\u80b2 Sports', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VSindex.htm'),
|
||||
(u'\u5f71\u8996 Film/TV', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/HK-MAindex.htm'),
|
||||
(u'\u526f\u520a Supplements', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/WWindex.htm'),]:
|
||||
articles = self.parse_section3(url, 'http://www.mingpaovan.com/')
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
|
||||
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
|
||||
articles = self.parse_section(url)
|
||||
elif __Region__ == 'Toronto':
|
||||
for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TAindex.htm'),
|
||||
(u'\u52a0\u570b Canada', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TDindex.htm'),
|
||||
(u'\u793e\u5340 Local', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TFindex.htm'),
|
||||
(u'\u4e2d\u570b China', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TCAindex.htm'),
|
||||
(u'\u570b\u969b World', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TTAindex.htm'),
|
||||
(u'\u6e2f\u805e Hong Kong', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/HK-GAindex.htm'),
|
||||
(u'\u7d93\u6fdf Economics', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/THindex.htm'),
|
||||
(u'\u9ad4\u80b2 Sports', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TSindex.htm'),
|
||||
(u'\u5f71\u8996 Film/TV', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/HK-MAindex.htm'),
|
||||
(u'\u526f\u520a Supplements', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/WWindex.htm'),]:
|
||||
articles = self.parse_section3(url, 'http://www.mingpaotor.com/')
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
else:
|
||||
for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
|
||||
(u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
|
||||
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm')]:
|
||||
articles = self.parse_section(url)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
# special- editorial
|
||||
ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
|
||||
if ed_articles:
|
||||
feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
|
||||
|
||||
for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
|
||||
(u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
|
||||
(u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm')]:
|
||||
articles = self.parse_section(url)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
# special - finance
|
||||
#fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
|
||||
fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
|
||||
if fin_articles:
|
||||
feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
|
||||
|
||||
for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
|
||||
(u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
|
||||
articles = self.parse_section(url)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
# special - entertainment
|
||||
ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
|
||||
if ent_articles:
|
||||
feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
|
||||
|
||||
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
|
||||
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
|
||||
articles = self.parse_section(url)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
|
||||
# special- columns
|
||||
col_articles = self.parse_col_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn')
|
||||
if col_articles:
|
||||
feeds.append((u'\u5c08\u6b04 Columns', col_articles))
|
||||
|
||||
return feeds
|
||||
|
||||
# parse from news.mingpao.com
|
||||
@ -256,11 +350,30 @@ class MPHKRecipe(BasicNewsRecipe):
|
||||
title = self.tag_to_string(i)
|
||||
url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
|
||||
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
|
||||
url = url.replace('dailynews3.cfm', 'dailynews3a.cfm') # use printed version of the article
|
||||
current_articles.append({'title': title, 'url': url, 'description': ''})
|
||||
included_urls.append(url)
|
||||
current_articles.reverse()
|
||||
return current_articles
|
||||
|
||||
# parse from www.mingpaovan.com
|
||||
def parse_section3(self, url, baseUrl):
|
||||
self.get_fetchdate()
|
||||
soup = self.index_to_soup(url)
|
||||
divs = soup.findAll(attrs={'class': ['ListContentLargeLink']})
|
||||
current_articles = []
|
||||
included_urls = []
|
||||
divs.reverse()
|
||||
for i in divs:
|
||||
title = self.tag_to_string(i)
|
||||
urlstr = i.get('href', False)
|
||||
urlstr = baseUrl + '/' + urlstr.replace('../../../', '')
|
||||
if urlstr not in included_urls:
|
||||
current_articles.append({'title': title, 'url': urlstr, 'description': '', 'date': ''})
|
||||
included_urls.append(urlstr)
|
||||
current_articles.reverse()
|
||||
return current_articles
|
||||
|
||||
def parse_ed_section(self, url):
|
||||
self.get_fetchdate()
|
||||
soup = self.index_to_soup(url)
|
||||
@ -338,7 +451,12 @@ class MPHKRecipe(BasicNewsRecipe):
|
||||
if dir is None:
|
||||
dir = self.output_dir
|
||||
if __UseChineseTitle__ == True:
|
||||
title = u'\u660e\u5831 (\u9999\u6e2f)'
|
||||
if __Region__ == 'Hong Kong':
|
||||
title = u'\u660e\u5831 (\u9999\u6e2f)'
|
||||
elif __Region__ == 'Vancouver':
|
||||
title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
|
||||
elif __Region__ == 'Toronto':
|
||||
title = u'\u660e\u5831 (\u591a\u502b\u591a)'
|
||||
else:
|
||||
title = self.short_title()
|
||||
# if not generating a periodical, force date to apply in title
|
||||
|
594
recipes/ming_pao_toronto.recipe
Normal file
594
recipes/ming_pao_toronto.recipe
Normal file
@ -0,0 +1,594 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010-2011, Eddie Lau'
|
||||
|
||||
# Region - Hong Kong, Vancouver, Toronto
|
||||
__Region__ = 'Toronto'
|
||||
# Users of Kindle 3 with limited system-level CJK support
|
||||
# please replace the following "True" with "False".
|
||||
__MakePeriodical__ = True
|
||||
# Turn below to true if your device supports display of CJK titles
|
||||
__UseChineseTitle__ = False
|
||||
# Set it to False if you want to skip images
|
||||
__KeepImages__ = True
|
||||
# (HK only) Turn below to true if you wish to use life.mingpao.com as the main article source
|
||||
__UseLife__ = True
|
||||
|
||||
|
||||
'''
|
||||
Change Log:
|
||||
2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source
|
||||
provide options to remove all images in the file
|
||||
2011/05/12: switch the main parse source to life.mingpao.com, which has more photos on the article pages
|
||||
2011/03/06: add new articles for finance section, also a new section "Columns"
|
||||
2011/02/28: rearrange the sections
|
||||
[Disabled until Kindle has better CJK support and can remember last (section,article) read in Sections & Articles
|
||||
View] make it the same title if generating a periodical, so past issue will be automatically put into "Past Issues"
|
||||
folder in Kindle 3
|
||||
2011/02/20: skip duplicated links in finance section, put photos which may extend a whole page to the back of the articles
|
||||
clean up the indentation
|
||||
2010/12/07: add entertainment section, use newspaper front page as ebook cover, suppress date display in section list
|
||||
(to avoid wrong date display in case the user generates the ebook in a time zone different from HKT)
|
||||
2010/11/22: add English section, remove eco-news section which is not updated daily, correct
|
||||
ordering of articles
|
||||
2010/11/12: add news image and eco-news section
|
||||
2010/11/08: add parsing of finance section
|
||||
2010/11/06: temporary work-around for Kindle device having no capability to display unicode
|
||||
in section/article list.
|
||||
2010/10/31: skip repeated articles in section pages
|
||||
'''
|
||||
|
||||
import os, datetime, re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from contextlib import nested
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||
from calibre.ebooks.metadata.toc import TOC
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
|
||||
# MAIN CLASS
|
||||
class MPRecipe(BasicNewsRecipe):
|
||||
if __Region__ == 'Hong Kong':
|
||||
title = 'Ming Pao - Hong Kong'
|
||||
description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
|
||||
category = 'Chinese, News, Hong Kong'
|
||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;}'
|
||||
masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
|
||||
keep_only_tags = [dict(name='h1'),
|
||||
dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
|
||||
dict(name='font', attrs={'color':['AA0000']}), # for column articles title
|
||||
dict(attrs={'id':['newscontent']}), # entertainment and column page content
|
||||
dict(attrs={'id':['newscontent01','newscontent02']}),
|
||||
dict(attrs={'class':['photo']}),
|
||||
dict(name='table', attrs={'width':['100%'], 'border':['0'], 'cellspacing':['5'], 'cellpadding':['0']}), # content in printed version of life.mingpao.com
|
||||
dict(name='img', attrs={'width':['180'], 'alt':['按圖放大']}) # images for source from life.mingpao.com
|
||||
]
|
||||
if __KeepImages__:
|
||||
remove_tags = [dict(name='style'),
|
||||
dict(attrs={'id':['newscontent135']}), # for the finance page from mpfinance.com
|
||||
dict(name='font', attrs={'size':['2'], 'color':['666666']}), # article date in life.mingpao.com article
|
||||
#dict(name='table') # for content fetched from life.mingpao.com
|
||||
]
|
||||
else:
|
||||
remove_tags = [dict(name='style'),
|
||||
dict(attrs={'id':['newscontent135']}), # for the finance page from mpfinance.com
|
||||
dict(name='font', attrs={'size':['2'], 'color':['666666']}), # article date in life.mingpao.com article
|
||||
dict(name='img'),
|
||||
#dict(name='table') # for content fetched from life.mingpao.com
|
||||
]
|
||||
remove_attributes = ['width']
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'<h5>', re.DOTALL|re.IGNORECASE),
|
||||
lambda match: '<h1>'),
|
||||
(re.compile(r'</h5>', re.DOTALL|re.IGNORECASE),
|
||||
lambda match: '</h1>'),
|
||||
(re.compile(r'<p><a href=.+?</a></p>', re.DOTALL|re.IGNORECASE), # for entertainment page
|
||||
lambda match: ''),
|
||||
# skip <br> after title in life.mingpao.com fetched article
|
||||
(re.compile(r"<div id='newscontent'><br>", re.DOTALL|re.IGNORECASE),
|
||||
lambda match: "<div id='newscontent'>"),
|
||||
(re.compile(r"<br><br></b>", re.DOTALL|re.IGNORECASE),
|
||||
lambda match: "</b>")
|
||||
]
|
||||
elif __Region__ == 'Vancouver':
|
||||
title = 'Ming Pao - Vancouver'
|
||||
description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)'
|
||||
category = 'Chinese, News, Vancouver'
|
||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
|
||||
masthead_url = 'http://www.mingpaovan.com/image/mainlogo2_VAN2.gif'
|
||||
keep_only_tags = [dict(name='table', attrs={'width':['450'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['1']}),
|
||||
dict(name='table', attrs={'width':['450'], 'border':['0'], 'cellspacing':['3'], 'cellpadding':['3'], 'id':['tblContent3']}),
|
||||
dict(name='table', attrs={'width':['180'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['0'], 'bgcolor':['F0F0F0']}),
|
||||
]
|
||||
if __KeepImages__:
|
||||
remove_tags = [dict(name='img', attrs={'src':['../../../image/magnifier.gif']})] # the magnifier icon
|
||||
else:
|
||||
remove_tags = [dict(name='img')]
|
||||
remove_attributes = ['width']
|
||||
preprocess_regexps = [(re.compile(r' ', re.DOTALL|re.IGNORECASE),
|
||||
lambda match: ''),
|
||||
]
|
||||
elif __Region__ == 'Toronto':
|
||||
title = 'Ming Pao - Toronto'
|
||||
description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)'
|
||||
category = 'Chinese, News, Toronto'
|
||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
|
||||
masthead_url = 'http://www.mingpaotor.com/image/mainlogo2_TOR2.gif'
|
||||
keep_only_tags = [dict(name='table', attrs={'width':['450'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['1']}),
|
||||
dict(name='table', attrs={'width':['450'], 'border':['0'], 'cellspacing':['3'], 'cellpadding':['3'], 'id':['tblContent3']}),
|
||||
dict(name='table', attrs={'width':['180'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['0'], 'bgcolor':['F0F0F0']}),
|
||||
]
|
||||
if __KeepImages__:
|
||||
remove_tags = [dict(name='img', attrs={'src':['../../../image/magnifier.gif']})] # the magnifier icon
|
||||
else:
|
||||
remove_tags = [dict(name='img')]
|
||||
remove_attributes = ['width']
|
||||
preprocess_regexps = [(re.compile(r' ', re.DOTALL|re.IGNORECASE),
|
||||
lambda match: ''),
|
||||
]
|
||||
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100
|
||||
__author__ = 'Eddie Lau'
|
||||
publisher = 'MingPao'
|
||||
remove_javascript = True
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
language = 'zh'
|
||||
encoding = 'Big5-HKSCS'
|
||||
recursions = 0
|
||||
conversion_options = {'linearize_tables':True}
|
||||
timefmt = ''
|
||||
|
||||
def image_url_processor(cls, baseurl, url):
|
||||
# trick: break the url at the first occurance of digit, add an additional
|
||||
# '_' at the front
|
||||
# not working, may need to move this to preprocess_html() method
|
||||
# minIdx = 10000
|
||||
# i0 = url.find('0')
|
||||
# if i0 >= 0 and i0 < minIdx:
|
||||
# minIdx = i0
|
||||
# i1 = url.find('1')
|
||||
# if i1 >= 0 and i1 < minIdx:
|
||||
# minIdx = i1
|
||||
# i2 = url.find('2')
|
||||
# if i2 >= 0 and i2 < minIdx:
|
||||
# minIdx = i2
|
||||
# i3 = url.find('3')
|
||||
# if i3 >= 0 and i0 < minIdx:
|
||||
# minIdx = i3
|
||||
# i4 = url.find('4')
|
||||
# if i4 >= 0 and i4 < minIdx:
|
||||
# minIdx = i4
|
||||
# i5 = url.find('5')
|
||||
# if i5 >= 0 and i5 < minIdx:
|
||||
# minIdx = i5
|
||||
# i6 = url.find('6')
|
||||
# if i6 >= 0 and i6 < minIdx:
|
||||
# minIdx = i6
|
||||
# i7 = url.find('7')
|
||||
# if i7 >= 0 and i7 < minIdx:
|
||||
# minIdx = i7
|
||||
# i8 = url.find('8')
|
||||
# if i8 >= 0 and i8 < minIdx:
|
||||
# minIdx = i8
|
||||
# i9 = url.find('9')
|
||||
# if i9 >= 0 and i9 < minIdx:
|
||||
# minIdx = i9
|
||||
return url
|
||||
|
||||
def get_dtlocal(self):
|
||||
dt_utc = datetime.datetime.utcnow()
|
||||
if __Region__ == 'Hong Kong':
|
||||
# convert UTC to local hk time - at HKT 4.30am, all news are available
|
||||
dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(4.5/24)
|
||||
# dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(4.5/24)
|
||||
elif __Region__ == 'Vancouver':
|
||||
# convert UTC to local Vancouver time - at PST time 4.30am, all news are available
|
||||
dt_local = dt_utc + datetime.timedelta(-8.0/24) - datetime.timedelta(4.5/24)
|
||||
#dt_local = dt_utc.astimezone(pytz.timezone('America/Vancouver')) - datetime.timedelta(4.5/24)
|
||||
elif __Region__ == 'Toronto':
|
||||
# convert UTC to local Toronto time - at EST time 4.30am, all news are available
|
||||
dt_local = dt_utc + datetime.timedelta(-5.0/24) - datetime.timedelta(4.5/24)
|
||||
#dt_local = dt_utc.astimezone(pytz.timezone('America/Toronto')) - datetime.timedelta(4.5/24)
|
||||
return dt_local
|
||||
|
||||
def get_fetchdate(self):
|
||||
return self.get_dtlocal().strftime("%Y%m%d")
|
||||
|
||||
def get_fetchformatteddate(self):
|
||||
return self.get_dtlocal().strftime("%Y-%m-%d")
|
||||
|
||||
def get_fetchday(self):
|
||||
return self.get_dtlocal().strftime("%d")
|
||||
|
||||
def get_cover_url(self):
|
||||
if __Region__ == 'Hong Kong':
|
||||
cover = 'http://news.mingpao.com/' + self.get_fetchdate() + '/' + self.get_fetchdate() + '_' + self.get_fetchday() + 'gacov.jpg'
|
||||
elif __Region__ == 'Vancouver':
|
||||
cover = 'http://www.mingpaovan.com/ftp/News/' + self.get_fetchdate() + '/' + self.get_fetchday() + 'pgva1s.jpg'
|
||||
elif __Region__ == 'Toronto':
|
||||
cover = 'http://www.mingpaotor.com/ftp/News/' + self.get_fetchdate() + '/' + self.get_fetchday() + 'pgtas.jpg'
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
try:
|
||||
br.open(cover)
|
||||
except:
|
||||
cover = None
|
||||
return cover
|
||||
|
||||
def parse_index(self):
|
||||
feeds = []
|
||||
dateStr = self.get_fetchdate()
|
||||
|
||||
if __Region__ == 'Hong Kong':
|
||||
if __UseLife__:
|
||||
for title, url, keystr in [(u'\u8981\u805e Headline', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalga', 'nal'),
|
||||
(u'\u6e2f\u805e Local', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalgb', 'nal'),
|
||||
(u'\u6559\u80b2 Education', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalgf', 'nal'),
|
||||
(u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr', 'nal'),
|
||||
(u'\u8ad6\u58c7 Forum', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalfa', 'nal'),
|
||||
(u'\u4e2d\u570b China', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalca', 'nal'),
|
||||
(u'\u570b\u969b World', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalta', 'nal'),
|
||||
(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal'),
|
||||
(u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'),
|
||||
(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal'),
|
||||
(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')]:
|
||||
articles = self.parse_section2(url, keystr)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
|
||||
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
|
||||
articles = self.parse_section(url)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
else:
|
||||
for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
|
||||
(u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
|
||||
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm')]:
|
||||
articles = self.parse_section(url)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
# special- editorial
|
||||
ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
|
||||
if ed_articles:
|
||||
feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
|
||||
|
||||
for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
|
||||
(u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
|
||||
(u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm')]:
|
||||
articles = self.parse_section(url)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
# special - finance
|
||||
#fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
|
||||
fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
|
||||
if fin_articles:
|
||||
feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
|
||||
|
||||
for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
|
||||
(u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
|
||||
articles = self.parse_section(url)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
# special - entertainment
|
||||
ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
|
||||
if ent_articles:
|
||||
feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
|
||||
|
||||
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
|
||||
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
|
||||
articles = self.parse_section(url)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
|
||||
# special- columns
|
||||
col_articles = self.parse_col_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn')
|
||||
if col_articles:
|
||||
feeds.append((u'\u5c08\u6b04 Columns', col_articles))
|
||||
elif __Region__ == 'Vancouver':
|
||||
for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VAindex.htm'),
|
||||
(u'\u52a0\u570b Canada', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VBindex.htm'),
|
||||
(u'\u793e\u5340 Local', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VDindex.htm'),
|
||||
(u'\u6e2f\u805e Hong Kong', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/HK-VGindex.htm'),
|
||||
(u'\u570b\u969b World', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VTindex.htm'),
|
||||
(u'\u4e2d\u570b China', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VCindex.htm'),
|
||||
(u'\u7d93\u6fdf Economics', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VEindex.htm'),
|
||||
(u'\u9ad4\u80b2 Sports', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VSindex.htm'),
|
||||
(u'\u5f71\u8996 Film/TV', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/HK-MAindex.htm'),
|
||||
(u'\u526f\u520a Supplements', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/WWindex.htm'),]:
|
||||
articles = self.parse_section3(url, 'http://www.mingpaovan.com/')
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
elif __Region__ == 'Toronto':
|
||||
for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TAindex.htm'),
|
||||
(u'\u52a0\u570b Canada', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TDindex.htm'),
|
||||
(u'\u793e\u5340 Local', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TFindex.htm'),
|
||||
(u'\u4e2d\u570b China', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TCAindex.htm'),
|
||||
(u'\u570b\u969b World', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TTAindex.htm'),
|
||||
(u'\u6e2f\u805e Hong Kong', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/HK-GAindex.htm'),
|
||||
(u'\u7d93\u6fdf Economics', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/THindex.htm'),
|
||||
(u'\u9ad4\u80b2 Sports', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TSindex.htm'),
|
||||
(u'\u5f71\u8996 Film/TV', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/HK-MAindex.htm'),
|
||||
(u'\u526f\u520a Supplements', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/WWindex.htm'),]:
|
||||
articles = self.parse_section3(url, 'http://www.mingpaotor.com/')
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
return feeds
|
||||
|
||||
# parse from news.mingpao.com
|
||||
def parse_section(self, url):
|
||||
dateStr = self.get_fetchdate()
|
||||
soup = self.index_to_soup(url)
|
||||
divs = soup.findAll(attrs={'class': ['bullet','bullet_grey']})
|
||||
current_articles = []
|
||||
included_urls = []
|
||||
divs.reverse()
|
||||
for i in divs:
|
||||
a = i.find('a', href = True)
|
||||
title = self.tag_to_string(a)
|
||||
url = a.get('href', False)
|
||||
url = 'http://news.mingpao.com/' + dateStr + '/' +url
|
||||
if url not in included_urls and url.rfind('Redirect') == -1:
|
||||
current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
|
||||
included_urls.append(url)
|
||||
current_articles.reverse()
|
||||
return current_articles
|
||||
|
||||
# parse from life.mingpao.com
|
||||
def parse_section2(self, url, keystr):
|
||||
self.get_fetchdate()
|
||||
soup = self.index_to_soup(url)
|
||||
a = soup.findAll('a', href=True)
|
||||
a.reverse()
|
||||
current_articles = []
|
||||
included_urls = []
|
||||
for i in a:
|
||||
title = self.tag_to_string(i)
|
||||
url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
|
||||
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
|
||||
url = url.replace('dailynews3.cfm', 'dailynews3a.cfm') # use printed version of the article
|
||||
current_articles.append({'title': title, 'url': url, 'description': ''})
|
||||
included_urls.append(url)
|
||||
current_articles.reverse()
|
||||
return current_articles
|
||||
|
||||
# parse from www.mingpaovan.com
|
||||
def parse_section3(self, url, baseUrl):
|
||||
self.get_fetchdate()
|
||||
soup = self.index_to_soup(url)
|
||||
divs = soup.findAll(attrs={'class': ['ListContentLargeLink']})
|
||||
current_articles = []
|
||||
included_urls = []
|
||||
divs.reverse()
|
||||
for i in divs:
|
||||
title = self.tag_to_string(i)
|
||||
urlstr = i.get('href', False)
|
||||
urlstr = baseUrl + '/' + urlstr.replace('../../../', '')
|
||||
if urlstr not in included_urls:
|
||||
current_articles.append({'title': title, 'url': urlstr, 'description': '', 'date': ''})
|
||||
included_urls.append(urlstr)
|
||||
current_articles.reverse()
|
||||
return current_articles
|
||||
|
||||
def parse_ed_section(self, url):
|
||||
self.get_fetchdate()
|
||||
soup = self.index_to_soup(url)
|
||||
a = soup.findAll('a', href=True)
|
||||
a.reverse()
|
||||
current_articles = []
|
||||
included_urls = []
|
||||
for i in a:
|
||||
title = self.tag_to_string(i)
|
||||
url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
|
||||
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('nal') == -1):
|
||||
current_articles.append({'title': title, 'url': url, 'description': ''})
|
||||
included_urls.append(url)
|
||||
current_articles.reverse()
|
||||
return current_articles
|
||||
|
||||
def parse_fin_section(self, url):
|
||||
self.get_fetchdate()
|
||||
soup = self.index_to_soup(url)
|
||||
a = soup.findAll('a', href= True)
|
||||
current_articles = []
|
||||
included_urls = []
|
||||
for i in a:
|
||||
#url = 'http://www.mpfinance.com/cfm/' + i.get('href', False)
|
||||
url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
|
||||
#if url not in included_urls and not url.rfind(dateStr) == -1 and url.rfind('index') == -1:
|
||||
if url not in included_urls and (not url.rfind('txt') == -1) and (not url.rfind('nal') == -1):
|
||||
title = self.tag_to_string(i)
|
||||
current_articles.append({'title': title, 'url': url, 'description':''})
|
||||
included_urls.append(url)
|
||||
return current_articles
|
||||
|
||||
def parse_ent_section(self, url):
|
||||
self.get_fetchdate()
|
||||
soup = self.index_to_soup(url)
|
||||
a = soup.findAll('a', href=True)
|
||||
a.reverse()
|
||||
current_articles = []
|
||||
included_urls = []
|
||||
for i in a:
|
||||
title = self.tag_to_string(i)
|
||||
url = 'http://ol.mingpao.com/cfm/' + i.get('href', False)
|
||||
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('star') == -1):
|
||||
current_articles.append({'title': title, 'url': url, 'description': ''})
|
||||
included_urls.append(url)
|
||||
current_articles.reverse()
|
||||
return current_articles
|
||||
|
||||
def parse_col_section(self, url):
|
||||
self.get_fetchdate()
|
||||
soup = self.index_to_soup(url)
|
||||
a = soup.findAll('a', href=True)
|
||||
a.reverse()
|
||||
current_articles = []
|
||||
included_urls = []
|
||||
for i in a:
|
||||
title = self.tag_to_string(i)
|
||||
url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
|
||||
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('ncl') == -1):
|
||||
current_articles.append({'title': title, 'url': url, 'description': ''})
|
||||
included_urls.append(url)
|
||||
current_articles.reverse()
|
||||
return current_articles
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
for item in soup.findAll(style=True):
|
||||
del item['width']
|
||||
for item in soup.findAll(stype=True):
|
||||
del item['absmiddle']
|
||||
return soup
|
||||
|
||||
def create_opf(self, feeds, dir=None):
|
||||
if dir is None:
|
||||
dir = self.output_dir
|
||||
if __UseChineseTitle__ == True:
|
||||
if __Region__ == 'Hong Kong':
|
||||
title = u'\u660e\u5831 (\u9999\u6e2f)'
|
||||
elif __Region__ == 'Vancouver':
|
||||
title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
|
||||
elif __Region__ == 'Toronto':
|
||||
title = u'\u660e\u5831 (\u591a\u502b\u591a)'
|
||||
else:
|
||||
title = self.short_title()
|
||||
# if not generating a periodical, force date to apply in title
|
||||
if __MakePeriodical__ == False:
|
||||
title = title + ' ' + self.get_fetchformatteddate()
|
||||
if True:
|
||||
mi = MetaInformation(title, [self.publisher])
|
||||
mi.publisher = self.publisher
|
||||
mi.author_sort = self.publisher
|
||||
if __MakePeriodical__ == True:
|
||||
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
|
||||
else:
|
||||
mi.publication_type = self.publication_type+':'+self.short_title()
|
||||
#mi.timestamp = nowf()
|
||||
mi.timestamp = self.get_dtlocal()
|
||||
mi.comments = self.description
|
||||
if not isinstance(mi.comments, unicode):
|
||||
mi.comments = mi.comments.decode('utf-8', 'replace')
|
||||
#mi.pubdate = nowf()
|
||||
mi.pubdate = self.get_dtlocal()
|
||||
opf_path = os.path.join(dir, 'index.opf')
|
||||
ncx_path = os.path.join(dir, 'index.ncx')
|
||||
opf = OPFCreator(dir, mi)
|
||||
# Add mastheadImage entry to <guide> section
|
||||
mp = getattr(self, 'masthead_path', None)
|
||||
if mp is not None and os.access(mp, os.R_OK):
|
||||
from calibre.ebooks.metadata.opf2 import Guide
|
||||
ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
|
||||
ref.type = 'masthead'
|
||||
ref.title = 'Masthead Image'
|
||||
opf.guide.append(ref)
|
||||
|
||||
manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
|
||||
manifest.append(os.path.join(dir, 'index.html'))
|
||||
manifest.append(os.path.join(dir, 'index.ncx'))
|
||||
|
||||
# Get cover
|
||||
cpath = getattr(self, 'cover_path', None)
|
||||
if cpath is None:
|
||||
pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
|
||||
if self.default_cover(pf):
|
||||
cpath = pf.name
|
||||
if cpath is not None and os.access(cpath, os.R_OK):
|
||||
opf.cover = cpath
|
||||
manifest.append(cpath)
|
||||
|
||||
# Get masthead
|
||||
mpath = getattr(self, 'masthead_path', None)
|
||||
if mpath is not None and os.access(mpath, os.R_OK):
|
||||
manifest.append(mpath)
|
||||
|
||||
opf.create_manifest_from_files_in(manifest)
|
||||
for mani in opf.manifest:
|
||||
if mani.path.endswith('.ncx'):
|
||||
mani.id = 'ncx'
|
||||
if mani.path.endswith('mastheadImage.jpg'):
|
||||
mani.id = 'masthead-image'
|
||||
entries = ['index.html']
|
||||
toc = TOC(base_path=dir)
|
||||
self.play_order_counter = 0
|
||||
self.play_order_map = {}
|
||||
|
||||
def feed_index(num, parent):
|
||||
f = feeds[num]
|
||||
for j, a in enumerate(f):
|
||||
if getattr(a, 'downloaded', False):
|
||||
adir = 'feed_%d/article_%d/'%(num, j)
|
||||
auth = a.author
|
||||
if not auth:
|
||||
auth = None
|
||||
desc = a.text_summary
|
||||
if not desc:
|
||||
desc = None
|
||||
else:
|
||||
desc = self.description_limiter(desc)
|
||||
entries.append('%sindex.html'%adir)
|
||||
po = self.play_order_map.get(entries[-1], None)
|
||||
if po is None:
|
||||
self.play_order_counter += 1
|
||||
po = self.play_order_counter
|
||||
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
|
||||
play_order=po, author=auth, description=desc)
|
||||
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
|
||||
for sp in a.sub_pages:
|
||||
prefix = os.path.commonprefix([opf_path, sp])
|
||||
relp = sp[len(prefix):]
|
||||
entries.append(relp.replace(os.sep, '/'))
|
||||
last = sp
|
||||
|
||||
if os.path.exists(last):
|
||||
with open(last, 'rb') as fi:
|
||||
src = fi.read().decode('utf-8')
|
||||
soup = BeautifulSoup(src)
|
||||
body = soup.find('body')
|
||||
if body is not None:
|
||||
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
|
||||
templ = self.navbar.generate(True, num, j, len(f),
|
||||
not self.has_single_feed,
|
||||
a.orig_url, self.publisher, prefix=prefix,
|
||||
center=self.center_navbar)
|
||||
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
|
||||
body.insert(len(body.contents), elem)
|
||||
with open(last, 'wb') as fi:
|
||||
fi.write(unicode(soup).encode('utf-8'))
|
||||
if len(feeds) == 0:
|
||||
raise Exception('All feeds are empty, aborting.')
|
||||
|
||||
if len(feeds) > 1:
|
||||
for i, f in enumerate(feeds):
|
||||
entries.append('feed_%d/index.html'%i)
|
||||
po = self.play_order_map.get(entries[-1], None)
|
||||
if po is None:
|
||||
self.play_order_counter += 1
|
||||
po = self.play_order_counter
|
||||
auth = getattr(f, 'author', None)
|
||||
if not auth:
|
||||
auth = None
|
||||
desc = getattr(f, 'description', None)
|
||||
if not desc:
|
||||
desc = None
|
||||
feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
|
||||
f.title, play_order=po, description=desc, author=auth))
|
||||
|
||||
else:
|
||||
entries.append('feed_%d/index.html'%0)
|
||||
feed_index(0, toc)
|
||||
|
||||
for i, p in enumerate(entries):
|
||||
entries[i] = os.path.join(dir, p.replace('/', os.sep))
|
||||
opf.create_spine(entries)
|
||||
opf.set_toc(toc)
|
||||
|
||||
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
|
||||
opf.render(opf_file, ncx_file)
|
||||
|
594
recipes/ming_pao_vancouver.recipe
Normal file
594
recipes/ming_pao_vancouver.recipe
Normal file
@ -0,0 +1,594 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010-2011, Eddie Lau'
|
||||
|
||||
# Region - Hong Kong, Vancouver, Toronto
|
||||
__Region__ = 'Vancouver'
|
||||
# Users of Kindle 3 with limited system-level CJK support
|
||||
# please replace the following "True" with "False".
|
||||
__MakePeriodical__ = True
|
||||
# Turn below to true if your device supports display of CJK titles
|
||||
__UseChineseTitle__ = False
|
||||
# Set it to False if you want to skip images
|
||||
__KeepImages__ = True
|
||||
# (HK only) Turn below to true if you wish to use life.mingpao.com as the main article source
|
||||
__UseLife__ = True
|
||||
|
||||
|
||||
'''
|
||||
Change Log:
|
||||
2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source
|
||||
provide options to remove all images in the file
|
||||
2011/05/12: switch the main parse source to life.mingpao.com, which has more photos on the article pages
|
||||
2011/03/06: add new articles for finance section, also a new section "Columns"
|
||||
2011/02/28: rearrange the sections
|
||||
[Disabled until Kindle has better CJK support and can remember last (section,article) read in Sections & Articles
|
||||
View] make it the same title if generating a periodical, so past issue will be automatically put into "Past Issues"
|
||||
folder in Kindle 3
|
||||
2011/02/20: skip duplicated links in finance section, put photos which may extend a whole page to the back of the articles
|
||||
clean up the indentation
|
||||
2010/12/07: add entertainment section, use newspaper front page as ebook cover, suppress date display in section list
|
||||
(to avoid wrong date display in case the user generates the ebook in a time zone different from HKT)
|
||||
2010/11/22: add English section, remove eco-news section which is not updated daily, correct
|
||||
ordering of articles
|
||||
2010/11/12: add news image and eco-news section
|
||||
2010/11/08: add parsing of finance section
|
||||
2010/11/06: temporary work-around for Kindle device having no capability to display unicode
|
||||
in section/article list.
|
||||
2010/10/31: skip repeated articles in section pages
|
||||
'''
|
||||
|
||||
import os, datetime, re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from contextlib import nested
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||
from calibre.ebooks.metadata.toc import TOC
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
|
||||
# MAIN CLASS
|
||||
class MPRecipe(BasicNewsRecipe):
|
||||
if __Region__ == 'Hong Kong':
|
||||
title = 'Ming Pao - Hong Kong'
|
||||
description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
|
||||
category = 'Chinese, News, Hong Kong'
|
||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;}'
|
||||
masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
|
||||
keep_only_tags = [dict(name='h1'),
|
||||
dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
|
||||
dict(name='font', attrs={'color':['AA0000']}), # for column articles title
|
||||
dict(attrs={'id':['newscontent']}), # entertainment and column page content
|
||||
dict(attrs={'id':['newscontent01','newscontent02']}),
|
||||
dict(attrs={'class':['photo']}),
|
||||
dict(name='table', attrs={'width':['100%'], 'border':['0'], 'cellspacing':['5'], 'cellpadding':['0']}), # content in printed version of life.mingpao.com
|
||||
dict(name='img', attrs={'width':['180'], 'alt':['按圖放大']}) # images for source from life.mingpao.com
|
||||
]
|
||||
if __KeepImages__:
|
||||
remove_tags = [dict(name='style'),
|
||||
dict(attrs={'id':['newscontent135']}), # for the finance page from mpfinance.com
|
||||
dict(name='font', attrs={'size':['2'], 'color':['666666']}), # article date in life.mingpao.com article
|
||||
#dict(name='table') # for content fetched from life.mingpao.com
|
||||
]
|
||||
else:
|
||||
remove_tags = [dict(name='style'),
|
||||
dict(attrs={'id':['newscontent135']}), # for the finance page from mpfinance.com
|
||||
dict(name='font', attrs={'size':['2'], 'color':['666666']}), # article date in life.mingpao.com article
|
||||
dict(name='img'),
|
||||
#dict(name='table') # for content fetched from life.mingpao.com
|
||||
]
|
||||
remove_attributes = ['width']
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'<h5>', re.DOTALL|re.IGNORECASE),
|
||||
lambda match: '<h1>'),
|
||||
(re.compile(r'</h5>', re.DOTALL|re.IGNORECASE),
|
||||
lambda match: '</h1>'),
|
||||
(re.compile(r'<p><a href=.+?</a></p>', re.DOTALL|re.IGNORECASE), # for entertainment page
|
||||
lambda match: ''),
|
||||
# skip <br> after title in life.mingpao.com fetched article
|
||||
(re.compile(r"<div id='newscontent'><br>", re.DOTALL|re.IGNORECASE),
|
||||
lambda match: "<div id='newscontent'>"),
|
||||
(re.compile(r"<br><br></b>", re.DOTALL|re.IGNORECASE),
|
||||
lambda match: "</b>")
|
||||
]
|
||||
elif __Region__ == 'Vancouver':
|
||||
title = 'Ming Pao - Vancouver'
|
||||
description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)'
|
||||
category = 'Chinese, News, Vancouver'
|
||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
|
||||
masthead_url = 'http://www.mingpaovan.com/image/mainlogo2_VAN2.gif'
|
||||
keep_only_tags = [dict(name='table', attrs={'width':['450'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['1']}),
|
||||
dict(name='table', attrs={'width':['450'], 'border':['0'], 'cellspacing':['3'], 'cellpadding':['3'], 'id':['tblContent3']}),
|
||||
dict(name='table', attrs={'width':['180'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['0'], 'bgcolor':['F0F0F0']}),
|
||||
]
|
||||
if __KeepImages__:
|
||||
remove_tags = [dict(name='img', attrs={'src':['../../../image/magnifier.gif']})] # the magnifier icon
|
||||
else:
|
||||
remove_tags = [dict(name='img')]
|
||||
remove_attributes = ['width']
|
||||
preprocess_regexps = [(re.compile(r' ', re.DOTALL|re.IGNORECASE),
|
||||
lambda match: ''),
|
||||
]
|
||||
elif __Region__ == 'Toronto':
|
||||
title = 'Ming Pao - Toronto'
|
||||
description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)'
|
||||
category = 'Chinese, News, Toronto'
|
||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
|
||||
masthead_url = 'http://www.mingpaotor.com/image/mainlogo2_TOR2.gif'
|
||||
keep_only_tags = [dict(name='table', attrs={'width':['450'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['1']}),
|
||||
dict(name='table', attrs={'width':['450'], 'border':['0'], 'cellspacing':['3'], 'cellpadding':['3'], 'id':['tblContent3']}),
|
||||
dict(name='table', attrs={'width':['180'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['0'], 'bgcolor':['F0F0F0']}),
|
||||
]
|
||||
if __KeepImages__:
|
||||
remove_tags = [dict(name='img', attrs={'src':['../../../image/magnifier.gif']})] # the magnifier icon
|
||||
else:
|
||||
remove_tags = [dict(name='img')]
|
||||
remove_attributes = ['width']
|
||||
preprocess_regexps = [(re.compile(r' ', re.DOTALL|re.IGNORECASE),
|
||||
lambda match: ''),
|
||||
]
|
||||
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100
|
||||
__author__ = 'Eddie Lau'
|
||||
publisher = 'MingPao'
|
||||
remove_javascript = True
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
language = 'zh'
|
||||
encoding = 'Big5-HKSCS'
|
||||
recursions = 0
|
||||
conversion_options = {'linearize_tables':True}
|
||||
timefmt = ''
|
||||
|
||||
def image_url_processor(cls, baseurl, url):
|
||||
# trick: break the url at the first occurance of digit, add an additional
|
||||
# '_' at the front
|
||||
# not working, may need to move this to preprocess_html() method
|
||||
# minIdx = 10000
|
||||
# i0 = url.find('0')
|
||||
# if i0 >= 0 and i0 < minIdx:
|
||||
# minIdx = i0
|
||||
# i1 = url.find('1')
|
||||
# if i1 >= 0 and i1 < minIdx:
|
||||
# minIdx = i1
|
||||
# i2 = url.find('2')
|
||||
# if i2 >= 0 and i2 < minIdx:
|
||||
# minIdx = i2
|
||||
# i3 = url.find('3')
|
||||
# if i3 >= 0 and i0 < minIdx:
|
||||
# minIdx = i3
|
||||
# i4 = url.find('4')
|
||||
# if i4 >= 0 and i4 < minIdx:
|
||||
# minIdx = i4
|
||||
# i5 = url.find('5')
|
||||
# if i5 >= 0 and i5 < minIdx:
|
||||
# minIdx = i5
|
||||
# i6 = url.find('6')
|
||||
# if i6 >= 0 and i6 < minIdx:
|
||||
# minIdx = i6
|
||||
# i7 = url.find('7')
|
||||
# if i7 >= 0 and i7 < minIdx:
|
||||
# minIdx = i7
|
||||
# i8 = url.find('8')
|
||||
# if i8 >= 0 and i8 < minIdx:
|
||||
# minIdx = i8
|
||||
# i9 = url.find('9')
|
||||
# if i9 >= 0 and i9 < minIdx:
|
||||
# minIdx = i9
|
||||
return url
|
||||
|
||||
def get_dtlocal(self):
|
||||
dt_utc = datetime.datetime.utcnow()
|
||||
if __Region__ == 'Hong Kong':
|
||||
# convert UTC to local hk time - at HKT 4.30am, all news are available
|
||||
dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(4.5/24)
|
||||
# dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(4.5/24)
|
||||
elif __Region__ == 'Vancouver':
|
||||
# convert UTC to local Vancouver time - at PST time 4.30am, all news are available
|
||||
dt_local = dt_utc + datetime.timedelta(-8.0/24) - datetime.timedelta(4.5/24)
|
||||
#dt_local = dt_utc.astimezone(pytz.timezone('America/Vancouver')) - datetime.timedelta(4.5/24)
|
||||
elif __Region__ == 'Toronto':
|
||||
# convert UTC to local Toronto time - at EST time 4.30am, all news are available
|
||||
dt_local = dt_utc + datetime.timedelta(-5.0/24) - datetime.timedelta(4.5/24)
|
||||
#dt_local = dt_utc.astimezone(pytz.timezone('America/Toronto')) - datetime.timedelta(4.5/24)
|
||||
return dt_local
|
||||
|
||||
def get_fetchdate(self):
|
||||
return self.get_dtlocal().strftime("%Y%m%d")
|
||||
|
||||
def get_fetchformatteddate(self):
|
||||
return self.get_dtlocal().strftime("%Y-%m-%d")
|
||||
|
||||
def get_fetchday(self):
|
||||
return self.get_dtlocal().strftime("%d")
|
||||
|
||||
def get_cover_url(self):
|
||||
if __Region__ == 'Hong Kong':
|
||||
cover = 'http://news.mingpao.com/' + self.get_fetchdate() + '/' + self.get_fetchdate() + '_' + self.get_fetchday() + 'gacov.jpg'
|
||||
elif __Region__ == 'Vancouver':
|
||||
cover = 'http://www.mingpaovan.com/ftp/News/' + self.get_fetchdate() + '/' + self.get_fetchday() + 'pgva1s.jpg'
|
||||
elif __Region__ == 'Toronto':
|
||||
cover = 'http://www.mingpaotor.com/ftp/News/' + self.get_fetchdate() + '/' + self.get_fetchday() + 'pgtas.jpg'
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
try:
|
||||
br.open(cover)
|
||||
except:
|
||||
cover = None
|
||||
return cover
|
||||
|
||||
def parse_index(self):
|
||||
feeds = []
|
||||
dateStr = self.get_fetchdate()
|
||||
|
||||
if __Region__ == 'Hong Kong':
|
||||
if __UseLife__:
|
||||
for title, url, keystr in [(u'\u8981\u805e Headline', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalga', 'nal'),
|
||||
(u'\u6e2f\u805e Local', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalgb', 'nal'),
|
||||
(u'\u6559\u80b2 Education', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalgf', 'nal'),
|
||||
(u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr', 'nal'),
|
||||
(u'\u8ad6\u58c7 Forum', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalfa', 'nal'),
|
||||
(u'\u4e2d\u570b China', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalca', 'nal'),
|
||||
(u'\u570b\u969b World', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalta', 'nal'),
|
||||
(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal'),
|
||||
(u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'),
|
||||
(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal'),
|
||||
(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')]:
|
||||
articles = self.parse_section2(url, keystr)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
|
||||
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
|
||||
articles = self.parse_section(url)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
else:
|
||||
for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
|
||||
(u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
|
||||
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm')]:
|
||||
articles = self.parse_section(url)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
# special- editorial
|
||||
ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
|
||||
if ed_articles:
|
||||
feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
|
||||
|
||||
for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
|
||||
(u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
|
||||
(u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm')]:
|
||||
articles = self.parse_section(url)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
# special - finance
|
||||
#fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
|
||||
fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
|
||||
if fin_articles:
|
||||
feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
|
||||
|
||||
for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
|
||||
(u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
|
||||
articles = self.parse_section(url)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
# special - entertainment
|
||||
ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
|
||||
if ent_articles:
|
||||
feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
|
||||
|
||||
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
|
||||
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
|
||||
articles = self.parse_section(url)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
|
||||
# special- columns
|
||||
col_articles = self.parse_col_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn')
|
||||
if col_articles:
|
||||
feeds.append((u'\u5c08\u6b04 Columns', col_articles))
|
||||
elif __Region__ == 'Vancouver':
|
||||
for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VAindex.htm'),
|
||||
(u'\u52a0\u570b Canada', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VBindex.htm'),
|
||||
(u'\u793e\u5340 Local', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VDindex.htm'),
|
||||
(u'\u6e2f\u805e Hong Kong', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/HK-VGindex.htm'),
|
||||
(u'\u570b\u969b World', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VTindex.htm'),
|
||||
(u'\u4e2d\u570b China', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VCindex.htm'),
|
||||
(u'\u7d93\u6fdf Economics', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VEindex.htm'),
|
||||
(u'\u9ad4\u80b2 Sports', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VSindex.htm'),
|
||||
(u'\u5f71\u8996 Film/TV', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/HK-MAindex.htm'),
|
||||
(u'\u526f\u520a Supplements', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/WWindex.htm'),]:
|
||||
articles = self.parse_section3(url, 'http://www.mingpaovan.com/')
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
elif __Region__ == 'Toronto':
|
||||
for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TAindex.htm'),
|
||||
(u'\u52a0\u570b Canada', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TDindex.htm'),
|
||||
(u'\u793e\u5340 Local', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TFindex.htm'),
|
||||
(u'\u4e2d\u570b China', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TCAindex.htm'),
|
||||
(u'\u570b\u969b World', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TTAindex.htm'),
|
||||
(u'\u6e2f\u805e Hong Kong', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/HK-GAindex.htm'),
|
||||
(u'\u7d93\u6fdf Economics', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/THindex.htm'),
|
||||
(u'\u9ad4\u80b2 Sports', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TSindex.htm'),
|
||||
(u'\u5f71\u8996 Film/TV', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/HK-MAindex.htm'),
|
||||
(u'\u526f\u520a Supplements', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/WWindex.htm'),]:
|
||||
articles = self.parse_section3(url, 'http://www.mingpaotor.com/')
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
return feeds
|
||||
|
||||
# parse from news.mingpao.com
|
||||
def parse_section(self, url):
|
||||
dateStr = self.get_fetchdate()
|
||||
soup = self.index_to_soup(url)
|
||||
divs = soup.findAll(attrs={'class': ['bullet','bullet_grey']})
|
||||
current_articles = []
|
||||
included_urls = []
|
||||
divs.reverse()
|
||||
for i in divs:
|
||||
a = i.find('a', href = True)
|
||||
title = self.tag_to_string(a)
|
||||
url = a.get('href', False)
|
||||
url = 'http://news.mingpao.com/' + dateStr + '/' +url
|
||||
if url not in included_urls and url.rfind('Redirect') == -1:
|
||||
current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
|
||||
included_urls.append(url)
|
||||
current_articles.reverse()
|
||||
return current_articles
|
||||
|
||||
# parse from life.mingpao.com
|
||||
def parse_section2(self, url, keystr):
|
||||
self.get_fetchdate()
|
||||
soup = self.index_to_soup(url)
|
||||
a = soup.findAll('a', href=True)
|
||||
a.reverse()
|
||||
current_articles = []
|
||||
included_urls = []
|
||||
for i in a:
|
||||
title = self.tag_to_string(i)
|
||||
url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
|
||||
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
|
||||
url = url.replace('dailynews3.cfm', 'dailynews3a.cfm') # use printed version of the article
|
||||
current_articles.append({'title': title, 'url': url, 'description': ''})
|
||||
included_urls.append(url)
|
||||
current_articles.reverse()
|
||||
return current_articles
|
||||
|
||||
# parse from www.mingpaovan.com
|
||||
def parse_section3(self, url, baseUrl):
|
||||
self.get_fetchdate()
|
||||
soup = self.index_to_soup(url)
|
||||
divs = soup.findAll(attrs={'class': ['ListContentLargeLink']})
|
||||
current_articles = []
|
||||
included_urls = []
|
||||
divs.reverse()
|
||||
for i in divs:
|
||||
title = self.tag_to_string(i)
|
||||
urlstr = i.get('href', False)
|
||||
urlstr = baseUrl + '/' + urlstr.replace('../../../', '')
|
||||
if urlstr not in included_urls:
|
||||
current_articles.append({'title': title, 'url': urlstr, 'description': '', 'date': ''})
|
||||
included_urls.append(urlstr)
|
||||
current_articles.reverse()
|
||||
return current_articles
|
||||
|
||||
def parse_ed_section(self, url):
|
||||
self.get_fetchdate()
|
||||
soup = self.index_to_soup(url)
|
||||
a = soup.findAll('a', href=True)
|
||||
a.reverse()
|
||||
current_articles = []
|
||||
included_urls = []
|
||||
for i in a:
|
||||
title = self.tag_to_string(i)
|
||||
url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
|
||||
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('nal') == -1):
|
||||
current_articles.append({'title': title, 'url': url, 'description': ''})
|
||||
included_urls.append(url)
|
||||
current_articles.reverse()
|
||||
return current_articles
|
||||
|
||||
def parse_fin_section(self, url):
|
||||
self.get_fetchdate()
|
||||
soup = self.index_to_soup(url)
|
||||
a = soup.findAll('a', href= True)
|
||||
current_articles = []
|
||||
included_urls = []
|
||||
for i in a:
|
||||
#url = 'http://www.mpfinance.com/cfm/' + i.get('href', False)
|
||||
url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
|
||||
#if url not in included_urls and not url.rfind(dateStr) == -1 and url.rfind('index') == -1:
|
||||
if url not in included_urls and (not url.rfind('txt') == -1) and (not url.rfind('nal') == -1):
|
||||
title = self.tag_to_string(i)
|
||||
current_articles.append({'title': title, 'url': url, 'description':''})
|
||||
included_urls.append(url)
|
||||
return current_articles
|
||||
|
||||
def parse_ent_section(self, url):
|
||||
self.get_fetchdate()
|
||||
soup = self.index_to_soup(url)
|
||||
a = soup.findAll('a', href=True)
|
||||
a.reverse()
|
||||
current_articles = []
|
||||
included_urls = []
|
||||
for i in a:
|
||||
title = self.tag_to_string(i)
|
||||
url = 'http://ol.mingpao.com/cfm/' + i.get('href', False)
|
||||
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('star') == -1):
|
||||
current_articles.append({'title': title, 'url': url, 'description': ''})
|
||||
included_urls.append(url)
|
||||
current_articles.reverse()
|
||||
return current_articles
|
||||
|
||||
def parse_col_section(self, url):
|
||||
self.get_fetchdate()
|
||||
soup = self.index_to_soup(url)
|
||||
a = soup.findAll('a', href=True)
|
||||
a.reverse()
|
||||
current_articles = []
|
||||
included_urls = []
|
||||
for i in a:
|
||||
title = self.tag_to_string(i)
|
||||
url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
|
||||
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('ncl') == -1):
|
||||
current_articles.append({'title': title, 'url': url, 'description': ''})
|
||||
included_urls.append(url)
|
||||
current_articles.reverse()
|
||||
return current_articles
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
for item in soup.findAll(style=True):
|
||||
del item['width']
|
||||
for item in soup.findAll(stype=True):
|
||||
del item['absmiddle']
|
||||
return soup
|
||||
|
||||
def create_opf(self, feeds, dir=None):
|
||||
if dir is None:
|
||||
dir = self.output_dir
|
||||
if __UseChineseTitle__ == True:
|
||||
if __Region__ == 'Hong Kong':
|
||||
title = u'\u660e\u5831 (\u9999\u6e2f)'
|
||||
elif __Region__ == 'Vancouver':
|
||||
title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
|
||||
elif __Region__ == 'Toronto':
|
||||
title = u'\u660e\u5831 (\u591a\u502b\u591a)'
|
||||
else:
|
||||
title = self.short_title()
|
||||
# if not generating a periodical, force date to apply in title
|
||||
if __MakePeriodical__ == False:
|
||||
title = title + ' ' + self.get_fetchformatteddate()
|
||||
if True:
|
||||
mi = MetaInformation(title, [self.publisher])
|
||||
mi.publisher = self.publisher
|
||||
mi.author_sort = self.publisher
|
||||
if __MakePeriodical__ == True:
|
||||
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
|
||||
else:
|
||||
mi.publication_type = self.publication_type+':'+self.short_title()
|
||||
#mi.timestamp = nowf()
|
||||
mi.timestamp = self.get_dtlocal()
|
||||
mi.comments = self.description
|
||||
if not isinstance(mi.comments, unicode):
|
||||
mi.comments = mi.comments.decode('utf-8', 'replace')
|
||||
#mi.pubdate = nowf()
|
||||
mi.pubdate = self.get_dtlocal()
|
||||
opf_path = os.path.join(dir, 'index.opf')
|
||||
ncx_path = os.path.join(dir, 'index.ncx')
|
||||
opf = OPFCreator(dir, mi)
|
||||
# Add mastheadImage entry to <guide> section
|
||||
mp = getattr(self, 'masthead_path', None)
|
||||
if mp is not None and os.access(mp, os.R_OK):
|
||||
from calibre.ebooks.metadata.opf2 import Guide
|
||||
ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
|
||||
ref.type = 'masthead'
|
||||
ref.title = 'Masthead Image'
|
||||
opf.guide.append(ref)
|
||||
|
||||
manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
|
||||
manifest.append(os.path.join(dir, 'index.html'))
|
||||
manifest.append(os.path.join(dir, 'index.ncx'))
|
||||
|
||||
# Get cover
|
||||
cpath = getattr(self, 'cover_path', None)
|
||||
if cpath is None:
|
||||
pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
|
||||
if self.default_cover(pf):
|
||||
cpath = pf.name
|
||||
if cpath is not None and os.access(cpath, os.R_OK):
|
||||
opf.cover = cpath
|
||||
manifest.append(cpath)
|
||||
|
||||
# Get masthead
|
||||
mpath = getattr(self, 'masthead_path', None)
|
||||
if mpath is not None and os.access(mpath, os.R_OK):
|
||||
manifest.append(mpath)
|
||||
|
||||
opf.create_manifest_from_files_in(manifest)
|
||||
for mani in opf.manifest:
|
||||
if mani.path.endswith('.ncx'):
|
||||
mani.id = 'ncx'
|
||||
if mani.path.endswith('mastheadImage.jpg'):
|
||||
mani.id = 'masthead-image'
|
||||
entries = ['index.html']
|
||||
toc = TOC(base_path=dir)
|
||||
self.play_order_counter = 0
|
||||
self.play_order_map = {}
|
||||
|
||||
def feed_index(num, parent):
|
||||
f = feeds[num]
|
||||
for j, a in enumerate(f):
|
||||
if getattr(a, 'downloaded', False):
|
||||
adir = 'feed_%d/article_%d/'%(num, j)
|
||||
auth = a.author
|
||||
if not auth:
|
||||
auth = None
|
||||
desc = a.text_summary
|
||||
if not desc:
|
||||
desc = None
|
||||
else:
|
||||
desc = self.description_limiter(desc)
|
||||
entries.append('%sindex.html'%adir)
|
||||
po = self.play_order_map.get(entries[-1], None)
|
||||
if po is None:
|
||||
self.play_order_counter += 1
|
||||
po = self.play_order_counter
|
||||
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
|
||||
play_order=po, author=auth, description=desc)
|
||||
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
|
||||
for sp in a.sub_pages:
|
||||
prefix = os.path.commonprefix([opf_path, sp])
|
||||
relp = sp[len(prefix):]
|
||||
entries.append(relp.replace(os.sep, '/'))
|
||||
last = sp
|
||||
|
||||
if os.path.exists(last):
|
||||
with open(last, 'rb') as fi:
|
||||
src = fi.read().decode('utf-8')
|
||||
soup = BeautifulSoup(src)
|
||||
body = soup.find('body')
|
||||
if body is not None:
|
||||
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
|
||||
templ = self.navbar.generate(True, num, j, len(f),
|
||||
not self.has_single_feed,
|
||||
a.orig_url, self.publisher, prefix=prefix,
|
||||
center=self.center_navbar)
|
||||
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
|
||||
body.insert(len(body.contents), elem)
|
||||
with open(last, 'wb') as fi:
|
||||
fi.write(unicode(soup).encode('utf-8'))
|
||||
if len(feeds) == 0:
|
||||
raise Exception('All feeds are empty, aborting.')
|
||||
|
||||
if len(feeds) > 1:
|
||||
for i, f in enumerate(feeds):
|
||||
entries.append('feed_%d/index.html'%i)
|
||||
po = self.play_order_map.get(entries[-1], None)
|
||||
if po is None:
|
||||
self.play_order_counter += 1
|
||||
po = self.play_order_counter
|
||||
auth = getattr(f, 'author', None)
|
||||
if not auth:
|
||||
auth = None
|
||||
desc = getattr(f, 'description', None)
|
||||
if not desc:
|
||||
desc = None
|
||||
feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
|
||||
f.title, play_order=po, description=desc, author=auth))
|
||||
|
||||
else:
|
||||
entries.append('feed_%d/index.html'%0)
|
||||
feed_index(0, toc)
|
||||
|
||||
for i, p in enumerate(entries):
|
||||
entries[i] = os.path.join(dir, p.replace('/', os.sep))
|
||||
opf.create_spine(entries)
|
||||
opf.set_toc(toc)
|
||||
|
||||
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
|
||||
opf.render(opf_file, ncx_file)
|
||||
|
Loading…
x
Reference in New Issue
Block a user