"),
- (re.compile(r"
", re.DOTALL | re.IGNORECASE),
- lambda match: "")
- ]
- elif __Region__ == 'Vancouver':
- if __UseChineseTitle__ is True:
- title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
- else:
- title = 'Ming Pao - Vancouver'
- description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)'
- category = 'Chinese, News, Vancouver'
- extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}' # noqa
- masthead_url = 'http://www.mingpaovan.com/image/mainlogo2_VAN2.gif'
- keep_only_tags = [dict(name='table', attrs={'width': ['450'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['1']}),
- dict(name='table', attrs={'width': ['450'], 'border':['0'], 'cellspacing':[
- '3'], 'cellpadding':['3'], 'id':['tblContent3']}),
- dict(name='table', attrs={'width': ['180'], 'border':['0'], 'cellspacing':[
- '0'], 'cellpadding':['0'], 'bgcolor':['F0F0F0']}),
- ]
- if __KeepImages__:
- # the magnifier icon
- remove_tags = [
- dict(name='img', attrs={'src': ['../../../image/magnifier.gif']})]
- else:
- remove_tags = [dict(name='img')]
- remove_attributes = ['width']
- preprocess_regexps = [(re.compile(r' ', re.DOTALL | re.IGNORECASE),
- lambda match: ''),
- ]
- elif __Region__ == 'Toronto':
- if __UseChineseTitle__ is True:
- title = u'\u660e\u5831 (\u591a\u502b\u591a)'
- else:
- title = 'Ming Pao - Toronto'
- description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)'
- category = 'Chinese, News, Toronto'
- extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}' # noqa
- masthead_url = 'http://www.mingpaotor.com/image/mainlogo2_TOR2.gif'
- keep_only_tags = [dict(name='table', attrs={'width': ['450'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['1']}),
- dict(name='table', attrs={'width': ['450'], 'border':['0'], 'cellspacing':[
- '3'], 'cellpadding':['3'], 'id':['tblContent3']}),
- dict(name='table', attrs={'width': ['180'], 'border':['0'], 'cellspacing':[
- '0'], 'cellpadding':['0'], 'bgcolor':['F0F0F0']}),
- ]
- if __KeepImages__:
- # the magnifier icon
- remove_tags = [
- dict(name='img', attrs={'src': ['../../../image/magnifier.gif']})]
- else:
- remove_tags = [dict(name='img')]
- remove_attributes = ['width']
- preprocess_regexps = [(re.compile(r' ', re.DOTALL | re.IGNORECASE),
- lambda match: ''),
- ]
-
- oldest_article = 1
- max_articles_per_feed = 100
- __author__ = 'Eddie Lau'
- publisher = 'MingPao'
- remove_javascript = True
- use_embedded_content = False
- no_stylesheets = True
- language = 'zh'
- encoding = 'Big5-HKSCS'
- recursions = 0
- conversion_options = {'linearize_tables': True}
- timefmt = ''
-
- def get_dtlocal(self):
- dt_utc = datetime.datetime.utcnow()
- if __Region__ == 'Hong Kong':
- # convert UTC to local hk time - at HKT 4.30am, all news are
- # available
- dt_local = dt_utc + \
- datetime.timedelta(8.0 / 24) - datetime.timedelta(4.5 / 24)
- # dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(4.5/24)
- elif __Region__ == 'Vancouver':
- # convert UTC to local Vancouver time - at PST time 5.30am, all
- # news are available
- dt_local = dt_utc + \
- datetime.timedelta(-8.0 / 24) - datetime.timedelta(5.5 / 24)
- # dt_local = dt_utc.astimezone(pytz.timezone('America/Vancouver')) - datetime.timedelta(5.5/24)
- elif __Region__ == 'Toronto':
- # convert UTC to local Toronto time - at EST time 8.30am, all news
- # are available
- dt_local = dt_utc + \
- datetime.timedelta(-5.0 / 24) - datetime.timedelta(8.5 / 24)
- # dt_local = dt_utc.astimezone(pytz.timezone('America/Toronto')) - datetime.timedelta(8.5/24)
- return dt_local
-
- def get_fetchdate(self):
- if __Date__ != '':
- return __Date__
- else:
- return self.get_dtlocal().strftime("%Y%m%d")
-
- def get_fetchformatteddate(self):
- if __Date__ != '':
- return __Date__[0:4] + '-' + __Date__[4:6] + '-' + __Date__[6:8]
- else:
- return self.get_dtlocal().strftime("%Y-%m-%d")
-
- def get_fetchyear(self):
- if __Date__ != '':
- return __Date__[0:4]
- else:
- return self.get_dtlocal().strftime("%Y")
-
- def get_fetchmonth(self):
- if __Date__ != '':
- return __Date__[4:6]
- else:
- return self.get_dtlocal().strftime("%m")
-
- def get_fetchday(self):
- if __Date__ != '':
- return __Date__[6:8]
- else:
- return self.get_dtlocal().strftime("%d")
-
- def get_cover_url(self):
- if __Region__ == 'Hong Kong':
- cover = 'http://news.mingpao.com/' + self.get_fetchdate() + '/' + self.get_fetchdate() + \
- '_' + self.get_fetchday() + 'gacov.jpg'
- elif __Region__ == 'Vancouver':
- cover = 'http://www.mingpaovan.com/ftp/News/' + \
- self.get_fetchdate() + '/' + self.get_fetchday() + 'pgva1s.jpg'
- elif __Region__ == 'Toronto':
- cover = 'http://www.mingpaotor.com/ftp/News/' + \
- self.get_fetchdate() + '/' + self.get_fetchday() + 'pgtas.jpg'
- br = BasicNewsRecipe.get_browser(self)
- try:
- br.open(cover)
- except:
- cover = None
- return cover
-
- def parse_index(self):
- feeds = []
- dateStr = self.get_fetchdate()
-
- if __Region__ == 'Hong Kong':
- if __UseLife__:
- for title, url, keystr in [(u'\u8981\u805e Headline', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalga', 'nal'),
- (u'\u6e2f\u805e Local', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' +
- dateStr + '&Category=nalgb', 'nal'),
- (u'\u6559\u80b2 Education', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' +
- dateStr + '&Category=nalgf', 'nal'),
- (u'\u793e\u8a55/\u7b46\u9663 Editorial',
- 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalmr', 'nal'),
- (u'\u8ad6\u58c7 Forum', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' +
- dateStr + '&Category=nalfa', 'nal'),
- (u'\u4e2d\u570b China', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' +
- dateStr + '&Category=nalca', 'nal'),
- (u'\u570b\u969b World', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' +
- dateStr + '&Category=nalta', 'nal'),
- (u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' +
- dateStr + '&Category=nalea', 'nal'),
- (u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' +
- dateStr + '&Category=nalsp', 'nal'),
- (u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' +
- dateStr + '&Category=nalma', 'nal')
- ]:
- if __InclPremium__ is True:
- articles = self.parse_section2_txt(url, keystr)
- else:
- articles = self.parse_section2(url, keystr)
- if articles:
- feeds.append((title, articles))
-
- if __InclPremium__ is True:
- # parse column section articles directly from .txt files
- for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=ncolumn', 'ncl') # noqa
- ]:
- articles = self.parse_section2_txt(url, keystr)
- if articles:
- feeds.append((title, articles))
-
- for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
- (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
- articles = self.parse_section(url)
- if articles:
- feeds.append((title, articles))
- else:
- for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
- (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' +
- dateStr + '/gbindex.htm'),
- (u'\u6559\u80b2 Education',
- 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
- (u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm')]:
- articles = self.parse_section(url)
- if articles:
- feeds.append((title, articles))
-
- # special- editorial
- # ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
- # if ed_articles:
- # feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
-
- for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
- (u'\u4e2d\u570b China', 'http://news.mingpao.com/' +
- dateStr + '/caindex.htm'),
- (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm')]:
- articles = self.parse_section(url)
- if articles:
- feeds.append((title, articles))
-
- # special - finance
- # fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
- # fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
- # if fin_articles:
- # feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
-
- for title, url, keystr in [(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal')]:
- articles = self.parse_section2_txt(url, keystr)
- if articles:
- feeds.append((title, articles))
-
- # for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
- # (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
- # articles = self.parse_section(url)
- # if articles:
- # feeds.append((title, articles))
-
- # special - entertainment
- # ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
- # if ent_articles:
- # feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
-
- for title, url, keystr in [(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
- ]:
- articles = self.parse_section2_txt(url, keystr)
- if articles:
- feeds.append((title, articles))
-
- if __InclPremium__ is True:
- # parse column section articles directly from .txt files
- for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=ncolumn', 'ncl') # noqa
- ]:
- articles = self.parse_section2_txt(url, keystr)
- if articles:
- feeds.append((title, articles))
-
- for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
- (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
- articles = self.parse_section(url)
- if articles:
- feeds.append((title, articles))
-
- elif __Region__ == 'Vancouver':
- for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VAindex.htm'),
- (u'\u52a0\u570b Canada', 'http://www.mingpaovan.com/htm/News/' +
- dateStr + '/VBindex.htm'),
- (u'\u793e\u5340 Local', 'http://www.mingpaovan.com/htm/News/' +
- dateStr + '/VDindex.htm'),
- (u'\u6e2f\u805e Hong Kong', 'http://www.mingpaovan.com/htm/News/' +
- dateStr + '/HK-VGindex.htm'),
- (u'\u570b\u969b World', 'http://www.mingpaovan.com/htm/News/' +
- dateStr + '/VTindex.htm'),
- (u'\u4e2d\u570b China', 'http://www.mingpaovan.com/htm/News/' +
- dateStr + '/VCindex.htm'),
- (u'\u7d93\u6fdf Economics',
- 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VEindex.htm'),
- (u'\u9ad4\u80b2 Sports', 'http://www.mingpaovan.com/htm/News/' +
- dateStr + '/VSindex.htm'),
- (u'\u5f71\u8996 Film/TV', 'http://www.mingpaovan.com/htm/News/' +
- dateStr + '/HK-MAindex.htm'),
- (u'\u526f\u520a Supplements', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/WWindex.htm'), ]:
- articles = self.parse_section3(
- url, 'http://www.mingpaovan.com/')
- if articles:
- feeds.append((title, articles))
- elif __Region__ == 'Toronto':
- for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TAindex.htm'),
- (u'\u52a0\u570b Canada', 'http://www.mingpaotor.com/htm/News/' +
- dateStr + '/TDindex.htm'),
- (u'\u793e\u5340 Local', 'http://www.mingpaotor.com/htm/News/' +
- dateStr + '/TFindex.htm'),
- (u'\u4e2d\u570b China', 'http://www.mingpaotor.com/htm/News/' +
- dateStr + '/TCAindex.htm'),
- (u'\u570b\u969b World', 'http://www.mingpaotor.com/htm/News/' +
- dateStr + '/TTAindex.htm'),
- (u'\u6e2f\u805e Hong Kong', 'http://www.mingpaotor.com/htm/News/' +
- dateStr + '/HK-GAindex.htm'),
- (u'\u7d93\u6fdf Economics',
- 'http://www.mingpaotor.com/htm/News/' + dateStr + '/THindex.htm'),
- (u'\u9ad4\u80b2 Sports', 'http://www.mingpaotor.com/htm/News/' +
- dateStr + '/TSindex.htm'),
- (u'\u5f71\u8996 Film/TV', 'http://www.mingpaotor.com/htm/News/' +
- dateStr + '/HK-MAindex.htm'),
- (u'\u526f\u520a Supplements', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/WWindex.htm'), ]:
- articles = self.parse_section3(
- url, 'http://www.mingpaotor.com/')
- if articles:
- feeds.append((title, articles))
- return feeds
-
- # parse from news.mingpao.com
- def parse_section(self, url):
- dateStr = self.get_fetchdate()
- soup = self.index_to_soup(url)
- divs = soup.findAll(attrs={'class': ['bullet', 'bullet_grey']})
- current_articles = []
- included_urls = []
- divs.reverse()
- for i in divs:
- a = i.find('a', href=True)
- title = self.tag_to_string(a)
- url = a.get('href', False)
- url = 'http://news.mingpao.com/' + dateStr + '/' + url
- # replace the url to the print-friendly version
- if __ParsePFF__ is True:
- if url.rfind('Redirect') != -1 and __InclPremium__ is True:
- url = re.sub(dateStr + '.*' + dateStr, dateStr, url)
- url = re.sub('%2F.*%2F', '/', url)
- title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '')
- url = url.replace('%2Etxt', '_print.htm')
- url = url.replace('%5F', '_')
- else:
- url = url.replace('.htm', '_print.htm')
- if url not in included_urls and url.rfind('Redirect') == -1:
- current_articles.append(
- {'title': title, 'url': url, 'description': '', 'date': ''})
- included_urls.append(url)
- current_articles.reverse()
- return current_articles
-
- # parse from life.mingpao.com
- def parse_section2(self, url, keystr):
- br = mechanize.Browser()
- br.set_handle_redirect(False)
- self.get_fetchdate()
- soup = self.index_to_soup(url)
- a = soup.findAll('a', href=True)
- a.reverse()
- current_articles = []
- included_urls = []
- for i in a:
- title = self.tag_to_string(i)
- url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
- if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
- try:
- br.open_novisit(url)
- # use printed version of the article
- url = url.replace('dailynews3.cfm', 'dailynews3a.cfm')
- current_articles.append(
- {'title': title, 'url': url, 'description': ''})
- included_urls.append(url)
- except:
- print('skipping a premium article')
- current_articles.reverse()
- return current_articles
-
- # parse from text file of life.mingpao.com
- def parse_section2_txt(self, url, keystr):
- self.get_fetchdate()
- soup = self.index_to_soup(url)
- a = soup.findAll('a', href=True)
- a.reverse()
- current_articles = []
- included_urls = []
- for i in a:
- title = self.tag_to_string(i)
- url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
- if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
- # use printed version of the article
- url = url.replace('cfm/dailynews3.cfm?File=', 'ftp/Life3/')
- current_articles.append(
- {'title': title, 'url': url, 'description': ''})
- included_urls.append(url)
- current_articles.reverse()
- return current_articles
-
- # parse from www.mingpaovan.com
- def parse_section3(self, url, baseUrl):
- self.get_fetchdate()
- soup = self.index_to_soup(url)
- divs = soup.findAll(attrs={'class': ['ListContentLargeLink']})
- current_articles = []
- included_urls = []
- divs.reverse()
- for i in divs:
- title = self.tag_to_string(i)
- urlstr = i.get('href', False)
- urlstr = baseUrl + '/' + urlstr.replace('../../../', '')
- if urlstr not in included_urls:
- current_articles.append(
- {'title': title, 'url': urlstr, 'description': '', 'date': ''})
- included_urls.append(urlstr)
- current_articles.reverse()
- return current_articles
-
- def parse_ed_section(self, url):
- self.get_fetchdate()
- soup = self.index_to_soup(url)
- a = soup.findAll('a', href=True)
- a.reverse()
- current_articles = []
- included_urls = []
- for i in a:
- title = self.tag_to_string(i)
- url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
- if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('nal') == -1):
- current_articles.append(
- {'title': title, 'url': url, 'description': ''})
- included_urls.append(url)
- current_articles.reverse()
- return current_articles
-
- def parse_fin_section(self, url):
- self.get_fetchdate()
- soup = self.index_to_soup(url)
- a = soup.findAll('a', href=True)
- current_articles = []
- included_urls = []
- for i in a:
- # url = 'http://www.mpfinance.com/cfm/' + i.get('href', False)
- url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
- # if url not in included_urls and not url.rfind(dateStr) == -1 and
- # url.rfind('index') == -1:
- if url not in included_urls and (not url.rfind('txt') == -1) and (not url.rfind('nal') == -1):
- title = self.tag_to_string(i)
- current_articles.append(
- {'title': title, 'url': url, 'description': ''})
- included_urls.append(url)
- return current_articles
-
- def parse_ent_section(self, url):
- self.get_fetchdate()
- soup = self.index_to_soup(url)
- a = soup.findAll('a', href=True)
- a.reverse()
- current_articles = []
- included_urls = []
- for i in a:
- title = self.tag_to_string(i)
- url = 'http://ol.mingpao.com/cfm/' + i.get('href', False)
- if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('star') == -1):
- current_articles.append(
- {'title': title, 'url': url, 'description': ''})
- included_urls.append(url)
- current_articles.reverse()
- return current_articles
-
- def parse_col_section(self, url):
- self.get_fetchdate()
- soup = self.index_to_soup(url)
- a = soup.findAll('a', href=True)
- a.reverse()
- current_articles = []
- included_urls = []
- for i in a:
- title = self.tag_to_string(i)
- url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
- if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('ncl') == -1):
- current_articles.append(
- {'title': title, 'url': url, 'description': ''})
- included_urls.append(url)
- current_articles.reverse()
- return current_articles
-
- # preprocess those .txt and javascript based files
- def preprocess_raw_html(self, raw_html, url):
- new_html = raw_html
- if url.rfind('ftp') != -1 or url.rfind('_print.htm') != -1:
- if url.rfind('_print.htm') != -1:
- # javascript based file
- splitter = re.compile(r'\n')
- new_raw_html = '
Untitled'
- new_raw_html = new_raw_html + ''
- for item in splitter.split(raw_html):
- if item.startswith('var heading1 ='):
- heading = item.replace('var heading1 = \'', '')
- heading = heading.replace('\'', '')
- heading = heading.replace(';', '')
- new_raw_html = new_raw_html + '
' + heading
- if item.startswith('var heading2 ='):
- heading = item.replace('var heading2 = \'', '')
- heading = heading.replace('\'', '')
- heading = heading.replace(';', '')
- if heading != '':
- new_raw_html = new_raw_html + '
' + heading + '
'
- else:
- new_raw_html = new_raw_html + '
'
- if item.startswith('var content ='):
- content = item.replace("var content = ", '')
- content = content.replace('\'', '')
- content = content.replace(';', '')
- new_raw_html = new_raw_html + '
'
- if item.startswith('var photocontent ='):
- photo = item.replace('var photocontent = \'', '')
- photo = photo.replace('\'', '')
- photo = photo.replace(';', '')
- photo = photo.replace('
')
- photo = photo.replace('class="photo"', '')
- new_raw_html = new_raw_html + '
'
- new_html = new_raw_html + ''
- else:
- # .txt based file
- splitter = re.compile(r'\n') # Match non-digits
- new_raw_html = '
'
- next_is_img_txt = False
- title_started = False
- title_break_reached = False
- met_article_start_char = False
- for item in splitter.split(raw_html):
- item = item.strip()
- # if title already reached but break between title and
- # content not yet found, record title_break_reached
- if title_started is True and title_break_reached is False and item == '':
- title_break_reached = True
- # if title reached and title_break_reached and met_article_start_char is False and item is not empty
- # start content
- elif title_started is True and title_break_reached is True and met_article_start_char is False:
- if item != '':
- met_article_start_char = True
- new_raw_html = new_raw_html + '
'
- if __HiResImg__ is True:
- # TODO: add a _ in front of an image url
- if url.rfind('news.mingpao.com') > -1:
- imglist = re.findall('src="?.*?jpg"', new_html)
- br = mechanize.Browser()
- br.set_handle_redirect(False)
- for img in imglist:
- gifimg = img.replace('jpg"', 'gif"')
- try:
- br.open_novisit(
- url + "/../" + gifimg[5:len(gifimg) - 1])
- new_html = new_html.replace(img, gifimg)
- except:
- # find the location of the first _
- pos = img.find('_')
- if pos > -1:
- # if found, insert _ after the first _
- newimg = img[0:pos] + '_' + img[pos:]
- new_html = new_html.replace(img, newimg)
- else:
- # if not found, insert _ after "
- new_html = new_html.replace(
- img[1:], '"_' + img[1:])
- elif url.rfind('life.mingpao.com') > -1:
- imglist = re.findall('src=\'?.*?jpg\'', new_html)
- br = mechanize.Browser()
- br.set_handle_redirect(False)
- # print 'Img list: ', imglist, '\n'
- for img in imglist:
- # print 'Found img: ', img
- gifimg = img.replace('jpg\'', 'gif\'')
- try:
- gifurl = re.sub(r'dailynews.*txt', '', url)
- br.open_novisit(gifurl + gifimg[5:len(gifimg) - 1])
- new_html = new_html.replace(img, gifimg)
- except:
- pos = img.rfind('/')
- newimg = img[0:pos + 1] + '_' + img[pos + 1:]
- new_html = new_html.replace(img, newimg)
- # repeat with src quoted by double quotes, for text parsed from
- # src txt
- imglist = re.findall('src="?.*?jpg"', new_html)
- for img in imglist:
- # print 'Found img: ', img
- gifimg = img.replace('jpg"', 'gif"')
- try:
- # print 'url', url
- pos = url.rfind('/')
- gifurl = url[:pos + 1]
- # print 'try it:', gifurl + gifimg[5:len(gifimg)-1]
- br.open_novisit(gifurl + gifimg[5:len(gifimg) - 1])
- new_html = new_html.replace(img, gifimg)
- except:
- pos = img.find('"')
- newimg = img[0:pos + 1] + '_' + img[pos + 1:]
- # print 'Use hi-res img', newimg
- new_html = new_html.replace(img, newimg)
- return new_html
-
- def preprocess_html(self, soup):
- for item in soup.findAll(style=True):
- del item['style']
- for item in soup.findAll(style=True):
- del item['width']
- for item in soup.findAll(stype=True):
- del item['absmiddle']
- return soup
-
- def populate_article_metadata(self, article, soup, first):
- # thumbnails shouldn't be available if using hi-res images
- if __IncludeThumbnails__ and __HiResImg__ is False and first and hasattr(self, 'add_toc_thumbnail'):
- img = soup.find('img')
- if img is not None:
- self.add_toc_thumbnail(article, img['src'])
-
- try:
- if __IncludeSummary__ and len(article.text_summary.strip()) == 0:
- # look for content
- articlebodies = soup.findAll(
- 'div', attrs={'id': 'newscontent'})
- if not articlebodies:
- articlebodies = soup.findAll(
- 'div', attrs={'id': 'newscontent01'})
- if not articlebodies:
- articlebodies = soup.findAll(
- 'div', attrs={'class': 'content'})
- if not articlebodies:
- articlebodies = soup.findAll('div', attrs={'id': 'font'})
- if articlebodies:
- for articlebody in articlebodies:
- if articlebody:
- # the text may or may not be enclosed in
- # tag
- paras = articlebody.findAll('p')
- if not paras:
- paras = articlebody
- textFound = False
- for p in paras:
- if not textFound:
- summary_candidate = self.tag_to_string(
- p).strip()
- summary_candidate = summary_candidate.replace(
- u'\u3010\u660e\u5831\u5c08\u8a0a\u3011', '', 1)
- if len(summary_candidate) > 0:
- article.summary = article.text_summary = summary_candidate
- textFound = True
- else:
- # display a simple text
- # article.summary = article.text_summary = u'\u66f4\u591a......'
- # display word counts
- counts = 0
- articlebodies = soup.findAll(
- 'div', attrs={'id': 'newscontent'})
- if not articlebodies:
- articlebodies = soup.findAll(
- 'div', attrs={'id': 'newscontent01'})
- if not articlebodies:
- articlebodies = soup.findAll(
- 'div', attrs={'class': 'content'})
- if not articlebodies:
- articlebodies = soup.findAll('div', attrs={'id': 'font'})
- if articlebodies:
- for articlebody in articlebodies:
- # the text may or may not be enclosed in
tag
- paras = articlebody.findAll('p')
- if not paras:
- paras = articlebody
- for p in paras:
- summary_candidate = self.tag_to_string(p).strip()
- counts += len(summary_candidate)
- article.summary = article.text_summary = u'\uff08' + \
- str(counts) + u'\u5b57\uff09'
- except:
- self.log("Error creating article descriptions")
- return
-
- # override from the one in version 0.8.31
- def create_opf(self, feeds, dir=None):
- if dir is None:
- dir = self.output_dir
- title = self.short_title()
- # change 1: allow our own flag to tell if a periodical is to be generated
- # also use customed date instead of current time
- if __MakePeriodical__ is False or self.output_profile.periodical_date_in_title:
- title = title + ' ' + self.get_fetchformatteddate()
- # end of change 1
- # change 2: __appname__ replaced by newspaper publisher
- __appname__ = self.publisher
- mi = MetaInformation(title, [__appname__])
- mi.publisher = __appname__
- mi.author_sort = __appname__
- # change 3: use __MakePeriodical__ flag to tell if a periodical should
- # be generated
- if __MakePeriodical__ is True:
- mi.publication_type = 'periodical:' + \
- self.publication_type + ':' + self.short_title()
- else:
- mi.publication_type = self.publication_type + ':' + self.short_title()
- # mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
- # change 4: in the following, all the nowf() are changed to adjusted time
- # This one doesn't matter
- mi.timestamp = nowf()
- # change 5: skip listing the articles
- # article_titles, aseen = [], set()
- # for f in feeds:
- # for a in f:
- # if a.title and a.title not in aseen:
- # aseen.add(a.title)
- # article_titles.append(force_unicode(a.title, 'utf-8'))
-
- # mi.comments = self.description
- # if not isinstance(mi.comments, unicode):
- # mi.comments = mi.comments.decode('utf-8', 'replace')
- # mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
- # '\n\n'.join(article_titles))
-
- language = canonicalize_lang(self.language)
- if language is not None:
- mi.language = language
- # This one affects the pub date shown in kindle title
- # mi.pubdate = nowf()
- # now appears to need the time field to be > 12.00noon as well
- mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(
- self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
- opf_path = os.path.join(dir, 'index.opf')
- ncx_path = os.path.join(dir, 'index.ncx')
-
- opf = OPFCreator(dir, mi)
- # Add mastheadImage entry to