Update Ming Pao

This commit is contained in:
Kovid Goyal 2011-10-19 06:06:36 +05:30
parent f31f109c23
commit 68a29c213d

View File

@ -4,24 +4,25 @@ __copyright__ = '2010-2011, Eddie Lau'
# Region - Hong Kong, Vancouver, Toronto # Region - Hong Kong, Vancouver, Toronto
__Region__ = 'Hong Kong' __Region__ = 'Hong Kong'
# Users of Kindle 3 with limited system-level CJK support # Users of Kindle 3 with limited system-level CJK support
# please replace the following "True" with "False". # please replace the following "True" with "False". (Default: True)
__MakePeriodical__ = True __MakePeriodical__ = True
# Turn below to True if your device supports display of CJK titles # Turn below to True if your device supports display of CJK titles (Default: False)
__UseChineseTitle__ = False __UseChineseTitle__ = False
# Set it to False if you want to skip images # Set it to False if you want to skip images (Default: True)
__KeepImages__ = True __KeepImages__ = True
# (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source # (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source (Default: True)
__UseLife__ = True __UseLife__ = True
# (HK only) It is to disable the column section which is now a premium content # (HK only) It is to disable premium content (Default: False)
__InclCols__ = False __InclPremium__ = False
# (HK only) Turn below to True if you wish to parse articles in news.mingpao.com with their printer-friendly formats # (HK only) Turn below to True if you wish to parse articles in news.mingpao.com with their printer-friendly formats (Default: True)
__ParsePFF__ = False __ParsePFF__ = True
# (HK only) Turn below to True if you wish hi-res images # (HK only) Turn below to True if you wish hi-res images (Default: False)
__HiResImg__ = False __HiResImg__ = False
''' '''
Change Log: Change Log:
2011/10/17: disable fetching of premium content, also improved txt source parsing
2011/10/04: option to get hi-res photos for the articles 2011/10/04: option to get hi-res photos for the articles
2011/09/21: fetching "column" section is made optional. 2011/09/21: fetching "column" section is made optional.
2011/09/18: parse "column" section stuff from source text file directly. 2011/09/18: parse "column" section stuff from source text file directly.
@ -72,7 +73,7 @@ class MPRecipe(BasicNewsRecipe):
dict(attrs={'class':['content']}), # for content from txt dict(attrs={'class':['content']}), # for content from txt
dict(attrs={'class':['photo']}), dict(attrs={'class':['photo']}),
dict(name='table', attrs={'width':['100%'], 'border':['0'], 'cellspacing':['5'], 'cellpadding':['0']}), # content in printed version of life.mingpao.com dict(name='table', attrs={'width':['100%'], 'border':['0'], 'cellspacing':['5'], 'cellpadding':['0']}), # content in printed version of life.mingpao.com
dict(name='img', attrs={'width':['180'], 'alt':['按圖放大']}), # images for source from life.mingpao.com dict(name='img', attrs={'width':['180'], 'alt':['????']}), # images for source from life.mingpao.com
dict(attrs={'class':['images']}) # for images from txt dict(attrs={'class':['images']}) # for images from txt
] ]
if __KeepImages__: if __KeepImages__:
@ -208,11 +209,14 @@ class MPRecipe(BasicNewsRecipe):
(u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'), (u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'),
(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal') (u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
]: ]:
if __InclPremium__ == True:
articles = self.parse_section2_txt(url, keystr)
else:
articles = self.parse_section2(url, keystr) articles = self.parse_section2(url, keystr)
if articles: if articles:
feeds.append((title, articles)) feeds.append((title, articles))
if __InclCols__ == True: if __InclPremium__ == True:
# parse column section articles directly from .txt files # parse column section articles directly from .txt files
for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl') for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
]: ]:
@ -253,7 +257,7 @@ class MPRecipe(BasicNewsRecipe):
# feeds.append((u'\u7d93\u6fdf Finance', fin_articles)) # feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
for title, url, keystr in [(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal')]: for title, url, keystr in [(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal')]:
articles = self.parse_section2(url, keystr) articles = self.parse_section2_txt(url, keystr)
if articles: if articles:
feeds.append((title, articles)) feeds.append((title, articles))
@ -270,11 +274,11 @@ class MPRecipe(BasicNewsRecipe):
for title, url, keystr in [(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal') for title, url, keystr in [(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
]: ]:
articles = self.parse_section2(url, keystr) articles = self.parse_section2_txt(url, keystr)
if articles: if articles:
feeds.append((title, articles)) feeds.append((title, articles))
if __InclCols__ == True: if __InclPremium__ == True:
# parse column section articles directly from .txt files # parse column section articles directly from .txt files
for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl') for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
]: ]:
@ -333,7 +337,7 @@ class MPRecipe(BasicNewsRecipe):
url = 'http://news.mingpao.com/' + dateStr + '/' +url url = 'http://news.mingpao.com/' + dateStr + '/' +url
# replace the url to the print-friendly version # replace the url to the print-friendly version
if __ParsePFF__ == True: if __ParsePFF__ == True:
if url.rfind('Redirect') <> -1: if url.rfind('Redirect') <> -1 and __InclPremium__ == True:
url = re.sub(dateStr + '.*' + dateStr, dateStr, url) url = re.sub(dateStr + '.*' + dateStr, dateStr, url)
url = re.sub('%2F.*%2F', '/', url) url = re.sub('%2F.*%2F', '/', url)
title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '') title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '')
@ -349,6 +353,8 @@ class MPRecipe(BasicNewsRecipe):
# parse from life.mingpao.com # parse from life.mingpao.com
def parse_section2(self, url, keystr): def parse_section2(self, url, keystr):
br = mechanize.Browser()
br.set_handle_redirect(False)
self.get_fetchdate() self.get_fetchdate()
soup = self.index_to_soup(url) soup = self.index_to_soup(url)
a = soup.findAll('a', href=True) a = soup.findAll('a', href=True)
@ -359,9 +365,13 @@ class MPRecipe(BasicNewsRecipe):
title = self.tag_to_string(i) title = self.tag_to_string(i)
url = 'http://life.mingpao.com/cfm/' + i.get('href', False) url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1): if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
try:
br.open_novisit(url)
url = url.replace('dailynews3.cfm', 'dailynews3a.cfm') # use printed version of the article url = url.replace('dailynews3.cfm', 'dailynews3a.cfm') # use printed version of the article
current_articles.append({'title': title, 'url': url, 'description': ''}) current_articles.append({'title': title, 'url': url, 'description': ''})
included_urls.append(url) included_urls.append(url)
except:
print 'skipping a premium article'
current_articles.reverse() current_articles.reverse()
return current_articles return current_articles
@ -553,6 +563,7 @@ class MPRecipe(BasicNewsRecipe):
# .txt based file # .txt based file
splitter = re.compile(r'\n') # Match non-digits splitter = re.compile(r'\n') # Match non-digits
new_raw_html = '<html><head><title>Untitled</title></head><body><div class="images">' new_raw_html = '<html><head><title>Untitled</title></head><body><div class="images">'
next_is_mov_link = False
next_is_img_txt = False next_is_img_txt = False
title_started = False title_started = False
met_article_start_char = False met_article_start_char = False
@ -561,19 +572,30 @@ class MPRecipe(BasicNewsRecipe):
met_article_start_char = True met_article_start_char = True
new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n' new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
else: else:
if next_is_img_txt == False: if next_is_img_txt == False and next_is_mov_link == False:
if item.startswith('='): item = item.strip()
if item.startswith("=@"):
next_is_mov_link = True
elif item.startswith("=?"):
next_is_img_txt = True
new_raw_html += '<img src="' + str(item)[2:].strip() + '.gif" /><p>\n'
elif item.startswith('='):
next_is_img_txt = True next_is_img_txt = True
new_raw_html += '<img src="' + str(item)[1:].strip() + '.jpg" /><p>\n' new_raw_html += '<img src="' + str(item)[1:].strip() + '.jpg" /><p>\n'
else: else:
if met_article_start_char == False: if item <> '':
if next_is_img_txt == False and met_article_start_char == False:
if title_started == False: if title_started == False:
#print 'Title started at ', item
new_raw_html = new_raw_html + '</div><div class="heading">' + item + '\n' new_raw_html = new_raw_html + '</div><div class="heading">' + item + '\n'
title_started = True title_started = True
else: else:
new_raw_html = new_raw_html + item + '\n' new_raw_html = new_raw_html + item + '\n'
else: else:
new_raw_html = new_raw_html + item + '<p>\n' new_raw_html = new_raw_html + item + '<p>\n'
else:
if next_is_mov_link == True:
next_is_mov_link = False
else: else:
next_is_img_txt = False next_is_img_txt = False
new_raw_html = new_raw_html + item + '\n' new_raw_html = new_raw_html + item + '\n'
@ -678,7 +700,7 @@ class MPRecipe(BasicNewsRecipe):
if po is None: if po is None:
self.play_order_counter += 1 self.play_order_counter += 1
po = self.play_order_counter po = self.play_order_counter
parent.add_item('%sindex.html'%adir, None, a.title if a.title else ('Untitled Article'), parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
play_order=po, author=auth, description=desc) play_order=po, author=auth, description=desc)
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep)) last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
for sp in a.sub_pages: for sp in a.sub_pages: