Update Ming Pao

This commit is contained in:
Kovid Goyal 2011-10-19 06:06:36 +05:30
parent f31f109c23
commit 68a29c213d

View File

@ -4,24 +4,25 @@ __copyright__ = '2010-2011, Eddie Lau'
# Region - Hong Kong, Vancouver, Toronto
__Region__ = 'Hong Kong'
# Users of Kindle 3 with limited system-level CJK support
# please replace the following "True" with "False".
# please replace the following "True" with "False". (Default: True)
__MakePeriodical__ = True
# Turn below to True if your device supports display of CJK titles
# Turn below to True if your device supports display of CJK titles (Default: False)
__UseChineseTitle__ = False
# Set it to False if you want to skip images
# Set it to False if you want to skip images (Default: True)
__KeepImages__ = True
# (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source
# (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source (Default: True)
__UseLife__ = True
# (HK only) It is to disable the column section which is now a premium content
__InclCols__ = False
# (HK only) Turn below to True if you wish to parse articles in news.mingpao.com with their printer-friendly formats
__ParsePFF__ = False
# (HK only) Turn below to True if you wish hi-res images
# (HK only) It is to disable premium content (Default: False)
__InclPremium__ = False
# (HK only) Turn below to True if you wish to parse articles in news.mingpao.com with their printer-friendly formats (Default: True)
__ParsePFF__ = True
# (HK only) Turn below to True if you wish hi-res images (Default: False)
__HiResImg__ = False
'''
Change Log:
2011/10/17: disable fetching of premium content, also improved txt source parsing
2011/10/04: option to get hi-res photos for the articles
2011/09/21: fetching "column" section is made optional.
2011/09/18: parse "column" section stuff from source text file directly.
@ -72,7 +73,7 @@ class MPRecipe(BasicNewsRecipe):
dict(attrs={'class':['content']}), # for content from txt
dict(attrs={'class':['photo']}),
dict(name='table', attrs={'width':['100%'], 'border':['0'], 'cellspacing':['5'], 'cellpadding':['0']}), # content in printed version of life.mingpao.com
dict(name='img', attrs={'width':['180'], 'alt':['按圖放大']}), # images for source from life.mingpao.com
dict(name='img', attrs={'width':['180'], 'alt':['????']}), # images for source from life.mingpao.com
dict(attrs={'class':['images']}) # for images from txt
]
if __KeepImages__:
@ -208,11 +209,14 @@ class MPRecipe(BasicNewsRecipe):
(u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'),
(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
]:
articles = self.parse_section2(url, keystr)
if __InclPremium__ == True:
articles = self.parse_section2_txt(url, keystr)
else:
articles = self.parse_section2(url, keystr)
if articles:
feeds.append((title, articles))
if __InclCols__ == True:
if __InclPremium__ == True:
# parse column section articles directly from .txt files
for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
]:
@ -253,7 +257,7 @@ class MPRecipe(BasicNewsRecipe):
# feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
for title, url, keystr in [(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal')]:
articles = self.parse_section2(url, keystr)
articles = self.parse_section2_txt(url, keystr)
if articles:
feeds.append((title, articles))
@ -270,11 +274,11 @@ class MPRecipe(BasicNewsRecipe):
for title, url, keystr in [(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
]:
articles = self.parse_section2(url, keystr)
articles = self.parse_section2_txt(url, keystr)
if articles:
feeds.append((title, articles))
if __InclCols__ == True:
if __InclPremium__ == True:
# parse column section articles directly from .txt files
for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
]:
@ -333,7 +337,7 @@ class MPRecipe(BasicNewsRecipe):
url = 'http://news.mingpao.com/' + dateStr + '/' +url
# replace the url to the print-friendly version
if __ParsePFF__ == True:
if url.rfind('Redirect') <> -1:
if url.rfind('Redirect') <> -1 and __InclPremium__ == True:
url = re.sub(dateStr + '.*' + dateStr, dateStr, url)
url = re.sub('%2F.*%2F', '/', url)
title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '')
@ -349,6 +353,8 @@ class MPRecipe(BasicNewsRecipe):
# parse from life.mingpao.com
def parse_section2(self, url, keystr):
br = mechanize.Browser()
br.set_handle_redirect(False)
self.get_fetchdate()
soup = self.index_to_soup(url)
a = soup.findAll('a', href=True)
@ -359,9 +365,13 @@ class MPRecipe(BasicNewsRecipe):
title = self.tag_to_string(i)
url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
url = url.replace('dailynews3.cfm', 'dailynews3a.cfm') # use printed version of the article
current_articles.append({'title': title, 'url': url, 'description': ''})
included_urls.append(url)
try:
br.open_novisit(url)
url = url.replace('dailynews3.cfm', 'dailynews3a.cfm') # use printed version of the article
current_articles.append({'title': title, 'url': url, 'description': ''})
included_urls.append(url)
except:
print 'skipping a premium article'
current_articles.reverse()
return current_articles
@ -553,6 +563,7 @@ class MPRecipe(BasicNewsRecipe):
# .txt based file
splitter = re.compile(r'\n') # Match non-digits
new_raw_html = '<html><head><title>Untitled</title></head><body><div class="images">'
next_is_mov_link = False
next_is_img_txt = False
title_started = False
met_article_start_char = False
@ -561,22 +572,33 @@ class MPRecipe(BasicNewsRecipe):
met_article_start_char = True
new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
else:
if next_is_img_txt == False:
if item.startswith('='):
if next_is_img_txt == False and next_is_mov_link == False:
item = item.strip()
if item.startswith("=@"):
next_is_mov_link = True
elif item.startswith("=?"):
next_is_img_txt = True
new_raw_html += '<img src="' + str(item)[2:].strip() + '.gif" /><p>\n'
elif item.startswith('='):
next_is_img_txt = True
new_raw_html += '<img src="' + str(item)[1:].strip() + '.jpg" /><p>\n'
else:
if met_article_start_char == False:
if title_started == False:
new_raw_html = new_raw_html + '</div><div class="heading">' + item + '\n'
title_started = True
if item <> '':
if next_is_img_txt == False and met_article_start_char == False:
if title_started == False:
#print 'Title started at ', item
new_raw_html = new_raw_html + '</div><div class="heading">' + item + '\n'
title_started = True
else:
new_raw_html = new_raw_html + item + '\n'
else:
new_raw_html = new_raw_html + item + '\n'
else:
new_raw_html = new_raw_html + item + '<p>\n'
new_raw_html = new_raw_html + item + '<p>\n'
else:
next_is_img_txt = False
new_raw_html = new_raw_html + item + '\n'
if next_is_mov_link == True:
next_is_mov_link = False
else:
next_is_img_txt = False
new_raw_html = new_raw_html + item + '\n'
return new_raw_html + '</div></body></html>'
def preprocess_html(self, soup):
@ -678,7 +700,7 @@ class MPRecipe(BasicNewsRecipe):
if po is None:
self.play_order_counter += 1
po = self.play_order_counter
parent.add_item('%sindex.html'%adir, None, a.title if a.title else ('Untitled Article'),
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
play_order=po, author=auth, description=desc)
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
for sp in a.sub_pages: