mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Update Ming Pao
This commit is contained in:
parent
f31f109c23
commit
68a29c213d
@ -4,26 +4,27 @@ __copyright__ = '2010-2011, Eddie Lau'
|
|||||||
# Region - Hong Kong, Vancouver, Toronto
|
# Region - Hong Kong, Vancouver, Toronto
|
||||||
__Region__ = 'Hong Kong'
|
__Region__ = 'Hong Kong'
|
||||||
# Users of Kindle 3 with limited system-level CJK support
|
# Users of Kindle 3 with limited system-level CJK support
|
||||||
# please replace the following "True" with "False".
|
# please replace the following "True" with "False". (Default: True)
|
||||||
__MakePeriodical__ = True
|
__MakePeriodical__ = True
|
||||||
# Turn below to True if your device supports display of CJK titles
|
# Turn below to True if your device supports display of CJK titles (Default: False)
|
||||||
__UseChineseTitle__ = False
|
__UseChineseTitle__ = False
|
||||||
# Set it to False if you want to skip images
|
# Set it to False if you want to skip images (Default: True)
|
||||||
__KeepImages__ = True
|
__KeepImages__ = True
|
||||||
# (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source
|
# (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source (Default: True)
|
||||||
__UseLife__ = True
|
__UseLife__ = True
|
||||||
# (HK only) It is to disable the column section which is now a premium content
|
# (HK only) It is to disable premium content (Default: False)
|
||||||
__InclCols__ = False
|
__InclPremium__ = False
|
||||||
# (HK only) Turn below to True if you wish to parse articles in news.mingpao.com with their printer-friendly formats
|
# (HK only) Turn below to True if you wish to parse articles in news.mingpao.com with their printer-friendly formats (Default: True)
|
||||||
__ParsePFF__ = False
|
__ParsePFF__ = True
|
||||||
# (HK only) Turn below to True if you wish hi-res images
|
# (HK only) Turn below to True if you wish hi-res images (Default: False)
|
||||||
__HiResImg__ = False
|
__HiResImg__ = False
|
||||||
|
|
||||||
|
|
||||||
'''
|
'''
|
||||||
Change Log:
|
Change Log:
|
||||||
|
2011/10/17: disable fetching of premium content, also improved txt source parsing
|
||||||
2011/10/04: option to get hi-res photos for the articles
|
2011/10/04: option to get hi-res photos for the articles
|
||||||
2011/09/21: fetching "column" section is made optional.
|
2011/09/21: fetching "column" section is made optional.
|
||||||
2011/09/18: parse "column" section stuff from source text file directly.
|
2011/09/18: parse "column" section stuff from source text file directly.
|
||||||
2011/09/07: disable "column" section as it is no longer offered free.
|
2011/09/07: disable "column" section as it is no longer offered free.
|
||||||
2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source
|
2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source
|
||||||
@ -72,7 +73,7 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
dict(attrs={'class':['content']}), # for content from txt
|
dict(attrs={'class':['content']}), # for content from txt
|
||||||
dict(attrs={'class':['photo']}),
|
dict(attrs={'class':['photo']}),
|
||||||
dict(name='table', attrs={'width':['100%'], 'border':['0'], 'cellspacing':['5'], 'cellpadding':['0']}), # content in printed version of life.mingpao.com
|
dict(name='table', attrs={'width':['100%'], 'border':['0'], 'cellspacing':['5'], 'cellpadding':['0']}), # content in printed version of life.mingpao.com
|
||||||
dict(name='img', attrs={'width':['180'], 'alt':['按圖放大']}), # images for source from life.mingpao.com
|
dict(name='img', attrs={'width':['180'], 'alt':['????']}), # images for source from life.mingpao.com
|
||||||
dict(attrs={'class':['images']}) # for images from txt
|
dict(attrs={'class':['images']}) # for images from txt
|
||||||
]
|
]
|
||||||
if __KeepImages__:
|
if __KeepImages__:
|
||||||
@ -208,18 +209,21 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
(u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'),
|
(u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'),
|
||||||
(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
|
(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
|
||||||
]:
|
]:
|
||||||
articles = self.parse_section2(url, keystr)
|
if __InclPremium__ == True:
|
||||||
|
articles = self.parse_section2_txt(url, keystr)
|
||||||
|
else:
|
||||||
|
articles = self.parse_section2(url, keystr)
|
||||||
if articles:
|
if articles:
|
||||||
feeds.append((title, articles))
|
feeds.append((title, articles))
|
||||||
|
|
||||||
if __InclCols__ == True:
|
if __InclPremium__ == True:
|
||||||
# parse column section articles directly from .txt files
|
# parse column section articles directly from .txt files
|
||||||
for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
|
for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
|
||||||
]:
|
]:
|
||||||
articles = self.parse_section2_txt(url, keystr)
|
articles = self.parse_section2_txt(url, keystr)
|
||||||
if articles:
|
if articles:
|
||||||
feeds.append((title, articles))
|
feeds.append((title, articles))
|
||||||
|
|
||||||
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
|
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
|
||||||
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
|
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
|
||||||
articles = self.parse_section(url)
|
articles = self.parse_section(url)
|
||||||
@ -253,10 +257,10 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
# feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
|
# feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
|
||||||
|
|
||||||
for title, url, keystr in [(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal')]:
|
for title, url, keystr in [(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal')]:
|
||||||
articles = self.parse_section2(url, keystr)
|
articles = self.parse_section2_txt(url, keystr)
|
||||||
if articles:
|
if articles:
|
||||||
feeds.append((title, articles))
|
feeds.append((title, articles))
|
||||||
|
|
||||||
#for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
|
#for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
|
||||||
# (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
|
# (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
|
||||||
# articles = self.parse_section(url)
|
# articles = self.parse_section(url)
|
||||||
@ -270,18 +274,18 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
|
|
||||||
for title, url, keystr in [(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
|
for title, url, keystr in [(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
|
||||||
]:
|
]:
|
||||||
articles = self.parse_section2(url, keystr)
|
articles = self.parse_section2_txt(url, keystr)
|
||||||
if articles:
|
if articles:
|
||||||
feeds.append((title, articles))
|
feeds.append((title, articles))
|
||||||
|
|
||||||
if __InclCols__ == True:
|
if __InclPremium__ == True:
|
||||||
# parse column section articles directly from .txt files
|
# parse column section articles directly from .txt files
|
||||||
for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
|
for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
|
||||||
]:
|
]:
|
||||||
articles = self.parse_section2_txt(url, keystr)
|
articles = self.parse_section2_txt(url, keystr)
|
||||||
if articles:
|
if articles:
|
||||||
feeds.append((title, articles))
|
feeds.append((title, articles))
|
||||||
|
|
||||||
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
|
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
|
||||||
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
|
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
|
||||||
articles = self.parse_section(url)
|
articles = self.parse_section(url)
|
||||||
@ -333,7 +337,7 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
url = 'http://news.mingpao.com/' + dateStr + '/' +url
|
url = 'http://news.mingpao.com/' + dateStr + '/' +url
|
||||||
# replace the url to the print-friendly version
|
# replace the url to the print-friendly version
|
||||||
if __ParsePFF__ == True:
|
if __ParsePFF__ == True:
|
||||||
if url.rfind('Redirect') <> -1:
|
if url.rfind('Redirect') <> -1 and __InclPremium__ == True:
|
||||||
url = re.sub(dateStr + '.*' + dateStr, dateStr, url)
|
url = re.sub(dateStr + '.*' + dateStr, dateStr, url)
|
||||||
url = re.sub('%2F.*%2F', '/', url)
|
url = re.sub('%2F.*%2F', '/', url)
|
||||||
title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '')
|
title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '')
|
||||||
@ -349,6 +353,8 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
|
|
||||||
# parse from life.mingpao.com
|
# parse from life.mingpao.com
|
||||||
def parse_section2(self, url, keystr):
|
def parse_section2(self, url, keystr):
|
||||||
|
br = mechanize.Browser()
|
||||||
|
br.set_handle_redirect(False)
|
||||||
self.get_fetchdate()
|
self.get_fetchdate()
|
||||||
soup = self.index_to_soup(url)
|
soup = self.index_to_soup(url)
|
||||||
a = soup.findAll('a', href=True)
|
a = soup.findAll('a', href=True)
|
||||||
@ -359,9 +365,13 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
title = self.tag_to_string(i)
|
title = self.tag_to_string(i)
|
||||||
url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
|
url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
|
||||||
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
|
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
|
||||||
url = url.replace('dailynews3.cfm', 'dailynews3a.cfm') # use printed version of the article
|
try:
|
||||||
current_articles.append({'title': title, 'url': url, 'description': ''})
|
br.open_novisit(url)
|
||||||
included_urls.append(url)
|
url = url.replace('dailynews3.cfm', 'dailynews3a.cfm') # use printed version of the article
|
||||||
|
current_articles.append({'title': title, 'url': url, 'description': ''})
|
||||||
|
included_urls.append(url)
|
||||||
|
except:
|
||||||
|
print 'skipping a premium article'
|
||||||
current_articles.reverse()
|
current_articles.reverse()
|
||||||
return current_articles
|
return current_articles
|
||||||
|
|
||||||
@ -382,7 +392,7 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
included_urls.append(url)
|
included_urls.append(url)
|
||||||
current_articles.reverse()
|
current_articles.reverse()
|
||||||
return current_articles
|
return current_articles
|
||||||
|
|
||||||
# parse from www.mingpaovan.com
|
# parse from www.mingpaovan.com
|
||||||
def parse_section3(self, url, baseUrl):
|
def parse_section3(self, url, baseUrl):
|
||||||
self.get_fetchdate()
|
self.get_fetchdate()
|
||||||
@ -470,23 +480,23 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
#raw_html = raw_html.replace(u'<p>\u3010', u'\u3010')
|
#raw_html = raw_html.replace(u'<p>\u3010', u'\u3010')
|
||||||
if __HiResImg__ == True:
|
if __HiResImg__ == True:
|
||||||
# TODO: add a _ in front of an image url
|
# TODO: add a _ in front of an image url
|
||||||
if url.rfind('news.mingpao.com') > -1:
|
if url.rfind('news.mingpao.com') > -1:
|
||||||
imglist = re.findall('src="?.*?jpg"', raw_html)
|
imglist = re.findall('src="?.*?jpg"', raw_html)
|
||||||
br = mechanize.Browser()
|
br = mechanize.Browser()
|
||||||
br.set_handle_redirect(False)
|
br.set_handle_redirect(False)
|
||||||
for img in imglist:
|
for img in imglist:
|
||||||
gifimg = img.replace('jpg"', 'gif"')
|
gifimg = img.replace('jpg"', 'gif"')
|
||||||
try:
|
try:
|
||||||
br.open_novisit(url + "/../" + gifimg[5:len(gifimg)-1])
|
br.open_novisit(url + "/../" + gifimg[5:len(gifimg)-1])
|
||||||
raw_html = raw_html.replace(img, gifimg)
|
raw_html = raw_html.replace(img, gifimg)
|
||||||
except:
|
except:
|
||||||
# find the location of the first _
|
# find the location of the first _
|
||||||
pos = img.find('_')
|
pos = img.find('_')
|
||||||
if pos > -1:
|
if pos > -1:
|
||||||
# if found, insert _ after the first _
|
# if found, insert _ after the first _
|
||||||
newimg = img[0:pos] + '_' + img[pos:]
|
newimg = img[0:pos] + '_' + img[pos:]
|
||||||
raw_html = raw_html.replace(img, newimg)
|
raw_html = raw_html.replace(img, newimg)
|
||||||
else:
|
else:
|
||||||
# if not found, insert _ after "
|
# if not found, insert _ after "
|
||||||
raw_html = raw_html.replace(img[1:], '"_' + img[1:])
|
raw_html = raw_html.replace(img[1:], '"_' + img[1:])
|
||||||
elif url.rfind('life.mingpao.com') > -1:
|
elif url.rfind('life.mingpao.com') > -1:
|
||||||
@ -510,7 +520,7 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
pos = img.rfind('/')
|
pos = img.rfind('/')
|
||||||
newimg = img[0:pos+1] + '_' + img[pos+1:]
|
newimg = img[0:pos+1] + '_' + img[pos+1:]
|
||||||
#print 'newimg: ', newimg
|
#print 'newimg: ', newimg
|
||||||
raw_html = raw_html.replace(img, newimg)
|
raw_html = raw_html.replace(img, newimg)
|
||||||
if url.rfind('ftp') == -1 and url.rfind('_print.htm') == -1:
|
if url.rfind('ftp') == -1 and url.rfind('_print.htm') == -1:
|
||||||
return raw_html
|
return raw_html
|
||||||
else:
|
else:
|
||||||
@ -549,10 +559,11 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
photo = photo.replace('class="photo"', '')
|
photo = photo.replace('class="photo"', '')
|
||||||
new_raw_html = new_raw_html + '<div class="images">' + photo + '</div>'
|
new_raw_html = new_raw_html + '<div class="images">' + photo + '</div>'
|
||||||
return new_raw_html + '</body></html>'
|
return new_raw_html + '</body></html>'
|
||||||
else:
|
else:
|
||||||
# .txt based file
|
# .txt based file
|
||||||
splitter = re.compile(r'\n') # Match non-digits
|
splitter = re.compile(r'\n') # Match non-digits
|
||||||
new_raw_html = '<html><head><title>Untitled</title></head><body><div class="images">'
|
new_raw_html = '<html><head><title>Untitled</title></head><body><div class="images">'
|
||||||
|
next_is_mov_link = False
|
||||||
next_is_img_txt = False
|
next_is_img_txt = False
|
||||||
title_started = False
|
title_started = False
|
||||||
met_article_start_char = False
|
met_article_start_char = False
|
||||||
@ -561,24 +572,35 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
met_article_start_char = True
|
met_article_start_char = True
|
||||||
new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
|
new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
|
||||||
else:
|
else:
|
||||||
if next_is_img_txt == False:
|
if next_is_img_txt == False and next_is_mov_link == False:
|
||||||
if item.startswith('='):
|
item = item.strip()
|
||||||
|
if item.startswith("=@"):
|
||||||
|
next_is_mov_link = True
|
||||||
|
elif item.startswith("=?"):
|
||||||
|
next_is_img_txt = True
|
||||||
|
new_raw_html += '<img src="' + str(item)[2:].strip() + '.gif" /><p>\n'
|
||||||
|
elif item.startswith('='):
|
||||||
next_is_img_txt = True
|
next_is_img_txt = True
|
||||||
new_raw_html += '<img src="' + str(item)[1:].strip() + '.jpg" /><p>\n'
|
new_raw_html += '<img src="' + str(item)[1:].strip() + '.jpg" /><p>\n'
|
||||||
else:
|
else:
|
||||||
if met_article_start_char == False:
|
if item <> '':
|
||||||
if title_started == False:
|
if next_is_img_txt == False and met_article_start_char == False:
|
||||||
new_raw_html = new_raw_html + '</div><div class="heading">' + item + '\n'
|
if title_started == False:
|
||||||
title_started = True
|
#print 'Title started at ', item
|
||||||
|
new_raw_html = new_raw_html + '</div><div class="heading">' + item + '\n'
|
||||||
|
title_started = True
|
||||||
|
else:
|
||||||
|
new_raw_html = new_raw_html + item + '\n'
|
||||||
else:
|
else:
|
||||||
new_raw_html = new_raw_html + item + '\n'
|
new_raw_html = new_raw_html + item + '<p>\n'
|
||||||
else:
|
|
||||||
new_raw_html = new_raw_html + item + '<p>\n'
|
|
||||||
else:
|
else:
|
||||||
next_is_img_txt = False
|
if next_is_mov_link == True:
|
||||||
new_raw_html = new_raw_html + item + '\n'
|
next_is_mov_link = False
|
||||||
|
else:
|
||||||
|
next_is_img_txt = False
|
||||||
|
new_raw_html = new_raw_html + item + '\n'
|
||||||
return new_raw_html + '</div></body></html>'
|
return new_raw_html + '</div></body></html>'
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for item in soup.findAll(style=True):
|
for item in soup.findAll(style=True):
|
||||||
del item['style']
|
del item['style']
|
||||||
@ -587,7 +609,7 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
for item in soup.findAll(stype=True):
|
for item in soup.findAll(stype=True):
|
||||||
del item['absmiddle']
|
del item['absmiddle']
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
def create_opf(self, feeds, dir=None):
|
def create_opf(self, feeds, dir=None):
|
||||||
if dir is None:
|
if dir is None:
|
||||||
dir = self.output_dir
|
dir = self.output_dir
|
||||||
@ -678,7 +700,7 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
if po is None:
|
if po is None:
|
||||||
self.play_order_counter += 1
|
self.play_order_counter += 1
|
||||||
po = self.play_order_counter
|
po = self.play_order_counter
|
||||||
parent.add_item('%sindex.html'%adir, None, a.title if a.title else ('Untitled Article'),
|
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
|
||||||
play_order=po, author=auth, description=desc)
|
play_order=po, author=auth, description=desc)
|
||||||
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
|
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
|
||||||
for sp in a.sub_pages:
|
for sp in a.sub_pages:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user