This commit is contained in:
Kovid Goyal 2011-12-22 19:04:37 +05:30
parent 6240a00209
commit fc8e5feabd
5 changed files with 66 additions and 69 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.1 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 15 KiB

View File

@ -29,14 +29,14 @@ __Date__ = ''
''' '''
Change Log: Change Log:
2011/12/18: update the overridden create_odf(.) routine with the one from Calibre version 0.8.31. Move __UseChineseTitle__ usage away 2011/12/18: update the overridden create_odf(.) routine with the one from Calibre version 0.8.31. Move __UseChineseTitle__ usage away
from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day
download of Hong Kong Mingpao at 4.30am. Set the actual publication date shown on kindle device. download of Hong Kong Mingpao at 4.30am. Set the actual publication date shown on kindle device.
2011/12/01: take care of situation that in txt source parsing, the article content does start with special character u'\u3010' 2011/12/01: take care of situation that in txt source parsing, the article content does start with special character u'\u3010'
2011/10/21: fix a bug that hi-res img is unavailable in pages parsed from source txt 2011/10/21: fix a bug that hi-res img is unavailable in pages parsed from source txt
2011/10/19: fix a bug in txt source parsing 2011/10/19: fix a bug in txt source parsing
2011/10/17: disable fetching of premium content, also improved txt source parsing 2011/10/17: disable fetching of premium content, also improved txt source parsing
2011/10/04: option to get hi-res photos for the articles 2011/10/04: option to get hi-res photos for the articles
2011/09/21: fetching "column" section is made optional. 2011/09/21: fetching "column" section is made optional.
2011/09/18: parse "column" section stuff from source text file directly. 2011/09/18: parse "column" section stuff from source text file directly.
2011/09/07: disable "column" section as it is no longer offered free. 2011/09/07: disable "column" section as it is no longer offered free.
2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source 2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source
@ -60,7 +60,6 @@ Change Log:
2010/10/31: skip repeated articles in section pages 2010/10/31: skip repeated articles in section pages
''' '''
from calibre import (browser, iswindows, __appname__, force_unicode, preferred_encoding, as_unicode)
from calibre.utils.date import now as nowf from calibre.utils.date import now as nowf
import os, datetime, re, mechanize import os, datetime, re, mechanize
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
@ -204,13 +203,13 @@ class MPRecipe(BasicNewsRecipe):
return __Date__[0:4]+'-'+__Date__[4:6]+'-'+__Date__[6:8] return __Date__[0:4]+'-'+__Date__[4:6]+'-'+__Date__[6:8]
else: else:
return self.get_dtlocal().strftime("%Y-%m-%d") return self.get_dtlocal().strftime("%Y-%m-%d")
def get_fetchyear(self): def get_fetchyear(self):
if __Date__ <> '': if __Date__ <> '':
return __Date__[0:4] return __Date__[0:4]
else: else:
return self.get_dtlocal().strftime("%Y") return self.get_dtlocal().strftime("%Y")
def get_fetchmonth(self): def get_fetchmonth(self):
if __Date__ <> '': if __Date__ <> '':
return __Date__[4:6] return __Date__[4:6]
@ -268,7 +267,7 @@ class MPRecipe(BasicNewsRecipe):
articles = self.parse_section2_txt(url, keystr) articles = self.parse_section2_txt(url, keystr)
if articles: if articles:
feeds.append((title, articles)) feeds.append((title, articles))
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
articles = self.parse_section(url) articles = self.parse_section(url)
@ -305,7 +304,7 @@ class MPRecipe(BasicNewsRecipe):
articles = self.parse_section2_txt(url, keystr) articles = self.parse_section2_txt(url, keystr)
if articles: if articles:
feeds.append((title, articles)) feeds.append((title, articles))
#for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), #for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
# (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]: # (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
# articles = self.parse_section(url) # articles = self.parse_section(url)
@ -322,7 +321,7 @@ class MPRecipe(BasicNewsRecipe):
articles = self.parse_section2_txt(url, keystr) articles = self.parse_section2_txt(url, keystr)
if articles: if articles:
feeds.append((title, articles)) feeds.append((title, articles))
if __InclPremium__ == True: if __InclPremium__ == True:
# parse column section articles directly from .txt files # parse column section articles directly from .txt files
for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl') for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
@ -330,7 +329,7 @@ class MPRecipe(BasicNewsRecipe):
articles = self.parse_section2_txt(url, keystr) articles = self.parse_section2_txt(url, keystr)
if articles: if articles:
feeds.append((title, articles)) feeds.append((title, articles))
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
articles = self.parse_section(url) articles = self.parse_section(url)
@ -410,7 +409,7 @@ class MPRecipe(BasicNewsRecipe):
title = self.tag_to_string(i) title = self.tag_to_string(i)
url = 'http://life.mingpao.com/cfm/' + i.get('href', False) url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1): if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
try: try:
br.open_novisit(url) br.open_novisit(url)
url = url.replace('dailynews3.cfm', 'dailynews3a.cfm') # use printed version of the article url = url.replace('dailynews3.cfm', 'dailynews3a.cfm') # use printed version of the article
current_articles.append({'title': title, 'url': url, 'description': ''}) current_articles.append({'title': title, 'url': url, 'description': ''})
@ -437,7 +436,7 @@ class MPRecipe(BasicNewsRecipe):
included_urls.append(url) included_urls.append(url)
current_articles.reverse() current_articles.reverse()
return current_articles return current_articles
# parse from www.mingpaovan.com # parse from www.mingpaovan.com
def parse_section3(self, url, baseUrl): def parse_section3(self, url, baseUrl):
self.get_fetchdate() self.get_fetchdate()
@ -559,7 +558,7 @@ class MPRecipe(BasicNewsRecipe):
photo = photo.replace('class="photo"', '') photo = photo.replace('class="photo"', '')
new_raw_html = new_raw_html + '<div class="images">' + photo + '</div>' new_raw_html = new_raw_html + '<div class="images">' + photo + '</div>'
new_html = new_raw_html + '</body></html>' new_html = new_raw_html + '</body></html>'
else: else:
# .txt based file # .txt based file
splitter = re.compile(r'\n') # Match non-digits splitter = re.compile(r'\n') # Match non-digits
new_raw_html = '<html><head><title>Untitled</title></head><body><div class="images">' new_raw_html = '<html><head><title>Untitled</title></head><body><div class="images">'
@ -622,23 +621,23 @@ class MPRecipe(BasicNewsRecipe):
#raw_html = raw_html.replace(u'<p>\u3010', u'\u3010') #raw_html = raw_html.replace(u'<p>\u3010', u'\u3010')
if __HiResImg__ == True: if __HiResImg__ == True:
# TODO: add a _ in front of an image url # TODO: add a _ in front of an image url
if url.rfind('news.mingpao.com') > -1: if url.rfind('news.mingpao.com') > -1:
imglist = re.findall('src="?.*?jpg"', new_html) imglist = re.findall('src="?.*?jpg"', new_html)
br = mechanize.Browser() br = mechanize.Browser()
br.set_handle_redirect(False) br.set_handle_redirect(False)
for img in imglist: for img in imglist:
gifimg = img.replace('jpg"', 'gif"') gifimg = img.replace('jpg"', 'gif"')
try: try:
br.open_novisit(url + "/../" + gifimg[5:len(gifimg)-1]) br.open_novisit(url + "/../" + gifimg[5:len(gifimg)-1])
new_html = new_html.replace(img, gifimg) new_html = new_html.replace(img, gifimg)
except: except:
# find the location of the first _ # find the location of the first _
pos = img.find('_') pos = img.find('_')
if pos > -1: if pos > -1:
# if found, insert _ after the first _ # if found, insert _ after the first _
newimg = img[0:pos] + '_' + img[pos:] newimg = img[0:pos] + '_' + img[pos:]
new_html = new_html.replace(img, newimg) new_html = new_html.replace(img, newimg)
else: else:
# if not found, insert _ after " # if not found, insert _ after "
new_html = new_html.replace(img[1:], '"_' + img[1:]) new_html = new_html.replace(img[1:], '"_' + img[1:])
elif url.rfind('life.mingpao.com') > -1: elif url.rfind('life.mingpao.com') > -1:
@ -675,7 +674,7 @@ class MPRecipe(BasicNewsRecipe):
#print 'Use hi-res img', newimg #print 'Use hi-res img', newimg
new_html = new_html.replace(img, newimg) new_html = new_html.replace(img, newimg)
return new_html return new_html
def preprocess_html(self, soup): def preprocess_html(self, soup):
for item in soup.findAll(style=True): for item in soup.findAll(style=True):
del item['style'] del item['style']
@ -684,7 +683,7 @@ class MPRecipe(BasicNewsRecipe):
for item in soup.findAll(stype=True): for item in soup.findAll(stype=True):
del item['absmiddle'] del item['absmiddle']
return soup return soup
def populate_article_metadata(self, article, soup, first): def populate_article_metadata(self, article, soup, first):
# thumbnails shouldn't be available if using hi-res images # thumbnails shouldn't be available if using hi-res images
if __IncludeThumbnails__ and __HiResImg__ == False and first and hasattr(self, 'add_toc_thumbnail'): if __IncludeThumbnails__ and __HiResImg__ == False and first and hasattr(self, 'add_toc_thumbnail'):
@ -699,7 +698,7 @@ class MPRecipe(BasicNewsRecipe):
if not articlebodies: if not articlebodies:
articlebodies = soup.findAll('div',attrs={'id':'newscontent01'}) articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
if not articlebodies: if not articlebodies:
articlebodies = soup.findAll('div',attrs={'class':'content'}) articlebodies = soup.findAll('div',attrs={'class':'content'})
if not articlebodies: if not articlebodies:
articlebodies = soup.findAll('div', attrs={'id':'font'}) articlebodies = soup.findAll('div', attrs={'id':'font'})
if articlebodies: if articlebodies:
@ -721,12 +720,12 @@ class MPRecipe(BasicNewsRecipe):
# display a simple text # display a simple text
#article.summary = article.text_summary = u'\u66f4\u591a......' #article.summary = article.text_summary = u'\u66f4\u591a......'
# display word counts # display word counts
counts = 0 counts = 0
articlebodies = soup.findAll('div',attrs={'id':'newscontent'}) articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
if not articlebodies: if not articlebodies:
articlebodies = soup.findAll('div',attrs={'id':'newscontent01'}) articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
if not articlebodies: if not articlebodies:
articlebodies = soup.findAll('div',attrs={'class':'content'}) articlebodies = soup.findAll('div',attrs={'class':'content'})
if not articlebodies: if not articlebodies:
articlebodies = soup.findAll('div', attrs={'id':'font'}) articlebodies = soup.findAll('div', attrs={'id':'font'})
if articlebodies: if articlebodies:
@ -908,5 +907,5 @@ class MPRecipe(BasicNewsRecipe):
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file): with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
opf.render(opf_file, ncx_file) opf.render(opf_file, ncx_file)

View File

@ -29,14 +29,14 @@ __Date__ = ''
''' '''
Change Log: Change Log:
2011/12/18: update the overridden create_odf(.) routine with the one from Calibre version 0.8.31. Move __UseChineseTitle__ usage away 2011/12/18: update the overridden create_odf(.) routine with the one from Calibre version 0.8.31. Move __UseChineseTitle__ usage away
from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day
download of Hong Kong Mingpao at 4.30am. Set the actual publication date shown on kindle device. download of Hong Kong Mingpao at 4.30am. Set the actual publication date shown on kindle device.
2011/12/01: take care of situation that in txt source parsing, the article content does start with special character u'\u3010' 2011/12/01: take care of situation that in txt source parsing, the article content does start with special character u'\u3010'
2011/10/21: fix a bug that hi-res img is unavailable in pages parsed from source txt 2011/10/21: fix a bug that hi-res img is unavailable in pages parsed from source txt
2011/10/19: fix a bug in txt source parsing 2011/10/19: fix a bug in txt source parsing
2011/10/17: disable fetching of premium content, also improved txt source parsing 2011/10/17: disable fetching of premium content, also improved txt source parsing
2011/10/04: option to get hi-res photos for the articles 2011/10/04: option to get hi-res photos for the articles
2011/09/21: fetching "column" section is made optional. 2011/09/21: fetching "column" section is made optional.
2011/09/18: parse "column" section stuff from source text file directly. 2011/09/18: parse "column" section stuff from source text file directly.
2011/09/07: disable "column" section as it is no longer offered free. 2011/09/07: disable "column" section as it is no longer offered free.
2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source 2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source
@ -60,7 +60,6 @@ Change Log:
2010/10/31: skip repeated articles in section pages 2010/10/31: skip repeated articles in section pages
''' '''
from calibre import (browser, iswindows, __appname__, force_unicode, preferred_encoding, as_unicode)
from calibre.utils.date import now as nowf from calibre.utils.date import now as nowf
import os, datetime, re, mechanize import os, datetime, re, mechanize
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
@ -204,13 +203,13 @@ class MPRecipe(BasicNewsRecipe):
return __Date__[0:4]+'-'+__Date__[4:6]+'-'+__Date__[6:8] return __Date__[0:4]+'-'+__Date__[4:6]+'-'+__Date__[6:8]
else: else:
return self.get_dtlocal().strftime("%Y-%m-%d") return self.get_dtlocal().strftime("%Y-%m-%d")
def get_fetchyear(self): def get_fetchyear(self):
if __Date__ <> '': if __Date__ <> '':
return __Date__[0:4] return __Date__[0:4]
else: else:
return self.get_dtlocal().strftime("%Y") return self.get_dtlocal().strftime("%Y")
def get_fetchmonth(self): def get_fetchmonth(self):
if __Date__ <> '': if __Date__ <> '':
return __Date__[4:6] return __Date__[4:6]
@ -268,7 +267,7 @@ class MPRecipe(BasicNewsRecipe):
articles = self.parse_section2_txt(url, keystr) articles = self.parse_section2_txt(url, keystr)
if articles: if articles:
feeds.append((title, articles)) feeds.append((title, articles))
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
articles = self.parse_section(url) articles = self.parse_section(url)
@ -305,7 +304,7 @@ class MPRecipe(BasicNewsRecipe):
articles = self.parse_section2_txt(url, keystr) articles = self.parse_section2_txt(url, keystr)
if articles: if articles:
feeds.append((title, articles)) feeds.append((title, articles))
#for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), #for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
# (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]: # (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
# articles = self.parse_section(url) # articles = self.parse_section(url)
@ -322,7 +321,7 @@ class MPRecipe(BasicNewsRecipe):
articles = self.parse_section2_txt(url, keystr) articles = self.parse_section2_txt(url, keystr)
if articles: if articles:
feeds.append((title, articles)) feeds.append((title, articles))
if __InclPremium__ == True: if __InclPremium__ == True:
# parse column section articles directly from .txt files # parse column section articles directly from .txt files
for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl') for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
@ -330,7 +329,7 @@ class MPRecipe(BasicNewsRecipe):
articles = self.parse_section2_txt(url, keystr) articles = self.parse_section2_txt(url, keystr)
if articles: if articles:
feeds.append((title, articles)) feeds.append((title, articles))
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
articles = self.parse_section(url) articles = self.parse_section(url)
@ -410,7 +409,7 @@ class MPRecipe(BasicNewsRecipe):
title = self.tag_to_string(i) title = self.tag_to_string(i)
url = 'http://life.mingpao.com/cfm/' + i.get('href', False) url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1): if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
try: try:
br.open_novisit(url) br.open_novisit(url)
url = url.replace('dailynews3.cfm', 'dailynews3a.cfm') # use printed version of the article url = url.replace('dailynews3.cfm', 'dailynews3a.cfm') # use printed version of the article
current_articles.append({'title': title, 'url': url, 'description': ''}) current_articles.append({'title': title, 'url': url, 'description': ''})
@ -437,7 +436,7 @@ class MPRecipe(BasicNewsRecipe):
included_urls.append(url) included_urls.append(url)
current_articles.reverse() current_articles.reverse()
return current_articles return current_articles
# parse from www.mingpaovan.com # parse from www.mingpaovan.com
def parse_section3(self, url, baseUrl): def parse_section3(self, url, baseUrl):
self.get_fetchdate() self.get_fetchdate()
@ -559,7 +558,7 @@ class MPRecipe(BasicNewsRecipe):
photo = photo.replace('class="photo"', '') photo = photo.replace('class="photo"', '')
new_raw_html = new_raw_html + '<div class="images">' + photo + '</div>' new_raw_html = new_raw_html + '<div class="images">' + photo + '</div>'
new_html = new_raw_html + '</body></html>' new_html = new_raw_html + '</body></html>'
else: else:
# .txt based file # .txt based file
splitter = re.compile(r'\n') # Match non-digits splitter = re.compile(r'\n') # Match non-digits
new_raw_html = '<html><head><title>Untitled</title></head><body><div class="images">' new_raw_html = '<html><head><title>Untitled</title></head><body><div class="images">'
@ -622,23 +621,23 @@ class MPRecipe(BasicNewsRecipe):
#raw_html = raw_html.replace(u'<p>\u3010', u'\u3010') #raw_html = raw_html.replace(u'<p>\u3010', u'\u3010')
if __HiResImg__ == True: if __HiResImg__ == True:
# TODO: add a _ in front of an image url # TODO: add a _ in front of an image url
if url.rfind('news.mingpao.com') > -1: if url.rfind('news.mingpao.com') > -1:
imglist = re.findall('src="?.*?jpg"', new_html) imglist = re.findall('src="?.*?jpg"', new_html)
br = mechanize.Browser() br = mechanize.Browser()
br.set_handle_redirect(False) br.set_handle_redirect(False)
for img in imglist: for img in imglist:
gifimg = img.replace('jpg"', 'gif"') gifimg = img.replace('jpg"', 'gif"')
try: try:
br.open_novisit(url + "/../" + gifimg[5:len(gifimg)-1]) br.open_novisit(url + "/../" + gifimg[5:len(gifimg)-1])
new_html = new_html.replace(img, gifimg) new_html = new_html.replace(img, gifimg)
except: except:
# find the location of the first _ # find the location of the first _
pos = img.find('_') pos = img.find('_')
if pos > -1: if pos > -1:
# if found, insert _ after the first _ # if found, insert _ after the first _
newimg = img[0:pos] + '_' + img[pos:] newimg = img[0:pos] + '_' + img[pos:]
new_html = new_html.replace(img, newimg) new_html = new_html.replace(img, newimg)
else: else:
# if not found, insert _ after " # if not found, insert _ after "
new_html = new_html.replace(img[1:], '"_' + img[1:]) new_html = new_html.replace(img[1:], '"_' + img[1:])
elif url.rfind('life.mingpao.com') > -1: elif url.rfind('life.mingpao.com') > -1:
@ -675,7 +674,7 @@ class MPRecipe(BasicNewsRecipe):
#print 'Use hi-res img', newimg #print 'Use hi-res img', newimg
new_html = new_html.replace(img, newimg) new_html = new_html.replace(img, newimg)
return new_html return new_html
def preprocess_html(self, soup): def preprocess_html(self, soup):
for item in soup.findAll(style=True): for item in soup.findAll(style=True):
del item['style'] del item['style']
@ -684,7 +683,7 @@ class MPRecipe(BasicNewsRecipe):
for item in soup.findAll(stype=True): for item in soup.findAll(stype=True):
del item['absmiddle'] del item['absmiddle']
return soup return soup
def populate_article_metadata(self, article, soup, first): def populate_article_metadata(self, article, soup, first):
# thumbnails shouldn't be available if using hi-res images # thumbnails shouldn't be available if using hi-res images
if __IncludeThumbnails__ and __HiResImg__ == False and first and hasattr(self, 'add_toc_thumbnail'): if __IncludeThumbnails__ and __HiResImg__ == False and first and hasattr(self, 'add_toc_thumbnail'):
@ -699,7 +698,7 @@ class MPRecipe(BasicNewsRecipe):
if not articlebodies: if not articlebodies:
articlebodies = soup.findAll('div',attrs={'id':'newscontent01'}) articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
if not articlebodies: if not articlebodies:
articlebodies = soup.findAll('div',attrs={'class':'content'}) articlebodies = soup.findAll('div',attrs={'class':'content'})
if not articlebodies: if not articlebodies:
articlebodies = soup.findAll('div', attrs={'id':'font'}) articlebodies = soup.findAll('div', attrs={'id':'font'})
if articlebodies: if articlebodies:
@ -721,12 +720,12 @@ class MPRecipe(BasicNewsRecipe):
# display a simple text # display a simple text
#article.summary = article.text_summary = u'\u66f4\u591a......' #article.summary = article.text_summary = u'\u66f4\u591a......'
# display word counts # display word counts
counts = 0 counts = 0
articlebodies = soup.findAll('div',attrs={'id':'newscontent'}) articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
if not articlebodies: if not articlebodies:
articlebodies = soup.findAll('div',attrs={'id':'newscontent01'}) articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
if not articlebodies: if not articlebodies:
articlebodies = soup.findAll('div',attrs={'class':'content'}) articlebodies = soup.findAll('div',attrs={'class':'content'})
if not articlebodies: if not articlebodies:
articlebodies = soup.findAll('div', attrs={'id':'font'}) articlebodies = soup.findAll('div', attrs={'id':'font'})
if articlebodies: if articlebodies:
@ -908,5 +907,5 @@ class MPRecipe(BasicNewsRecipe):
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file): with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
opf.render(opf_file, ncx_file) opf.render(opf_file, ncx_file)

View File

@ -29,14 +29,14 @@ __Date__ = ''
''' '''
Change Log: Change Log:
2011/12/18: update the overridden create_odf(.) routine with the one from Calibre version 0.8.31. Move __UseChineseTitle__ usage away 2011/12/18: update the overridden create_odf(.) routine with the one from Calibre version 0.8.31. Move __UseChineseTitle__ usage away
from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day
download of Hong Kong Mingpao at 4.30am. Set the actual publication date shown on kindle device. download of Hong Kong Mingpao at 4.30am. Set the actual publication date shown on kindle device.
2011/12/01: take care of situation that in txt source parsing, the article content does start with special character u'\u3010' 2011/12/01: take care of situation that in txt source parsing, the article content does start with special character u'\u3010'
2011/10/21: fix a bug that hi-res img is unavailable in pages parsed from source txt 2011/10/21: fix a bug that hi-res img is unavailable in pages parsed from source txt
2011/10/19: fix a bug in txt source parsing 2011/10/19: fix a bug in txt source parsing
2011/10/17: disable fetching of premium content, also improved txt source parsing 2011/10/17: disable fetching of premium content, also improved txt source parsing
2011/10/04: option to get hi-res photos for the articles 2011/10/04: option to get hi-res photos for the articles
2011/09/21: fetching "column" section is made optional. 2011/09/21: fetching "column" section is made optional.
2011/09/18: parse "column" section stuff from source text file directly. 2011/09/18: parse "column" section stuff from source text file directly.
2011/09/07: disable "column" section as it is no longer offered free. 2011/09/07: disable "column" section as it is no longer offered free.
2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source 2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source
@ -60,7 +60,6 @@ Change Log:
2010/10/31: skip repeated articles in section pages 2010/10/31: skip repeated articles in section pages
''' '''
from calibre import (browser, iswindows, __appname__, force_unicode, preferred_encoding, as_unicode)
from calibre.utils.date import now as nowf from calibre.utils.date import now as nowf
import os, datetime, re, mechanize import os, datetime, re, mechanize
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
@ -204,13 +203,13 @@ class MPRecipe(BasicNewsRecipe):
return __Date__[0:4]+'-'+__Date__[4:6]+'-'+__Date__[6:8] return __Date__[0:4]+'-'+__Date__[4:6]+'-'+__Date__[6:8]
else: else:
return self.get_dtlocal().strftime("%Y-%m-%d") return self.get_dtlocal().strftime("%Y-%m-%d")
def get_fetchyear(self): def get_fetchyear(self):
if __Date__ <> '': if __Date__ <> '':
return __Date__[0:4] return __Date__[0:4]
else: else:
return self.get_dtlocal().strftime("%Y") return self.get_dtlocal().strftime("%Y")
def get_fetchmonth(self): def get_fetchmonth(self):
if __Date__ <> '': if __Date__ <> '':
return __Date__[4:6] return __Date__[4:6]
@ -268,7 +267,7 @@ class MPRecipe(BasicNewsRecipe):
articles = self.parse_section2_txt(url, keystr) articles = self.parse_section2_txt(url, keystr)
if articles: if articles:
feeds.append((title, articles)) feeds.append((title, articles))
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
articles = self.parse_section(url) articles = self.parse_section(url)
@ -305,7 +304,7 @@ class MPRecipe(BasicNewsRecipe):
articles = self.parse_section2_txt(url, keystr) articles = self.parse_section2_txt(url, keystr)
if articles: if articles:
feeds.append((title, articles)) feeds.append((title, articles))
#for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), #for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
# (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]: # (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
# articles = self.parse_section(url) # articles = self.parse_section(url)
@ -322,7 +321,7 @@ class MPRecipe(BasicNewsRecipe):
articles = self.parse_section2_txt(url, keystr) articles = self.parse_section2_txt(url, keystr)
if articles: if articles:
feeds.append((title, articles)) feeds.append((title, articles))
if __InclPremium__ == True: if __InclPremium__ == True:
# parse column section articles directly from .txt files # parse column section articles directly from .txt files
for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl') for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
@ -330,7 +329,7 @@ class MPRecipe(BasicNewsRecipe):
articles = self.parse_section2_txt(url, keystr) articles = self.parse_section2_txt(url, keystr)
if articles: if articles:
feeds.append((title, articles)) feeds.append((title, articles))
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
articles = self.parse_section(url) articles = self.parse_section(url)
@ -410,7 +409,7 @@ class MPRecipe(BasicNewsRecipe):
title = self.tag_to_string(i) title = self.tag_to_string(i)
url = 'http://life.mingpao.com/cfm/' + i.get('href', False) url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1): if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
try: try:
br.open_novisit(url) br.open_novisit(url)
url = url.replace('dailynews3.cfm', 'dailynews3a.cfm') # use printed version of the article url = url.replace('dailynews3.cfm', 'dailynews3a.cfm') # use printed version of the article
current_articles.append({'title': title, 'url': url, 'description': ''}) current_articles.append({'title': title, 'url': url, 'description': ''})
@ -437,7 +436,7 @@ class MPRecipe(BasicNewsRecipe):
included_urls.append(url) included_urls.append(url)
current_articles.reverse() current_articles.reverse()
return current_articles return current_articles
# parse from www.mingpaovan.com # parse from www.mingpaovan.com
def parse_section3(self, url, baseUrl): def parse_section3(self, url, baseUrl):
self.get_fetchdate() self.get_fetchdate()
@ -559,7 +558,7 @@ class MPRecipe(BasicNewsRecipe):
photo = photo.replace('class="photo"', '') photo = photo.replace('class="photo"', '')
new_raw_html = new_raw_html + '<div class="images">' + photo + '</div>' new_raw_html = new_raw_html + '<div class="images">' + photo + '</div>'
new_html = new_raw_html + '</body></html>' new_html = new_raw_html + '</body></html>'
else: else:
# .txt based file # .txt based file
splitter = re.compile(r'\n') # Match non-digits splitter = re.compile(r'\n') # Match non-digits
new_raw_html = '<html><head><title>Untitled</title></head><body><div class="images">' new_raw_html = '<html><head><title>Untitled</title></head><body><div class="images">'
@ -622,23 +621,23 @@ class MPRecipe(BasicNewsRecipe):
#raw_html = raw_html.replace(u'<p>\u3010', u'\u3010') #raw_html = raw_html.replace(u'<p>\u3010', u'\u3010')
if __HiResImg__ == True: if __HiResImg__ == True:
# TODO: add a _ in front of an image url # TODO: add a _ in front of an image url
if url.rfind('news.mingpao.com') > -1: if url.rfind('news.mingpao.com') > -1:
imglist = re.findall('src="?.*?jpg"', new_html) imglist = re.findall('src="?.*?jpg"', new_html)
br = mechanize.Browser() br = mechanize.Browser()
br.set_handle_redirect(False) br.set_handle_redirect(False)
for img in imglist: for img in imglist:
gifimg = img.replace('jpg"', 'gif"') gifimg = img.replace('jpg"', 'gif"')
try: try:
br.open_novisit(url + "/../" + gifimg[5:len(gifimg)-1]) br.open_novisit(url + "/../" + gifimg[5:len(gifimg)-1])
new_html = new_html.replace(img, gifimg) new_html = new_html.replace(img, gifimg)
except: except:
# find the location of the first _ # find the location of the first _
pos = img.find('_') pos = img.find('_')
if pos > -1: if pos > -1:
# if found, insert _ after the first _ # if found, insert _ after the first _
newimg = img[0:pos] + '_' + img[pos:] newimg = img[0:pos] + '_' + img[pos:]
new_html = new_html.replace(img, newimg) new_html = new_html.replace(img, newimg)
else: else:
# if not found, insert _ after " # if not found, insert _ after "
new_html = new_html.replace(img[1:], '"_' + img[1:]) new_html = new_html.replace(img[1:], '"_' + img[1:])
elif url.rfind('life.mingpao.com') > -1: elif url.rfind('life.mingpao.com') > -1:
@ -675,7 +674,7 @@ class MPRecipe(BasicNewsRecipe):
#print 'Use hi-res img', newimg #print 'Use hi-res img', newimg
new_html = new_html.replace(img, newimg) new_html = new_html.replace(img, newimg)
return new_html return new_html
def preprocess_html(self, soup): def preprocess_html(self, soup):
for item in soup.findAll(style=True): for item in soup.findAll(style=True):
del item['style'] del item['style']
@ -684,7 +683,7 @@ class MPRecipe(BasicNewsRecipe):
for item in soup.findAll(stype=True): for item in soup.findAll(stype=True):
del item['absmiddle'] del item['absmiddle']
return soup return soup
def populate_article_metadata(self, article, soup, first): def populate_article_metadata(self, article, soup, first):
# thumbnails shouldn't be available if using hi-res images # thumbnails shouldn't be available if using hi-res images
if __IncludeThumbnails__ and __HiResImg__ == False and first and hasattr(self, 'add_toc_thumbnail'): if __IncludeThumbnails__ and __HiResImg__ == False and first and hasattr(self, 'add_toc_thumbnail'):
@ -699,7 +698,7 @@ class MPRecipe(BasicNewsRecipe):
if not articlebodies: if not articlebodies:
articlebodies = soup.findAll('div',attrs={'id':'newscontent01'}) articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
if not articlebodies: if not articlebodies:
articlebodies = soup.findAll('div',attrs={'class':'content'}) articlebodies = soup.findAll('div',attrs={'class':'content'})
if not articlebodies: if not articlebodies:
articlebodies = soup.findAll('div', attrs={'id':'font'}) articlebodies = soup.findAll('div', attrs={'id':'font'})
if articlebodies: if articlebodies:
@ -721,12 +720,12 @@ class MPRecipe(BasicNewsRecipe):
# display a simple text # display a simple text
#article.summary = article.text_summary = u'\u66f4\u591a......' #article.summary = article.text_summary = u'\u66f4\u591a......'
# display word counts # display word counts
counts = 0 counts = 0
articlebodies = soup.findAll('div',attrs={'id':'newscontent'}) articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
if not articlebodies: if not articlebodies:
articlebodies = soup.findAll('div',attrs={'id':'newscontent01'}) articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
if not articlebodies: if not articlebodies:
articlebodies = soup.findAll('div',attrs={'class':'content'}) articlebodies = soup.findAll('div',attrs={'class':'content'})
if not articlebodies: if not articlebodies:
articlebodies = soup.findAll('div', attrs={'id':'font'}) articlebodies = soup.findAll('div', attrs={'id':'font'})
if articlebodies: if articlebodies:
@ -908,5 +907,5 @@ class MPRecipe(BasicNewsRecipe):
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file): with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
opf.render(opf_file, ncx_file) opf.render(opf_file, ncx_file)