__license__ = 'GPL v3' __copyright__ = '2010-2011, Eddie Lau' # Region - Hong Kong, Vancouver, Toronto __Region__ = 'Vancouver' # Users of Kindle 3 with limited system-level CJK support # please replace the following "True" with "False". (Default: True) __MakePeriodical__ = True # Turn below to True if your device supports display of CJK titles (Default: False) __UseChineseTitle__ = False # Set it to False if you want to skip images (Default: True) __KeepImages__ = True # Set it to True if you want to include a summary in Kindle's article view (Default: False) __IncludeSummary__ = False # Set it to True if you want thumbnail images in Kindle's article view (Default: True) __IncludeThumbnails__ = True # (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source (Default: True) __UseLife__ = True # (HK only) It is to disable premium content (Default: False) __InclPremium__ = False # (HK only) Turn below to True if you wish to parse articles in news.mingpao.com with their printer-friendly formats (Default: True) __ParsePFF__ = True # (HK only) Turn below to True if you wish hi-res images (Default: False) __HiResImg__ = False # Override the date returned by the program if specifying a YYYYMMDD below __Date__ = '' ''' Change Log: 2011/12/18: update the overridden create_odf(.) routine with the one from Calibre version 0.8.31. Move __UseChineseTitle__ usage away from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day download of Hong Kong Mingpao at 4.30am. Set the actual publication date shown on kindle device. 2011/12/01: take care of situation that in txt source parsing, the article content does start with special character u'\u3010' 2011/10/21: fix a bug that hi-res img is unavailable in pages parsed from source txt 2011/10/19: fix a bug in txt source parsing 2011/10/17: disable fetching of premium content, also improved txt source parsing 2011/10/04: option to get hi-res photos for the articles 2011/09/21: fetching "column" section is made optional. 2011/09/18: parse "column" section stuff from source text file directly. 2011/09/07: disable "column" section as it is no longer offered free. 2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source provide options to remove all images in the file 2011/05/12: switch the main parse source to life.mingpao.com, which has more photos on the article pages 2011/03/06: add new articles for finance section, also a new section "Columns" 2011/02/28: rearrange the sections [Disabled until Kindle has better CJK support and can remember last (section,article) read in Sections & Articles View] make it the same title if generating a periodical, so past issue will be automatically put into "Past Issues" folder in Kindle 3 2011/02/20: skip duplicated links in finance section, put photos which may extend a whole page to the back of the articles clean up the indentation 2010/12/07: add entertainment section, use newspaper front page as ebook cover, suppress date display in section list (to avoid wrong date display in case the user generates the ebook in a time zone different from HKT) 2010/11/22: add English section, remove eco-news section which is not updated daily, correct ordering of articles 2010/11/12: add news image and eco-news section 2010/11/08: add parsing of finance section 2010/11/06: temporary work-around for Kindle device having no capability to display unicode in section/article list. 2010/10/31: skip repeated articles in section pages ''' from calibre.utils.date import now as nowf import os, datetime, re, mechanize from calibre.web.feeds.recipes import BasicNewsRecipe from contextlib import nested from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.metadata.toc import TOC from calibre.ebooks.metadata import MetaInformation from calibre.utils.localization import canonicalize_lang # MAIN CLASS class MPRecipe(BasicNewsRecipe): if __Region__ == 'Hong Kong': if __UseChineseTitle__ == True: title = u'\u660e\u5831 (\u9999\u6e2f)' else: title = 'Ming Pao - Hong Kong' description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)' category = 'Chinese, News, Hong Kong' extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} font>b {font-size:200%; font-weight:bold;} div[class=heading] {font-size:200%; font-weight:bold;} div[class=images] {font-size:50%;}' masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif' keep_only_tags = [dict(name='h1'), dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title dict(name='font', attrs={'color':['AA0000']}), # for column articles title dict(attrs={'class':['heading']}), # for heading from txt dict(attrs={'id':['newscontent']}), # entertainment and column page content dict(attrs={'id':['newscontent01','newscontent02']}), dict(attrs={'class':['content']}), # for content from txt dict(attrs={'class':['photo']}), dict(name='table', attrs={'width':['100%'], 'border':['0'], 'cellspacing':['5'], 'cellpadding':['0']}), # content in printed version of life.mingpao.com dict(name='img', attrs={'width':['180'], 'alt':['????']}), # images for source from life.mingpao.com dict(attrs={'class':['images']}) # for images from txt ] if __KeepImages__: remove_tags = [dict(name='style'), dict(attrs={'id':['newscontent135']}), # for the finance page from mpfinance.com dict(name='font', attrs={'size':['2'], 'color':['666666']}), # article date in life.mingpao.com article #dict(name='table') # for content fetched from life.mingpao.com ] else: remove_tags = [dict(name='style'), dict(attrs={'id':['newscontent135']}), # for the finance page from mpfinance.com dict(name='font', attrs={'size':['2'], 'color':['666666']}), # article date in life.mingpao.com article dict(name='img'), #dict(name='table') # for content fetched from life.mingpao.com ] remove_attributes = ['width'] preprocess_regexps = [ (re.compile(r'
' + item + '
\n' #if item.startswith(u'\u3010'): # met_article_start_char = True # new_raw_html = new_raw_html + '
' + item + '
\n'
else:
if next_is_img_txt == False:
if item.startswith("=@"):
print 'skip movie link'
elif item.startswith("=?"):
next_is_img_txt = True
new_raw_html += '
\n'
elif item.startswith('=='):
next_is_img_txt = True
if False:
# TODO: check existence of .gif first
newimg = '_' + item[2:].strip() + '.jpg'
new_raw_html += '
\n'
else:
new_raw_html += '
\n'
elif item.startswith('='):
next_is_img_txt = True
if False:
# TODO: check existence of .gif first
newimg = '_' + item[1:].strip() + '.jpg'
new_raw_html += '
\n'
else:
new_raw_html += '
\n' else: if next_is_img_txt == False and met_article_start_char == False: if item <> '': if title_started == False: #print 'Title started at ', item new_raw_html = new_raw_html + '
\n' else: next_is_img_txt = False new_raw_html = new_raw_html + item + '\n' new_html = new_raw_html + '
\u3010', u'\u3010') if __HiResImg__ == True: # TODO: add a _ in front of an image url if url.rfind('news.mingpao.com') > -1: imglist = re.findall('src="?.*?jpg"', new_html) br = mechanize.Browser() br.set_handle_redirect(False) for img in imglist: gifimg = img.replace('jpg"', 'gif"') try: br.open_novisit(url + "/../" + gifimg[5:len(gifimg)-1]) new_html = new_html.replace(img, gifimg) except: # find the location of the first _ pos = img.find('_') if pos > -1: # if found, insert _ after the first _ newimg = img[0:pos] + '_' + img[pos:] new_html = new_html.replace(img, newimg) else: # if not found, insert _ after " new_html = new_html.replace(img[1:], '"_' + img[1:]) elif url.rfind('life.mingpao.com') > -1: imglist = re.findall('src=\'?.*?jpg\'', new_html) br = mechanize.Browser() br.set_handle_redirect(False) #print 'Img list: ', imglist, '\n' for img in imglist: #print 'Found img: ', img gifimg = img.replace('jpg\'', 'gif\'') try: gifurl = re.sub(r'dailynews.*txt', '', url) br.open_novisit(gifurl + gifimg[5:len(gifimg)-1]) new_html = new_html.replace(img, gifimg) except: pos = img.rfind('/') newimg = img[0:pos+1] + '_' + img[pos+1:] new_html = new_html.replace(img, newimg) # repeat with src quoted by double quotes, for text parsed from src txt imglist = re.findall('src="?.*?jpg"', new_html) for img in imglist: #print 'Found img: ', img gifimg = img.replace('jpg"', 'gif"') try: #print 'url', url pos = url.rfind('/') gifurl = url[:pos+1] #print 'try it:', gifurl + gifimg[5:len(gifimg)-1] br.open_novisit(gifurl + gifimg[5:len(gifimg)-1]) new_html = new_html.replace(img, gifimg) except: pos = img.find('"') newimg = img[0:pos+1] + '_' + img[pos+1:] #print 'Use hi-res img', newimg new_html = new_html.replace(img, newimg) return new_html def preprocess_html(self, soup): for item in soup.findAll(style=True): del item['style'] for item in soup.findAll(style=True): del item['width'] for item in soup.findAll(stype=True): del item['absmiddle'] return soup def populate_article_metadata(self, article, soup, first): # thumbnails shouldn't be available if using hi-res images if __IncludeThumbnails__ and __HiResImg__ == False and first and hasattr(self, 'add_toc_thumbnail'): img = soup.find('img') if img is not None: self.add_toc_thumbnail(article, img['src']) try: if __IncludeSummary__ and len(article.text_summary.strip()) == 0: # look for content articlebodies = soup.findAll('div',attrs={'id':'newscontent'}) if not articlebodies: articlebodies = soup.findAll('div',attrs={'id':'newscontent01'}) if not articlebodies: articlebodies = soup.findAll('div',attrs={'class':'content'}) if not articlebodies: articlebodies = soup.findAll('div', attrs={'id':'font'}) if articlebodies: for articlebody in articlebodies: if articlebody: # the text may or may not be enclosed in
tag paras = articlebody.findAll('p') if not paras: paras = articlebody textFound = False for p in paras: if not textFound: summary_candidate = self.tag_to_string(p).strip() summary_candidate = summary_candidate.replace(u'\u3010\u660e\u5831\u5c08\u8a0a\u3011', '', 1) if len(summary_candidate) > 0: article.summary = article.text_summary = summary_candidate textFound = True else: # display a simple text #article.summary = article.text_summary = u'\u66f4\u591a......' # display word counts counts = 0 articlebodies = soup.findAll('div',attrs={'id':'newscontent'}) if not articlebodies: articlebodies = soup.findAll('div',attrs={'id':'newscontent01'}) if not articlebodies: articlebodies = soup.findAll('div',attrs={'class':'content'}) if not articlebodies: articlebodies = soup.findAll('div', attrs={'id':'font'}) if articlebodies: for articlebody in articlebodies: # the text may or may not be enclosed in tag paras = articlebody.findAll('p') if not paras: paras = articlebody for p in paras: summary_candidate = self.tag_to_string(p).strip() counts += len(summary_candidate) article.summary = article.text_summary = u'\uff08' + str(counts) + u'\u5b57\uff09' except: self.log("Error creating article descriptions") return # override from the one in version 0.8.31 def create_opf(self, feeds, dir=None): if dir is None: dir = self.output_dir title = self.short_title() # change 1: allow our own flag to tell if a periodical is to be generated # also use customed date instead of current time if __MakePeriodical__ == False or self.output_profile.periodical_date_in_title: title = title + ' ' + self.get_fetchformatteddate() # end of change 1 # change 2: __appname__ replaced by newspaper publisher __appname__ = self.publisher mi = MetaInformation(title, [__appname__]) mi.publisher = __appname__ mi.author_sort = __appname__ # change 3: use __MakePeriodical__ flag to tell if a periodical should be generated if __MakePeriodical__ == True: mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title() else: mi.publication_type = self.publication_type+':'+self.short_title() #mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title() # change 4: in the following, all the nowf() are changed to adjusted time # This one doesn't matter mi.timestamp = nowf() # change 5: skip listing the articles #article_titles, aseen = [], set() #for f in feeds: # for a in f: # if a.title and a.title not in aseen: # aseen.add(a.title) # article_titles.append(force_unicode(a.title, 'utf-8')) #mi.comments = self.description #if not isinstance(mi.comments, unicode): # mi.comments = mi.comments.decode('utf-8', 'replace') #mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' + # '\n\n'.join(article_titles)) language = canonicalize_lang(self.language) if language is not None: mi.language = language # This one affects the pub date shown in kindle title #mi.pubdate = nowf() # now appears to need the time field to be > 12.00noon as well mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0) opf_path = os.path.join(dir, 'index.opf') ncx_path = os.path.join(dir, 'index.ncx') opf = OPFCreator(dir, mi) # Add mastheadImage entry to