mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Updated Ming Pao
This commit is contained in:
parent
cef64ff0e7
commit
ff6dd9c16a
@ -10,6 +10,10 @@ __MakePeriodical__ = True
|
|||||||
__UseChineseTitle__ = False
|
__UseChineseTitle__ = False
|
||||||
# Set it to False if you want to skip images (Default: True)
|
# Set it to False if you want to skip images (Default: True)
|
||||||
__KeepImages__ = True
|
__KeepImages__ = True
|
||||||
|
# Set it to True if you want to include a summary in Kindle's article view (Default: False)
|
||||||
|
__IncludeSummary__ = False
|
||||||
|
# Set it to True if you want thumbnail images in Kindle's article view (Default: True)
|
||||||
|
__IncludeThumbnails__ = True
|
||||||
# (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source (Default: True)
|
# (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source (Default: True)
|
||||||
__UseLife__ = True
|
__UseLife__ = True
|
||||||
# (HK only) It is to disable premium content (Default: False)
|
# (HK only) It is to disable premium content (Default: False)
|
||||||
@ -24,6 +28,9 @@ __Date__ = ''
|
|||||||
|
|
||||||
'''
|
'''
|
||||||
Change Log:
|
Change Log:
|
||||||
|
2011/12/18: update the overridden create_odf(.) routine with the one from Calibre version 0.8.31. Move __UseChineseTitle__ usage away
|
||||||
|
from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day
|
||||||
|
download of Hong Kong Mingpao at 4.30am. Set the actual publication date shown on kindle device.
|
||||||
2011/12/01: take care of situation that in txt source parsing, the article content does start with special character u'\u3010'
|
2011/12/01: take care of situation that in txt source parsing, the article content does start with special character u'\u3010'
|
||||||
2011/10/21: fix a bug that hi-res img is unavailable in pages parsed from source txt
|
2011/10/21: fix a bug that hi-res img is unavailable in pages parsed from source txt
|
||||||
2011/10/19: fix a bug in txt source parsing
|
2011/10/19: fix a bug in txt source parsing
|
||||||
@ -53,6 +60,8 @@ Change Log:
|
|||||||
2010/10/31: skip repeated articles in section pages
|
2010/10/31: skip repeated articles in section pages
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
from calibre import (browser, iswindows, __appname__, force_unicode, preferred_encoding, as_unicode)
|
||||||
|
from calibre.utils.date import now as nowf
|
||||||
import os, datetime, re, mechanize
|
import os, datetime, re, mechanize
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
from contextlib import nested
|
from contextlib import nested
|
||||||
@ -60,11 +69,15 @@ from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
|||||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||||
from calibre.ebooks.metadata.toc import TOC
|
from calibre.ebooks.metadata.toc import TOC
|
||||||
from calibre.ebooks.metadata import MetaInformation
|
from calibre.ebooks.metadata import MetaInformation
|
||||||
|
from calibre.utils.localization import canonicalize_lang
|
||||||
|
|
||||||
# MAIN CLASS
|
# MAIN CLASS
|
||||||
class MPRecipe(BasicNewsRecipe):
|
class MPRecipe(BasicNewsRecipe):
|
||||||
if __Region__ == 'Hong Kong':
|
if __Region__ == 'Hong Kong':
|
||||||
title = 'Ming Pao - Hong Kong'
|
if __UseChineseTitle__ == True:
|
||||||
|
title = u'\u660e\u5831 (\u9999\u6e2f)'
|
||||||
|
else:
|
||||||
|
title = 'Ming Pao - Hong Kong'
|
||||||
description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
|
description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
|
||||||
category = 'Chinese, News, Hong Kong'
|
category = 'Chinese, News, Hong Kong'
|
||||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} font>b {font-size:200%; font-weight:bold;} div[class=heading] {font-size:200%; font-weight:bold;} div[class=images] {font-size:50%;}'
|
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} font>b {font-size:200%; font-weight:bold;} div[class=heading] {font-size:200%; font-weight:bold;} div[class=images] {font-size:50%;}'
|
||||||
@ -109,7 +122,10 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
lambda match: "</b>")
|
lambda match: "</b>")
|
||||||
]
|
]
|
||||||
elif __Region__ == 'Vancouver':
|
elif __Region__ == 'Vancouver':
|
||||||
title = 'Ming Pao - Vancouver'
|
if __UseChineseTitle__ == True:
|
||||||
|
title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
|
||||||
|
else:
|
||||||
|
title = 'Ming Pao - Vancouver'
|
||||||
description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)'
|
description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)'
|
||||||
category = 'Chinese, News, Vancouver'
|
category = 'Chinese, News, Vancouver'
|
||||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
|
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
|
||||||
@ -127,7 +143,10 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
lambda match: ''),
|
lambda match: ''),
|
||||||
]
|
]
|
||||||
elif __Region__ == 'Toronto':
|
elif __Region__ == 'Toronto':
|
||||||
title = 'Ming Pao - Toronto'
|
if __UseChineseTitle__ == True:
|
||||||
|
title = u'\u660e\u5831 (\u591a\u502b\u591a)'
|
||||||
|
else:
|
||||||
|
title = 'Ming Pao - Toronto'
|
||||||
description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)'
|
description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)'
|
||||||
category = 'Chinese, News, Toronto'
|
category = 'Chinese, News, Toronto'
|
||||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
|
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
|
||||||
@ -161,9 +180,9 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
def get_dtlocal(self):
|
def get_dtlocal(self):
|
||||||
dt_utc = datetime.datetime.utcnow()
|
dt_utc = datetime.datetime.utcnow()
|
||||||
if __Region__ == 'Hong Kong':
|
if __Region__ == 'Hong Kong':
|
||||||
# convert UTC to local hk time - at HKT 5.30am, all news are available
|
# convert UTC to local hk time - at HKT 4.30am, all news are available
|
||||||
dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(5.5/24)
|
dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(4.5/24)
|
||||||
# dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(5.5/24)
|
# dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(4.5/24)
|
||||||
elif __Region__ == 'Vancouver':
|
elif __Region__ == 'Vancouver':
|
||||||
# convert UTC to local Vancouver time - at PST time 5.30am, all news are available
|
# convert UTC to local Vancouver time - at PST time 5.30am, all news are available
|
||||||
dt_local = dt_utc + datetime.timedelta(-8.0/24) - datetime.timedelta(5.5/24)
|
dt_local = dt_utc + datetime.timedelta(-8.0/24) - datetime.timedelta(5.5/24)
|
||||||
@ -185,6 +204,18 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
return __Date__[0:4]+'-'+__Date__[4:6]+'-'+__Date__[6:8]
|
return __Date__[0:4]+'-'+__Date__[4:6]+'-'+__Date__[6:8]
|
||||||
else:
|
else:
|
||||||
return self.get_dtlocal().strftime("%Y-%m-%d")
|
return self.get_dtlocal().strftime("%Y-%m-%d")
|
||||||
|
|
||||||
|
def get_fetchyear(self):
|
||||||
|
if __Date__ <> '':
|
||||||
|
return __Date__[0:4]
|
||||||
|
else:
|
||||||
|
return self.get_dtlocal().strftime("%Y")
|
||||||
|
|
||||||
|
def get_fetchmonth(self):
|
||||||
|
if __Date__ <> '':
|
||||||
|
return __Date__[4:6]
|
||||||
|
else:
|
||||||
|
return self.get_dtlocal().strftime("%m")
|
||||||
|
|
||||||
def get_fetchday(self):
|
def get_fetchday(self):
|
||||||
if __Date__ <> '':
|
if __Date__ <> '':
|
||||||
@ -654,77 +685,153 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
del item['absmiddle']
|
del item['absmiddle']
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
def populate_article_metadata(self, article, soup, first):
|
||||||
|
# thumbnails shouldn't be available if using hi-res images
|
||||||
|
if __IncludeThumbnails__ and __HiResImg__ == False and first and hasattr(self, 'add_toc_thumbnail'):
|
||||||
|
img = soup.find('img')
|
||||||
|
if img is not None:
|
||||||
|
self.add_toc_thumbnail(article, img['src'])
|
||||||
|
|
||||||
|
try:
|
||||||
|
if __IncludeSummary__ and len(article.text_summary.strip()) == 0:
|
||||||
|
# look for content
|
||||||
|
articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
|
||||||
|
if not articlebodies:
|
||||||
|
articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
|
||||||
|
if not articlebodies:
|
||||||
|
articlebodies = soup.findAll('div',attrs={'class':'content'})
|
||||||
|
if not articlebodies:
|
||||||
|
articlebodies = soup.findAll('div', attrs={'id':'font'})
|
||||||
|
if articlebodies:
|
||||||
|
for articlebody in articlebodies:
|
||||||
|
if articlebody:
|
||||||
|
# the text may or may not be enclosed in <p></p> tag
|
||||||
|
paras = articlebody.findAll('p')
|
||||||
|
if not paras:
|
||||||
|
paras = articlebody
|
||||||
|
textFound = False
|
||||||
|
for p in paras:
|
||||||
|
if not textFound:
|
||||||
|
summary_candidate = self.tag_to_string(p).strip()
|
||||||
|
summary_candidate = summary_candidate.replace(u'\u3010\u660e\u5831\u5c08\u8a0a\u3011', '', 1)
|
||||||
|
if len(summary_candidate) > 0:
|
||||||
|
article.summary = article.text_summary = summary_candidate
|
||||||
|
textFound = True
|
||||||
|
else:
|
||||||
|
# display a simple text
|
||||||
|
#article.summary = article.text_summary = u'\u66f4\u591a......'
|
||||||
|
# display word counts
|
||||||
|
counts = 0
|
||||||
|
articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
|
||||||
|
if not articlebodies:
|
||||||
|
articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
|
||||||
|
if not articlebodies:
|
||||||
|
articlebodies = soup.findAll('div',attrs={'class':'content'})
|
||||||
|
if not articlebodies:
|
||||||
|
articlebodies = soup.findAll('div', attrs={'id':'font'})
|
||||||
|
if articlebodies:
|
||||||
|
for articlebody in articlebodies:
|
||||||
|
# the text may or may not be enclosed in <p></p> tag
|
||||||
|
paras = articlebody.findAll('p')
|
||||||
|
if not paras:
|
||||||
|
paras = articlebody
|
||||||
|
for p in paras:
|
||||||
|
summary_candidate = self.tag_to_string(p).strip()
|
||||||
|
counts += len(summary_candidate)
|
||||||
|
article.summary = article.text_summary = u'\uff08' + str(counts) + u'\u5b57\uff09'
|
||||||
|
except:
|
||||||
|
self.log("Error creating article descriptions")
|
||||||
|
return
|
||||||
|
|
||||||
|
# override from the one in version 0.8.31
|
||||||
def create_opf(self, feeds, dir=None):
|
def create_opf(self, feeds, dir=None):
|
||||||
if dir is None:
|
if dir is None:
|
||||||
dir = self.output_dir
|
dir = self.output_dir
|
||||||
if __UseChineseTitle__ == True:
|
title = self.short_title()
|
||||||
if __Region__ == 'Hong Kong':
|
# change 1: allow our own flag to tell if a periodical is to be generated
|
||||||
title = u'\u660e\u5831 (\u9999\u6e2f)'
|
# also use customed date instead of current time
|
||||||
elif __Region__ == 'Vancouver':
|
if __MakePeriodical__ == False or self.output_profile.periodical_date_in_title:
|
||||||
title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
|
|
||||||
elif __Region__ == 'Toronto':
|
|
||||||
title = u'\u660e\u5831 (\u591a\u502b\u591a)'
|
|
||||||
else:
|
|
||||||
title = self.short_title()
|
|
||||||
# if not generating a periodical, force date to apply in title
|
|
||||||
if __MakePeriodical__ == False:
|
|
||||||
title = title + ' ' + self.get_fetchformatteddate()
|
title = title + ' ' + self.get_fetchformatteddate()
|
||||||
if True:
|
# end of change 1
|
||||||
mi = MetaInformation(title, [self.publisher])
|
# change 2: __appname__ replaced by newspaper publisher
|
||||||
mi.publisher = self.publisher
|
__appname__ = self.publisher
|
||||||
mi.author_sort = self.publisher
|
mi = MetaInformation(title, [__appname__])
|
||||||
if __MakePeriodical__ == True:
|
mi.publisher = __appname__
|
||||||
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
|
mi.author_sort = __appname__
|
||||||
else:
|
# change 3: use __MakePeriodical__ flag to tell if a periodical should be generated
|
||||||
mi.publication_type = self.publication_type+':'+self.short_title()
|
if __MakePeriodical__ == True:
|
||||||
#mi.timestamp = nowf()
|
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
|
||||||
mi.timestamp = self.get_dtlocal()
|
else:
|
||||||
mi.comments = self.description
|
mi.publication_type = self.publication_type+':'+self.short_title()
|
||||||
if not isinstance(mi.comments, unicode):
|
#mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
|
||||||
mi.comments = mi.comments.decode('utf-8', 'replace')
|
# change 4: in the following, all the nowf() are changed to adjusted time
|
||||||
#mi.pubdate = nowf()
|
# This one doesn't matter
|
||||||
mi.pubdate = self.get_dtlocal()
|
mi.timestamp = nowf()
|
||||||
opf_path = os.path.join(dir, 'index.opf')
|
# change 5: skip listing the articles
|
||||||
ncx_path = os.path.join(dir, 'index.ncx')
|
#article_titles, aseen = [], set()
|
||||||
opf = OPFCreator(dir, mi)
|
#for f in feeds:
|
||||||
# Add mastheadImage entry to <guide> section
|
# for a in f:
|
||||||
mp = getattr(self, 'masthead_path', None)
|
# if a.title and a.title not in aseen:
|
||||||
if mp is not None and os.access(mp, os.R_OK):
|
# aseen.add(a.title)
|
||||||
from calibre.ebooks.metadata.opf2 import Guide
|
# article_titles.append(force_unicode(a.title, 'utf-8'))
|
||||||
ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
|
|
||||||
ref.type = 'masthead'
|
|
||||||
ref.title = 'Masthead Image'
|
|
||||||
opf.guide.append(ref)
|
|
||||||
|
|
||||||
manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
|
#mi.comments = self.description
|
||||||
manifest.append(os.path.join(dir, 'index.html'))
|
#if not isinstance(mi.comments, unicode):
|
||||||
manifest.append(os.path.join(dir, 'index.ncx'))
|
# mi.comments = mi.comments.decode('utf-8', 'replace')
|
||||||
|
#mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
|
||||||
|
# '\n\n'.join(article_titles))
|
||||||
|
|
||||||
# Get cover
|
language = canonicalize_lang(self.language)
|
||||||
cpath = getattr(self, 'cover_path', None)
|
if language is not None:
|
||||||
if cpath is None:
|
mi.language = language
|
||||||
pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
|
# This one affects the pub date shown in kindle title
|
||||||
if self.default_cover(pf):
|
#mi.pubdate = nowf()
|
||||||
cpath = pf.name
|
# now appears to need the time field to be > 12.00noon as well
|
||||||
if cpath is not None and os.access(cpath, os.R_OK):
|
mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
|
||||||
opf.cover = cpath
|
opf_path = os.path.join(dir, 'index.opf')
|
||||||
manifest.append(cpath)
|
ncx_path = os.path.join(dir, 'index.ncx')
|
||||||
|
|
||||||
# Get masthead
|
opf = OPFCreator(dir, mi)
|
||||||
mpath = getattr(self, 'masthead_path', None)
|
# Add mastheadImage entry to <guide> section
|
||||||
if mpath is not None and os.access(mpath, os.R_OK):
|
mp = getattr(self, 'masthead_path', None)
|
||||||
manifest.append(mpath)
|
if mp is not None and os.access(mp, os.R_OK):
|
||||||
|
from calibre.ebooks.metadata.opf2 import Guide
|
||||||
|
ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
|
||||||
|
ref.type = 'masthead'
|
||||||
|
ref.title = 'Masthead Image'
|
||||||
|
opf.guide.append(ref)
|
||||||
|
|
||||||
|
manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
|
||||||
|
manifest.append(os.path.join(dir, 'index.html'))
|
||||||
|
manifest.append(os.path.join(dir, 'index.ncx'))
|
||||||
|
|
||||||
|
# Get cover
|
||||||
|
cpath = getattr(self, 'cover_path', None)
|
||||||
|
if cpath is None:
|
||||||
|
pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
|
||||||
|
if self.default_cover(pf):
|
||||||
|
cpath = pf.name
|
||||||
|
if cpath is not None and os.access(cpath, os.R_OK):
|
||||||
|
opf.cover = cpath
|
||||||
|
manifest.append(cpath)
|
||||||
|
|
||||||
|
# Get masthead
|
||||||
|
mpath = getattr(self, 'masthead_path', None)
|
||||||
|
if mpath is not None and os.access(mpath, os.R_OK):
|
||||||
|
manifest.append(mpath)
|
||||||
|
|
||||||
|
opf.create_manifest_from_files_in(manifest)
|
||||||
|
for mani in opf.manifest:
|
||||||
|
if mani.path.endswith('.ncx'):
|
||||||
|
mani.id = 'ncx'
|
||||||
|
if mani.path.endswith('mastheadImage.jpg'):
|
||||||
|
mani.id = 'masthead-image'
|
||||||
|
|
||||||
|
entries = ['index.html']
|
||||||
|
toc = TOC(base_path=dir)
|
||||||
|
self.play_order_counter = 0
|
||||||
|
self.play_order_map = {}
|
||||||
|
|
||||||
opf.create_manifest_from_files_in(manifest)
|
|
||||||
for mani in opf.manifest:
|
|
||||||
if mani.path.endswith('.ncx'):
|
|
||||||
mani.id = 'ncx'
|
|
||||||
if mani.path.endswith('mastheadImage.jpg'):
|
|
||||||
mani.id = 'masthead-image'
|
|
||||||
entries = ['index.html']
|
|
||||||
toc = TOC(base_path=dir)
|
|
||||||
self.play_order_counter = 0
|
|
||||||
self.play_order_map = {}
|
|
||||||
|
|
||||||
def feed_index(num, parent):
|
def feed_index(num, parent):
|
||||||
f = feeds[num]
|
f = feeds[num]
|
||||||
@ -739,13 +846,16 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
desc = None
|
desc = None
|
||||||
else:
|
else:
|
||||||
desc = self.description_limiter(desc)
|
desc = self.description_limiter(desc)
|
||||||
|
tt = a.toc_thumbnail if a.toc_thumbnail else None
|
||||||
entries.append('%sindex.html'%adir)
|
entries.append('%sindex.html'%adir)
|
||||||
po = self.play_order_map.get(entries[-1], None)
|
po = self.play_order_map.get(entries[-1], None)
|
||||||
if po is None:
|
if po is None:
|
||||||
self.play_order_counter += 1
|
self.play_order_counter += 1
|
||||||
po = self.play_order_counter
|
po = self.play_order_counter
|
||||||
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
|
parent.add_item('%sindex.html'%adir, None,
|
||||||
play_order=po, author=auth, description=desc)
|
a.title if a.title else _('Untitled Article'),
|
||||||
|
play_order=po, author=auth,
|
||||||
|
description=desc, toc_thumbnail=tt)
|
||||||
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
|
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
|
||||||
for sp in a.sub_pages:
|
for sp in a.sub_pages:
|
||||||
prefix = os.path.commonprefix([opf_path, sp])
|
prefix = os.path.commonprefix([opf_path, sp])
|
||||||
@ -762,7 +872,7 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
|
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
|
||||||
templ = self.navbar.generate(True, num, j, len(f),
|
templ = self.navbar.generate(True, num, j, len(f),
|
||||||
not self.has_single_feed,
|
not self.has_single_feed,
|
||||||
a.orig_url, self.publisher, prefix=prefix,
|
a.orig_url, __appname__, prefix=prefix,
|
||||||
center=self.center_navbar)
|
center=self.center_navbar)
|
||||||
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
|
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
|
||||||
body.insert(len(body.contents), elem)
|
body.insert(len(body.contents), elem)
|
||||||
@ -785,7 +895,7 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
if not desc:
|
if not desc:
|
||||||
desc = None
|
desc = None
|
||||||
feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
|
feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
|
||||||
f.title, play_order=po, description=desc, author=auth))
|
f.title, play_order=po, description=desc, author=auth))
|
||||||
|
|
||||||
else:
|
else:
|
||||||
entries.append('feed_%d/index.html'%0)
|
entries.append('feed_%d/index.html'%0)
|
||||||
@ -798,4 +908,5 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
|
|
||||||
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
|
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
|
||||||
opf.render(opf_file, ncx_file)
|
opf.render(opf_file, ncx_file)
|
||||||
|
|
||||||
|
|
||||||
|
@ -4,18 +4,41 @@ __copyright__ = '2010-2011, Eddie Lau'
|
|||||||
# Region - Hong Kong, Vancouver, Toronto
|
# Region - Hong Kong, Vancouver, Toronto
|
||||||
__Region__ = 'Toronto'
|
__Region__ = 'Toronto'
|
||||||
# Users of Kindle 3 with limited system-level CJK support
|
# Users of Kindle 3 with limited system-level CJK support
|
||||||
# please replace the following "True" with "False".
|
# please replace the following "True" with "False". (Default: True)
|
||||||
__MakePeriodical__ = True
|
__MakePeriodical__ = True
|
||||||
# Turn below to true if your device supports display of CJK titles
|
# Turn below to True if your device supports display of CJK titles (Default: False)
|
||||||
__UseChineseTitle__ = False
|
__UseChineseTitle__ = False
|
||||||
# Set it to False if you want to skip images
|
# Set it to False if you want to skip images (Default: True)
|
||||||
__KeepImages__ = True
|
__KeepImages__ = True
|
||||||
# (HK only) Turn below to true if you wish to use life.mingpao.com as the main article source
|
# Set it to True if you want to include a summary in Kindle's article view (Default: False)
|
||||||
|
__IncludeSummary__ = False
|
||||||
|
# Set it to True if you want thumbnail images in Kindle's article view (Default: True)
|
||||||
|
__IncludeThumbnails__ = True
|
||||||
|
# (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source (Default: True)
|
||||||
__UseLife__ = True
|
__UseLife__ = True
|
||||||
|
# (HK only) It is to disable premium content (Default: False)
|
||||||
|
__InclPremium__ = False
|
||||||
|
# (HK only) Turn below to True if you wish to parse articles in news.mingpao.com with their printer-friendly formats (Default: True)
|
||||||
|
__ParsePFF__ = True
|
||||||
|
# (HK only) Turn below to True if you wish hi-res images (Default: False)
|
||||||
|
__HiResImg__ = False
|
||||||
|
# Override the date returned by the program if specifying a YYYYMMDD below
|
||||||
|
__Date__ = ''
|
||||||
|
|
||||||
|
|
||||||
'''
|
'''
|
||||||
Change Log:
|
Change Log:
|
||||||
|
2011/12/18: update the overridden create_odf(.) routine with the one from Calibre version 0.8.31. Move __UseChineseTitle__ usage away
|
||||||
|
from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day
|
||||||
|
download of Hong Kong Mingpao at 4.30am. Set the actual publication date shown on kindle device.
|
||||||
|
2011/12/01: take care of situation that in txt source parsing, the article content does start with special character u'\u3010'
|
||||||
|
2011/10/21: fix a bug that hi-res img is unavailable in pages parsed from source txt
|
||||||
|
2011/10/19: fix a bug in txt source parsing
|
||||||
|
2011/10/17: disable fetching of premium content, also improved txt source parsing
|
||||||
|
2011/10/04: option to get hi-res photos for the articles
|
||||||
|
2011/09/21: fetching "column" section is made optional.
|
||||||
|
2011/09/18: parse "column" section stuff from source text file directly.
|
||||||
|
2011/09/07: disable "column" section as it is no longer offered free.
|
||||||
2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source
|
2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source
|
||||||
provide options to remove all images in the file
|
provide options to remove all images in the file
|
||||||
2011/05/12: switch the main parse source to life.mingpao.com, which has more photos on the article pages
|
2011/05/12: switch the main parse source to life.mingpao.com, which has more photos on the article pages
|
||||||
@ -37,30 +60,39 @@ Change Log:
|
|||||||
2010/10/31: skip repeated articles in section pages
|
2010/10/31: skip repeated articles in section pages
|
||||||
'''
|
'''
|
||||||
|
|
||||||
import os, datetime, re
|
from calibre import (browser, iswindows, __appname__, force_unicode, preferred_encoding, as_unicode)
|
||||||
|
from calibre.utils.date import now as nowf
|
||||||
|
import os, datetime, re, mechanize
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
from contextlib import nested
|
from contextlib import nested
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||||
from calibre.ebooks.metadata.toc import TOC
|
from calibre.ebooks.metadata.toc import TOC
|
||||||
from calibre.ebooks.metadata import MetaInformation
|
from calibre.ebooks.metadata import MetaInformation
|
||||||
|
from calibre.utils.localization import canonicalize_lang
|
||||||
|
|
||||||
# MAIN CLASS
|
# MAIN CLASS
|
||||||
class MPRecipe(BasicNewsRecipe):
|
class MPRecipe(BasicNewsRecipe):
|
||||||
if __Region__ == 'Hong Kong':
|
if __Region__ == 'Hong Kong':
|
||||||
title = 'Ming Pao - Hong Kong'
|
if __UseChineseTitle__ == True:
|
||||||
|
title = u'\u660e\u5831 (\u9999\u6e2f)'
|
||||||
|
else:
|
||||||
|
title = 'Ming Pao - Hong Kong'
|
||||||
description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
|
description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
|
||||||
category = 'Chinese, News, Hong Kong'
|
category = 'Chinese, News, Hong Kong'
|
||||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;}'
|
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} font>b {font-size:200%; font-weight:bold;} div[class=heading] {font-size:200%; font-weight:bold;} div[class=images] {font-size:50%;}'
|
||||||
masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
|
masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
|
||||||
keep_only_tags = [dict(name='h1'),
|
keep_only_tags = [dict(name='h1'),
|
||||||
dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
|
dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
|
||||||
dict(name='font', attrs={'color':['AA0000']}), # for column articles title
|
dict(name='font', attrs={'color':['AA0000']}), # for column articles title
|
||||||
|
dict(attrs={'class':['heading']}), # for heading from txt
|
||||||
dict(attrs={'id':['newscontent']}), # entertainment and column page content
|
dict(attrs={'id':['newscontent']}), # entertainment and column page content
|
||||||
dict(attrs={'id':['newscontent01','newscontent02']}),
|
dict(attrs={'id':['newscontent01','newscontent02']}),
|
||||||
|
dict(attrs={'class':['content']}), # for content from txt
|
||||||
dict(attrs={'class':['photo']}),
|
dict(attrs={'class':['photo']}),
|
||||||
dict(name='table', attrs={'width':['100%'], 'border':['0'], 'cellspacing':['5'], 'cellpadding':['0']}), # content in printed version of life.mingpao.com
|
dict(name='table', attrs={'width':['100%'], 'border':['0'], 'cellspacing':['5'], 'cellpadding':['0']}), # content in printed version of life.mingpao.com
|
||||||
dict(name='img', attrs={'width':['180'], 'alt':['按圖放大']}) # images for source from life.mingpao.com
|
dict(name='img', attrs={'width':['180'], 'alt':['????']}), # images for source from life.mingpao.com
|
||||||
|
dict(attrs={'class':['images']}) # for images from txt
|
||||||
]
|
]
|
||||||
if __KeepImages__:
|
if __KeepImages__:
|
||||||
remove_tags = [dict(name='style'),
|
remove_tags = [dict(name='style'),
|
||||||
@ -90,7 +122,10 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
lambda match: "</b>")
|
lambda match: "</b>")
|
||||||
]
|
]
|
||||||
elif __Region__ == 'Vancouver':
|
elif __Region__ == 'Vancouver':
|
||||||
title = 'Ming Pao - Vancouver'
|
if __UseChineseTitle__ == True:
|
||||||
|
title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
|
||||||
|
else:
|
||||||
|
title = 'Ming Pao - Vancouver'
|
||||||
description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)'
|
description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)'
|
||||||
category = 'Chinese, News, Vancouver'
|
category = 'Chinese, News, Vancouver'
|
||||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
|
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
|
||||||
@ -108,7 +143,10 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
lambda match: ''),
|
lambda match: ''),
|
||||||
]
|
]
|
||||||
elif __Region__ == 'Toronto':
|
elif __Region__ == 'Toronto':
|
||||||
title = 'Ming Pao - Toronto'
|
if __UseChineseTitle__ == True:
|
||||||
|
title = u'\u660e\u5831 (\u591a\u502b\u591a)'
|
||||||
|
else:
|
||||||
|
title = 'Ming Pao - Toronto'
|
||||||
description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)'
|
description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)'
|
||||||
category = 'Chinese, News, Toronto'
|
category = 'Chinese, News, Toronto'
|
||||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
|
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
|
||||||
@ -139,49 +177,12 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
conversion_options = {'linearize_tables':True}
|
conversion_options = {'linearize_tables':True}
|
||||||
timefmt = ''
|
timefmt = ''
|
||||||
|
|
||||||
def image_url_processor(cls, baseurl, url):
|
|
||||||
# trick: break the url at the first occurance of digit, add an additional
|
|
||||||
# '_' at the front
|
|
||||||
# not working, may need to move this to preprocess_html() method
|
|
||||||
# minIdx = 10000
|
|
||||||
# i0 = url.find('0')
|
|
||||||
# if i0 >= 0 and i0 < minIdx:
|
|
||||||
# minIdx = i0
|
|
||||||
# i1 = url.find('1')
|
|
||||||
# if i1 >= 0 and i1 < minIdx:
|
|
||||||
# minIdx = i1
|
|
||||||
# i2 = url.find('2')
|
|
||||||
# if i2 >= 0 and i2 < minIdx:
|
|
||||||
# minIdx = i2
|
|
||||||
# i3 = url.find('3')
|
|
||||||
# if i3 >= 0 and i0 < minIdx:
|
|
||||||
# minIdx = i3
|
|
||||||
# i4 = url.find('4')
|
|
||||||
# if i4 >= 0 and i4 < minIdx:
|
|
||||||
# minIdx = i4
|
|
||||||
# i5 = url.find('5')
|
|
||||||
# if i5 >= 0 and i5 < minIdx:
|
|
||||||
# minIdx = i5
|
|
||||||
# i6 = url.find('6')
|
|
||||||
# if i6 >= 0 and i6 < minIdx:
|
|
||||||
# minIdx = i6
|
|
||||||
# i7 = url.find('7')
|
|
||||||
# if i7 >= 0 and i7 < minIdx:
|
|
||||||
# minIdx = i7
|
|
||||||
# i8 = url.find('8')
|
|
||||||
# if i8 >= 0 and i8 < minIdx:
|
|
||||||
# minIdx = i8
|
|
||||||
# i9 = url.find('9')
|
|
||||||
# if i9 >= 0 and i9 < minIdx:
|
|
||||||
# minIdx = i9
|
|
||||||
return url
|
|
||||||
|
|
||||||
def get_dtlocal(self):
|
def get_dtlocal(self):
|
||||||
dt_utc = datetime.datetime.utcnow()
|
dt_utc = datetime.datetime.utcnow()
|
||||||
if __Region__ == 'Hong Kong':
|
if __Region__ == 'Hong Kong':
|
||||||
# convert UTC to local hk time - at HKT 5.30am, all news are available
|
# convert UTC to local hk time - at HKT 4.30am, all news are available
|
||||||
dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(5.5/24)
|
dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(4.5/24)
|
||||||
# dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(5.5/24)
|
# dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(4.5/24)
|
||||||
elif __Region__ == 'Vancouver':
|
elif __Region__ == 'Vancouver':
|
||||||
# convert UTC to local Vancouver time - at PST time 5.30am, all news are available
|
# convert UTC to local Vancouver time - at PST time 5.30am, all news are available
|
||||||
dt_local = dt_utc + datetime.timedelta(-8.0/24) - datetime.timedelta(5.5/24)
|
dt_local = dt_utc + datetime.timedelta(-8.0/24) - datetime.timedelta(5.5/24)
|
||||||
@ -193,13 +194,34 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
return dt_local
|
return dt_local
|
||||||
|
|
||||||
def get_fetchdate(self):
|
def get_fetchdate(self):
|
||||||
return self.get_dtlocal().strftime("%Y%m%d")
|
if __Date__ <> '':
|
||||||
|
return __Date__
|
||||||
|
else:
|
||||||
|
return self.get_dtlocal().strftime("%Y%m%d")
|
||||||
|
|
||||||
def get_fetchformatteddate(self):
|
def get_fetchformatteddate(self):
|
||||||
return self.get_dtlocal().strftime("%Y-%m-%d")
|
if __Date__ <> '':
|
||||||
|
return __Date__[0:4]+'-'+__Date__[4:6]+'-'+__Date__[6:8]
|
||||||
|
else:
|
||||||
|
return self.get_dtlocal().strftime("%Y-%m-%d")
|
||||||
|
|
||||||
|
def get_fetchyear(self):
|
||||||
|
if __Date__ <> '':
|
||||||
|
return __Date__[0:4]
|
||||||
|
else:
|
||||||
|
return self.get_dtlocal().strftime("%Y")
|
||||||
|
|
||||||
|
def get_fetchmonth(self):
|
||||||
|
if __Date__ <> '':
|
||||||
|
return __Date__[4:6]
|
||||||
|
else:
|
||||||
|
return self.get_dtlocal().strftime("%m")
|
||||||
|
|
||||||
def get_fetchday(self):
|
def get_fetchday(self):
|
||||||
return self.get_dtlocal().strftime("%d")
|
if __Date__ <> '':
|
||||||
|
return __Date__[6:8]
|
||||||
|
else:
|
||||||
|
return self.get_dtlocal().strftime("%d")
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
if __Region__ == 'Hong Kong':
|
if __Region__ == 'Hong Kong':
|
||||||
@ -230,12 +252,23 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
(u'\u570b\u969b World', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalta', 'nal'),
|
(u'\u570b\u969b World', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalta', 'nal'),
|
||||||
(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal'),
|
(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal'),
|
||||||
(u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'),
|
(u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'),
|
||||||
(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal'),
|
(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
|
||||||
(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')]:
|
]:
|
||||||
articles = self.parse_section2(url, keystr)
|
if __InclPremium__ == True:
|
||||||
|
articles = self.parse_section2_txt(url, keystr)
|
||||||
|
else:
|
||||||
|
articles = self.parse_section2(url, keystr)
|
||||||
if articles:
|
if articles:
|
||||||
feeds.append((title, articles))
|
feeds.append((title, articles))
|
||||||
|
|
||||||
|
if __InclPremium__ == True:
|
||||||
|
# parse column section articles directly from .txt files
|
||||||
|
for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
|
||||||
|
]:
|
||||||
|
articles = self.parse_section2_txt(url, keystr)
|
||||||
|
if articles:
|
||||||
|
feeds.append((title, articles))
|
||||||
|
|
||||||
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
|
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
|
||||||
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
|
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
|
||||||
articles = self.parse_section(url)
|
articles = self.parse_section(url)
|
||||||
@ -244,15 +277,16 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
else:
|
else:
|
||||||
for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
|
for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
|
||||||
(u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
|
(u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
|
||||||
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm')]:
|
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
|
||||||
|
(u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm')]:
|
||||||
articles = self.parse_section(url)
|
articles = self.parse_section(url)
|
||||||
if articles:
|
if articles:
|
||||||
feeds.append((title, articles))
|
feeds.append((title, articles))
|
||||||
|
|
||||||
# special- editorial
|
# special- editorial
|
||||||
ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
|
#ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
|
||||||
if ed_articles:
|
#if ed_articles:
|
||||||
feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
|
# feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
|
||||||
|
|
||||||
for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
|
for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
|
||||||
(u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
|
(u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
|
||||||
@ -263,32 +297,46 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
|
|
||||||
# special - finance
|
# special - finance
|
||||||
#fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
|
#fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
|
||||||
fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
|
#fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
|
||||||
if fin_articles:
|
#if fin_articles:
|
||||||
feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
|
# feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
|
||||||
|
|
||||||
for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
|
for title, url, keystr in [(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal')]:
|
||||||
(u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
|
articles = self.parse_section2_txt(url, keystr)
|
||||||
articles = self.parse_section(url)
|
|
||||||
if articles:
|
if articles:
|
||||||
feeds.append((title, articles))
|
feeds.append((title, articles))
|
||||||
|
|
||||||
|
#for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
|
||||||
|
# (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
|
||||||
|
# articles = self.parse_section(url)
|
||||||
|
# if articles:
|
||||||
|
# feeds.append((title, articles))
|
||||||
|
|
||||||
# special - entertainment
|
# special - entertainment
|
||||||
ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
|
#ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
|
||||||
if ent_articles:
|
#if ent_articles:
|
||||||
feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
|
# feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
|
||||||
|
|
||||||
|
for title, url, keystr in [(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
|
||||||
|
]:
|
||||||
|
articles = self.parse_section2_txt(url, keystr)
|
||||||
|
if articles:
|
||||||
|
feeds.append((title, articles))
|
||||||
|
|
||||||
|
if __InclPremium__ == True:
|
||||||
|
# parse column section articles directly from .txt files
|
||||||
|
for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
|
||||||
|
]:
|
||||||
|
articles = self.parse_section2_txt(url, keystr)
|
||||||
|
if articles:
|
||||||
|
feeds.append((title, articles))
|
||||||
|
|
||||||
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
|
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
|
||||||
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
|
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
|
||||||
articles = self.parse_section(url)
|
articles = self.parse_section(url)
|
||||||
if articles:
|
if articles:
|
||||||
feeds.append((title, articles))
|
feeds.append((title, articles))
|
||||||
|
|
||||||
|
|
||||||
# special- columns
|
|
||||||
col_articles = self.parse_col_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn')
|
|
||||||
if col_articles:
|
|
||||||
feeds.append((u'\u5c08\u6b04 Columns', col_articles))
|
|
||||||
elif __Region__ == 'Vancouver':
|
elif __Region__ == 'Vancouver':
|
||||||
for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VAindex.htm'),
|
for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VAindex.htm'),
|
||||||
(u'\u52a0\u570b Canada', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VBindex.htm'),
|
(u'\u52a0\u570b Canada', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VBindex.htm'),
|
||||||
@ -332,6 +380,16 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
title = self.tag_to_string(a)
|
title = self.tag_to_string(a)
|
||||||
url = a.get('href', False)
|
url = a.get('href', False)
|
||||||
url = 'http://news.mingpao.com/' + dateStr + '/' +url
|
url = 'http://news.mingpao.com/' + dateStr + '/' +url
|
||||||
|
# replace the url to the print-friendly version
|
||||||
|
if __ParsePFF__ == True:
|
||||||
|
if url.rfind('Redirect') <> -1 and __InclPremium__ == True:
|
||||||
|
url = re.sub(dateStr + '.*' + dateStr, dateStr, url)
|
||||||
|
url = re.sub('%2F.*%2F', '/', url)
|
||||||
|
title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '')
|
||||||
|
url = url.replace('%2Etxt', '_print.htm')
|
||||||
|
url = url.replace('%5F', '_')
|
||||||
|
else:
|
||||||
|
url = url.replace('.htm', '_print.htm')
|
||||||
if url not in included_urls and url.rfind('Redirect') == -1:
|
if url not in included_urls and url.rfind('Redirect') == -1:
|
||||||
current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
|
current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
|
||||||
included_urls.append(url)
|
included_urls.append(url)
|
||||||
@ -340,6 +398,8 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
|
|
||||||
# parse from life.mingpao.com
|
# parse from life.mingpao.com
|
||||||
def parse_section2(self, url, keystr):
|
def parse_section2(self, url, keystr):
|
||||||
|
br = mechanize.Browser()
|
||||||
|
br.set_handle_redirect(False)
|
||||||
self.get_fetchdate()
|
self.get_fetchdate()
|
||||||
soup = self.index_to_soup(url)
|
soup = self.index_to_soup(url)
|
||||||
a = soup.findAll('a', href=True)
|
a = soup.findAll('a', href=True)
|
||||||
@ -350,12 +410,34 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
title = self.tag_to_string(i)
|
title = self.tag_to_string(i)
|
||||||
url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
|
url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
|
||||||
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
|
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
|
||||||
url = url.replace('dailynews3.cfm', 'dailynews3a.cfm') # use printed version of the article
|
try:
|
||||||
|
br.open_novisit(url)
|
||||||
|
url = url.replace('dailynews3.cfm', 'dailynews3a.cfm') # use printed version of the article
|
||||||
|
current_articles.append({'title': title, 'url': url, 'description': ''})
|
||||||
|
included_urls.append(url)
|
||||||
|
except:
|
||||||
|
print 'skipping a premium article'
|
||||||
|
current_articles.reverse()
|
||||||
|
return current_articles
|
||||||
|
|
||||||
|
# parse from text file of life.mingpao.com
|
||||||
|
def parse_section2_txt(self, url, keystr):
|
||||||
|
self.get_fetchdate()
|
||||||
|
soup = self.index_to_soup(url)
|
||||||
|
a = soup.findAll('a', href=True)
|
||||||
|
a.reverse()
|
||||||
|
current_articles = []
|
||||||
|
included_urls = []
|
||||||
|
for i in a:
|
||||||
|
title = self.tag_to_string(i)
|
||||||
|
url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
|
||||||
|
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
|
||||||
|
url = url.replace('cfm/dailynews3.cfm?File=', 'ftp/Life3/') # use printed version of the article
|
||||||
current_articles.append({'title': title, 'url': url, 'description': ''})
|
current_articles.append({'title': title, 'url': url, 'description': ''})
|
||||||
included_urls.append(url)
|
included_urls.append(url)
|
||||||
current_articles.reverse()
|
current_articles.reverse()
|
||||||
return current_articles
|
return current_articles
|
||||||
|
|
||||||
# parse from www.mingpaovan.com
|
# parse from www.mingpaovan.com
|
||||||
def parse_section3(self, url, baseUrl):
|
def parse_section3(self, url, baseUrl):
|
||||||
self.get_fetchdate()
|
self.get_fetchdate()
|
||||||
@ -438,6 +520,162 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
current_articles.reverse()
|
current_articles.reverse()
|
||||||
return current_articles
|
return current_articles
|
||||||
|
|
||||||
|
# preprocess those .txt and javascript based files
|
||||||
|
def preprocess_raw_html(self, raw_html, url):
|
||||||
|
new_html = raw_html
|
||||||
|
if url.rfind('ftp') <> -1 or url.rfind('_print.htm') <> -1:
|
||||||
|
if url.rfind('_print.htm') <> -1:
|
||||||
|
# javascript based file
|
||||||
|
splitter = re.compile(r'\n')
|
||||||
|
new_raw_html = '<html><head><title>Untitled</title></head>'
|
||||||
|
new_raw_html = new_raw_html + '<body>'
|
||||||
|
for item in splitter.split(raw_html):
|
||||||
|
if item.startswith('var heading1 ='):
|
||||||
|
heading = item.replace('var heading1 = \'', '')
|
||||||
|
heading = heading.replace('\'', '')
|
||||||
|
heading = heading.replace(';', '')
|
||||||
|
new_raw_html = new_raw_html + '<div class="heading">' + heading
|
||||||
|
if item.startswith('var heading2 ='):
|
||||||
|
heading = item.replace('var heading2 = \'', '')
|
||||||
|
heading = heading.replace('\'', '')
|
||||||
|
heading = heading.replace(';', '')
|
||||||
|
if heading <> '':
|
||||||
|
new_raw_html = new_raw_html + '<br>' + heading + '</div>'
|
||||||
|
else:
|
||||||
|
new_raw_html = new_raw_html + '</div>'
|
||||||
|
if item.startswith('var content ='):
|
||||||
|
content = item.replace("var content = ", '')
|
||||||
|
content = content.replace('\'', '')
|
||||||
|
content = content.replace(';', '')
|
||||||
|
new_raw_html = new_raw_html + '<div class="content">' + content + '</div>'
|
||||||
|
if item.startswith('var photocontent ='):
|
||||||
|
photo = item.replace('var photocontent = \'', '')
|
||||||
|
photo = photo.replace('\'', '')
|
||||||
|
photo = photo.replace(';', '')
|
||||||
|
photo = photo.replace('<tr>', '')
|
||||||
|
photo = photo.replace('<td>', '')
|
||||||
|
photo = photo.replace('</tr>', '')
|
||||||
|
photo = photo.replace('</td>', '<br>')
|
||||||
|
photo = photo.replace('class="photo"', '')
|
||||||
|
new_raw_html = new_raw_html + '<div class="images">' + photo + '</div>'
|
||||||
|
new_html = new_raw_html + '</body></html>'
|
||||||
|
else:
|
||||||
|
# .txt based file
|
||||||
|
splitter = re.compile(r'\n') # Match non-digits
|
||||||
|
new_raw_html = '<html><head><title>Untitled</title></head><body><div class="images">'
|
||||||
|
next_is_img_txt = False
|
||||||
|
title_started = False
|
||||||
|
title_break_reached = False
|
||||||
|
met_article_start_char = False
|
||||||
|
for item in splitter.split(raw_html):
|
||||||
|
item = item.strip()
|
||||||
|
# if title already reached but break between title and content not yet found, record title_break_reached
|
||||||
|
if title_started == True and title_break_reached == False and item == '':
|
||||||
|
title_break_reached = True
|
||||||
|
# if title reached and title_break_reached and met_article_start_char == False and item is not empty
|
||||||
|
# start content
|
||||||
|
elif title_started == True and title_break_reached == True and met_article_start_char == False:
|
||||||
|
if item <> '':
|
||||||
|
met_article_start_char = True
|
||||||
|
new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
|
||||||
|
#if item.startswith(u'\u3010'):
|
||||||
|
# met_article_start_char = True
|
||||||
|
# new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
|
||||||
|
else:
|
||||||
|
if next_is_img_txt == False:
|
||||||
|
if item.startswith("=@"):
|
||||||
|
print 'skip movie link'
|
||||||
|
elif item.startswith("=?"):
|
||||||
|
next_is_img_txt = True
|
||||||
|
new_raw_html += '<img src="' + str(item)[2:].strip() + '.gif" /><p>\n'
|
||||||
|
elif item.startswith('=='):
|
||||||
|
next_is_img_txt = True
|
||||||
|
if False:
|
||||||
|
# TODO: check existence of .gif first
|
||||||
|
newimg = '_' + item[2:].strip() + '.jpg'
|
||||||
|
new_raw_html += '<img src="' + newimg + '" /><p>\n'
|
||||||
|
else:
|
||||||
|
new_raw_html += '<img src="' + str(item)[2:].strip() + '.jpg" /><p>\n'
|
||||||
|
elif item.startswith('='):
|
||||||
|
next_is_img_txt = True
|
||||||
|
if False:
|
||||||
|
# TODO: check existence of .gif first
|
||||||
|
newimg = '_' + item[1:].strip() + '.jpg'
|
||||||
|
new_raw_html += '<img src="' + newimg + '" /><p>\n'
|
||||||
|
else:
|
||||||
|
new_raw_html += '<img src="' + str(item)[1:].strip() + '.jpg" /><p>\n'
|
||||||
|
else:
|
||||||
|
if next_is_img_txt == False and met_article_start_char == False:
|
||||||
|
if item <> '':
|
||||||
|
if title_started == False:
|
||||||
|
#print 'Title started at ', item
|
||||||
|
new_raw_html = new_raw_html + '</div><div class="heading">' + item + '\n'
|
||||||
|
title_started = True
|
||||||
|
else:
|
||||||
|
new_raw_html = new_raw_html + item + '\n'
|
||||||
|
else:
|
||||||
|
new_raw_html = new_raw_html + item + '<p>\n'
|
||||||
|
else:
|
||||||
|
next_is_img_txt = False
|
||||||
|
new_raw_html = new_raw_html + item + '\n'
|
||||||
|
new_html = new_raw_html + '</div></body></html>'
|
||||||
|
#raw_html = raw_html.replace(u'<p>\u3010', u'\u3010')
|
||||||
|
if __HiResImg__ == True:
|
||||||
|
# TODO: add a _ in front of an image url
|
||||||
|
if url.rfind('news.mingpao.com') > -1:
|
||||||
|
imglist = re.findall('src="?.*?jpg"', new_html)
|
||||||
|
br = mechanize.Browser()
|
||||||
|
br.set_handle_redirect(False)
|
||||||
|
for img in imglist:
|
||||||
|
gifimg = img.replace('jpg"', 'gif"')
|
||||||
|
try:
|
||||||
|
br.open_novisit(url + "/../" + gifimg[5:len(gifimg)-1])
|
||||||
|
new_html = new_html.replace(img, gifimg)
|
||||||
|
except:
|
||||||
|
# find the location of the first _
|
||||||
|
pos = img.find('_')
|
||||||
|
if pos > -1:
|
||||||
|
# if found, insert _ after the first _
|
||||||
|
newimg = img[0:pos] + '_' + img[pos:]
|
||||||
|
new_html = new_html.replace(img, newimg)
|
||||||
|
else:
|
||||||
|
# if not found, insert _ after "
|
||||||
|
new_html = new_html.replace(img[1:], '"_' + img[1:])
|
||||||
|
elif url.rfind('life.mingpao.com') > -1:
|
||||||
|
imglist = re.findall('src=\'?.*?jpg\'', new_html)
|
||||||
|
br = mechanize.Browser()
|
||||||
|
br.set_handle_redirect(False)
|
||||||
|
#print 'Img list: ', imglist, '\n'
|
||||||
|
for img in imglist:
|
||||||
|
#print 'Found img: ', img
|
||||||
|
gifimg = img.replace('jpg\'', 'gif\'')
|
||||||
|
try:
|
||||||
|
gifurl = re.sub(r'dailynews.*txt', '', url)
|
||||||
|
br.open_novisit(gifurl + gifimg[5:len(gifimg)-1])
|
||||||
|
new_html = new_html.replace(img, gifimg)
|
||||||
|
except:
|
||||||
|
pos = img.rfind('/')
|
||||||
|
newimg = img[0:pos+1] + '_' + img[pos+1:]
|
||||||
|
new_html = new_html.replace(img, newimg)
|
||||||
|
# repeat with src quoted by double quotes, for text parsed from src txt
|
||||||
|
imglist = re.findall('src="?.*?jpg"', new_html)
|
||||||
|
for img in imglist:
|
||||||
|
#print 'Found img: ', img
|
||||||
|
gifimg = img.replace('jpg"', 'gif"')
|
||||||
|
try:
|
||||||
|
#print 'url', url
|
||||||
|
pos = url.rfind('/')
|
||||||
|
gifurl = url[:pos+1]
|
||||||
|
#print 'try it:', gifurl + gifimg[5:len(gifimg)-1]
|
||||||
|
br.open_novisit(gifurl + gifimg[5:len(gifimg)-1])
|
||||||
|
new_html = new_html.replace(img, gifimg)
|
||||||
|
except:
|
||||||
|
pos = img.find('"')
|
||||||
|
newimg = img[0:pos+1] + '_' + img[pos+1:]
|
||||||
|
#print 'Use hi-res img', newimg
|
||||||
|
new_html = new_html.replace(img, newimg)
|
||||||
|
return new_html
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for item in soup.findAll(style=True):
|
for item in soup.findAll(style=True):
|
||||||
del item['style']
|
del item['style']
|
||||||
@ -446,78 +684,154 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
for item in soup.findAll(stype=True):
|
for item in soup.findAll(stype=True):
|
||||||
del item['absmiddle']
|
del item['absmiddle']
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
def populate_article_metadata(self, article, soup, first):
|
||||||
|
# thumbnails shouldn't be available if using hi-res images
|
||||||
|
if __IncludeThumbnails__ and __HiResImg__ == False and first and hasattr(self, 'add_toc_thumbnail'):
|
||||||
|
img = soup.find('img')
|
||||||
|
if img is not None:
|
||||||
|
self.add_toc_thumbnail(article, img['src'])
|
||||||
|
|
||||||
|
try:
|
||||||
|
if __IncludeSummary__ and len(article.text_summary.strip()) == 0:
|
||||||
|
# look for content
|
||||||
|
articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
|
||||||
|
if not articlebodies:
|
||||||
|
articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
|
||||||
|
if not articlebodies:
|
||||||
|
articlebodies = soup.findAll('div',attrs={'class':'content'})
|
||||||
|
if not articlebodies:
|
||||||
|
articlebodies = soup.findAll('div', attrs={'id':'font'})
|
||||||
|
if articlebodies:
|
||||||
|
for articlebody in articlebodies:
|
||||||
|
if articlebody:
|
||||||
|
# the text may or may not be enclosed in <p></p> tag
|
||||||
|
paras = articlebody.findAll('p')
|
||||||
|
if not paras:
|
||||||
|
paras = articlebody
|
||||||
|
textFound = False
|
||||||
|
for p in paras:
|
||||||
|
if not textFound:
|
||||||
|
summary_candidate = self.tag_to_string(p).strip()
|
||||||
|
summary_candidate = summary_candidate.replace(u'\u3010\u660e\u5831\u5c08\u8a0a\u3011', '', 1)
|
||||||
|
if len(summary_candidate) > 0:
|
||||||
|
article.summary = article.text_summary = summary_candidate
|
||||||
|
textFound = True
|
||||||
|
else:
|
||||||
|
# display a simple text
|
||||||
|
#article.summary = article.text_summary = u'\u66f4\u591a......'
|
||||||
|
# display word counts
|
||||||
|
counts = 0
|
||||||
|
articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
|
||||||
|
if not articlebodies:
|
||||||
|
articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
|
||||||
|
if not articlebodies:
|
||||||
|
articlebodies = soup.findAll('div',attrs={'class':'content'})
|
||||||
|
if not articlebodies:
|
||||||
|
articlebodies = soup.findAll('div', attrs={'id':'font'})
|
||||||
|
if articlebodies:
|
||||||
|
for articlebody in articlebodies:
|
||||||
|
# the text may or may not be enclosed in <p></p> tag
|
||||||
|
paras = articlebody.findAll('p')
|
||||||
|
if not paras:
|
||||||
|
paras = articlebody
|
||||||
|
for p in paras:
|
||||||
|
summary_candidate = self.tag_to_string(p).strip()
|
||||||
|
counts += len(summary_candidate)
|
||||||
|
article.summary = article.text_summary = u'\uff08' + str(counts) + u'\u5b57\uff09'
|
||||||
|
except:
|
||||||
|
self.log("Error creating article descriptions")
|
||||||
|
return
|
||||||
|
|
||||||
|
# override from the one in version 0.8.31
|
||||||
def create_opf(self, feeds, dir=None):
|
def create_opf(self, feeds, dir=None):
|
||||||
if dir is None:
|
if dir is None:
|
||||||
dir = self.output_dir
|
dir = self.output_dir
|
||||||
if __UseChineseTitle__ == True:
|
title = self.short_title()
|
||||||
if __Region__ == 'Hong Kong':
|
# change 1: allow our own flag to tell if a periodical is to be generated
|
||||||
title = u'\u660e\u5831 (\u9999\u6e2f)'
|
# also use customed date instead of current time
|
||||||
elif __Region__ == 'Vancouver':
|
if __MakePeriodical__ == False or self.output_profile.periodical_date_in_title:
|
||||||
title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
|
|
||||||
elif __Region__ == 'Toronto':
|
|
||||||
title = u'\u660e\u5831 (\u591a\u502b\u591a)'
|
|
||||||
else:
|
|
||||||
title = self.short_title()
|
|
||||||
# if not generating a periodical, force date to apply in title
|
|
||||||
if __MakePeriodical__ == False:
|
|
||||||
title = title + ' ' + self.get_fetchformatteddate()
|
title = title + ' ' + self.get_fetchformatteddate()
|
||||||
if True:
|
# end of change 1
|
||||||
mi = MetaInformation(title, [self.publisher])
|
# change 2: __appname__ replaced by newspaper publisher
|
||||||
mi.publisher = self.publisher
|
__appname__ = self.publisher
|
||||||
mi.author_sort = self.publisher
|
mi = MetaInformation(title, [__appname__])
|
||||||
if __MakePeriodical__ == True:
|
mi.publisher = __appname__
|
||||||
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
|
mi.author_sort = __appname__
|
||||||
else:
|
# change 3: use __MakePeriodical__ flag to tell if a periodical should be generated
|
||||||
mi.publication_type = self.publication_type+':'+self.short_title()
|
if __MakePeriodical__ == True:
|
||||||
#mi.timestamp = nowf()
|
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
|
||||||
mi.timestamp = self.get_dtlocal()
|
else:
|
||||||
mi.comments = self.description
|
mi.publication_type = self.publication_type+':'+self.short_title()
|
||||||
if not isinstance(mi.comments, unicode):
|
#mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
|
||||||
mi.comments = mi.comments.decode('utf-8', 'replace')
|
# change 4: in the following, all the nowf() are changed to adjusted time
|
||||||
#mi.pubdate = nowf()
|
# This one doesn't matter
|
||||||
mi.pubdate = self.get_dtlocal()
|
mi.timestamp = nowf()
|
||||||
opf_path = os.path.join(dir, 'index.opf')
|
# change 5: skip listing the articles
|
||||||
ncx_path = os.path.join(dir, 'index.ncx')
|
#article_titles, aseen = [], set()
|
||||||
opf = OPFCreator(dir, mi)
|
#for f in feeds:
|
||||||
# Add mastheadImage entry to <guide> section
|
# for a in f:
|
||||||
mp = getattr(self, 'masthead_path', None)
|
# if a.title and a.title not in aseen:
|
||||||
if mp is not None and os.access(mp, os.R_OK):
|
# aseen.add(a.title)
|
||||||
from calibre.ebooks.metadata.opf2 import Guide
|
# article_titles.append(force_unicode(a.title, 'utf-8'))
|
||||||
ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
|
|
||||||
ref.type = 'masthead'
|
|
||||||
ref.title = 'Masthead Image'
|
|
||||||
opf.guide.append(ref)
|
|
||||||
|
|
||||||
manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
|
#mi.comments = self.description
|
||||||
manifest.append(os.path.join(dir, 'index.html'))
|
#if not isinstance(mi.comments, unicode):
|
||||||
manifest.append(os.path.join(dir, 'index.ncx'))
|
# mi.comments = mi.comments.decode('utf-8', 'replace')
|
||||||
|
#mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
|
||||||
|
# '\n\n'.join(article_titles))
|
||||||
|
|
||||||
# Get cover
|
language = canonicalize_lang(self.language)
|
||||||
cpath = getattr(self, 'cover_path', None)
|
if language is not None:
|
||||||
if cpath is None:
|
mi.language = language
|
||||||
pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
|
# This one affects the pub date shown in kindle title
|
||||||
if self.default_cover(pf):
|
#mi.pubdate = nowf()
|
||||||
cpath = pf.name
|
# now appears to need the time field to be > 12.00noon as well
|
||||||
if cpath is not None and os.access(cpath, os.R_OK):
|
mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
|
||||||
opf.cover = cpath
|
opf_path = os.path.join(dir, 'index.opf')
|
||||||
manifest.append(cpath)
|
ncx_path = os.path.join(dir, 'index.ncx')
|
||||||
|
|
||||||
# Get masthead
|
opf = OPFCreator(dir, mi)
|
||||||
mpath = getattr(self, 'masthead_path', None)
|
# Add mastheadImage entry to <guide> section
|
||||||
if mpath is not None and os.access(mpath, os.R_OK):
|
mp = getattr(self, 'masthead_path', None)
|
||||||
manifest.append(mpath)
|
if mp is not None and os.access(mp, os.R_OK):
|
||||||
|
from calibre.ebooks.metadata.opf2 import Guide
|
||||||
|
ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
|
||||||
|
ref.type = 'masthead'
|
||||||
|
ref.title = 'Masthead Image'
|
||||||
|
opf.guide.append(ref)
|
||||||
|
|
||||||
|
manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
|
||||||
|
manifest.append(os.path.join(dir, 'index.html'))
|
||||||
|
manifest.append(os.path.join(dir, 'index.ncx'))
|
||||||
|
|
||||||
|
# Get cover
|
||||||
|
cpath = getattr(self, 'cover_path', None)
|
||||||
|
if cpath is None:
|
||||||
|
pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
|
||||||
|
if self.default_cover(pf):
|
||||||
|
cpath = pf.name
|
||||||
|
if cpath is not None and os.access(cpath, os.R_OK):
|
||||||
|
opf.cover = cpath
|
||||||
|
manifest.append(cpath)
|
||||||
|
|
||||||
|
# Get masthead
|
||||||
|
mpath = getattr(self, 'masthead_path', None)
|
||||||
|
if mpath is not None and os.access(mpath, os.R_OK):
|
||||||
|
manifest.append(mpath)
|
||||||
|
|
||||||
|
opf.create_manifest_from_files_in(manifest)
|
||||||
|
for mani in opf.manifest:
|
||||||
|
if mani.path.endswith('.ncx'):
|
||||||
|
mani.id = 'ncx'
|
||||||
|
if mani.path.endswith('mastheadImage.jpg'):
|
||||||
|
mani.id = 'masthead-image'
|
||||||
|
|
||||||
|
entries = ['index.html']
|
||||||
|
toc = TOC(base_path=dir)
|
||||||
|
self.play_order_counter = 0
|
||||||
|
self.play_order_map = {}
|
||||||
|
|
||||||
opf.create_manifest_from_files_in(manifest)
|
|
||||||
for mani in opf.manifest:
|
|
||||||
if mani.path.endswith('.ncx'):
|
|
||||||
mani.id = 'ncx'
|
|
||||||
if mani.path.endswith('mastheadImage.jpg'):
|
|
||||||
mani.id = 'masthead-image'
|
|
||||||
entries = ['index.html']
|
|
||||||
toc = TOC(base_path=dir)
|
|
||||||
self.play_order_counter = 0
|
|
||||||
self.play_order_map = {}
|
|
||||||
|
|
||||||
def feed_index(num, parent):
|
def feed_index(num, parent):
|
||||||
f = feeds[num]
|
f = feeds[num]
|
||||||
@ -532,13 +846,16 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
desc = None
|
desc = None
|
||||||
else:
|
else:
|
||||||
desc = self.description_limiter(desc)
|
desc = self.description_limiter(desc)
|
||||||
|
tt = a.toc_thumbnail if a.toc_thumbnail else None
|
||||||
entries.append('%sindex.html'%adir)
|
entries.append('%sindex.html'%adir)
|
||||||
po = self.play_order_map.get(entries[-1], None)
|
po = self.play_order_map.get(entries[-1], None)
|
||||||
if po is None:
|
if po is None:
|
||||||
self.play_order_counter += 1
|
self.play_order_counter += 1
|
||||||
po = self.play_order_counter
|
po = self.play_order_counter
|
||||||
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
|
parent.add_item('%sindex.html'%adir, None,
|
||||||
play_order=po, author=auth, description=desc)
|
a.title if a.title else _('Untitled Article'),
|
||||||
|
play_order=po, author=auth,
|
||||||
|
description=desc, toc_thumbnail=tt)
|
||||||
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
|
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
|
||||||
for sp in a.sub_pages:
|
for sp in a.sub_pages:
|
||||||
prefix = os.path.commonprefix([opf_path, sp])
|
prefix = os.path.commonprefix([opf_path, sp])
|
||||||
@ -555,7 +872,7 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
|
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
|
||||||
templ = self.navbar.generate(True, num, j, len(f),
|
templ = self.navbar.generate(True, num, j, len(f),
|
||||||
not self.has_single_feed,
|
not self.has_single_feed,
|
||||||
a.orig_url, self.publisher, prefix=prefix,
|
a.orig_url, __appname__, prefix=prefix,
|
||||||
center=self.center_navbar)
|
center=self.center_navbar)
|
||||||
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
|
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
|
||||||
body.insert(len(body.contents), elem)
|
body.insert(len(body.contents), elem)
|
||||||
@ -578,7 +895,7 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
if not desc:
|
if not desc:
|
||||||
desc = None
|
desc = None
|
||||||
feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
|
feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
|
||||||
f.title, play_order=po, description=desc, author=auth))
|
f.title, play_order=po, description=desc, author=auth))
|
||||||
|
|
||||||
else:
|
else:
|
||||||
entries.append('feed_%d/index.html'%0)
|
entries.append('feed_%d/index.html'%0)
|
||||||
@ -591,4 +908,5 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
|
|
||||||
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
|
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
|
||||||
opf.render(opf_file, ncx_file)
|
opf.render(opf_file, ncx_file)
|
||||||
|
|
||||||
|
|
||||||
|
@ -4,18 +4,41 @@ __copyright__ = '2010-2011, Eddie Lau'
|
|||||||
# Region - Hong Kong, Vancouver, Toronto
|
# Region - Hong Kong, Vancouver, Toronto
|
||||||
__Region__ = 'Vancouver'
|
__Region__ = 'Vancouver'
|
||||||
# Users of Kindle 3 with limited system-level CJK support
|
# Users of Kindle 3 with limited system-level CJK support
|
||||||
# please replace the following "True" with "False".
|
# please replace the following "True" with "False". (Default: True)
|
||||||
__MakePeriodical__ = True
|
__MakePeriodical__ = True
|
||||||
# Turn below to true if your device supports display of CJK titles
|
# Turn below to True if your device supports display of CJK titles (Default: False)
|
||||||
__UseChineseTitle__ = False
|
__UseChineseTitle__ = False
|
||||||
# Set it to False if you want to skip images
|
# Set it to False if you want to skip images (Default: True)
|
||||||
__KeepImages__ = True
|
__KeepImages__ = True
|
||||||
# (HK only) Turn below to true if you wish to use life.mingpao.com as the main article source
|
# Set it to True if you want to include a summary in Kindle's article view (Default: False)
|
||||||
|
__IncludeSummary__ = False
|
||||||
|
# Set it to True if you want thumbnail images in Kindle's article view (Default: True)
|
||||||
|
__IncludeThumbnails__ = True
|
||||||
|
# (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source (Default: True)
|
||||||
__UseLife__ = True
|
__UseLife__ = True
|
||||||
|
# (HK only) It is to disable premium content (Default: False)
|
||||||
|
__InclPremium__ = False
|
||||||
|
# (HK only) Turn below to True if you wish to parse articles in news.mingpao.com with their printer-friendly formats (Default: True)
|
||||||
|
__ParsePFF__ = True
|
||||||
|
# (HK only) Turn below to True if you wish hi-res images (Default: False)
|
||||||
|
__HiResImg__ = False
|
||||||
|
# Override the date returned by the program if specifying a YYYYMMDD below
|
||||||
|
__Date__ = ''
|
||||||
|
|
||||||
|
|
||||||
'''
|
'''
|
||||||
Change Log:
|
Change Log:
|
||||||
|
2011/12/18: update the overridden create_odf(.) routine with the one from Calibre version 0.8.31. Move __UseChineseTitle__ usage away
|
||||||
|
from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day
|
||||||
|
download of Hong Kong Mingpao at 4.30am. Set the actual publication date shown on kindle device.
|
||||||
|
2011/12/01: take care of situation that in txt source parsing, the article content does start with special character u'\u3010'
|
||||||
|
2011/10/21: fix a bug that hi-res img is unavailable in pages parsed from source txt
|
||||||
|
2011/10/19: fix a bug in txt source parsing
|
||||||
|
2011/10/17: disable fetching of premium content, also improved txt source parsing
|
||||||
|
2011/10/04: option to get hi-res photos for the articles
|
||||||
|
2011/09/21: fetching "column" section is made optional.
|
||||||
|
2011/09/18: parse "column" section stuff from source text file directly.
|
||||||
|
2011/09/07: disable "column" section as it is no longer offered free.
|
||||||
2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source
|
2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source
|
||||||
provide options to remove all images in the file
|
provide options to remove all images in the file
|
||||||
2011/05/12: switch the main parse source to life.mingpao.com, which has more photos on the article pages
|
2011/05/12: switch the main parse source to life.mingpao.com, which has more photos on the article pages
|
||||||
@ -37,30 +60,39 @@ Change Log:
|
|||||||
2010/10/31: skip repeated articles in section pages
|
2010/10/31: skip repeated articles in section pages
|
||||||
'''
|
'''
|
||||||
|
|
||||||
import os, datetime, re
|
from calibre import (browser, iswindows, __appname__, force_unicode, preferred_encoding, as_unicode)
|
||||||
|
from calibre.utils.date import now as nowf
|
||||||
|
import os, datetime, re, mechanize
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
from contextlib import nested
|
from contextlib import nested
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||||
from calibre.ebooks.metadata.toc import TOC
|
from calibre.ebooks.metadata.toc import TOC
|
||||||
from calibre.ebooks.metadata import MetaInformation
|
from calibre.ebooks.metadata import MetaInformation
|
||||||
|
from calibre.utils.localization import canonicalize_lang
|
||||||
|
|
||||||
# MAIN CLASS
|
# MAIN CLASS
|
||||||
class MPRecipe(BasicNewsRecipe):
|
class MPRecipe(BasicNewsRecipe):
|
||||||
if __Region__ == 'Hong Kong':
|
if __Region__ == 'Hong Kong':
|
||||||
title = 'Ming Pao - Hong Kong'
|
if __UseChineseTitle__ == True:
|
||||||
|
title = u'\u660e\u5831 (\u9999\u6e2f)'
|
||||||
|
else:
|
||||||
|
title = 'Ming Pao - Hong Kong'
|
||||||
description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
|
description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
|
||||||
category = 'Chinese, News, Hong Kong'
|
category = 'Chinese, News, Hong Kong'
|
||||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;}'
|
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} font>b {font-size:200%; font-weight:bold;} div[class=heading] {font-size:200%; font-weight:bold;} div[class=images] {font-size:50%;}'
|
||||||
masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
|
masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
|
||||||
keep_only_tags = [dict(name='h1'),
|
keep_only_tags = [dict(name='h1'),
|
||||||
dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
|
dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
|
||||||
dict(name='font', attrs={'color':['AA0000']}), # for column articles title
|
dict(name='font', attrs={'color':['AA0000']}), # for column articles title
|
||||||
|
dict(attrs={'class':['heading']}), # for heading from txt
|
||||||
dict(attrs={'id':['newscontent']}), # entertainment and column page content
|
dict(attrs={'id':['newscontent']}), # entertainment and column page content
|
||||||
dict(attrs={'id':['newscontent01','newscontent02']}),
|
dict(attrs={'id':['newscontent01','newscontent02']}),
|
||||||
|
dict(attrs={'class':['content']}), # for content from txt
|
||||||
dict(attrs={'class':['photo']}),
|
dict(attrs={'class':['photo']}),
|
||||||
dict(name='table', attrs={'width':['100%'], 'border':['0'], 'cellspacing':['5'], 'cellpadding':['0']}), # content in printed version of life.mingpao.com
|
dict(name='table', attrs={'width':['100%'], 'border':['0'], 'cellspacing':['5'], 'cellpadding':['0']}), # content in printed version of life.mingpao.com
|
||||||
dict(name='img', attrs={'width':['180'], 'alt':['按圖放大']}) # images for source from life.mingpao.com
|
dict(name='img', attrs={'width':['180'], 'alt':['????']}), # images for source from life.mingpao.com
|
||||||
|
dict(attrs={'class':['images']}) # for images from txt
|
||||||
]
|
]
|
||||||
if __KeepImages__:
|
if __KeepImages__:
|
||||||
remove_tags = [dict(name='style'),
|
remove_tags = [dict(name='style'),
|
||||||
@ -90,7 +122,10 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
lambda match: "</b>")
|
lambda match: "</b>")
|
||||||
]
|
]
|
||||||
elif __Region__ == 'Vancouver':
|
elif __Region__ == 'Vancouver':
|
||||||
title = 'Ming Pao - Vancouver'
|
if __UseChineseTitle__ == True:
|
||||||
|
title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
|
||||||
|
else:
|
||||||
|
title = 'Ming Pao - Vancouver'
|
||||||
description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)'
|
description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)'
|
||||||
category = 'Chinese, News, Vancouver'
|
category = 'Chinese, News, Vancouver'
|
||||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
|
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
|
||||||
@ -108,7 +143,10 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
lambda match: ''),
|
lambda match: ''),
|
||||||
]
|
]
|
||||||
elif __Region__ == 'Toronto':
|
elif __Region__ == 'Toronto':
|
||||||
title = 'Ming Pao - Toronto'
|
if __UseChineseTitle__ == True:
|
||||||
|
title = u'\u660e\u5831 (\u591a\u502b\u591a)'
|
||||||
|
else:
|
||||||
|
title = 'Ming Pao - Toronto'
|
||||||
description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)'
|
description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)'
|
||||||
category = 'Chinese, News, Toronto'
|
category = 'Chinese, News, Toronto'
|
||||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
|
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
|
||||||
@ -139,49 +177,12 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
conversion_options = {'linearize_tables':True}
|
conversion_options = {'linearize_tables':True}
|
||||||
timefmt = ''
|
timefmt = ''
|
||||||
|
|
||||||
def image_url_processor(cls, baseurl, url):
|
|
||||||
# trick: break the url at the first occurance of digit, add an additional
|
|
||||||
# '_' at the front
|
|
||||||
# not working, may need to move this to preprocess_html() method
|
|
||||||
# minIdx = 10000
|
|
||||||
# i0 = url.find('0')
|
|
||||||
# if i0 >= 0 and i0 < minIdx:
|
|
||||||
# minIdx = i0
|
|
||||||
# i1 = url.find('1')
|
|
||||||
# if i1 >= 0 and i1 < minIdx:
|
|
||||||
# minIdx = i1
|
|
||||||
# i2 = url.find('2')
|
|
||||||
# if i2 >= 0 and i2 < minIdx:
|
|
||||||
# minIdx = i2
|
|
||||||
# i3 = url.find('3')
|
|
||||||
# if i3 >= 0 and i0 < minIdx:
|
|
||||||
# minIdx = i3
|
|
||||||
# i4 = url.find('4')
|
|
||||||
# if i4 >= 0 and i4 < minIdx:
|
|
||||||
# minIdx = i4
|
|
||||||
# i5 = url.find('5')
|
|
||||||
# if i5 >= 0 and i5 < minIdx:
|
|
||||||
# minIdx = i5
|
|
||||||
# i6 = url.find('6')
|
|
||||||
# if i6 >= 0 and i6 < minIdx:
|
|
||||||
# minIdx = i6
|
|
||||||
# i7 = url.find('7')
|
|
||||||
# if i7 >= 0 and i7 < minIdx:
|
|
||||||
# minIdx = i7
|
|
||||||
# i8 = url.find('8')
|
|
||||||
# if i8 >= 0 and i8 < minIdx:
|
|
||||||
# minIdx = i8
|
|
||||||
# i9 = url.find('9')
|
|
||||||
# if i9 >= 0 and i9 < minIdx:
|
|
||||||
# minIdx = i9
|
|
||||||
return url
|
|
||||||
|
|
||||||
def get_dtlocal(self):
|
def get_dtlocal(self):
|
||||||
dt_utc = datetime.datetime.utcnow()
|
dt_utc = datetime.datetime.utcnow()
|
||||||
if __Region__ == 'Hong Kong':
|
if __Region__ == 'Hong Kong':
|
||||||
# convert UTC to local hk time - at HKT 5.30am, all news are available
|
# convert UTC to local hk time - at HKT 4.30am, all news are available
|
||||||
dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(5.5/24)
|
dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(4.5/24)
|
||||||
# dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(5.5/24)
|
# dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(4.5/24)
|
||||||
elif __Region__ == 'Vancouver':
|
elif __Region__ == 'Vancouver':
|
||||||
# convert UTC to local Vancouver time - at PST time 5.30am, all news are available
|
# convert UTC to local Vancouver time - at PST time 5.30am, all news are available
|
||||||
dt_local = dt_utc + datetime.timedelta(-8.0/24) - datetime.timedelta(5.5/24)
|
dt_local = dt_utc + datetime.timedelta(-8.0/24) - datetime.timedelta(5.5/24)
|
||||||
@ -193,13 +194,34 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
return dt_local
|
return dt_local
|
||||||
|
|
||||||
def get_fetchdate(self):
|
def get_fetchdate(self):
|
||||||
return self.get_dtlocal().strftime("%Y%m%d")
|
if __Date__ <> '':
|
||||||
|
return __Date__
|
||||||
|
else:
|
||||||
|
return self.get_dtlocal().strftime("%Y%m%d")
|
||||||
|
|
||||||
def get_fetchformatteddate(self):
|
def get_fetchformatteddate(self):
|
||||||
return self.get_dtlocal().strftime("%Y-%m-%d")
|
if __Date__ <> '':
|
||||||
|
return __Date__[0:4]+'-'+__Date__[4:6]+'-'+__Date__[6:8]
|
||||||
|
else:
|
||||||
|
return self.get_dtlocal().strftime("%Y-%m-%d")
|
||||||
|
|
||||||
|
def get_fetchyear(self):
|
||||||
|
if __Date__ <> '':
|
||||||
|
return __Date__[0:4]
|
||||||
|
else:
|
||||||
|
return self.get_dtlocal().strftime("%Y")
|
||||||
|
|
||||||
|
def get_fetchmonth(self):
|
||||||
|
if __Date__ <> '':
|
||||||
|
return __Date__[4:6]
|
||||||
|
else:
|
||||||
|
return self.get_dtlocal().strftime("%m")
|
||||||
|
|
||||||
def get_fetchday(self):
|
def get_fetchday(self):
|
||||||
return self.get_dtlocal().strftime("%d")
|
if __Date__ <> '':
|
||||||
|
return __Date__[6:8]
|
||||||
|
else:
|
||||||
|
return self.get_dtlocal().strftime("%d")
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
if __Region__ == 'Hong Kong':
|
if __Region__ == 'Hong Kong':
|
||||||
@ -230,12 +252,23 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
(u'\u570b\u969b World', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalta', 'nal'),
|
(u'\u570b\u969b World', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalta', 'nal'),
|
||||||
(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal'),
|
(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal'),
|
||||||
(u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'),
|
(u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'),
|
||||||
(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal'),
|
(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
|
||||||
(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')]:
|
]:
|
||||||
articles = self.parse_section2(url, keystr)
|
if __InclPremium__ == True:
|
||||||
|
articles = self.parse_section2_txt(url, keystr)
|
||||||
|
else:
|
||||||
|
articles = self.parse_section2(url, keystr)
|
||||||
if articles:
|
if articles:
|
||||||
feeds.append((title, articles))
|
feeds.append((title, articles))
|
||||||
|
|
||||||
|
if __InclPremium__ == True:
|
||||||
|
# parse column section articles directly from .txt files
|
||||||
|
for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
|
||||||
|
]:
|
||||||
|
articles = self.parse_section2_txt(url, keystr)
|
||||||
|
if articles:
|
||||||
|
feeds.append((title, articles))
|
||||||
|
|
||||||
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
|
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
|
||||||
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
|
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
|
||||||
articles = self.parse_section(url)
|
articles = self.parse_section(url)
|
||||||
@ -244,15 +277,16 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
else:
|
else:
|
||||||
for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
|
for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
|
||||||
(u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
|
(u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
|
||||||
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm')]:
|
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
|
||||||
|
(u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm')]:
|
||||||
articles = self.parse_section(url)
|
articles = self.parse_section(url)
|
||||||
if articles:
|
if articles:
|
||||||
feeds.append((title, articles))
|
feeds.append((title, articles))
|
||||||
|
|
||||||
# special- editorial
|
# special- editorial
|
||||||
ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
|
#ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
|
||||||
if ed_articles:
|
#if ed_articles:
|
||||||
feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
|
# feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
|
||||||
|
|
||||||
for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
|
for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
|
||||||
(u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
|
(u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
|
||||||
@ -263,32 +297,46 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
|
|
||||||
# special - finance
|
# special - finance
|
||||||
#fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
|
#fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
|
||||||
fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
|
#fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
|
||||||
if fin_articles:
|
#if fin_articles:
|
||||||
feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
|
# feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
|
||||||
|
|
||||||
for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
|
for title, url, keystr in [(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal')]:
|
||||||
(u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
|
articles = self.parse_section2_txt(url, keystr)
|
||||||
articles = self.parse_section(url)
|
|
||||||
if articles:
|
if articles:
|
||||||
feeds.append((title, articles))
|
feeds.append((title, articles))
|
||||||
|
|
||||||
|
#for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
|
||||||
|
# (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
|
||||||
|
# articles = self.parse_section(url)
|
||||||
|
# if articles:
|
||||||
|
# feeds.append((title, articles))
|
||||||
|
|
||||||
# special - entertainment
|
# special - entertainment
|
||||||
ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
|
#ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
|
||||||
if ent_articles:
|
#if ent_articles:
|
||||||
feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
|
# feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
|
||||||
|
|
||||||
|
for title, url, keystr in [(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
|
||||||
|
]:
|
||||||
|
articles = self.parse_section2_txt(url, keystr)
|
||||||
|
if articles:
|
||||||
|
feeds.append((title, articles))
|
||||||
|
|
||||||
|
if __InclPremium__ == True:
|
||||||
|
# parse column section articles directly from .txt files
|
||||||
|
for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
|
||||||
|
]:
|
||||||
|
articles = self.parse_section2_txt(url, keystr)
|
||||||
|
if articles:
|
||||||
|
feeds.append((title, articles))
|
||||||
|
|
||||||
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
|
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
|
||||||
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
|
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
|
||||||
articles = self.parse_section(url)
|
articles = self.parse_section(url)
|
||||||
if articles:
|
if articles:
|
||||||
feeds.append((title, articles))
|
feeds.append((title, articles))
|
||||||
|
|
||||||
|
|
||||||
# special- columns
|
|
||||||
col_articles = self.parse_col_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn')
|
|
||||||
if col_articles:
|
|
||||||
feeds.append((u'\u5c08\u6b04 Columns', col_articles))
|
|
||||||
elif __Region__ == 'Vancouver':
|
elif __Region__ == 'Vancouver':
|
||||||
for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VAindex.htm'),
|
for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VAindex.htm'),
|
||||||
(u'\u52a0\u570b Canada', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VBindex.htm'),
|
(u'\u52a0\u570b Canada', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VBindex.htm'),
|
||||||
@ -332,6 +380,16 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
title = self.tag_to_string(a)
|
title = self.tag_to_string(a)
|
||||||
url = a.get('href', False)
|
url = a.get('href', False)
|
||||||
url = 'http://news.mingpao.com/' + dateStr + '/' +url
|
url = 'http://news.mingpao.com/' + dateStr + '/' +url
|
||||||
|
# replace the url to the print-friendly version
|
||||||
|
if __ParsePFF__ == True:
|
||||||
|
if url.rfind('Redirect') <> -1 and __InclPremium__ == True:
|
||||||
|
url = re.sub(dateStr + '.*' + dateStr, dateStr, url)
|
||||||
|
url = re.sub('%2F.*%2F', '/', url)
|
||||||
|
title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '')
|
||||||
|
url = url.replace('%2Etxt', '_print.htm')
|
||||||
|
url = url.replace('%5F', '_')
|
||||||
|
else:
|
||||||
|
url = url.replace('.htm', '_print.htm')
|
||||||
if url not in included_urls and url.rfind('Redirect') == -1:
|
if url not in included_urls and url.rfind('Redirect') == -1:
|
||||||
current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
|
current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
|
||||||
included_urls.append(url)
|
included_urls.append(url)
|
||||||
@ -340,6 +398,8 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
|
|
||||||
# parse from life.mingpao.com
|
# parse from life.mingpao.com
|
||||||
def parse_section2(self, url, keystr):
|
def parse_section2(self, url, keystr):
|
||||||
|
br = mechanize.Browser()
|
||||||
|
br.set_handle_redirect(False)
|
||||||
self.get_fetchdate()
|
self.get_fetchdate()
|
||||||
soup = self.index_to_soup(url)
|
soup = self.index_to_soup(url)
|
||||||
a = soup.findAll('a', href=True)
|
a = soup.findAll('a', href=True)
|
||||||
@ -350,12 +410,34 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
title = self.tag_to_string(i)
|
title = self.tag_to_string(i)
|
||||||
url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
|
url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
|
||||||
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
|
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
|
||||||
url = url.replace('dailynews3.cfm', 'dailynews3a.cfm') # use printed version of the article
|
try:
|
||||||
|
br.open_novisit(url)
|
||||||
|
url = url.replace('dailynews3.cfm', 'dailynews3a.cfm') # use printed version of the article
|
||||||
|
current_articles.append({'title': title, 'url': url, 'description': ''})
|
||||||
|
included_urls.append(url)
|
||||||
|
except:
|
||||||
|
print 'skipping a premium article'
|
||||||
|
current_articles.reverse()
|
||||||
|
return current_articles
|
||||||
|
|
||||||
|
# parse from text file of life.mingpao.com
|
||||||
|
def parse_section2_txt(self, url, keystr):
|
||||||
|
self.get_fetchdate()
|
||||||
|
soup = self.index_to_soup(url)
|
||||||
|
a = soup.findAll('a', href=True)
|
||||||
|
a.reverse()
|
||||||
|
current_articles = []
|
||||||
|
included_urls = []
|
||||||
|
for i in a:
|
||||||
|
title = self.tag_to_string(i)
|
||||||
|
url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
|
||||||
|
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
|
||||||
|
url = url.replace('cfm/dailynews3.cfm?File=', 'ftp/Life3/') # use printed version of the article
|
||||||
current_articles.append({'title': title, 'url': url, 'description': ''})
|
current_articles.append({'title': title, 'url': url, 'description': ''})
|
||||||
included_urls.append(url)
|
included_urls.append(url)
|
||||||
current_articles.reverse()
|
current_articles.reverse()
|
||||||
return current_articles
|
return current_articles
|
||||||
|
|
||||||
# parse from www.mingpaovan.com
|
# parse from www.mingpaovan.com
|
||||||
def parse_section3(self, url, baseUrl):
|
def parse_section3(self, url, baseUrl):
|
||||||
self.get_fetchdate()
|
self.get_fetchdate()
|
||||||
@ -438,6 +520,162 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
current_articles.reverse()
|
current_articles.reverse()
|
||||||
return current_articles
|
return current_articles
|
||||||
|
|
||||||
|
# preprocess those .txt and javascript based files
|
||||||
|
def preprocess_raw_html(self, raw_html, url):
|
||||||
|
new_html = raw_html
|
||||||
|
if url.rfind('ftp') <> -1 or url.rfind('_print.htm') <> -1:
|
||||||
|
if url.rfind('_print.htm') <> -1:
|
||||||
|
# javascript based file
|
||||||
|
splitter = re.compile(r'\n')
|
||||||
|
new_raw_html = '<html><head><title>Untitled</title></head>'
|
||||||
|
new_raw_html = new_raw_html + '<body>'
|
||||||
|
for item in splitter.split(raw_html):
|
||||||
|
if item.startswith('var heading1 ='):
|
||||||
|
heading = item.replace('var heading1 = \'', '')
|
||||||
|
heading = heading.replace('\'', '')
|
||||||
|
heading = heading.replace(';', '')
|
||||||
|
new_raw_html = new_raw_html + '<div class="heading">' + heading
|
||||||
|
if item.startswith('var heading2 ='):
|
||||||
|
heading = item.replace('var heading2 = \'', '')
|
||||||
|
heading = heading.replace('\'', '')
|
||||||
|
heading = heading.replace(';', '')
|
||||||
|
if heading <> '':
|
||||||
|
new_raw_html = new_raw_html + '<br>' + heading + '</div>'
|
||||||
|
else:
|
||||||
|
new_raw_html = new_raw_html + '</div>'
|
||||||
|
if item.startswith('var content ='):
|
||||||
|
content = item.replace("var content = ", '')
|
||||||
|
content = content.replace('\'', '')
|
||||||
|
content = content.replace(';', '')
|
||||||
|
new_raw_html = new_raw_html + '<div class="content">' + content + '</div>'
|
||||||
|
if item.startswith('var photocontent ='):
|
||||||
|
photo = item.replace('var photocontent = \'', '')
|
||||||
|
photo = photo.replace('\'', '')
|
||||||
|
photo = photo.replace(';', '')
|
||||||
|
photo = photo.replace('<tr>', '')
|
||||||
|
photo = photo.replace('<td>', '')
|
||||||
|
photo = photo.replace('</tr>', '')
|
||||||
|
photo = photo.replace('</td>', '<br>')
|
||||||
|
photo = photo.replace('class="photo"', '')
|
||||||
|
new_raw_html = new_raw_html + '<div class="images">' + photo + '</div>'
|
||||||
|
new_html = new_raw_html + '</body></html>'
|
||||||
|
else:
|
||||||
|
# .txt based file
|
||||||
|
splitter = re.compile(r'\n') # Match non-digits
|
||||||
|
new_raw_html = '<html><head><title>Untitled</title></head><body><div class="images">'
|
||||||
|
next_is_img_txt = False
|
||||||
|
title_started = False
|
||||||
|
title_break_reached = False
|
||||||
|
met_article_start_char = False
|
||||||
|
for item in splitter.split(raw_html):
|
||||||
|
item = item.strip()
|
||||||
|
# if title already reached but break between title and content not yet found, record title_break_reached
|
||||||
|
if title_started == True and title_break_reached == False and item == '':
|
||||||
|
title_break_reached = True
|
||||||
|
# if title reached and title_break_reached and met_article_start_char == False and item is not empty
|
||||||
|
# start content
|
||||||
|
elif title_started == True and title_break_reached == True and met_article_start_char == False:
|
||||||
|
if item <> '':
|
||||||
|
met_article_start_char = True
|
||||||
|
new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
|
||||||
|
#if item.startswith(u'\u3010'):
|
||||||
|
# met_article_start_char = True
|
||||||
|
# new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
|
||||||
|
else:
|
||||||
|
if next_is_img_txt == False:
|
||||||
|
if item.startswith("=@"):
|
||||||
|
print 'skip movie link'
|
||||||
|
elif item.startswith("=?"):
|
||||||
|
next_is_img_txt = True
|
||||||
|
new_raw_html += '<img src="' + str(item)[2:].strip() + '.gif" /><p>\n'
|
||||||
|
elif item.startswith('=='):
|
||||||
|
next_is_img_txt = True
|
||||||
|
if False:
|
||||||
|
# TODO: check existence of .gif first
|
||||||
|
newimg = '_' + item[2:].strip() + '.jpg'
|
||||||
|
new_raw_html += '<img src="' + newimg + '" /><p>\n'
|
||||||
|
else:
|
||||||
|
new_raw_html += '<img src="' + str(item)[2:].strip() + '.jpg" /><p>\n'
|
||||||
|
elif item.startswith('='):
|
||||||
|
next_is_img_txt = True
|
||||||
|
if False:
|
||||||
|
# TODO: check existence of .gif first
|
||||||
|
newimg = '_' + item[1:].strip() + '.jpg'
|
||||||
|
new_raw_html += '<img src="' + newimg + '" /><p>\n'
|
||||||
|
else:
|
||||||
|
new_raw_html += '<img src="' + str(item)[1:].strip() + '.jpg" /><p>\n'
|
||||||
|
else:
|
||||||
|
if next_is_img_txt == False and met_article_start_char == False:
|
||||||
|
if item <> '':
|
||||||
|
if title_started == False:
|
||||||
|
#print 'Title started at ', item
|
||||||
|
new_raw_html = new_raw_html + '</div><div class="heading">' + item + '\n'
|
||||||
|
title_started = True
|
||||||
|
else:
|
||||||
|
new_raw_html = new_raw_html + item + '\n'
|
||||||
|
else:
|
||||||
|
new_raw_html = new_raw_html + item + '<p>\n'
|
||||||
|
else:
|
||||||
|
next_is_img_txt = False
|
||||||
|
new_raw_html = new_raw_html + item + '\n'
|
||||||
|
new_html = new_raw_html + '</div></body></html>'
|
||||||
|
#raw_html = raw_html.replace(u'<p>\u3010', u'\u3010')
|
||||||
|
if __HiResImg__ == True:
|
||||||
|
# TODO: add a _ in front of an image url
|
||||||
|
if url.rfind('news.mingpao.com') > -1:
|
||||||
|
imglist = re.findall('src="?.*?jpg"', new_html)
|
||||||
|
br = mechanize.Browser()
|
||||||
|
br.set_handle_redirect(False)
|
||||||
|
for img in imglist:
|
||||||
|
gifimg = img.replace('jpg"', 'gif"')
|
||||||
|
try:
|
||||||
|
br.open_novisit(url + "/../" + gifimg[5:len(gifimg)-1])
|
||||||
|
new_html = new_html.replace(img, gifimg)
|
||||||
|
except:
|
||||||
|
# find the location of the first _
|
||||||
|
pos = img.find('_')
|
||||||
|
if pos > -1:
|
||||||
|
# if found, insert _ after the first _
|
||||||
|
newimg = img[0:pos] + '_' + img[pos:]
|
||||||
|
new_html = new_html.replace(img, newimg)
|
||||||
|
else:
|
||||||
|
# if not found, insert _ after "
|
||||||
|
new_html = new_html.replace(img[1:], '"_' + img[1:])
|
||||||
|
elif url.rfind('life.mingpao.com') > -1:
|
||||||
|
imglist = re.findall('src=\'?.*?jpg\'', new_html)
|
||||||
|
br = mechanize.Browser()
|
||||||
|
br.set_handle_redirect(False)
|
||||||
|
#print 'Img list: ', imglist, '\n'
|
||||||
|
for img in imglist:
|
||||||
|
#print 'Found img: ', img
|
||||||
|
gifimg = img.replace('jpg\'', 'gif\'')
|
||||||
|
try:
|
||||||
|
gifurl = re.sub(r'dailynews.*txt', '', url)
|
||||||
|
br.open_novisit(gifurl + gifimg[5:len(gifimg)-1])
|
||||||
|
new_html = new_html.replace(img, gifimg)
|
||||||
|
except:
|
||||||
|
pos = img.rfind('/')
|
||||||
|
newimg = img[0:pos+1] + '_' + img[pos+1:]
|
||||||
|
new_html = new_html.replace(img, newimg)
|
||||||
|
# repeat with src quoted by double quotes, for text parsed from src txt
|
||||||
|
imglist = re.findall('src="?.*?jpg"', new_html)
|
||||||
|
for img in imglist:
|
||||||
|
#print 'Found img: ', img
|
||||||
|
gifimg = img.replace('jpg"', 'gif"')
|
||||||
|
try:
|
||||||
|
#print 'url', url
|
||||||
|
pos = url.rfind('/')
|
||||||
|
gifurl = url[:pos+1]
|
||||||
|
#print 'try it:', gifurl + gifimg[5:len(gifimg)-1]
|
||||||
|
br.open_novisit(gifurl + gifimg[5:len(gifimg)-1])
|
||||||
|
new_html = new_html.replace(img, gifimg)
|
||||||
|
except:
|
||||||
|
pos = img.find('"')
|
||||||
|
newimg = img[0:pos+1] + '_' + img[pos+1:]
|
||||||
|
#print 'Use hi-res img', newimg
|
||||||
|
new_html = new_html.replace(img, newimg)
|
||||||
|
return new_html
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for item in soup.findAll(style=True):
|
for item in soup.findAll(style=True):
|
||||||
del item['style']
|
del item['style']
|
||||||
@ -446,78 +684,154 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
for item in soup.findAll(stype=True):
|
for item in soup.findAll(stype=True):
|
||||||
del item['absmiddle']
|
del item['absmiddle']
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
def populate_article_metadata(self, article, soup, first):
|
||||||
|
# thumbnails shouldn't be available if using hi-res images
|
||||||
|
if __IncludeThumbnails__ and __HiResImg__ == False and first and hasattr(self, 'add_toc_thumbnail'):
|
||||||
|
img = soup.find('img')
|
||||||
|
if img is not None:
|
||||||
|
self.add_toc_thumbnail(article, img['src'])
|
||||||
|
|
||||||
|
try:
|
||||||
|
if __IncludeSummary__ and len(article.text_summary.strip()) == 0:
|
||||||
|
# look for content
|
||||||
|
articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
|
||||||
|
if not articlebodies:
|
||||||
|
articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
|
||||||
|
if not articlebodies:
|
||||||
|
articlebodies = soup.findAll('div',attrs={'class':'content'})
|
||||||
|
if not articlebodies:
|
||||||
|
articlebodies = soup.findAll('div', attrs={'id':'font'})
|
||||||
|
if articlebodies:
|
||||||
|
for articlebody in articlebodies:
|
||||||
|
if articlebody:
|
||||||
|
# the text may or may not be enclosed in <p></p> tag
|
||||||
|
paras = articlebody.findAll('p')
|
||||||
|
if not paras:
|
||||||
|
paras = articlebody
|
||||||
|
textFound = False
|
||||||
|
for p in paras:
|
||||||
|
if not textFound:
|
||||||
|
summary_candidate = self.tag_to_string(p).strip()
|
||||||
|
summary_candidate = summary_candidate.replace(u'\u3010\u660e\u5831\u5c08\u8a0a\u3011', '', 1)
|
||||||
|
if len(summary_candidate) > 0:
|
||||||
|
article.summary = article.text_summary = summary_candidate
|
||||||
|
textFound = True
|
||||||
|
else:
|
||||||
|
# display a simple text
|
||||||
|
#article.summary = article.text_summary = u'\u66f4\u591a......'
|
||||||
|
# display word counts
|
||||||
|
counts = 0
|
||||||
|
articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
|
||||||
|
if not articlebodies:
|
||||||
|
articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
|
||||||
|
if not articlebodies:
|
||||||
|
articlebodies = soup.findAll('div',attrs={'class':'content'})
|
||||||
|
if not articlebodies:
|
||||||
|
articlebodies = soup.findAll('div', attrs={'id':'font'})
|
||||||
|
if articlebodies:
|
||||||
|
for articlebody in articlebodies:
|
||||||
|
# the text may or may not be enclosed in <p></p> tag
|
||||||
|
paras = articlebody.findAll('p')
|
||||||
|
if not paras:
|
||||||
|
paras = articlebody
|
||||||
|
for p in paras:
|
||||||
|
summary_candidate = self.tag_to_string(p).strip()
|
||||||
|
counts += len(summary_candidate)
|
||||||
|
article.summary = article.text_summary = u'\uff08' + str(counts) + u'\u5b57\uff09'
|
||||||
|
except:
|
||||||
|
self.log("Error creating article descriptions")
|
||||||
|
return
|
||||||
|
|
||||||
|
# override from the one in version 0.8.31
|
||||||
def create_opf(self, feeds, dir=None):
|
def create_opf(self, feeds, dir=None):
|
||||||
if dir is None:
|
if dir is None:
|
||||||
dir = self.output_dir
|
dir = self.output_dir
|
||||||
if __UseChineseTitle__ == True:
|
title = self.short_title()
|
||||||
if __Region__ == 'Hong Kong':
|
# change 1: allow our own flag to tell if a periodical is to be generated
|
||||||
title = u'\u660e\u5831 (\u9999\u6e2f)'
|
# also use customed date instead of current time
|
||||||
elif __Region__ == 'Vancouver':
|
if __MakePeriodical__ == False or self.output_profile.periodical_date_in_title:
|
||||||
title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
|
|
||||||
elif __Region__ == 'Toronto':
|
|
||||||
title = u'\u660e\u5831 (\u591a\u502b\u591a)'
|
|
||||||
else:
|
|
||||||
title = self.short_title()
|
|
||||||
# if not generating a periodical, force date to apply in title
|
|
||||||
if __MakePeriodical__ == False:
|
|
||||||
title = title + ' ' + self.get_fetchformatteddate()
|
title = title + ' ' + self.get_fetchformatteddate()
|
||||||
if True:
|
# end of change 1
|
||||||
mi = MetaInformation(title, [self.publisher])
|
# change 2: __appname__ replaced by newspaper publisher
|
||||||
mi.publisher = self.publisher
|
__appname__ = self.publisher
|
||||||
mi.author_sort = self.publisher
|
mi = MetaInformation(title, [__appname__])
|
||||||
if __MakePeriodical__ == True:
|
mi.publisher = __appname__
|
||||||
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
|
mi.author_sort = __appname__
|
||||||
else:
|
# change 3: use __MakePeriodical__ flag to tell if a periodical should be generated
|
||||||
mi.publication_type = self.publication_type+':'+self.short_title()
|
if __MakePeriodical__ == True:
|
||||||
#mi.timestamp = nowf()
|
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
|
||||||
mi.timestamp = self.get_dtlocal()
|
else:
|
||||||
mi.comments = self.description
|
mi.publication_type = self.publication_type+':'+self.short_title()
|
||||||
if not isinstance(mi.comments, unicode):
|
#mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
|
||||||
mi.comments = mi.comments.decode('utf-8', 'replace')
|
# change 4: in the following, all the nowf() are changed to adjusted time
|
||||||
#mi.pubdate = nowf()
|
# This one doesn't matter
|
||||||
mi.pubdate = self.get_dtlocal()
|
mi.timestamp = nowf()
|
||||||
opf_path = os.path.join(dir, 'index.opf')
|
# change 5: skip listing the articles
|
||||||
ncx_path = os.path.join(dir, 'index.ncx')
|
#article_titles, aseen = [], set()
|
||||||
opf = OPFCreator(dir, mi)
|
#for f in feeds:
|
||||||
# Add mastheadImage entry to <guide> section
|
# for a in f:
|
||||||
mp = getattr(self, 'masthead_path', None)
|
# if a.title and a.title not in aseen:
|
||||||
if mp is not None and os.access(mp, os.R_OK):
|
# aseen.add(a.title)
|
||||||
from calibre.ebooks.metadata.opf2 import Guide
|
# article_titles.append(force_unicode(a.title, 'utf-8'))
|
||||||
ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
|
|
||||||
ref.type = 'masthead'
|
|
||||||
ref.title = 'Masthead Image'
|
|
||||||
opf.guide.append(ref)
|
|
||||||
|
|
||||||
manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
|
#mi.comments = self.description
|
||||||
manifest.append(os.path.join(dir, 'index.html'))
|
#if not isinstance(mi.comments, unicode):
|
||||||
manifest.append(os.path.join(dir, 'index.ncx'))
|
# mi.comments = mi.comments.decode('utf-8', 'replace')
|
||||||
|
#mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
|
||||||
|
# '\n\n'.join(article_titles))
|
||||||
|
|
||||||
# Get cover
|
language = canonicalize_lang(self.language)
|
||||||
cpath = getattr(self, 'cover_path', None)
|
if language is not None:
|
||||||
if cpath is None:
|
mi.language = language
|
||||||
pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
|
# This one affects the pub date shown in kindle title
|
||||||
if self.default_cover(pf):
|
#mi.pubdate = nowf()
|
||||||
cpath = pf.name
|
# now appears to need the time field to be > 12.00noon as well
|
||||||
if cpath is not None and os.access(cpath, os.R_OK):
|
mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
|
||||||
opf.cover = cpath
|
opf_path = os.path.join(dir, 'index.opf')
|
||||||
manifest.append(cpath)
|
ncx_path = os.path.join(dir, 'index.ncx')
|
||||||
|
|
||||||
# Get masthead
|
opf = OPFCreator(dir, mi)
|
||||||
mpath = getattr(self, 'masthead_path', None)
|
# Add mastheadImage entry to <guide> section
|
||||||
if mpath is not None and os.access(mpath, os.R_OK):
|
mp = getattr(self, 'masthead_path', None)
|
||||||
manifest.append(mpath)
|
if mp is not None and os.access(mp, os.R_OK):
|
||||||
|
from calibre.ebooks.metadata.opf2 import Guide
|
||||||
|
ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
|
||||||
|
ref.type = 'masthead'
|
||||||
|
ref.title = 'Masthead Image'
|
||||||
|
opf.guide.append(ref)
|
||||||
|
|
||||||
|
manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
|
||||||
|
manifest.append(os.path.join(dir, 'index.html'))
|
||||||
|
manifest.append(os.path.join(dir, 'index.ncx'))
|
||||||
|
|
||||||
|
# Get cover
|
||||||
|
cpath = getattr(self, 'cover_path', None)
|
||||||
|
if cpath is None:
|
||||||
|
pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
|
||||||
|
if self.default_cover(pf):
|
||||||
|
cpath = pf.name
|
||||||
|
if cpath is not None and os.access(cpath, os.R_OK):
|
||||||
|
opf.cover = cpath
|
||||||
|
manifest.append(cpath)
|
||||||
|
|
||||||
|
# Get masthead
|
||||||
|
mpath = getattr(self, 'masthead_path', None)
|
||||||
|
if mpath is not None and os.access(mpath, os.R_OK):
|
||||||
|
manifest.append(mpath)
|
||||||
|
|
||||||
|
opf.create_manifest_from_files_in(manifest)
|
||||||
|
for mani in opf.manifest:
|
||||||
|
if mani.path.endswith('.ncx'):
|
||||||
|
mani.id = 'ncx'
|
||||||
|
if mani.path.endswith('mastheadImage.jpg'):
|
||||||
|
mani.id = 'masthead-image'
|
||||||
|
|
||||||
|
entries = ['index.html']
|
||||||
|
toc = TOC(base_path=dir)
|
||||||
|
self.play_order_counter = 0
|
||||||
|
self.play_order_map = {}
|
||||||
|
|
||||||
opf.create_manifest_from_files_in(manifest)
|
|
||||||
for mani in opf.manifest:
|
|
||||||
if mani.path.endswith('.ncx'):
|
|
||||||
mani.id = 'ncx'
|
|
||||||
if mani.path.endswith('mastheadImage.jpg'):
|
|
||||||
mani.id = 'masthead-image'
|
|
||||||
entries = ['index.html']
|
|
||||||
toc = TOC(base_path=dir)
|
|
||||||
self.play_order_counter = 0
|
|
||||||
self.play_order_map = {}
|
|
||||||
|
|
||||||
def feed_index(num, parent):
|
def feed_index(num, parent):
|
||||||
f = feeds[num]
|
f = feeds[num]
|
||||||
@ -532,13 +846,16 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
desc = None
|
desc = None
|
||||||
else:
|
else:
|
||||||
desc = self.description_limiter(desc)
|
desc = self.description_limiter(desc)
|
||||||
|
tt = a.toc_thumbnail if a.toc_thumbnail else None
|
||||||
entries.append('%sindex.html'%adir)
|
entries.append('%sindex.html'%adir)
|
||||||
po = self.play_order_map.get(entries[-1], None)
|
po = self.play_order_map.get(entries[-1], None)
|
||||||
if po is None:
|
if po is None:
|
||||||
self.play_order_counter += 1
|
self.play_order_counter += 1
|
||||||
po = self.play_order_counter
|
po = self.play_order_counter
|
||||||
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
|
parent.add_item('%sindex.html'%adir, None,
|
||||||
play_order=po, author=auth, description=desc)
|
a.title if a.title else _('Untitled Article'),
|
||||||
|
play_order=po, author=auth,
|
||||||
|
description=desc, toc_thumbnail=tt)
|
||||||
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
|
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
|
||||||
for sp in a.sub_pages:
|
for sp in a.sub_pages:
|
||||||
prefix = os.path.commonprefix([opf_path, sp])
|
prefix = os.path.commonprefix([opf_path, sp])
|
||||||
@ -555,7 +872,7 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
|
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
|
||||||
templ = self.navbar.generate(True, num, j, len(f),
|
templ = self.navbar.generate(True, num, j, len(f),
|
||||||
not self.has_single_feed,
|
not self.has_single_feed,
|
||||||
a.orig_url, self.publisher, prefix=prefix,
|
a.orig_url, __appname__, prefix=prefix,
|
||||||
center=self.center_navbar)
|
center=self.center_navbar)
|
||||||
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
|
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
|
||||||
body.insert(len(body.contents), elem)
|
body.insert(len(body.contents), elem)
|
||||||
@ -578,7 +895,7 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
if not desc:
|
if not desc:
|
||||||
desc = None
|
desc = None
|
||||||
feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
|
feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
|
||||||
f.title, play_order=po, description=desc, author=auth))
|
f.title, play_order=po, description=desc, author=auth))
|
||||||
|
|
||||||
else:
|
else:
|
||||||
entries.append('feed_%d/index.html'%0)
|
entries.append('feed_%d/index.html'%0)
|
||||||
@ -591,4 +908,5 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
|
|
||||||
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
|
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
|
||||||
opf.render(opf_file, ncx_file)
|
opf.render(opf_file, ncx_file)
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user