Sing Tao Daily by Eddie Lau

This commit is contained in:
Kovid Goyal 2011-12-30 08:58:37 +05:30
parent d5f2c7cade
commit 890d4a6ad2
2 changed files with 496 additions and 2 deletions

491
recipes/singtaohk.recipe Normal file
View File

@ -0,0 +1,491 @@
__license__ = 'GPL v3'
__copyright__ = '2011, Eddie Lau'
# data source: normal, mobile
__Source__ = 'mobile'
# please replace the following "True" with "False". (Default: True)
__MakePeriodical__ = True
# Turn below to True if your device supports display of CJK titles (Default: False)
__UseChineseTitle__ = False
# Set it to False if you want to skip images (Default: True)
__KeepImages__ = True
# Set it to True if you want to include a summary in Kindle's article view (Default: False)
__IncludeSummary__ = False
# Set it to True if you want thumbnail images in Kindle's article view (Default: True)
__IncludeThumbnails__ = True
'''
Change Log:
2011/12/29 -- first version done
TODO:
* use alternative source at http://m.singtao.com/index.php
'''
from calibre.utils.date import now as nowf
import os, datetime, re
from datetime import date
from calibre.web.feeds.recipes import BasicNewsRecipe
from contextlib import nested
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.metadata import MetaInformation
from calibre.utils.localization import canonicalize_lang
# MAIN CLASS
class STHKRecipe(BasicNewsRecipe):
if __UseChineseTitle__ == True:
title = u'\u661f\u5cf6\u65e5\u5831 (\u9999\u6e2f)'
else:
title = 'Sing Tao Daily - Hong Kong'
description = 'Hong Kong Chinese Newspaper (http://singtao.com)'
category = 'Chinese, News, Hong Kong'
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} td[class=caption] {font-size:50%;} td[class=bodyhead]{font-weight:bold; font-size:150%;} td[class=stmobheadline]{font-weight:bold; font-size:150%;}'
masthead_url = 'http://upload.wikimedia.org/wikipedia/en/d/dd/Singtao-usa.png'
if __Source__ == 'normal':
keep_only_tags = [dict(name='td', attrs={'class':['bodyhead','bodytext']})]
else:
keep_only_tags = [dict(name='td', attrs={'class':['stmobheadline']}),
dict(name='img', attrs={'width':['146']}),
dict(name='td', attrs={'class':['bodytextg']}),
]
if __KeepImages__:
remove_tags = [dict(name='hr')]
else:
remove_tags = [dict(name='hr'), dict(name='img')]
remove_attributes = ['align']
preprocess_regexps = [
(re.compile(r'<font class="bodytext">', re.DOTALL|re.IGNORECASE),
lambda match: '<br><br><font class="bodytext">'),
]
oldest_article = 1
max_articles_per_feed = 200
__author__ = 'Eddie Lau'
publisher = 'Sing Tao Ltd.'
remove_javascript = True
use_embedded_content = False
no_stylesheets = True
language = 'zh'
encoding = 'Big5-HKSCS'
recursions = 0
conversion_options = {'linearize_tables':True}
timefmt = ''
auto_cleanup = False
def get_dtlocal(self):
dt_utc = datetime.datetime.utcnow()
# convert UTC to local hk time - at HKT 4.00am, all news are available
dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(4.0/24)
return dt_local
def get_fetchdate(self):
return self.get_dtlocal().strftime("%Y%m%d")
def get_fetchformatteddate(self):
return self.get_dtlocal().strftime("%Y-%m-%d")
def get_fetchyear(self):
return self.get_dtlocal().strftime("%Y")
def get_fetchmonth(self):
return self.get_dtlocal().strftime("%m")
def get_fetchday(self):
return self.get_dtlocal().strftime("%d")
def get_cover_url(self):
#cover = 'http://singtao.com/media/a/a(2660).jpg' # for 2011/12/29
base = 2660
todaydate = date(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()))
diff = todaydate - date(2011, 12, 29)
base = base + int(diff.total_seconds()/(3600*24))
cover = 'http://singtao.com/media/a/a(' + str(base) +').jpg'
br = BasicNewsRecipe.get_browser()
try:
br.open(cover)
except:
cover = 'http://singtao.com/images/stlogo.gif'
return cover
def parse_index(self):
feeds = []
dateStr = self.get_fetchdate()
dateStr
if __Source__ == 'normal':
# single-item section
for title, url in [(u'\u793e\u8ad6 Editorial', 'http://singtao.com/yesterday/jou/j_index.html')]:
article = self.parse_singleitem_section(url)
if article:
feeds.append((title, article))
# multiple items
# for title, url in [(u'\u8981\u805e\u6e2f\u805e Local', 'http://singtao.com/yesterday/loc/a_index.html'),
# (u'\u8ca1\u7d93 Finance', 'http://singtao.com/yesterday/fin/d_index.html'),
# (u'\u5730\u7522 Properties', 'http://singtao.com/yesterday/pro/h_index.html'),
# (u'\u6559\u80b2 Education', 'http://singtao.com/yesterday/edu/g_index.asp'),
# (u'\u5a1b\u6a02 Entertainment', 'http://singtao.com/yesterday/ent/f_index.html'),
# (u'\u9ad4\u80b2 Sports', 'http://singtao.com/yesterday/spo/c_index.html'),
# (u'\u99ac\u7d93 Horse Racing', 'http://singtao.com/yesterday/rac/n_index.html')
# ]:
# articles = self.parse_section(url)
# if articles:
# feeds.append((title, articles))
# special: supplement
# for title, url, baseurl in [(u'\u526f\u520a Supplements', 'http://singtao.com/yesterday/sup/m_index.html', '/')]:
# articles = self.parse_section_withouttext(url, baseurl)
# if articles:
# feeds.append((title, articles))
# multiple-item sections
# for title, url in [(u'\u570b\u969b World', 'http://singtao.com/yesterday/int/b_index.html'),
# (u'\u4e2d\u570b China', 'http://singtao.com/yesterday/chi/e_index.html')
# ]:
# articles = self.parse_section(url)
# if articles:
# feeds.append((title, articles))
for title, url, baseurl in [(u'\u8981\u805e\u6e2f\u805e Local', 'http://singtao.com/yesterday/loc/a_index.html', '/'),
(u'\u8ca1\u7d93 Finance', 'http://singtao.com/yesterday/fin/d_index.html', '/'),
(u'\u5730\u7522 Properties', 'http://singtao.com/yesterday/pro/h_index.html', '/'),
(u'\u6559\u80b2 Education', 'http://singtao.com/yesterday/edu/g_index.asp', '/'),
(u'\u5a1b\u6a02 Entertainment', 'http://singtao.com/yesterday/ent/f_index.html', '/'),
(u'\u9ad4\u80b2 Sports', 'http://singtao.com/yesterday/spo/c_index.html', '/'),
(u'\u99ac\u7d93 Horse Racing', 'http://singtao.com/yesterday/rac/n_index.html', '/'),
(u'\u526f\u520a Supplements', 'http://singtao.com/yesterday/sup/m_index.html', '/'),
(u'\u570b\u969b World', 'http://singtao.com/yesterday/int/b_index.html', '/'),
(u'\u4e2d\u570b China', 'http://singtao.com/yesterday/chi/e_index.html', '/')]:
articles = self.parse_section_withouttext(url, baseurl)
if articles:
feeds.append((title, articles))
else: # use mobile
# single-item section
for title, url in [(u'\u793e\u8ad6 Editorial', 'http://m.singtao.com/showContent.php?main=paper&sub=0&title=0')]:
article = self.parse_singleitem_section_m(url)
if article:
feeds.append((title, article))
# multiple-item section
for title, url, baseurl in [(u'\u8981\u805e\u6e2f\u805e Local', 'http://m.singtao.com/showTitle.php?main=paper&sub=1', 'http://m.singtao.com/'),
(u'\u8ca1\u7d93 Finance', 'http://m.singtao.com/showTitle.php?main=paper&sub=2', 'http://m.singtao.com/'),
(u'\u5730\u7522 Properties', 'http://m.singtao.com/showTitle.php?main=paper&sub=3', 'http://m.singtao.com/'),
(u'\u6559\u80b2 Education', 'http://m.singtao.com/showTitle.php?main=paper&sub=4', 'http://m.singtao.com/'),
(u'\u5a1b\u6a02 Entertainment', 'http://m.singtao.com/showTitle.php?main=paper&sub=5', 'http://m.singtao.com/'),
(u'\u99ac\u7d93 Horse Racing', 'http://m.singtao.com/showTitle.php?main=paper&sub=6', 'http://m.singtao.com/'),
(u'\u9ad4\u80b2 Sports', 'http://m.singtao.com/showTitle.php?main=paper&sub=7', 'http://m.singtao.com/'),
(u'\u526f\u520a Supplements', 'http://m.singtao.com/showTitle.php?main=paper&sub=8', 'http://m.singtao.com/'),
(u'\u570b\u969b World', 'http://m.singtao.com/showTitle.php?main=paper&sub=9', 'http://m.singtao.com/'),
(u'\u4e2d\u570b China', 'http://m.singtao.com/showTitle.php?main=paper&sub=10', 'http://m.singtao.com/')]:
articles = self.parse_multiitem_section_m(url, baseurl)
if articles:
feeds.append((title, articles))
return feeds
def parse_singleitem_section(self, url):
current_articles = []
current_articles.append({'title': '', 'url': url, 'description': '', 'date': ''})
return current_articles
def parse_singleitem_section_m(self, url):
current_articles = []
current_articles.append({'title': '', 'url': url, 'description': '', 'date': ''})
return current_articles
def parse_section(self, url):
soup = self.index_to_soup(url)
# find <table width=436 border=0 cellspacing=0 align=center cellpadding=0> tag
tables = soup.findAll(name={'table'}, attrs={'width': ['436']})
current_articles_all = []
for table in tables:
divs = table.findAll(name={'a'})
current_articles = []
included_urls = []
for i in divs:
title = self.tag_to_string(i)
urlstr = i.get('href', False)
urlstr = url + '/../' + urlstr
if urlstr not in included_urls:
current_articles.append({'title': title, 'url': urlstr, 'description': '', 'date': ''})
included_urls.append(urlstr)
current_articles_all.extend(current_articles)
return current_articles_all
def parse_section_withouttext(self, url, baseurl):
soup = self.index_to_soup(url)
# find all a tag
links = soup.findAll(name={'a'})
linksexcluded = soup.findAll(name={'a'}, attrs={'class':'secondhead'})
for elink in linksexcluded:
links.remove(elink)
linksexcluded = soup.findAll(name={'a'}, attrs={'class':'second02'})
for elink in linksexcluded:
links.remove(elink)
current_articles_all = []
included_urls = []
for link in links:
title = self.tag_to_string(link)
if len(title.strip()) > 0:
urlstr = link.get('href', False)
if urlstr.rfind(baseurl) == -1 and urlstr.rfind('mailto:') == -1:
urlstr = url + '/../' + urlstr
if urlstr not in included_urls:
current_articles_all.append({'title': title, 'url': urlstr, 'description': '', 'date': ''})
included_urls.append(urlstr)
return current_articles_all
def parse_multiitem_section_m(self, url, baseurl):
soup = self.index_to_soup(url)
# find all a tag
links = soup.findAll(name={'span'}, attrs={'class':'urlurl'})
current_articles_all = []
included_urls = []
for linkraw in links:
linkclean = soup.findAll(name={'a'})
for link in linkclean:
title = self.tag_to_string(link)
if len(title.strip()) > 0:
urlstr = link.get('href', False)
urlstr = baseurl + urlstr
if urlstr not in included_urls:
current_articles_all.append({'title': title, 'url': urlstr, 'description': '', 'date': ''})
included_urls.append(urlstr)
return current_articles_all
def populate_article_metadata(self, article, soup, first):
if __Source__ == 'normal':
# get title if not fetched in parse_section() function
if article.title == '' or len(article.title.strip()) == 0:
articletitle = soup.findAll('td',attrs={'class':'bodyhead'})
if articletitle:
articletitlemod = articletitle[0].find('font')
if articletitlemod:
article.title = articletitlemod.string.strip()
else:
article.title = articletitle[0].string.strip()
else:
# use the title in the text in any case
articletitle = soup.findAll('td', attrs={'class':'stmobheadline'})
if articletitle:
articletitle[0].br.extract()
article.title = articletitle[0].contents[0]
# get thumbnail image
if __IncludeThumbnails__ and first and hasattr(self, 'add_toc_thumbnail'):
img = soup.find('img')
if img is not None:
self.add_toc_thumbnail(article, img['src'])
try:
if __IncludeSummary__ and len(article.text_summary.strip()) == 0:
# look for content
if __Source__ == 'normal':
articlebodies = soup.findAll('font',attrs={'class':'bodytext'})
else:
articlebodies = soup.findAll('div', attrs={'class':'hkadj'})
if articlebodies:
for articlebody in articlebodies:
if articlebody:
# the text may or may not be enclosed in <p></p> tag
paras = articlebody.findAll('p')
if not paras:
paras = articlebody
textFound = False
for p in paras:
if not textFound:
summary_candidate = self.tag_to_string(p).strip()
if len(summary_candidate) > 0:
summary_candidate = summary_candidate.replace(u'(\u661f\u5cf6\u65e5\u5831\u5831\u9053)', '', 1)
article.summary = article.text_summary = summary_candidate
textFound = True
else:
# display a simple text
#article.summary = article.text_summary = u'\u66f4\u591a......'
# display word counts
counts = 0
if __Source__ == 'normal':
articlebodies = soup.findAll('font',attrs={'class':'bodytext'})
else:
articlebodies = soup.findAll('div', attrs={'class':'hkadj'})
if articlebodies:
for articlebody in articlebodies:
# the text may or may not be enclosed in <p></p> tag
paras = articlebody.findAll('p')
if not paras:
paras = articlebody
for p in paras:
summary_candidate = self.tag_to_string(p).strip()
counts += len(summary_candidate)
article.summary = article.text_summary = u'\uff08' + str(counts) + u'\u5b57\uff09'
except:
self.log("Error creating article descriptions")
return
# override from the one in version 0.8.31
def create_opf(self, feeds, dir=None):
if dir is None:
dir = self.output_dir
title = self.short_title()
# change 1: allow our own flag to tell if a periodical is to be generated
# also use customed date instead of current time
if __MakePeriodical__ == False or self.output_profile.periodical_date_in_title:
title = title + ' ' + self.get_fetchformatteddate()
# end of change 1
# change 2: __appname__ replaced by newspaper publisher
__appname__ = self.publisher
mi = MetaInformation(title, [__appname__])
mi.publisher = __appname__
mi.author_sort = __appname__
# change 3: use __MakePeriodical__ flag to tell if a periodical should be generated
if __MakePeriodical__ == True:
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
else:
mi.publication_type = self.publication_type+':'+self.short_title()
#mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
# change 4: in the following, all the nowf() are changed to adjusted time
# This one doesn't matter
mi.timestamp = nowf()
# change 5: skip listing the articles
#article_titles, aseen = [], set()
#for f in feeds:
# for a in f:
# if a.title and a.title not in aseen:
# aseen.add(a.title)
# article_titles.append(force_unicode(a.title, 'utf-8'))
#mi.comments = self.description
#if not isinstance(mi.comments, unicode):
# mi.comments = mi.comments.decode('utf-8', 'replace')
#mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
# '\n\n'.join(article_titles))
language = canonicalize_lang(self.language)
if language is not None:
mi.language = language
# This one affects the pub date shown in kindle title
#mi.pubdate = nowf()
# now appears to need the time field to be > 12.00noon as well
mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
opf_path = os.path.join(dir, 'index.opf')
ncx_path = os.path.join(dir, 'index.ncx')
opf = OPFCreator(dir, mi)
# Add mastheadImage entry to <guide> section
mp = getattr(self, 'masthead_path', None)
if mp is not None and os.access(mp, os.R_OK):
from calibre.ebooks.metadata.opf2 import Guide
ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
ref.type = 'masthead'
ref.title = 'Masthead Image'
opf.guide.append(ref)
manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
manifest.append(os.path.join(dir, 'index.html'))
manifest.append(os.path.join(dir, 'index.ncx'))
# Get cover
cpath = getattr(self, 'cover_path', None)
if cpath is None:
pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
if self.default_cover(pf):
cpath = pf.name
if cpath is not None and os.access(cpath, os.R_OK):
opf.cover = cpath
manifest.append(cpath)
# Get masthead
mpath = getattr(self, 'masthead_path', None)
if mpath is not None and os.access(mpath, os.R_OK):
manifest.append(mpath)
opf.create_manifest_from_files_in(manifest)
for mani in opf.manifest:
if mani.path.endswith('.ncx'):
mani.id = 'ncx'
if mani.path.endswith('mastheadImage.jpg'):
mani.id = 'masthead-image'
entries = ['index.html']
toc = TOC(base_path=dir)
self.play_order_counter = 0
self.play_order_map = {}
def feed_index(num, parent):
f = feeds[num]
for j, a in enumerate(f):
if getattr(a, 'downloaded', False):
adir = 'feed_%d/article_%d/'%(num, j)
auth = a.author
if not auth:
auth = None
desc = a.text_summary
if not desc:
desc = None
else:
desc = self.description_limiter(desc)
tt = a.toc_thumbnail if a.toc_thumbnail else None
entries.append('%sindex.html'%adir)
po = self.play_order_map.get(entries[-1], None)
if po is None:
self.play_order_counter += 1
po = self.play_order_counter
parent.add_item('%sindex.html'%adir, None,
a.title if a.title else ('Untitled Article'),
play_order=po, author=auth,
description=desc, toc_thumbnail=tt)
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
for sp in a.sub_pages:
prefix = os.path.commonprefix([opf_path, sp])
relp = sp[len(prefix):]
entries.append(relp.replace(os.sep, '/'))
last = sp
if os.path.exists(last):
with open(last, 'rb') as fi:
src = fi.read().decode('utf-8')
soup = BeautifulSoup(src)
body = soup.find('body')
if body is not None:
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
templ = self.navbar.generate(True, num, j, len(f),
not self.has_single_feed,
a.orig_url, __appname__, prefix=prefix,
center=self.center_navbar)
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
body.insert(len(body.contents), elem)
with open(last, 'wb') as fi:
fi.write(unicode(soup).encode('utf-8'))
if len(feeds) == 0:
raise Exception('All feeds are empty, aborting.')
if len(feeds) > 1:
for i, f in enumerate(feeds):
entries.append('feed_%d/index.html'%i)
po = self.play_order_map.get(entries[-1], None)
if po is None:
self.play_order_counter += 1
po = self.play_order_counter
auth = getattr(f, 'author', None)
if not auth:
auth = None
desc = getattr(f, 'description', None)
if not desc:
desc = None
feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
f.title, play_order=po, description=desc, author=auth))
else:
entries.append('feed_%d/index.html'%0)
feed_index(0, toc)
for i, p in enumerate(entries):
entries[i] = os.path.join(dir, p.replace('/', os.sep))
opf.create_spine(entries)
opf.set_toc(toc)
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
opf.render(opf_file, ncx_file)

View File

@ -286,12 +286,15 @@ class PRST1(USBMS):
query = 'SELECT file_path, _id FROM books'
cursor.execute(query)
except DatabaseError:
raise DeviceError(('The SONY database is corrupted. '
import traceback
tb = traceback.format_exc()
raise DeviceError((('The SONY database is corrupted. '
' Delete the file %s on your reader and then disconnect '
' reconnect it. If you are using an SD card, you '
' should delete the file on the card as well. Note that '
' deleting this file will cause your reader to forget '
' any notes/highlights, etc.')%dbpath)
' any notes/highlights, etc.')%dbpath)+' Underlying error:'
'\n'+tb)
db_books = {}
for i, row in enumerate(cursor):