diff --git a/recipes/singtaohk.recipe b/recipes/singtaohk.recipe new file mode 100644 index 0000000000..a038372693 --- /dev/null +++ b/recipes/singtaohk.recipe @@ -0,0 +1,491 @@ +__license__ = 'GPL v3' +__copyright__ = '2011, Eddie Lau' + +# data source: normal, mobile +__Source__ = 'mobile' +# please replace the following "True" with "False". (Default: True) +__MakePeriodical__ = True +# Turn below to True if your device supports display of CJK titles (Default: False) +__UseChineseTitle__ = False +# Set it to False if you want to skip images (Default: True) +__KeepImages__ = True +# Set it to True if you want to include a summary in Kindle's article view (Default: False) +__IncludeSummary__ = False +# Set it to True if you want thumbnail images in Kindle's article view (Default: True) +__IncludeThumbnails__ = True + + +''' +Change Log: +2011/12/29 -- first version done +TODO: +* use alternative source at http://m.singtao.com/index.php +''' + +from calibre.utils.date import now as nowf +import os, datetime, re +from datetime import date +from calibre.web.feeds.recipes import BasicNewsRecipe +from contextlib import nested +from calibre.ebooks.BeautifulSoup import BeautifulSoup +from calibre.ebooks.metadata.opf2 import OPFCreator +from calibre.ebooks.metadata.toc import TOC +from calibre.ebooks.metadata import MetaInformation +from calibre.utils.localization import canonicalize_lang + +# MAIN CLASS +class STHKRecipe(BasicNewsRecipe): + if __UseChineseTitle__ == True: + title = u'\u661f\u5cf6\u65e5\u5831 (\u9999\u6e2f)' + else: + title = 'Sing Tao Daily - Hong Kong' + description = 'Hong Kong Chinese Newspaper (http://singtao.com)' + category = 'Chinese, News, Hong Kong' + extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} td[class=caption] {font-size:50%;} td[class=bodyhead]{font-weight:bold; font-size:150%;} td[class=stmobheadline]{font-weight:bold; font-size:150%;}' + masthead_url = 'http://upload.wikimedia.org/wikipedia/en/d/dd/Singtao-usa.png' + if __Source__ == 'normal': + keep_only_tags = [dict(name='td', attrs={'class':['bodyhead','bodytext']})] + else: + keep_only_tags = [dict(name='td', attrs={'class':['stmobheadline']}), + dict(name='img', attrs={'width':['146']}), + dict(name='td', attrs={'class':['bodytextg']}), + ] + if __KeepImages__: + remove_tags = [dict(name='hr')] + else: + remove_tags = [dict(name='hr'), dict(name='img')] + remove_attributes = ['align'] + preprocess_regexps = [ + (re.compile(r'', re.DOTALL|re.IGNORECASE), + lambda match: '

'), + ] + + oldest_article = 1 + max_articles_per_feed = 200 + __author__ = 'Eddie Lau' + publisher = 'Sing Tao Ltd.' + remove_javascript = True + use_embedded_content = False + no_stylesheets = True + language = 'zh' + encoding = 'Big5-HKSCS' + recursions = 0 + conversion_options = {'linearize_tables':True} + timefmt = '' + auto_cleanup = False + + def get_dtlocal(self): + dt_utc = datetime.datetime.utcnow() + # convert UTC to local hk time - at HKT 4.00am, all news are available + dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(4.0/24) + return dt_local + + def get_fetchdate(self): + return self.get_dtlocal().strftime("%Y%m%d") + + def get_fetchformatteddate(self): + return self.get_dtlocal().strftime("%Y-%m-%d") + + def get_fetchyear(self): + return self.get_dtlocal().strftime("%Y") + + def get_fetchmonth(self): + return self.get_dtlocal().strftime("%m") + + def get_fetchday(self): + return self.get_dtlocal().strftime("%d") + + def get_cover_url(self): + #cover = 'http://singtao.com/media/a/a(2660).jpg' # for 2011/12/29 + base = 2660 + todaydate = date(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday())) + diff = todaydate - date(2011, 12, 29) + base = base + int(diff.total_seconds()/(3600*24)) + cover = 'http://singtao.com/media/a/a(' + str(base) +').jpg' + br = BasicNewsRecipe.get_browser() + try: + br.open(cover) + except: + cover = 'http://singtao.com/images/stlogo.gif' + return cover + + def parse_index(self): + feeds = [] + dateStr = self.get_fetchdate() + dateStr + + if __Source__ == 'normal': + # single-item section + for title, url in [(u'\u793e\u8ad6 Editorial', 'http://singtao.com/yesterday/jou/j_index.html')]: + article = self.parse_singleitem_section(url) + if article: + feeds.append((title, article)) + + # multiple items + # for title, url in [(u'\u8981\u805e\u6e2f\u805e Local', 'http://singtao.com/yesterday/loc/a_index.html'), + # (u'\u8ca1\u7d93 Finance', 'http://singtao.com/yesterday/fin/d_index.html'), + # (u'\u5730\u7522 Properties', 'http://singtao.com/yesterday/pro/h_index.html'), + # (u'\u6559\u80b2 Education', 'http://singtao.com/yesterday/edu/g_index.asp'), + # (u'\u5a1b\u6a02 Entertainment', 'http://singtao.com/yesterday/ent/f_index.html'), + # (u'\u9ad4\u80b2 Sports', 'http://singtao.com/yesterday/spo/c_index.html'), + # (u'\u99ac\u7d93 Horse Racing', 'http://singtao.com/yesterday/rac/n_index.html') + # ]: + # articles = self.parse_section(url) + # if articles: + # feeds.append((title, articles)) + + # special: supplement + # for title, url, baseurl in [(u'\u526f\u520a Supplements', 'http://singtao.com/yesterday/sup/m_index.html', '/')]: + # articles = self.parse_section_withouttext(url, baseurl) + # if articles: + # feeds.append((title, articles)) + + # multiple-item sections + # for title, url in [(u'\u570b\u969b World', 'http://singtao.com/yesterday/int/b_index.html'), + # (u'\u4e2d\u570b China', 'http://singtao.com/yesterday/chi/e_index.html') + # ]: + # articles = self.parse_section(url) + # if articles: + # feeds.append((title, articles)) + + for title, url, baseurl in [(u'\u8981\u805e\u6e2f\u805e Local', 'http://singtao.com/yesterday/loc/a_index.html', '/'), + (u'\u8ca1\u7d93 Finance', 'http://singtao.com/yesterday/fin/d_index.html', '/'), + (u'\u5730\u7522 Properties', 'http://singtao.com/yesterday/pro/h_index.html', '/'), + (u'\u6559\u80b2 Education', 'http://singtao.com/yesterday/edu/g_index.asp', '/'), + (u'\u5a1b\u6a02 Entertainment', 'http://singtao.com/yesterday/ent/f_index.html', '/'), + (u'\u9ad4\u80b2 Sports', 'http://singtao.com/yesterday/spo/c_index.html', '/'), + (u'\u99ac\u7d93 Horse Racing', 'http://singtao.com/yesterday/rac/n_index.html', '/'), + (u'\u526f\u520a Supplements', 'http://singtao.com/yesterday/sup/m_index.html', '/'), + (u'\u570b\u969b World', 'http://singtao.com/yesterday/int/b_index.html', '/'), + (u'\u4e2d\u570b China', 'http://singtao.com/yesterday/chi/e_index.html', '/')]: + articles = self.parse_section_withouttext(url, baseurl) + if articles: + feeds.append((title, articles)) + else: # use mobile + # single-item section + for title, url in [(u'\u793e\u8ad6 Editorial', 'http://m.singtao.com/showContent.php?main=paper&sub=0&title=0')]: + article = self.parse_singleitem_section_m(url) + if article: + feeds.append((title, article)) + # multiple-item section + for title, url, baseurl in [(u'\u8981\u805e\u6e2f\u805e Local', 'http://m.singtao.com/showTitle.php?main=paper&sub=1', 'http://m.singtao.com/'), + (u'\u8ca1\u7d93 Finance', 'http://m.singtao.com/showTitle.php?main=paper&sub=2', 'http://m.singtao.com/'), + (u'\u5730\u7522 Properties', 'http://m.singtao.com/showTitle.php?main=paper&sub=3', 'http://m.singtao.com/'), + (u'\u6559\u80b2 Education', 'http://m.singtao.com/showTitle.php?main=paper&sub=4', 'http://m.singtao.com/'), + (u'\u5a1b\u6a02 Entertainment', 'http://m.singtao.com/showTitle.php?main=paper&sub=5', 'http://m.singtao.com/'), + (u'\u99ac\u7d93 Horse Racing', 'http://m.singtao.com/showTitle.php?main=paper&sub=6', 'http://m.singtao.com/'), + (u'\u9ad4\u80b2 Sports', 'http://m.singtao.com/showTitle.php?main=paper&sub=7', 'http://m.singtao.com/'), + (u'\u526f\u520a Supplements', 'http://m.singtao.com/showTitle.php?main=paper&sub=8', 'http://m.singtao.com/'), + (u'\u570b\u969b World', 'http://m.singtao.com/showTitle.php?main=paper&sub=9', 'http://m.singtao.com/'), + (u'\u4e2d\u570b China', 'http://m.singtao.com/showTitle.php?main=paper&sub=10', 'http://m.singtao.com/')]: + articles = self.parse_multiitem_section_m(url, baseurl) + if articles: + feeds.append((title, articles)) + return feeds + + def parse_singleitem_section(self, url): + current_articles = [] + current_articles.append({'title': '', 'url': url, 'description': '', 'date': ''}) + return current_articles + + def parse_singleitem_section_m(self, url): + current_articles = [] + current_articles.append({'title': '', 'url': url, 'description': '', 'date': ''}) + return current_articles + + def parse_section(self, url): + soup = self.index_to_soup(url) + # find tag + tables = soup.findAll(name={'table'}, attrs={'width': ['436']}) + current_articles_all = [] + for table in tables: + divs = table.findAll(name={'a'}) + current_articles = [] + included_urls = [] + for i in divs: + title = self.tag_to_string(i) + urlstr = i.get('href', False) + urlstr = url + '/../' + urlstr + if urlstr not in included_urls: + current_articles.append({'title': title, 'url': urlstr, 'description': '', 'date': ''}) + included_urls.append(urlstr) + current_articles_all.extend(current_articles) + return current_articles_all + + def parse_section_withouttext(self, url, baseurl): + soup = self.index_to_soup(url) + # find all a tag + links = soup.findAll(name={'a'}) + linksexcluded = soup.findAll(name={'a'}, attrs={'class':'secondhead'}) + for elink in linksexcluded: + links.remove(elink) + linksexcluded = soup.findAll(name={'a'}, attrs={'class':'second02'}) + for elink in linksexcluded: + links.remove(elink) + current_articles_all = [] + included_urls = [] + for link in links: + title = self.tag_to_string(link) + if len(title.strip()) > 0: + urlstr = link.get('href', False) + if urlstr.rfind(baseurl) == -1 and urlstr.rfind('mailto:') == -1: + urlstr = url + '/../' + urlstr + if urlstr not in included_urls: + current_articles_all.append({'title': title, 'url': urlstr, 'description': '', 'date': ''}) + included_urls.append(urlstr) + return current_articles_all + + def parse_multiitem_section_m(self, url, baseurl): + soup = self.index_to_soup(url) + # find all a tag + links = soup.findAll(name={'span'}, attrs={'class':'urlurl'}) + current_articles_all = [] + included_urls = [] + for linkraw in links: + linkclean = soup.findAll(name={'a'}) + for link in linkclean: + title = self.tag_to_string(link) + if len(title.strip()) > 0: + urlstr = link.get('href', False) + urlstr = baseurl + urlstr + if urlstr not in included_urls: + current_articles_all.append({'title': title, 'url': urlstr, 'description': '', 'date': ''}) + included_urls.append(urlstr) + return current_articles_all + + def populate_article_metadata(self, article, soup, first): + if __Source__ == 'normal': + # get title if not fetched in parse_section() function + if article.title == '' or len(article.title.strip()) == 0: + articletitle = soup.findAll('td',attrs={'class':'bodyhead'}) + if articletitle: + articletitlemod = articletitle[0].find('font') + if articletitlemod: + article.title = articletitlemod.string.strip() + else: + article.title = articletitle[0].string.strip() + else: + # use the title in the text in any case + articletitle = soup.findAll('td', attrs={'class':'stmobheadline'}) + if articletitle: + articletitle[0].br.extract() + article.title = articletitle[0].contents[0] + # get thumbnail image + if __IncludeThumbnails__ and first and hasattr(self, 'add_toc_thumbnail'): + img = soup.find('img') + if img is not None: + self.add_toc_thumbnail(article, img['src']) + + try: + if __IncludeSummary__ and len(article.text_summary.strip()) == 0: + # look for content + if __Source__ == 'normal': + articlebodies = soup.findAll('font',attrs={'class':'bodytext'}) + else: + articlebodies = soup.findAll('div', attrs={'class':'hkadj'}) + if articlebodies: + for articlebody in articlebodies: + if articlebody: + # the text may or may not be enclosed in

tag + paras = articlebody.findAll('p') + if not paras: + paras = articlebody + textFound = False + for p in paras: + if not textFound: + summary_candidate = self.tag_to_string(p).strip() + if len(summary_candidate) > 0: + summary_candidate = summary_candidate.replace(u'(\u661f\u5cf6\u65e5\u5831\u5831\u9053)', '', 1) + article.summary = article.text_summary = summary_candidate + textFound = True + else: + # display a simple text + #article.summary = article.text_summary = u'\u66f4\u591a......' + # display word counts + counts = 0 + if __Source__ == 'normal': + articlebodies = soup.findAll('font',attrs={'class':'bodytext'}) + else: + articlebodies = soup.findAll('div', attrs={'class':'hkadj'}) + if articlebodies: + for articlebody in articlebodies: + # the text may or may not be enclosed in

tag + paras = articlebody.findAll('p') + if not paras: + paras = articlebody + for p in paras: + summary_candidate = self.tag_to_string(p).strip() + counts += len(summary_candidate) + article.summary = article.text_summary = u'\uff08' + str(counts) + u'\u5b57\uff09' + except: + self.log("Error creating article descriptions") + return + + # override from the one in version 0.8.31 + def create_opf(self, feeds, dir=None): + if dir is None: + dir = self.output_dir + title = self.short_title() + # change 1: allow our own flag to tell if a periodical is to be generated + # also use customed date instead of current time + if __MakePeriodical__ == False or self.output_profile.periodical_date_in_title: + title = title + ' ' + self.get_fetchformatteddate() + # end of change 1 + # change 2: __appname__ replaced by newspaper publisher + __appname__ = self.publisher + mi = MetaInformation(title, [__appname__]) + mi.publisher = __appname__ + mi.author_sort = __appname__ + # change 3: use __MakePeriodical__ flag to tell if a periodical should be generated + if __MakePeriodical__ == True: + mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title() + else: + mi.publication_type = self.publication_type+':'+self.short_title() + #mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title() + # change 4: in the following, all the nowf() are changed to adjusted time + # This one doesn't matter + mi.timestamp = nowf() + # change 5: skip listing the articles + #article_titles, aseen = [], set() + #for f in feeds: + # for a in f: + # if a.title and a.title not in aseen: + # aseen.add(a.title) + # article_titles.append(force_unicode(a.title, 'utf-8')) + + #mi.comments = self.description + #if not isinstance(mi.comments, unicode): + # mi.comments = mi.comments.decode('utf-8', 'replace') + #mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' + + # '\n\n'.join(article_titles)) + + language = canonicalize_lang(self.language) + if language is not None: + mi.language = language + # This one affects the pub date shown in kindle title + #mi.pubdate = nowf() + # now appears to need the time field to be > 12.00noon as well + mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0) + opf_path = os.path.join(dir, 'index.opf') + ncx_path = os.path.join(dir, 'index.ncx') + + opf = OPFCreator(dir, mi) + # Add mastheadImage entry to section + mp = getattr(self, 'masthead_path', None) + if mp is not None and os.access(mp, os.R_OK): + from calibre.ebooks.metadata.opf2 import Guide + ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu()) + ref.type = 'masthead' + ref.title = 'Masthead Image' + opf.guide.append(ref) + + manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))] + manifest.append(os.path.join(dir, 'index.html')) + manifest.append(os.path.join(dir, 'index.ncx')) + + # Get cover + cpath = getattr(self, 'cover_path', None) + if cpath is None: + pf = open(os.path.join(dir, 'cover.jpg'), 'wb') + if self.default_cover(pf): + cpath = pf.name + if cpath is not None and os.access(cpath, os.R_OK): + opf.cover = cpath + manifest.append(cpath) + + # Get masthead + mpath = getattr(self, 'masthead_path', None) + if mpath is not None and os.access(mpath, os.R_OK): + manifest.append(mpath) + + opf.create_manifest_from_files_in(manifest) + for mani in opf.manifest: + if mani.path.endswith('.ncx'): + mani.id = 'ncx' + if mani.path.endswith('mastheadImage.jpg'): + mani.id = 'masthead-image' + + entries = ['index.html'] + toc = TOC(base_path=dir) + self.play_order_counter = 0 + self.play_order_map = {} + + + def feed_index(num, parent): + f = feeds[num] + for j, a in enumerate(f): + if getattr(a, 'downloaded', False): + adir = 'feed_%d/article_%d/'%(num, j) + auth = a.author + if not auth: + auth = None + desc = a.text_summary + if not desc: + desc = None + else: + desc = self.description_limiter(desc) + tt = a.toc_thumbnail if a.toc_thumbnail else None + entries.append('%sindex.html'%adir) + po = self.play_order_map.get(entries[-1], None) + if po is None: + self.play_order_counter += 1 + po = self.play_order_counter + parent.add_item('%sindex.html'%adir, None, + a.title if a.title else ('Untitled Article'), + play_order=po, author=auth, + description=desc, toc_thumbnail=tt) + last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep)) + for sp in a.sub_pages: + prefix = os.path.commonprefix([opf_path, sp]) + relp = sp[len(prefix):] + entries.append(relp.replace(os.sep, '/')) + last = sp + + if os.path.exists(last): + with open(last, 'rb') as fi: + src = fi.read().decode('utf-8') + soup = BeautifulSoup(src) + body = soup.find('body') + if body is not None: + prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last)))) + templ = self.navbar.generate(True, num, j, len(f), + not self.has_single_feed, + a.orig_url, __appname__, prefix=prefix, + center=self.center_navbar) + elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div') + body.insert(len(body.contents), elem) + with open(last, 'wb') as fi: + fi.write(unicode(soup).encode('utf-8')) + if len(feeds) == 0: + raise Exception('All feeds are empty, aborting.') + + if len(feeds) > 1: + for i, f in enumerate(feeds): + entries.append('feed_%d/index.html'%i) + po = self.play_order_map.get(entries[-1], None) + if po is None: + self.play_order_counter += 1 + po = self.play_order_counter + auth = getattr(f, 'author', None) + if not auth: + auth = None + desc = getattr(f, 'description', None) + if not desc: + desc = None + feed_index(i, toc.add_item('feed_%d/index.html'%i, None, + f.title, play_order=po, description=desc, author=auth)) + + else: + entries.append('feed_%d/index.html'%0) + feed_index(0, toc) + + for i, p in enumerate(entries): + entries[i] = os.path.join(dir, p.replace('/', os.sep)) + opf.create_spine(entries) + opf.set_toc(toc) + + with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file): + opf.render(opf_file, ncx_file) + + + diff --git a/src/calibre/devices/prst1/driver.py b/src/calibre/devices/prst1/driver.py index 9d2bfd67f1..0462e04a6e 100644 --- a/src/calibre/devices/prst1/driver.py +++ b/src/calibre/devices/prst1/driver.py @@ -286,12 +286,15 @@ class PRST1(USBMS): query = 'SELECT file_path, _id FROM books' cursor.execute(query) except DatabaseError: - raise DeviceError(('The SONY database is corrupted. ' + import traceback + tb = traceback.format_exc() + raise DeviceError((('The SONY database is corrupted. ' ' Delete the file %s on your reader and then disconnect ' ' reconnect it. If you are using an SD card, you ' ' should delete the file on the card as well. Note that ' ' deleting this file will cause your reader to forget ' - ' any notes/highlights, etc.')%dbpath) + ' any notes/highlights, etc.')%dbpath)+' Underlying error:' + '\n'+tb) db_books = {} for i, row in enumerate(cursor):