Sing Tao Daily by Eddie Lau

2025-07-09 03:04:10 -04:00 · 2011-12-30 08:58:37 +05:30 · 2011-12-30 08:58:37 +05:30 · 890d4a6ad2
commit 890d4a6ad2
parent d5f2c7cade
2 changed files with 496 additions and 2 deletions
--- a/recipes/singtaohk.recipe
+++ b/recipes/singtaohk.recipe
@ -0,0 +1,491 @@
+__license__   = 'GPL v3'
+__copyright__ = '2011, Eddie Lau'
+
+# data source: normal, mobile
+__Source__ = 'mobile'
+# please replace the following "True" with "False". (Default: True)
+__MakePeriodical__ = True
+# Turn below to True if your device supports display of CJK titles (Default: False)
+__UseChineseTitle__ = False
+# Set it to False if you want to skip images (Default: True)
+__KeepImages__ = True
+# Set it to True if you want to include a summary in Kindle's article view (Default: False)
+__IncludeSummary__ = False
+# Set it to True if you want thumbnail images in Kindle's article view (Default: True)
+__IncludeThumbnails__ = True
+
+
+'''
+Change Log:
+2011/12/29 -- first version done
+TODO:
+* use alternative source at http://m.singtao.com/index.php
+'''
+
+from calibre.utils.date import now as nowf
+import os, datetime, re
+from datetime import date
+from calibre.web.feeds.recipes import BasicNewsRecipe
+from contextlib import nested
+from calibre.ebooks.BeautifulSoup import BeautifulSoup
+from calibre.ebooks.metadata.opf2 import OPFCreator
+from calibre.ebooks.metadata.toc import TOC
+from calibre.ebooks.metadata import MetaInformation
+from calibre.utils.localization import canonicalize_lang
+
+# MAIN CLASS
+class STHKRecipe(BasicNewsRecipe):
+    if __UseChineseTitle__ == True:
+        title = u'\u661f\u5cf6\u65e5\u5831 (\u9999\u6e2f)'
+    else:
+        title   = 'Sing Tao Daily - Hong Kong'
+    description = 'Hong Kong Chinese Newspaper (http://singtao.com)'
+    category    = 'Chinese, News, Hong Kong'
+    extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} td[class=caption] {font-size:50%;} td[class=bodyhead]{font-weight:bold; font-size:150%;} td[class=stmobheadline]{font-weight:bold; font-size:150%;}'
+    masthead_url = 'http://upload.wikimedia.org/wikipedia/en/d/dd/Singtao-usa.png'
+    if __Source__ == 'normal':
+        keep_only_tags = [dict(name='td', attrs={'class':['bodyhead','bodytext']})]
+    else:
+        keep_only_tags = [dict(name='td', attrs={'class':['stmobheadline']}),
+                          dict(name='img', attrs={'width':['146']}),
+                          dict(name='td', attrs={'class':['bodytextg']}),
+                          ]
+    if __KeepImages__:
+        remove_tags = [dict(name='hr')]
+    else:
+        remove_tags = [dict(name='hr'), dict(name='img')]
+    remove_attributes = ['align']
+    preprocess_regexps = [
+                          (re.compile(r'<font class="bodytext">', re.DOTALL|re.IGNORECASE),
+                          lambda match: '<br><br><font class="bodytext">'),
+                         ]
+
+    oldest_article = 1
+    max_articles_per_feed = 200
+    __author__            = 'Eddie Lau'
+    publisher             = 'Sing Tao Ltd.'
+    remove_javascript = True
+    use_embedded_content   = False
+    no_stylesheets = True
+    language = 'zh'
+    encoding = 'Big5-HKSCS'
+    recursions = 0
+    conversion_options = {'linearize_tables':True}
+    timefmt = ''
+    auto_cleanup = False
+
+    def get_dtlocal(self):
+        dt_utc = datetime.datetime.utcnow()
+        # convert UTC to local hk time - at HKT 4.00am, all news are available
+        dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(4.0/24)
+        return dt_local
+
+    def get_fetchdate(self):
+        return self.get_dtlocal().strftime("%Y%m%d")
+
+    def get_fetchformatteddate(self):
+        return self.get_dtlocal().strftime("%Y-%m-%d")
+
+    def get_fetchyear(self):
+        return self.get_dtlocal().strftime("%Y")
+
+    def get_fetchmonth(self):
+        return self.get_dtlocal().strftime("%m")
+
+    def get_fetchday(self):
+        return self.get_dtlocal().strftime("%d")
+
+    def get_cover_url(self):
+        #cover = 'http://singtao.com/media/a/a(2660).jpg'  # for 2011/12/29
+        base = 2660
+        todaydate = date(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()))
+        diff = todaydate - date(2011, 12, 29)
+        base = base + int(diff.total_seconds()/(3600*24))
+        cover = 'http://singtao.com/media/a/a(' + str(base) +').jpg'
+        br = BasicNewsRecipe.get_browser()
+        try:
+            br.open(cover)
+        except:
+            cover = 'http://singtao.com/images/stlogo.gif'
+        return cover
+
+    def parse_index(self):
+        feeds = []
+        dateStr = self.get_fetchdate()
+        dateStr
+
+        if __Source__ == 'normal':
+            # single-item section
+            for title, url in [(u'\u793e\u8ad6 Editorial', 'http://singtao.com/yesterday/jou/j_index.html')]:
+                article = self.parse_singleitem_section(url)
+                if article:
+                    feeds.append((title, article))
+
+            # multiple items
+    #        for title, url in [(u'\u8981\u805e\u6e2f\u805e Local', 'http://singtao.com/yesterday/loc/a_index.html'),
+    #                           (u'\u8ca1\u7d93 Finance', 'http://singtao.com/yesterday/fin/d_index.html'),
+    #                           (u'\u5730\u7522 Properties', 'http://singtao.com/yesterday/pro/h_index.html'),
+    #                           (u'\u6559\u80b2 Education', 'http://singtao.com/yesterday/edu/g_index.asp'),
+    #                           (u'\u5a1b\u6a02 Entertainment', 'http://singtao.com/yesterday/ent/f_index.html'),
+    #                           (u'\u9ad4\u80b2 Sports', 'http://singtao.com/yesterday/spo/c_index.html'),
+    #                           (u'\u99ac\u7d93 Horse Racing', 'http://singtao.com/yesterday/rac/n_index.html')
+    #                           ]:
+    #            articles = self.parse_section(url)
+    #            if articles:
+    #                feeds.append((title, articles))
+
+             # special: supplement
+    #        for title, url, baseurl in [(u'\u526f\u520a Supplements', 'http://singtao.com/yesterday/sup/m_index.html', '/')]:
+    #            articles = self.parse_section_withouttext(url, baseurl)
+    #            if articles:
+    #                feeds.append((title, articles))
+
+           # multiple-item sections
+    #        for title, url in [(u'\u570b\u969b World', 'http://singtao.com/yesterday/int/b_index.html'),
+    #                           (u'\u4e2d\u570b China', 'http://singtao.com/yesterday/chi/e_index.html')
+    #                           ]:
+    #            articles = self.parse_section(url)
+    #            if articles:
+    #                feeds.append((title, articles))
+
+            for title, url, baseurl in [(u'\u8981\u805e\u6e2f\u805e Local', 'http://singtao.com/yesterday/loc/a_index.html', '/'),
+                                        (u'\u8ca1\u7d93 Finance', 'http://singtao.com/yesterday/fin/d_index.html', '/'),
+                                        (u'\u5730\u7522 Properties', 'http://singtao.com/yesterday/pro/h_index.html', '/'),
+                                        (u'\u6559\u80b2 Education', 'http://singtao.com/yesterday/edu/g_index.asp', '/'),
+                                        (u'\u5a1b\u6a02 Entertainment', 'http://singtao.com/yesterday/ent/f_index.html', '/'),
+                                        (u'\u9ad4\u80b2 Sports', 'http://singtao.com/yesterday/spo/c_index.html', '/'),
+                                        (u'\u99ac\u7d93 Horse Racing', 'http://singtao.com/yesterday/rac/n_index.html', '/'),
+                                        (u'\u526f\u520a Supplements', 'http://singtao.com/yesterday/sup/m_index.html', '/'),
+                                        (u'\u570b\u969b World', 'http://singtao.com/yesterday/int/b_index.html', '/'),
+                                        (u'\u4e2d\u570b China', 'http://singtao.com/yesterday/chi/e_index.html', '/')]:
+                articles = self.parse_section_withouttext(url, baseurl)
+                if articles:
+                    feeds.append((title, articles))
+        else:  # use mobile
+            # single-item section
+            for title, url in [(u'\u793e\u8ad6 Editorial', 'http://m.singtao.com/showContent.php?main=paper&sub=0&title=0')]:
+                article = self.parse_singleitem_section_m(url)
+                if article:
+                    feeds.append((title, article))
+            # multiple-item section
+            for title, url, baseurl in [(u'\u8981\u805e\u6e2f\u805e Local', 'http://m.singtao.com/showTitle.php?main=paper&sub=1', 'http://m.singtao.com/'),
+                                        (u'\u8ca1\u7d93 Finance', 'http://m.singtao.com/showTitle.php?main=paper&sub=2', 'http://m.singtao.com/'),
+                                        (u'\u5730\u7522 Properties', 'http://m.singtao.com/showTitle.php?main=paper&sub=3', 'http://m.singtao.com/'),
+                                        (u'\u6559\u80b2 Education', 'http://m.singtao.com/showTitle.php?main=paper&sub=4', 'http://m.singtao.com/'),
+                                        (u'\u5a1b\u6a02 Entertainment', 'http://m.singtao.com/showTitle.php?main=paper&sub=5', 'http://m.singtao.com/'),
+                                        (u'\u99ac\u7d93 Horse Racing', 'http://m.singtao.com/showTitle.php?main=paper&sub=6', 'http://m.singtao.com/'),
+                                        (u'\u9ad4\u80b2 Sports', 'http://m.singtao.com/showTitle.php?main=paper&sub=7', 'http://m.singtao.com/'),
+                                        (u'\u526f\u520a Supplements', 'http://m.singtao.com/showTitle.php?main=paper&sub=8', 'http://m.singtao.com/'),
+                                        (u'\u570b\u969b World', 'http://m.singtao.com/showTitle.php?main=paper&sub=9', 'http://m.singtao.com/'),
+                                        (u'\u4e2d\u570b China', 'http://m.singtao.com/showTitle.php?main=paper&sub=10', 'http://m.singtao.com/')]:
+                articles = self.parse_multiitem_section_m(url, baseurl)
+                if articles:
+                    feeds.append((title, articles))
+        return feeds
+
+    def parse_singleitem_section(self, url):
+        current_articles = []
+        current_articles.append({'title': '', 'url': url, 'description': '', 'date': ''})
+        return current_articles
+
+    def parse_singleitem_section_m(self, url):
+        current_articles = []
+        current_articles.append({'title': '', 'url': url, 'description': '', 'date': ''})
+        return current_articles
+
+    def parse_section(self, url):
+        soup = self.index_to_soup(url)
+        # find <table width=436 border=0 cellspacing=0 align=center cellpadding=0> tag
+        tables = soup.findAll(name={'table'}, attrs={'width': ['436']})
+        current_articles_all = []
+        for table in tables:
+            divs = table.findAll(name={'a'})
+            current_articles = []
+            included_urls = []
+            for i in divs:
+                title = self.tag_to_string(i)
+                urlstr = i.get('href', False)
+                urlstr = url + '/../' + urlstr
+                if urlstr not in included_urls:
+                    current_articles.append({'title': title, 'url': urlstr, 'description': '', 'date': ''})
+                    included_urls.append(urlstr)
+            current_articles_all.extend(current_articles)
+        return current_articles_all
+
+    def parse_section_withouttext(self, url, baseurl):
+        soup = self.index_to_soup(url)
+        # find all a tag
+        links = soup.findAll(name={'a'})
+        linksexcluded = soup.findAll(name={'a'}, attrs={'class':'secondhead'})
+        for elink in linksexcluded:
+            links.remove(elink)
+        linksexcluded = soup.findAll(name={'a'}, attrs={'class':'second02'})
+        for elink in linksexcluded:
+            links.remove(elink)
+        current_articles_all = []
+        included_urls = []
+        for link in links:
+            title = self.tag_to_string(link)
+            if len(title.strip()) > 0:
+                urlstr = link.get('href', False)
+                if urlstr.rfind(baseurl) == -1 and urlstr.rfind('mailto:') == -1:
+                    urlstr = url + '/../' + urlstr
+                    if urlstr not in included_urls:
+                        current_articles_all.append({'title': title, 'url': urlstr, 'description': '', 'date': ''})
+                        included_urls.append(urlstr)
+        return current_articles_all
+
+    def parse_multiitem_section_m(self, url, baseurl):
+        soup = self.index_to_soup(url)
+        # find all a tag
+        links = soup.findAll(name={'span'}, attrs={'class':'urlurl'})
+        current_articles_all = []
+        included_urls = []
+        for linkraw in links:
+            linkclean = soup.findAll(name={'a'})
+            for link in linkclean:
+                title = self.tag_to_string(link)
+                if len(title.strip()) > 0:
+                    urlstr = link.get('href', False)
+                    urlstr = baseurl + urlstr
+                    if urlstr not in included_urls:
+                        current_articles_all.append({'title': title, 'url': urlstr, 'description': '', 'date': ''})
+                        included_urls.append(urlstr)
+        return current_articles_all
+
+    def populate_article_metadata(self, article, soup, first):
+        if __Source__ == 'normal':
+            # get title if not fetched in parse_section() function
+            if article.title == '' or len(article.title.strip()) == 0:
+                articletitle = soup.findAll('td',attrs={'class':'bodyhead'})
+                if articletitle:
+                    articletitlemod = articletitle[0].find('font')
+                    if articletitlemod:
+                        article.title = articletitlemod.string.strip()
+                    else:
+                        article.title = articletitle[0].string.strip()
+        else:
+            # use the title in the text in any case
+            articletitle = soup.findAll('td', attrs={'class':'stmobheadline'})
+            if articletitle:
+                articletitle[0].br.extract()
+                article.title = articletitle[0].contents[0]
+        # get thumbnail image
+        if __IncludeThumbnails__ and first and hasattr(self, 'add_toc_thumbnail'):
+            img = soup.find('img')
+            if img is not None:
+                self.add_toc_thumbnail(article, img['src'])
+
+        try:
+            if __IncludeSummary__ and len(article.text_summary.strip()) == 0:
+                # look for content
+                if __Source__ == 'normal':
+                    articlebodies = soup.findAll('font',attrs={'class':'bodytext'})
+                else:
+                    articlebodies = soup.findAll('div', attrs={'class':'hkadj'})
+                if articlebodies:
+                    for articlebody in articlebodies:
+                        if articlebody:
+                            # the text may or may not be enclosed in <p></p> tag
+                            paras = articlebody.findAll('p')
+                            if not paras:
+                            	paras = articlebody
+                            textFound = False
+                            for p in paras:
+                                if not textFound:
+                                    summary_candidate = self.tag_to_string(p).strip()
+                                    if len(summary_candidate) > 0:
+                                        summary_candidate = summary_candidate.replace(u'(\u661f\u5cf6\u65e5\u5831\u5831\u9053)', '', 1)
+                                        article.summary = article.text_summary = summary_candidate
+                                        textFound = True
+            else:
+                # display a simple text
+                #article.summary = article.text_summary = u'\u66f4\u591a......'
+                # display word counts
+                counts = 0
+                if __Source__ == 'normal':
+                    articlebodies = soup.findAll('font',attrs={'class':'bodytext'})
+                else:
+                    articlebodies = soup.findAll('div', attrs={'class':'hkadj'})
+                if articlebodies:
+                    for articlebody in articlebodies:
+                        # the text may or may not be enclosed in <p></p> tag
+                        paras = articlebody.findAll('p')
+                        if not paras:
+                            paras = articlebody
+                        for p in paras:
+                            summary_candidate = self.tag_to_string(p).strip()
+                            counts += len(summary_candidate)
+                    article.summary = article.text_summary = u'\uff08' + str(counts) + u'\u5b57\uff09'
+        except:
+            self.log("Error creating article descriptions")
+            return
+
+    # override from the one in version 0.8.31
+    def create_opf(self, feeds, dir=None):
+        if dir is None:
+            dir = self.output_dir
+        title = self.short_title()
+        # change 1: allow our own flag to tell if a periodical is to be generated
+        # also use customed date instead of current time
+        if __MakePeriodical__ == False or self.output_profile.periodical_date_in_title:
+            title = title + ' ' + self.get_fetchformatteddate()
+        # end of change 1
+        # change 2: __appname__ replaced by newspaper publisher
+        __appname__ = self.publisher
+        mi = MetaInformation(title, [__appname__])
+        mi.publisher = __appname__
+        mi.author_sort = __appname__
+        # change 3: use __MakePeriodical__ flag to tell if a periodical should be generated
+        if __MakePeriodical__ == True:
+            mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
+        else:
+            mi.publication_type = self.publication_type+':'+self.short_title()
+        #mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
+        # change 4: in the following, all the nowf() are changed to adjusted time
+        # This one doesn't matter
+        mi.timestamp = nowf()
+        # change 5: skip listing the articles
+        #article_titles, aseen = [], set()
+        #for f in feeds:
+        #    for a in f:
+        #        if a.title and a.title not in aseen:
+        #            aseen.add(a.title)
+        #            article_titles.append(force_unicode(a.title, 'utf-8'))
+
+        #mi.comments = self.description
+        #if not isinstance(mi.comments, unicode):
+        #    mi.comments = mi.comments.decode('utf-8', 'replace')
+        #mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
+        #        '\n\n'.join(article_titles))
+
+        language = canonicalize_lang(self.language)
+        if language is not None:
+            mi.language = language
+        # This one affects the pub date shown in kindle title
+        #mi.pubdate = nowf()
+        # now appears to need the time field to be > 12.00noon as well
+        mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
+        opf_path = os.path.join(dir, 'index.opf')
+        ncx_path = os.path.join(dir, 'index.ncx')
+
+        opf = OPFCreator(dir, mi)
+        # Add mastheadImage entry to <guide> section
+        mp = getattr(self, 'masthead_path', None)
+        if mp is not None and os.access(mp, os.R_OK):
+            from calibre.ebooks.metadata.opf2 import Guide
+            ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
+            ref.type = 'masthead'
+            ref.title = 'Masthead Image'
+            opf.guide.append(ref)
+
+        manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
+        manifest.append(os.path.join(dir, 'index.html'))
+        manifest.append(os.path.join(dir, 'index.ncx'))
+
+        # Get cover
+        cpath = getattr(self, 'cover_path', None)
+        if cpath is None:
+            pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
+            if self.default_cover(pf):
+                cpath =  pf.name
+        if cpath is not None and os.access(cpath, os.R_OK):
+            opf.cover = cpath
+            manifest.append(cpath)
+
+        # Get masthead
+        mpath = getattr(self, 'masthead_path', None)
+        if mpath is not None and os.access(mpath, os.R_OK):
+            manifest.append(mpath)
+
+        opf.create_manifest_from_files_in(manifest)
+        for mani in opf.manifest:
+            if mani.path.endswith('.ncx'):
+                mani.id = 'ncx'
+            if mani.path.endswith('mastheadImage.jpg'):
+                mani.id = 'masthead-image'
+
+        entries = ['index.html']
+        toc = TOC(base_path=dir)
+        self.play_order_counter = 0
+        self.play_order_map = {}
+
+
+        def feed_index(num, parent):
+            f = feeds[num]
+            for j, a in enumerate(f):
+                if getattr(a, 'downloaded', False):
+                    adir = 'feed_%d/article_%d/'%(num, j)
+                    auth = a.author
+                    if not auth:
+                        auth = None
+                    desc = a.text_summary
+                    if not desc:
+                        desc = None
+                    else:
+                        desc = self.description_limiter(desc)
+                    tt = a.toc_thumbnail if a.toc_thumbnail else None
+                    entries.append('%sindex.html'%adir)
+                    po = self.play_order_map.get(entries[-1], None)
+                    if po is None:
+                        self.play_order_counter += 1
+                        po = self.play_order_counter
+                    parent.add_item('%sindex.html'%adir, None,
+                            a.title if a.title else ('Untitled Article'),
+                            play_order=po, author=auth,
+                            description=desc, toc_thumbnail=tt)
+                    last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
+                    for sp in a.sub_pages:
+                        prefix = os.path.commonprefix([opf_path, sp])
+                        relp = sp[len(prefix):]
+                        entries.append(relp.replace(os.sep, '/'))
+                        last = sp
+
+                    if os.path.exists(last):
+                        with open(last, 'rb') as fi:
+                            src = fi.read().decode('utf-8')
+                        soup = BeautifulSoup(src)
+                        body = soup.find('body')
+                        if body is not None:
+                            prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
+                            templ = self.navbar.generate(True, num, j, len(f),
+                                            not self.has_single_feed,
+                                            a.orig_url, __appname__, prefix=prefix,
+                                            center=self.center_navbar)
+                            elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
+                            body.insert(len(body.contents), elem)
+                            with open(last, 'wb') as fi:
+                                fi.write(unicode(soup).encode('utf-8'))
+        if len(feeds) == 0:
+            raise Exception('All feeds are empty, aborting.')
+
+        if len(feeds) > 1:
+            for i, f in enumerate(feeds):
+                entries.append('feed_%d/index.html'%i)
+                po = self.play_order_map.get(entries[-1], None)
+                if po is None:
+                    self.play_order_counter += 1
+                    po = self.play_order_counter
+                auth = getattr(f, 'author', None)
+                if not auth:
+                    auth = None
+                desc = getattr(f, 'description', None)
+                if not desc:
+                    desc = None
+                feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
+                    f.title, play_order=po, description=desc, author=auth))
+
+        else:
+            entries.append('feed_%d/index.html'%0)
+            feed_index(0, toc)
+
+        for i, p in enumerate(entries):
+            entries[i] = os.path.join(dir, p.replace('/', os.sep))
+        opf.create_spine(entries)
+        opf.set_toc(toc)
+
+        with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
+            opf.render(opf_file, ncx_file)
+
+
+
--- a/src/calibre/devices/prst1/driver.py
+++ b/src/calibre/devices/prst1/driver.py
@ -286,12 +286,15 @@ class PRST1(USBMS):
            query = 'SELECT file_path, _id FROM books'
            cursor.execute(query)
        except DatabaseError:
-            raise DeviceError(('The SONY database is corrupted. '
+            import traceback
+            tb = traceback.format_exc()
+            raise DeviceError((('The SONY database is corrupted. '
                    ' Delete the file %s on your reader and then disconnect '
                    ' reconnect it. If you are using an SD card, you '
                    ' should delete the file on the card as well. Note that '
                    ' deleting this file will cause your reader to forget '
-                    ' any notes/highlights, etc.')%dbpath)
+                    ' any notes/highlights, etc.')%dbpath)+' Underlying error:'
+                    '\n'+tb)

        db_books = {}
        for i, row in enumerate(cursor):