Delete Apple Daily

its no longer under publication.
2025-07-09 03:04:10 -04:00 · 2023-11-24 16:20:12 +05:30 · 2023-11-24 16:20:12 +05:30 · 693378d64e
commit 693378d64e
parent 10d995c26c
2 changed files with 11 additions and 326 deletions
--- a/recipes/apple_daily.recipe
+++ b/recipes/apple_daily.recipe
@ -1,305 +0,0 @@
-# vim:fileencoding=UTF-8
-from __future__ import unicode_literals
-__license__ = 'GPL v3'
-__copyright__ = '2013-2015, Eddie Lau'
-__Date__ = ''
-
-from calibre import (__appname__, force_unicode, strftime)
-from calibre.utils.date import now as nowf, utcnow
-import os
-import datetime
-import re
-from calibre.web.feeds.recipes import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
-from calibre.ebooks.metadata.opf2 import OPFCreator
-from calibre.ebooks.metadata.toc import TOC
-from calibre.ebooks.metadata import MetaInformation
-from calibre.utils.localization import canonicalize_lang
-
-
-class AppleDaily(BasicNewsRecipe):
-    title = u'蘋果日報 (香港)'
-    __author__ = 'Eddie Lau'
-    publisher = '蘋果日報'
-    publication_type= 'newspaper'
-    oldest_article = 1
-    max_articles_per_feed = 100
-    auto_cleanup = False
-    language = 'zh'
-    encoding = 'utf-8'
-    auto_cleanup = False
-    remove_javascript = True
-    use_embedded_content = False
-    no_stylesheets = True
-    description = 'http://hkm.appledaily.com/'
-    category = 'Chinese, News, Hong Kong'
-    masthead_url = 'https://upload.wikimedia.org/wikipedia/commons/8/86/Apple_Daily_Title.svg'
-
-    extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} h1 {font-size:125%; text-align:left; font-weight:bold;} p{font-size:90%;} p[class=video-caption] {font-size:50%; margin-left:auto; margin-right:auto;}'  # noqa
-    keep_only_tags = [dict(name='div', attrs={'id': 'content-article'})]
-    remove_tags = [dict(name='div', attrs={'class': 'prev-next-btn'}),
-                   dict(name='p', attrs={'class': 'next'}),
-                   dict(name='meta'),
-                   dict(name='link')]
-
-    def get_dtlocal(self):
-        dt_utc = utcnow()
-        # convert UTC to local hk time - at HKT 6am, all news are available
-        return dt_utc + datetime.timedelta(8.0 / 24) - datetime.timedelta(6.0 / 24)
-
-    def get_fetchdate(self):
-        if __Date__ != '':
-            return __Date__
-        else:
-            return self.get_dtlocal().strftime("%Y%m%d")
-
-    def get_fetchformatteddate(self):
-        if __Date__ != '':
-            return __Date__[0:4] + '-' + __Date__[4:6] + '-' + __Date__[6:8]
-        else:
-            return self.get_dtlocal().strftime("%Y-%m-%d")
-
-    def get_fetchyear(self):
-        if __Date__ != '':
-            return __Date__[0:4]
-        else:
-            return self.get_dtlocal().strftime("%Y")
-
-    def get_fetchmonth(self):
-        if __Date__ != '':
-            return __Date__[4:6]
-        else:
-            return self.get_dtlocal().strftime("%m")
-
-    def get_fetchday(self):
-        if __Date__ != '':
-            return __Date__[6:8]
-        else:
-            return self.get_dtlocal().strftime("%d")
-
-    # Note: does not work with custom date given by __Date__
-    def get_weekday(self):
-        return self.get_dtlocal().weekday()
-
-    def get_cover_url(self):
-        soup = self.index_to_soup('http://hkm.appledaily.com/')
-        cover = soup.find(attrs={'class': 'top-news'}).get('src', False)
-        br = BasicNewsRecipe.get_browser(self)
-        try:
-            br.open(cover)
-        except:
-            cover = None
-        return cover
-
-    def populate_article_metadata(self, article, soup, first):
-        if first and hasattr(self, 'add_toc_thumbnail'):
-            picdiv = soup.find('img')
-            if picdiv is not None:
-                self.add_toc_thumbnail(article, picdiv['src'])
-
-    def parse_index(self):
-        feeds = []
-        soup = self.index_to_soup('http://hkm.appledaily.com/')
-        ul = soup.find(attrs={'class': 'menu'})
-        sectionList = []
-        for li in ul.findAll('li'):
-            relativea = li.find('a', href=True).get('href', False)
-            a = 'http://hkm.appledaily.com/' + relativea
-            title = li.find('a', text=True).strip()
-            # if (time.tzname != 'HKT'):
-            #     if (title == u'三藩市'):
-            #         continue
-            #     if (title == u'洛杉磯'):
-            #         continue
-            #     if (title == u'紐&nbsp; 約'):
-            #         continue
-            #     if (title == u'美&nbsp; 國'):
-            #         continue
-            # if (not title == u'動新聞') and (relativea.startswith('list.php')):
-            if (relativea.find('category=daily')!= -1)and (relativea.startswith('list.php')):
-                sectionList.append((title, a))
-        for title, url in sectionList:
-            title = title.replace("&nbsp;", "")
-            articles = self.parse_section(url)
-            if articles:
-                feeds.append((title, articles))
-        return feeds
-
-    def parse_section(self, url):
-        soup = self.index_to_soup(url)
-        ul = soup.find(attrs={'class': 'list'})
-        current_articles = []
-        if ul is None :
-            return current_articles
-        for li in ul.findAll('li'):
-            a = li.find('a', href=True)
-            title = li.find('p', text=True).strip()
-            if a is not None:
-                current_articles.append(
-                    {'title': title, 'url': 'http://hkm.appledaily.com/' + a.get('href', False)})
-            pass
-        return current_articles
-
-    def create_opf(self, feeds, dir=None):
-        if dir is None:
-            dir = self.output_dir
-        title = self.short_title()
-        if self.output_profile.periodical_date_in_title:
-            title += strftime(self.timefmt)
-        mi = MetaInformation(title, [__appname__])
-        mi.publisher = __appname__
-        mi.author_sort = __appname__
-        if self.publication_type:
-            mi.publication_type = 'periodical:' + \
-                self.publication_type + ':' + self.short_title()
-        mi.timestamp = nowf()
-        article_titles, aseen = [], set()
-        for f in feeds:
-            for a in f:
-                if a.title and a.title not in aseen:
-                    aseen.add(a.title)
-                    article_titles.append(force_unicode(a.title, 'utf-8'))
-
-        mi.comments = self.description
-        if not isinstance(mi.comments, type(u'')):
-            mi.comments = mi.comments.decode('utf-8', 'replace')
-        mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
-                        '\n\n'.join(article_titles))
-
-        language = canonicalize_lang(self.language)
-        if language is not None:
-            mi.language = language
-        # This one affects the pub date shown in kindle title
-        # mi.pubdate = nowf()
-        # now appears to need the time field to be > 12.00noon as well
-        mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(
-            self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
-        opf_path = os.path.join(dir, 'index.opf')
-        ncx_path = os.path.join(dir, 'index.ncx')
-
-        opf = OPFCreator(dir, mi)
-        # Add mastheadImage entry to <guide> section
-        mp = getattr(self, 'masthead_path', None)
-        if mp is not None and os.access(mp, os.R_OK):
-            from calibre.ebooks.metadata.opf2 import Guide
-            ref = Guide.Reference(os.path.basename(
-                self.masthead_path), os.getcwd())
-            ref.type = 'masthead'
-            ref.title = 'Masthead Image'
-            opf.guide.append(ref)
-
-        manifest = [os.path.join(dir, 'feed_%d' % i)
-                    for i in range(len(feeds))]
-        manifest.append(os.path.join(dir, 'index.html'))
-        manifest.append(os.path.join(dir, 'index.ncx'))
-
-        # Get cover
-        cpath = getattr(self, 'cover_path', None)
-        if cpath is None:
-            pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
-            if self.default_cover(pf):
-                cpath = pf.name
-        if cpath is not None and os.access(cpath, os.R_OK):
-            opf.cover = cpath
-            manifest.append(cpath)
-
-        # Get masthead
-        mpath = getattr(self, 'masthead_path', None)
-        if mpath is not None and os.access(mpath, os.R_OK):
-            manifest.append(mpath)
-
-        opf.create_manifest_from_files_in(manifest)
-        for mani in opf.manifest:
-            if mani.path.endswith('.ncx'):
-                mani.id = 'ncx'
-            if mani.path.endswith('mastheadImage.jpg'):
-                mani.id = 'masthead-image'
-
-        entries = ['index.html']
-        toc = TOC(base_path=dir)
-        self.play_order_counter = 0
-        self.play_order_map = {}
-
-        def feed_index(num, parent):
-            f = feeds[num]
-            for j, a in enumerate(f):
-                if getattr(a, 'downloaded', False):
-                    adir = 'feed_%d/article_%d/' % (num, j)
-                    auth = a.author
-                    if not auth:
-                        auth = None
-                    desc = a.text_summary
-                    if not desc:
-                        desc = None
-                    else:
-                        desc = self.description_limiter(desc)
-                    tt = a.toc_thumbnail if a.toc_thumbnail else None
-                    entries.append('%sindex.html' % adir)
-                    po = self.play_order_map.get(entries[-1], None)
-                    if po is None:
-                        self.play_order_counter += 1
-                        po = self.play_order_counter
-                    parent.add_item('%sindex.html' % adir, None,
-                                    a.title if a.title else _(
-                                        'Untitled Article'),
-                                    play_order=po, author=auth,
-                                    description=desc, toc_thumbnail=tt)
-                    last = os.path.join(
-                        self.output_dir, ('%sindex.html' % adir).replace('/', os.sep))
-                    for sp in a.sub_pages:
-                        prefix = os.path.commonprefix([opf_path, sp])
-                        relp = sp[len(prefix):]
-                        entries.append(relp.replace(os.sep, '/'))
-                        last = sp
-
-                    if os.path.exists(last):
-                        with open(last, 'rb') as fi:
-                            src = fi.read().decode('utf-8')
-                        src = src.replace('height:260px !important;','')  # fix flow-player div tag parent
-                        soup = BeautifulSoup(src)
-                        body = soup.find('body')
-                        if body is not None:
-                            prefix = '/'.join('..'for i in range(2 *
-                                                                 len(re.findall(r'link\d+', last))))
-                            templ = self.navbar.generate(True, num, j, len(f),
-                                                         not self.has_single_feed,
-                                                         a.orig_url, __appname__, prefix=prefix,
-                                                         center=self.center_navbar)
-                            translatedTempl =re.sub(
-                                '<hr.*<br','<hr>本篇由 '+__appname__+
-                                ' 快取自 <a href="http://hkm.appledaily.com/" >蘋果日報</a> ; <a href="'+a.orig_url+'">本篇來源位置</a>。'+
-                                '<br',templ.render(doctype='xhtml').decode('utf-8'),flags=re.S)
-                            elem = BeautifulSoup(translatedTempl).find('div')
-                            body.insert(len(body.contents), elem)
-                            with open(last, 'wb') as fi:
-                                fi.write(type(u'')(soup).encode('utf-8'))
-        if len(feeds) == 0:
-            raise Exception('All feeds are empty, aborting.')
-
-        if len(feeds) > 1:
-            for i, f in enumerate(feeds):
-                entries.append('feed_%d/index.html' % i)
-                po = self.play_order_map.get(entries[-1], None)
-                if po is None:
-                    self.play_order_counter += 1
-                    po = self.play_order_counter
-                auth = getattr(f, 'author', None)
-                if not auth:
-                    auth = None
-                desc = getattr(f, 'description', None)
-                if not desc:
-                    desc = None
-                feed_index(i, toc.add_item('feed_%d/index.html' % i, None,
-                                           f.title, play_order=po, description=desc, author=auth))
-
-        else:
-            entries.append('feed_%d/index.html' % 0)
-            feed_index(0, toc)
-
-        for i, p in enumerate(entries):
-            entries[i] = os.path.join(dir, p.replace('/', os.sep))
-        opf.create_spine(entries)
-        opf.set_toc(toc)
-
-        with open(opf_path, 'wb') as opf_file, open(ncx_path, 'wb') as ncx_file:
-            opf.render(opf_file, ncx_file)
--- a/recipes/singtaohk.recipe
+++ b/recipes/singtaohk.recipe
@ -1,5 +1,4 @@
 from calibre.web.feeds.news import BasicNewsRecipe, classes
-from calibre.ptempfile import PersistentTemporaryFile

 class STHKRecipe(BasicNewsRecipe):
    title = '星島日報 (香港)'
@ -37,32 +36,23 @@ class STHKRecipe(BasicNewsRecipe):
        except Exception as e:
            url = e.hdrs.get('location')
        soup = self.index_to_soup(url)
-        link = soup.find('a', href=True)
-        skip_sections =[ # add sections you want to skip
+        link = soup.find('a', href=True)['href']
+        skip_sections = [ # add sections you want to skip
            '/video/', '/videos/', '/media/', 'podcast'
        ]
-        if any(x in link['href'] for x in skip_sections):
-            self.log('Aborting Article ', link['href'])
+        if any(x in link for x in skip_sections):
+            self.log('Aborting Article ', link)
            self.abort_article('skipping video links')
+        html = br.open(link).read()
+        return ({ 'data': html, 'url': link }) 

-        self.log('Downloading ', link['href'])
-        html = br.open(link['href']).read()
-        pt = PersistentTemporaryFile('.html')
-        pt.write(html)
-        pt.close()
-        return pt.name
-
-    feeds = []
-
-    sections = [
-        'daily', 'realtime', 'education', 'property', 'racing', 'supplement', 'kol'
+    feeds = [
+        ('日報', 'https://news.google.com/rss/search?q=when:27h+allinurl:https%3A%2F%2Fstd.stheadline.com%2Fdaily%2F&hl=zh-HK&gl=HK&ceid=HK:zh'),
+        ('即時', 'https://news.google.com/rss/search?q=when:27h+allinurl:https%3A%2F%2Fstd.stheadline.com%2Frealtime%2F&hl=zh-HK&gl=HK&ceid=HK:zh'),
+        ('副刊', 'https://news.google.com/rss/search?q=when:27h+allinurl:https%3A%2F%2Fstd.stheadline.com%2Fsupplement%2F&hl=zh-HK&gl=HK&ceid=HK:zh'),
+        ('其他的 新聞', 'https://news.google.com/rss/search?q=when:27h+allinurl:https%3A%2F%2Fstd.stheadline.com&hl=zh-HK&gl=HK&ceid=HK:zh')
    ]

-    for sec in sections:
-        a = 'https://news.google.com/rss/search?q=when:27h+allinurl:https%3A%2F%2Fstd.stheadline.com{}&hl=zh-HK&gl=HK&ceid=HK:zh'
-        feeds.append((sec.capitalize(), a.format('%2F' + sec + '%2F')))
-    feeds.append(('Others', a.format('')))
-
    def populate_article_metadata(self, article, soup, first):
        article.title = article.title.replace(' - 星島頭條', '')