Update zaobao.recipe

articles from front page
2025-07-09 03:04:10 -04:00 · 2025-02-17 22:45:19 +05:30 · 2025-02-17 22:45:19 +05:30 · 0c7c9b041d
commit 0c7c9b041d
parent 41b0ca1461
1 changed files with 63 additions and 166 deletions
--- a/recipes/zaobao.recipe
+++ b/recipes/zaobao.recipe
@ -1,185 +1,82 @@
 #!/usr/bin/env python
-__license__ = 'GPL v3'
+"""
-__copyright__ = '2009, Pu Bo <pubo at pubolab.com>'
+zaobao.com.sg
-'''
+"""
-zaobao.com
+
-'''
+from calibre.web.feeds.news import BasicNewsRecipe, classes
 from calibre.web.feeds import feeds_from_index
 from calibre.web.feeds.news import BasicNewsRecipe
 class ZAOBAO(BasicNewsRecipe):
-    title = u'\u8054\u5408\u65e9\u62a5\u7f51 zaobao.com'
+    title = '联合早报'
-    __author__ = 'Pu Bo'
+    __author__ = 'unkn0wn'
-    description = 'News from zaobao.com'
+    description = '联合早报 是全球华文用户信任的媒体，每天即时为你提供国际最新新闻和热门新闻。从财经、体育、生活娱乐资讯到评论分析，尽在zaobao.com.sg。'
    no_stylesheets = True
    recursions = 1
    language = 'zh'
-    encoding = 'gbk'
+    encoding = 'utf-8'
-    masthead_url = 'http://www.zaobao.com/ssi/images1/zblogo_original.gif'
+    masthead_url = (
-    # multithreaded_fetch = True
+        'https://upload.wikimedia.org/wikipedia/en/2/29/Lianhe_Zaobao_new_logo.svg'
    )
    remove_javascript = True
    ignore_duplicate_articles = {'url'}
    remove_attributes = ['width', 'height', 'style']
    resolve_internal_links = True
    remove_empty_feeds = True
    extra_css = """
        .calibre-nuked-tag-figcaption {font-size:small; text-align:center; }
        img {display:block; margin:0 auto;}
    """
    def get_cover_url(self):
        soup = self.index_to_soup(
            'https://frontpages.freedomforum.org/newspapers/sing_lz-Lianhe_Zaobao'
        )
        return soup.find(
            'img',
            attrs={
                'alt': 'Front Page Image',
                'src': lambda x: x and x.endswith('front-page-large.jpg'),
            },
        )['src'].replace('-large', '-medium')
    keep_only_tags = [
-        dict(name='td', attrs={'class': 'text'}),
+        dict(name='article', attrs={'class': 'max-w-full'}),
        dict(name='span', attrs={'class': 'page'}),
        dict(name='div', attrs={'id': 'content'})
    ]
-    remove_tags = [
+    remove_tags_after = [classes('articleBody')]
        dict(name='table', attrs={'cellspacing': '9'}),
        dict(name='fieldset'),
        dict(name='div', attrs={'width': '30%'}),
    ]
-    extra_css      = '''
+    remove_tags = [classes('bff-google-ad bff-recommend-article')]
            @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)}
            body{font-family: serif1, serif}
            .article_description{font-family: serif1, serif}
            p{font-family: serif1, serif}
            h1 {font-weight: bold; font-size: large;}
            h2 {font-size: large;}
            .title {font-size: large;}
            .article {font-size:medium}
            .navbar {font-size: small}
            .feed{font-size: medium}
            .small{font-size: small;padding-right: 8pt}
            .text{padding-right: 8pt}
            p{text-indent: 0cm}
            div#content{padding-right: 10pt}'''
-    INDEXES = [
+    def parse_index(self):
-        (u'\u65b0\u95fb\u56fe\u7247',
+        index = 'https://www.zaobao.com.sg'
-         u'http://www.zaobao.com/photoweb/photoweb_idx.shtml')
+        sections = ['realtime', 'news', 'forum', 'wencui', 'lifestyle', 'entertainment']
    ]
    MAX_ITEMS_IN_INDEX = 10
-    DESC_SENSE = u'\u8054\u5408\u65e9\u62a5\u7f51'
+        soup = self.index_to_soup(index)
-    feeds = [
+        feeds = []
        (u'\u5373\u65f6\u62a5\u9053', u'http://realtime.zaobao.com/news.xml'),
        (u'\u4e2d\u56fd\u65b0\u95fb',
         u'http://www.zaobao.com/zg/zg.xml'),
        (u'\u56fd\u9645\u65b0\u95fb',
         u'http://www.zaobao.com/gj/gj.xml'),
        (u'\u4e16\u754c\u62a5\u520a\u6587\u8403',
         u'http://www.zaobao.com/wencui/wencui.xml'),
        (u'\u4e1c\u5357\u4e9a\u65b0\u95fb',
         u'http://www.zaobao.com/yx/yx.xml'),
        (u'\u65b0\u52a0\u5761\u65b0\u95fb',
         u'http://www.zaobao.com/sp/sp.xml'),
        (u'\u4eca\u65e5\u89c2\u70b9',
         u'http://www.zaobao.com/yl/yl.xml'),
        (u'\u4e2d\u56fd\u8d22\u7ecf',
         u'http://www.zaobao.com/cz/cz.xml'),
        (u'\u72ee\u57ce\u8d22\u7ecf',
         u'http://www.zaobao.com/cs/cs.xml'),
        (u'\u5168\u7403\u8d22\u7ecf',
         u'http://www.zaobao.com/cg/cg.xml'),
        (u'\u65e9\u62a5\u4f53\u80b2',
         u'http://www.zaobao.com/ty/ty.xml'),
        (u'\u65e9\u62a5\u526f\u520a',
         u'http://www.zaobao.com/fk/fk.xml'),
    ]
-    def preprocess_html(self, soup):
+        for sec in sections:
-        for tag in soup.findAll(name='a', href=True):
+            sectn = sec.capitalize()
-            tag_url = tag['href']
+            self.log(sectn)
            if tag_url.find('http://') != -1 and tag_url.find('zaobao.com') == -1:
                del tag['href']
        return soup
    def postprocess_html(self, soup, first):
        for tag in soup.findAll(name=['table', 'tr', 'td']):
            tag.name = 'div'
        return soup
    def parse_feeds(self):
        self.log(_('ZAOBAO overrode parse_feeds()'))
        parsed_feeds = BasicNewsRecipe.parse_feeds(self)
        for id, obj in enumerate(self.INDEXES):
            title, url = obj
            articles = []
            soup = self.index_to_soup(url)
-            for i, item in enumerate(soup.findAll('li')):
+            for a in soup.findAll(
-                if i >= self.MAX_ITEMS_IN_INDEX:
+                'a',
-                    break
+                attrs={
-                a = item.find('a', href=True)
+                    'class': 'article-type-link',
-                if a is not None:
+                    'href': lambda x: x and x.startswith('/' + sec),
-                    a_url = a['href']
+                },
-                    a_title = self.tag_to_string(a)
+            ):
-                    date = ''
+                if a.find('img'):
-                    description = ''
+                    continue
-                    self.log(_('adding %s at %s') % (a_title, a_url))
+                title = self.tag_to_string(a)
-                    articles.append({
+                url = index + a['href']
-                                    'title': a_title,
+                if url == index + '/' + sec:
-                                    'date': date,
+                    continue
-                                    'url': a_url,
+                self.log('\t', title, '\n\t\t', url)
-                                    'description': description
+                articles.append({'title': title, 'url': url})
-                                    })
+            if articles:
-
+                feeds.append((sectn, articles))
-            pfeeds = feeds_from_index([(title, articles)], oldest_article=self.oldest_article,
+        return feeds
                                      max_articles_per_feed=self.max_articles_per_feed)
            self.log(_('adding %s to feed') % (title))
            for feed in pfeeds:
                self.log(_('adding feed: %s') % (feed.title))
                feed.description = self.DESC_SENSE
                parsed_feeds.append(feed)
                for a, article in enumerate(feed):
                    self.log(_('added article %s from %s') %
                             (article.title, article.url))
                self.log(_('added feed %s') % (feed.title))
        for i, feed in enumerate(parsed_feeds):
            # workaround a strange problem: Sometimes the xml encoding is not
            # applied correctly by parse()
            weird_encoding_detected = False
            if not isinstance(feed.description, type(u'')) and self.encoding and feed.description:
                self.log(
                    _('Feed %s is not encoded correctly, manually replace it') % (feed.title))
                feed.description = feed.description.decode(
                    self.encoding, 'replace')
            elif feed.description.find(self.DESC_SENSE) == -1 and self.encoding and feed.description:
                self.log(
                    _('Feed %s is weirdly encoded, manually redo all') % (feed.title))
                feed.description = feed.description.encode(
                    'cp1252', 'replace').decode(self.encoding, 'replace')
                weird_encoding_detected = True
            for a, article in enumerate(feed):
                if not isinstance(article.title, type(u'')) and self.encoding:
                    article.title = article.title.decode(
                        self.encoding, 'replace')
                if not isinstance(article.summary, type(u'')) and self.encoding and article.summary:
                    article.summary = article.summary.decode(
                        self.encoding, 'replace')
                    article.text_summary = article.summary
                if not isinstance(article.text_summary, type(u'')) and self.encoding and article.text_summary:
                    article.text_summary = article.text_summary.decode(
                        self.encoding, 'replace')
                    article.summary = article.text_summary
                if weird_encoding_detected:
                    if article.title:
                        article.title = article.title.encode(
                            'cp1252', 'replace').decode(self.encoding, 'replace')
                    if article.summary:
                        article.summary = article.summary.encode(
                            'cp1252', 'replace').decode(self.encoding, 'replace')
                    if article.text_summary:
                        article.text_summary = article.text_summary.encode(
                            'cp1252', 'replace').decode(self.encoding, 'replace')
                if article.title == 'Untitled article':
                    self.log(_('Removing empty article %s from %s') %
                             (article.title, article.url))
                    # remove the article
                    feed.articles[a:a + 1] = []
        return parsed_feeds
    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
        br.addheaders.append(('Pragma', 'no-cache'))
        return br