Update AM 730 and Ming Pao (HK)

2025-07-09 03:04:10 -04:00 · 2013-10-03 09:28:55 +05:30 · 2013-10-03 09:28:55 +05:30 · fd77ad2c92
commit fd77ad2c92
parent ea9a2dfd8f
2 changed files with 33 additions and 53 deletions
--- a/recipes/am730.recipe
+++ b/recipes/am730.recipe
@ -3,10 +3,10 @@ from __future__ import unicode_literals
 __license__   = 'GPL v3'
 __copyright__ = '2013, Eddie Lau'
 __Date__ = ''
-__HiResImg__ = True

 '''
 Change Log:
+2013/09/28 -- update due to website redesign, add cover
 2013/03/30 -- first version
 '''

@ -32,18 +32,17 @@ class AppleDaily(BasicNewsRecipe):
    encoding = 'utf-8'
    auto_cleanup = False
    remove_javascript = True
-    use_embedded_content   = False
+    use_embedded_content = False
    no_stylesheets = True
    description = 'http://www.am730.com.hk'
    category    = 'Chinese, News, Hong Kong'
    masthead_url = 'http://www.am730.com.hk/images/logo.jpg'
-
-    extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} div[id=articleHeader] {font-size:200%; text-align:left; font-weight:bold;} photocaption {font-size:50%; margin-left:auto; margin-right:auto;}'
-    keep_only_tags = [dict(name='div', attrs={'id':'articleHeader'}),
-                      dict(name='div', attrs={'class':'thecontent wordsnap'}),
-                      dict(name='a', attrs={'class':'lightboximg'})]
-    remove_tags = [dict(name='img', attrs={'src':'/images/am730_article_logo.jpg'}),
-                   dict(name='img', attrs={'src':'/images/am_endmark.gif'})]
+    extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 20px; margin-bottom: 20px; max-height:70%;} div[id=articleHeader] {font-size:200%; text-align:left; font-weight:bold;} li {font-size:50%; margin-left:auto; margin-right:auto;}'
+    keep_only_tags = [dict(name='h2', attrs={'class':'printTopic'}),
+                      dict(name='div', attrs={'id':'article_content'}),
+                      dict(name='div', attrs={'id':'slider'})]
+    remove_tags = [dict(name='img', attrs={'src':'images/am730_article_logo.jpg'}),
+                   dict(name='img', attrs={'src':'images/am_endmark.gif'})]

    def get_dtlocal(self):
        dt_utc = datetime.datetime.utcnow()
@ -84,6 +83,16 @@ class AppleDaily(BasicNewsRecipe):
    def get_weekday(self):
        return self.get_dtlocal().weekday()

+    def get_cover_url(self):
+        soup = self.index_to_soup('http://www.am730.com.hk')
+        cover = 'http://www.am730.com.hk/' + soup.find(attrs={'id':'mini_news_img'}).find('img').get('src', False)
+        br = BasicNewsRecipe.get_browser(self)
+        try:
+            br.open(cover)
+        except:
+            cover = None
+        return cover
+        
    def populate_article_metadata(self, article, soup, first):
        if first and hasattr(self, 'add_toc_thumbnail'):
            picdiv = soup.find('img')
@ -93,48 +102,17 @@ class AppleDaily(BasicNewsRecipe):
    def parse_index(self):
        feeds = []
        soup = self.index_to_soup('http://www.am730.com.hk/')
-        ul = soup.find(attrs={'class':'nav-section'})
-        sectionList = []
-        for li in ul.findAll('li'):
-            a = 'http://www.am730.com.hk/' + li.find('a', href=True).get('href', False)
-            title = li.find('a').get('title', False).strip()
-            sectionList.append((title, a))
-        for title, url in sectionList:
-            articles = self.parse_section(url)
-            if articles:
-                feeds.append((title, articles))
+        optgroups = soup.findAll('optgroup')
+        for optgroup in optgroups:
+            sectitle = optgroup.get('label')
+            articles = []
+            for option in optgroup.findAll('option'):
+                articlelink = "http://www.am730.com.hk/" + option.get('value')
+                title = option.string
+                articles.append({'title': title, 'url': articlelink})
+            feeds.append((sectitle, articles))
        return feeds
-
-    def parse_section(self, url):
-        soup = self.index_to_soup(url)
-        items = soup.findAll(attrs={'style':'padding-bottom: 15px;'})
-        current_articles = []
-        for item in items:
-            a = item.find(attrs={'class':'t6 f14'}).find('a', href=True)
-            articlelink = 'http://www.am730.com.hk/' + a.get('href', True)
-            title = self.tag_to_string(a)
-            description = self.tag_to_string(item.find(attrs={'class':'t3 f14'}))
-            current_articles.append({'title': title, 'url': articlelink, 'description': description})
-        return current_articles
-
-    def preprocess_html(self, soup):
-        multia = soup.findAll('a')
-        for a in multia:
-            if not (a == None):
-                image = a.find('img')
-                if not (image == None):
-                    if __HiResImg__:
-                        image['src'] = image.get('src').replace('/thumbs/', '/')
-                    caption = image.get('alt')
-                    tag = Tag(soup, "photo", [])
-                    tag2 = Tag(soup, "photocaption", [])
-                    tag.insert(0, image)
-                    if not caption == None:
-                        tag2.insert(0, caption)
-                        tag.insert(1, tag2)
-                    a.replaceWith(tag)
-        return soup
-
+        
    def create_opf(self, feeds, dir=None):
        if dir is None:
            dir = self.output_dir
@ -288,3 +266,4 @@ class AppleDaily(BasicNewsRecipe):
        with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
            opf.render(opf_file, ncx_file)

+
--- a/recipes/ming_pao.recipe
+++ b/recipes/ming_pao.recipe
@ -1,5 +1,5 @@
 __license__   = 'GPL v3'
-__copyright__ = '2010-2011, Eddie Lau'
+__copyright__ = '2010-2013, Eddie Lau'

 # Region - Hong Kong, Vancouver, Toronto
 __Region__ = 'Hong Kong'
@ -32,6 +32,7 @@ __Date__ = ''

 '''
 Change Log:
+2013/09/28: allow thumbnails even with hi-res images
 2012/04/24: improved parsing of news.mingpao.com content
 2011/12/18: update the overridden create_odf(.) routine with the one from Calibre version 0.8.31. Move __UseChineseTitle__ usage away
            from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day
@ -846,8 +847,7 @@ class MPRecipe(BasicNewsRecipe):
        return soup

    def populate_article_metadata(self, article, soup, first):
-        # thumbnails shouldn't be available if using hi-res images
-        if __IncludeThumbnails__ and __HiResImg__ == False and first and hasattr(self, 'add_toc_thumbnail'):
+        if __IncludeThumbnails__ and first and hasattr(self, 'add_toc_thumbnail'):
            img = soup.find('img')
            if img is not None:
                self.add_toc_thumbnail(article, img['src'])
@ -1071,3 +1071,4 @@ class MPRecipe(BasicNewsRecipe):



+