Merge branch 'master' of https://github.com/unkn0w7n/calibre

2025-07-09 03:04:10 -04:00 · 2024-10-10 11:18:11 +05:30 · 2024-10-10 11:18:11 +05:30 · 3a273e8766
commit 3a273e8766
parent cfb9a67baa 15527533ba
2 changed files with 38 additions and 49 deletions
--- a/recipes/mainichi.recipe
+++ b/recipes/mainichi.recipe
@ -1,59 +1,46 @@
-__license__ = 'GPL v3'
+#!/usr/bin/env python
-__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
+"""
 '''
 www.mainichi.jp
-'''
+"""
 import re
 from calibre.web.feeds.news import BasicNewsRecipe
 class MainichiDailyNews(BasicNewsRecipe):
    title = u'\u6bce\u65e5\u65b0\u805e'
-    __author__ = 'Hiroshi Miura'
+    __author__ = 'unkn0wn'
-    oldest_article = 2
+    description = "Japanese traditional newspaper Mainichi Daily News"
-    max_articles_per_feed = 20
+    publisher = "Mainichi News"
-    description = 'Japanese traditional newspaper Mainichi Daily News'
+    publication_type = "newspaper"
-    publisher = 'Mainichi Daily News'
+    category = "news, japan"
-    category = 'news, japan'
+    language = "ja"
-    language = 'ja'
+
-    index = 'http://mainichi.jp/select/'
+    no_stylesheets = True
    remove_javascript = True
-    masthead_title = u'MAINICHI DAILY NEWS'
+    auto_cleanup = True
-    remove_tags_before = {'class': "NewsTitle"}
+    ignore_duplicate_articles = {'title', 'url'}
    remove_tags_after = {'class': "NewsBody clr"}
    def parse_feeds(self):
        feeds = BasicNewsRecipe.parse_feeds(self)
        for curfeed in feeds:
            delList = []
            for a, curarticle in enumerate(curfeed.articles):
                if re.search(r'pheedo.jp', curarticle.url):
                    delList.append(curarticle)
                if re.search(r'rssad.jp', curarticle.url):
                    delList.append(curarticle)
            if len(delList) > 0:
                for d in delList:
                    index = curfeed.articles.index(d)
                    curfeed.articles[index:index + 1] = []
        return feeds
    def parse_index(self):
        index = 'https://mainichi.jp'
        sections = [
            'articles'
        ]
        feeds = []
-        soup = self.index_to_soup(self.index)
+        soup = self.index_to_soup(index)
-        topstories = soup.find('ul', attrs={'class': 'MaiLink'})
+        for sec in sections:
-        if topstories:
+            section = sec.capitalize()
-            newsarticles = []
+            self.log(section)
-            for itt in topstories.findAll('li'):
+            articles = []
-                itema = itt.find('a', href=True)
+            for a in soup.findAll('a', attrs={'href':lambda x: x and 'articles' in x}):
-                if itema:
+                if a.find('img'):
-                    newsarticles.append({
+                    continue
-                        'title': itema.string, 'date': '', 'url': itema['href'], 'description': ''
+                url = a['href']
-                    })
+                if not url.startswith('http'):
-            feeds.append(('latest', newsarticles))
+                    url = 'https:' + url
                title = self.tag_to_string(a)
                self.log('\t', title, '\n\t\t', url)
                articles.append({'title': title, 'url': url})
            if articles:
                feeds.append((section, articles))
        return feeds
--- a/recipes/mainichi_en.recipe
+++ b/recipes/mainichi_en.recipe
@ -36,10 +36,12 @@ class MainichiEnglishNews(BasicNewsRecipe):
            section = sec.capitalize()
            self.log(section)
            articles = []
-            for a in soup.findAll('a', attrs={'href':lambda x: x and x.startswith(index + sec + '/')}):
+            for a in soup.findAll('a', attrs={'href':lambda x: x and 'articles' in x}):
-                url = a['href']
+                if a.find('img'):
                if url in {index + sec + '/', index + sec}:
                    continue
                url = a['href']
                if not url.startswith('http'):
                    url = 'https:' + url
                title = self.tag_to_string(a)
                self.log('\t', title, '\n\t\t', url)
                articles.append({'title': title, 'url': url})