mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-11-03 11:07:02 -05:00 
			
		
		
		
	
		
			
				
	
	
		
			100 lines
		
	
	
		
			3.6 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			100 lines
		
	
	
		
			3.6 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
import re
 | 
						|
from calibre.web.feeds.recipes import BasicNewsRecipe
 | 
						|
from calibre.constants import config_dir, CONFIG_DIR_MODE
 | 
						|
import os
 | 
						|
import os.path
 | 
						|
try:
 | 
						|
    from urllib.parse import quote
 | 
						|
except ImportError:
 | 
						|
    from urllib import quote
 | 
						|
from hashlib import md5
 | 
						|
 | 
						|
 | 
						|
class ModorosBlogHu(BasicNewsRecipe):
 | 
						|
    __author__ = 'Zsolt Botykai'
 | 
						|
    title = u'Modoros Blog'
 | 
						|
    description = u"Modoros.blog.hu"
 | 
						|
    oldest_article = 10000
 | 
						|
    max_articles_per_feed = 10000
 | 
						|
    reverse_article_order = True
 | 
						|
    language = 'hu'
 | 
						|
    remove_javascript = True
 | 
						|
    remove_empty_feeds = True
 | 
						|
    no_stylesheets = True
 | 
						|
    feeds = [(u'Modoros Blog', u'http://modoros.blog.hu/rss')]
 | 
						|
    remove_javascript = True
 | 
						|
    use_embedded_content = False
 | 
						|
    preprocess_regexps = [
 | 
						|
        (re.compile(r'<!--megosztas -->.*?</body>', re.DOTALL | re.IGNORECASE),
 | 
						|
         lambda match: '</body>'),
 | 
						|
        (re.compile(r'<p align="left"'), lambda m: '<p'),
 | 
						|
        (re.compile(r'<noscript.+?noscript>', re.DOTALL | re.IGNORECASE), lambda m: ''),
 | 
						|
        (re.compile(r'<img style="position: absolute;top:-10px.+?>',
 | 
						|
                    re.DOTALL | re.IGNORECASE), lambda m: ''),
 | 
						|
        (re.compile(r'<p>( | )*?</p>', re.DOTALL | re.IGNORECASE), lambda match: ''),
 | 
						|
    ]
 | 
						|
    extra_css = '''
 | 
						|
                    body { background-color: white; color: black }
 | 
						|
                '''
 | 
						|
 | 
						|
    remove_tags = [
 | 
						|
        dict(name='div', attrs={'id': ['csucs']}),
 | 
						|
        dict(name='img', attrs={
 | 
						|
             'style': ['position: absolute;top:-10px;left:-10px;']}),
 | 
						|
        dict(name='div', attrs={'class': ['tovabb-is-van',
 | 
						|
                                          'page-break',
 | 
						|
                                          'clear']}),
 | 
						|
        dict(name='span', attrs={'class': ['hozzaszolas-szamlalo']})
 | 
						|
    ]
 | 
						|
 | 
						|
    masthead_url = 'http://modoros.blog.hu/media/skins/modoros-neon/img/modorosblog-felirat.png'
 | 
						|
 | 
						|
    def get_cover_url(self):
 | 
						|
        return 'http://modoros.blog.hu/media/skins/modoros-neon/img/modorosblog-felirat.png'
 | 
						|
 | 
						|
    # As seen here:
 | 
						|
    # http://www.mobileread.com/forums/showpost.php?p=1295505&postcount=10
 | 
						|
    def parse_feeds(self):
 | 
						|
        recipe_dir = os.path.join(config_dir, 'recipes')
 | 
						|
        hash_dir = os.path.join(recipe_dir, 'recipe_storage')
 | 
						|
        feed_dir = os.path.join(
 | 
						|
            hash_dir, self.title.encode('utf-8').replace('/', ':'))
 | 
						|
        if not os.path.isdir(feed_dir):
 | 
						|
            os.makedirs(feed_dir, mode=CONFIG_DIR_MODE)
 | 
						|
 | 
						|
        feeds = BasicNewsRecipe.parse_feeds(self)
 | 
						|
 | 
						|
        for feed in feeds:
 | 
						|
            feed_hash = quote(feed.title.encode('utf-8'), safe='')
 | 
						|
            feed_fn = os.path.join(feed_dir, feed_hash)
 | 
						|
 | 
						|
            past_items = set()
 | 
						|
            if os.path.exists(feed_fn):
 | 
						|
                with open(feed_fn) as f:
 | 
						|
                    for h in f:
 | 
						|
                        past_items.add(h.strip())
 | 
						|
 | 
						|
            cur_items = set()
 | 
						|
            for article in feed.articles[:]:
 | 
						|
                item_hash = md5()
 | 
						|
                if article.content:
 | 
						|
                    item_hash.update(article.content.encode('utf-8'))
 | 
						|
                if article.summary:
 | 
						|
                    item_hash.update(article.summary.encode('utf-8'))
 | 
						|
                item_hash = item_hash.hexdigest()
 | 
						|
                if article.url:
 | 
						|
                    item_hash = article.url + ':' + item_hash
 | 
						|
                cur_items.add(item_hash)
 | 
						|
                if item_hash in past_items:
 | 
						|
                    feed.articles.remove(article)
 | 
						|
            with open(feed_fn, 'w') as f:
 | 
						|
                for h in cur_items:
 | 
						|
                    f.write(h + '\n')
 | 
						|
 | 
						|
        remove = [fl for fl in feeds if len(fl) == 0 and
 | 
						|
                  self.remove_empty_feeds]
 | 
						|
        for f in remove:
 | 
						|
            feeds.remove(f)
 | 
						|
 | 
						|
        return feeds
 |