calibre/recipes/modoros.recipe
2019-04-13 09:17:31 +05:30

100 lines
3.6 KiB
Plaintext

import re
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.constants import config_dir, CONFIG_DIR_MODE
import os
import os.path
try:
from urllib.parse import quote
except ImportError:
from urllib import quote
from hashlib import md5
class ModorosBlogHu(BasicNewsRecipe):
__author__ = 'Zsolt Botykai'
title = u'Modoros Blog'
description = u"Modoros.blog.hu"
oldest_article = 10000
max_articles_per_feed = 10000
reverse_article_order = True
language = 'hu'
remove_javascript = True
remove_empty_feeds = True
no_stylesheets = True
feeds = [(u'Modoros Blog', u'http://modoros.blog.hu/rss')]
remove_javascript = True
use_embedded_content = False
preprocess_regexps = [
(re.compile(r'<!--megosztas -->.*?</body>', re.DOTALL | re.IGNORECASE),
lambda match: '</body>'),
(re.compile(r'<p align="left"'), lambda m: '<p'),
(re.compile(r'<noscript.+?noscript>', re.DOTALL | re.IGNORECASE), lambda m: ''),
(re.compile(r'<img style="position: absolute;top:-10px.+?>',
re.DOTALL | re.IGNORECASE), lambda m: ''),
(re.compile(r'<p>( |&nbsp;)*?</p>', re.DOTALL | re.IGNORECASE), lambda match: ''),
]
extra_css = '''
body { background-color: white; color: black }
'''
remove_tags = [
dict(name='div', attrs={'id': ['csucs']}),
dict(name='img', attrs={
'style': ['position: absolute;top:-10px;left:-10px;']}),
dict(name='div', attrs={'class': ['tovabb-is-van',
'page-break',
'clear']}),
dict(name='span', attrs={'class': ['hozzaszolas-szamlalo']})
]
masthead_url = 'http://modoros.blog.hu/media/skins/modoros-neon/img/modorosblog-felirat.png'
def get_cover_url(self):
return 'http://modoros.blog.hu/media/skins/modoros-neon/img/modorosblog-felirat.png'
# As seen here:
# http://www.mobileread.com/forums/showpost.php?p=1295505&postcount=10
def parse_feeds(self):
recipe_dir = os.path.join(config_dir, 'recipes')
hash_dir = os.path.join(recipe_dir, 'recipe_storage')
feed_dir = os.path.join(
hash_dir, self.title.encode('utf-8').replace('/', ':'))
if not os.path.isdir(feed_dir):
os.makedirs(feed_dir, mode=CONFIG_DIR_MODE)
feeds = BasicNewsRecipe.parse_feeds(self)
for feed in feeds:
feed_hash = quote(feed.title.encode('utf-8'), safe='')
feed_fn = os.path.join(feed_dir, feed_hash)
past_items = set()
if os.path.exists(feed_fn):
with open(feed_fn) as f:
for h in f:
past_items.add(h.strip())
cur_items = set()
for article in feed.articles[:]:
item_hash = md5()
if article.content:
item_hash.update(article.content.encode('utf-8'))
if article.summary:
item_hash.update(article.summary.encode('utf-8'))
item_hash = item_hash.hexdigest()
if article.url:
item_hash = article.url + ':' + item_hash
cur_items.add(item_hash)
if item_hash in past_items:
feed.articles.remove(article)
with open(feed_fn, 'w') as f:
for h in cur_items:
f.write(h + '\n')
remove = [fl for fl in feeds if len(fl) == 0 and
self.remove_empty_feeds]
for f in remove:
feeds.remove(f)
return feeds