mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
124 lines
4.5 KiB
Plaintext
124 lines
4.5 KiB
Plaintext
import re
|
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
|
from calibre.constants import config_dir, CONFIG_DIR_MODE
|
|
import os
|
|
import os.path
|
|
try:
|
|
from urllib.parse import quote
|
|
except ImportError:
|
|
from urllib import quote
|
|
from hashlib import md5
|
|
|
|
|
|
class OfficeSpaceBlogHu(BasicNewsRecipe):
|
|
__author__ = 'Zsolt Botykai'
|
|
title = u'Office Space Blog'
|
|
description = u"officespace.blog.hu"
|
|
oldest_article = 10000
|
|
max_articles_per_feed = 10000
|
|
reverse_article_order = True
|
|
language = 'hu'
|
|
remove_javascript = True
|
|
remove_empty_feeds = True
|
|
no_stylesheets = True
|
|
feeds = [(u'Office Space Blog', u'http://officespace.blog.hu/rss')]
|
|
remove_javascript = True
|
|
use_embedded_content = False
|
|
title = u'Irodai patkényok'
|
|
feeds = [(u'Office Space', u'http://officespace.blog.hu/rss')]
|
|
|
|
masthead_url = 'http://m.blog.hu/of/officespace/ipfejlec7.jpg'
|
|
|
|
keep_only_tags = [
|
|
dict(name='div', attrs={'id': ['mainWrapper']})
|
|
]
|
|
|
|
# 1.: I like justified lines more
|
|
# 2.: remove empty paragraphs
|
|
# 3.: drop header and sidebar
|
|
# 4.: drop comments counter
|
|
# 5.: drop everything after article-tags
|
|
# 6-8.: drop audit images
|
|
|
|
preprocess_regexps = [
|
|
(re.compile(r'<p align="left"'), lambda m: '<p'),
|
|
(re.compile(r'<p>( | )*?</p>', re.DOTALL | re.IGNORECASE), lambda match: ''),
|
|
(re.compile(r'<body[^>]+>.*?<div id="mainIn"', re.DOTALL |
|
|
re.IGNORECASE), lambda match: '<body><div id="mainIn"'),
|
|
(re.compile(r'<h3 class="comments">.*?</h3>',
|
|
re.DOTALL | re.IGNORECASE), lambda match: ''),
|
|
(re.compile(r'<div class="related">.*?</body>',
|
|
re.DOTALL | re.IGNORECASE), lambda match: '<body>'),
|
|
(re.compile(
|
|
r'<img style="position: absolute;" src="[^"]+pixel\?uc.*?>', re.DOTALL | re.IGNORECASE), lambda match: ''),
|
|
(re.compile(r'<noscript.+?noscript>', re.DOTALL | re.IGNORECASE), lambda m: ''),
|
|
(re.compile(r'<img style="position: absolute;top:-10px.+?>',
|
|
re.DOTALL | re.IGNORECASE), lambda m: ''),
|
|
]
|
|
extra_css = '''
|
|
body { background-color: white; color: black }
|
|
'''
|
|
|
|
def get_cover_url(self):
|
|
return 'http://m.blog.hu/of/officespace/ipfejlec7.jpg'
|
|
|
|
def preprocess_html(self, soup):
|
|
for tagz in soup.findAll('h3', attrs={'class': 'tags'}):
|
|
for taglink in tagz.findAll('a'):
|
|
if taglink.string is not None:
|
|
tstr = taglink.string + ','
|
|
taglink.replaceWith(tstr)
|
|
|
|
for alink in soup.findAll('a'):
|
|
if alink.string is not None:
|
|
tstr = alink.string
|
|
alink.replaceWith(tstr)
|
|
|
|
return soup
|
|
|
|
# As seen here:
|
|
# http://www.mobileread.com/forums/showpost.php?p=1295505&postcount=10
|
|
def parse_feeds(self):
|
|
recipe_dir = os.path.join(config_dir, 'recipes')
|
|
hash_dir = os.path.join(recipe_dir, 'recipe_storage')
|
|
feed_dir = os.path.join(
|
|
hash_dir, self.title.encode('utf-8').replace('/', ':'))
|
|
if not os.path.isdir(feed_dir):
|
|
os.makedirs(feed_dir, mode=CONFIG_DIR_MODE)
|
|
|
|
feeds = BasicNewsRecipe.parse_feeds(self)
|
|
|
|
for feed in feeds:
|
|
feed_hash = quote(feed.title.encode('utf-8'), safe='')
|
|
feed_fn = os.path.join(feed_dir, feed_hash)
|
|
|
|
past_items = set()
|
|
if os.path.exists(feed_fn):
|
|
with open(feed_fn) as f:
|
|
for h in f:
|
|
past_items.add(h.strip())
|
|
|
|
cur_items = set()
|
|
for article in feed.articles[:]:
|
|
item_hash = md5()
|
|
if article.content:
|
|
item_hash.update(article.content.encode('utf-8'))
|
|
if article.summary:
|
|
item_hash.update(article.summary.encode('utf-8'))
|
|
item_hash = item_hash.hexdigest()
|
|
if article.url:
|
|
item_hash = article.url + ':' + item_hash
|
|
cur_items.add(item_hash)
|
|
if item_hash in past_items:
|
|
feed.articles.remove(article)
|
|
with open(feed_fn, 'w') as f:
|
|
for h in cur_items:
|
|
f.write(h + '\n')
|
|
|
|
remove = [fl for fl in feeds if len(fl) == 0 and
|
|
self.remove_empty_feeds]
|
|
for f in remove:
|
|
feeds.remove(f)
|
|
|
|
return feeds
|