import re from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.constants import config_dir, CONFIG_DIR_MODE import os import os.path try: from urllib.parse import quote except ImportError: from urllib import quote from hashlib import md5 class OfficeSpaceBlogHu(BasicNewsRecipe): __author__ = 'Zsolt Botykai' title = u'Office Space Blog' description = u"officespace.blog.hu" oldest_article = 10000 max_articles_per_feed = 10000 reverse_article_order = True language = 'hu' remove_javascript = True remove_empty_feeds = True no_stylesheets = True feeds = [(u'Office Space Blog', u'http://officespace.blog.hu/rss')] remove_javascript = True use_embedded_content = False title = u'Irodai patkényok' feeds = [(u'Office Space', u'http://officespace.blog.hu/rss')] masthead_url = 'http://m.blog.hu/of/officespace/ipfejlec7.jpg' keep_only_tags = [ dict(name='div', attrs={'id': ['mainWrapper']}) ] # 1.: I like justified lines more # 2.: remove empty paragraphs # 3.: drop header and sidebar # 4.: drop comments counter # 5.: drop everything after article-tags # 6-8.: drop audit images preprocess_regexps = [ (re.compile(r'

( | )*?

', re.DOTALL | re.IGNORECASE), lambda match: ''), (re.compile(r']+>.*?
.*?', re.DOTALL | re.IGNORECASE), lambda match: ''), (re.compile(r'