calibre/recipes/office_space.recipe

import re
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.constants import config_dir, CONFIG_DIR_MODE
import os
import os.path
try:
    from urllib.parse import quote
except ImportError:
    from urllib import quote
from hashlib import md5


class OfficeSpaceBlogHu(BasicNewsRecipe):
    __author__ = 'Zsolt Botykai'
    title = u'Office Space Blog'
    description = u"officespace.blog.hu"
    oldest_article = 10000
    max_articles_per_feed = 10000
    reverse_article_order = True
    language = 'hu'
    remove_javascript = True
    remove_empty_feeds = True
    no_stylesheets = True
    feeds = [(u'Office Space Blog', u'http://officespace.blog.hu/rss')]
    remove_javascript = True
    use_embedded_content = False
    title = u'Irodai patkényok'
    feeds = [(u'Office Space', u'http://officespace.blog.hu/rss')]

    masthead_url = 'http://m.blog.hu/of/officespace/ipfejlec7.jpg'

    keep_only_tags = [
        dict(name='div', attrs={'id': ['mainWrapper']})
    ]

    #   1.: I like justified lines more
    #   2.: remove empty paragraphs
    #   3.: drop header and sidebar
    #   4.: drop comments counter
    #   5.: drop everything after article-tags
    # 6-8.: drop audit images

    preprocess_regexps = [
        (re.compile(r'<p align="left"'), lambda m: '<p'),
        (re.compile(r'<p>( |&nbsp;)*?</p>', re.DOTALL | re.IGNORECASE), lambda match: ''),
        (re.compile(r'<body[^>]+>.*?<div id="mainIn"', re.DOTALL |
                    re.IGNORECASE), lambda match: '<body><div id="mainIn"'),
        (re.compile(r'<h3 class="comments">.*?</h3>',
                    re.DOTALL | re.IGNORECASE), lambda match: ''),
        (re.compile(r'<div class="related">.*?</body>',
                    re.DOTALL | re.IGNORECASE), lambda match: '<body>'),
        (re.compile(
            r'<img style="position: absolute;" src="[^"]+pixel\?uc.*?>', re.DOTALL | re.IGNORECASE), lambda match: ''),
        (re.compile(r'<noscript.+?noscript>', re.DOTALL | re.IGNORECASE), lambda m: ''),
        (re.compile(r'<img style="position: absolute;top:-10px.+?>',
                    re.DOTALL | re.IGNORECASE), lambda m: ''),
    ]
    extra_css = '''
                    body { background-color: white; color: black }
                '''

    def get_cover_url(self):
        return 'http://m.blog.hu/of/officespace/ipfejlec7.jpg'

    def preprocess_html(self, soup):
        for tagz in soup.findAll('h3', attrs={'class': 'tags'}):
            for taglink in tagz.findAll('a'):
                if taglink.string is not None:
                    tstr = taglink.string + ','
                    taglink.replaceWith(tstr)

        for alink in soup.findAll('a'):
            if alink.string is not None:
                tstr = alink.string
                alink.replaceWith(tstr)

        return soup

    # As seen here:
    # http://www.mobileread.com/forums/showpost.php?p=1295505&postcount=10
    def parse_feeds(self):
        recipe_dir = os.path.join(config_dir, 'recipes')
        hash_dir = os.path.join(recipe_dir, 'recipe_storage')
        feed_dir = os.path.join(
            hash_dir, self.title.encode('utf-8').replace('/', ':'))
        if not os.path.isdir(feed_dir):
            os.makedirs(feed_dir, mode=CONFIG_DIR_MODE)

        feeds = BasicNewsRecipe.parse_feeds(self)

        for feed in feeds:
            feed_hash = quote(feed.title.encode('utf-8'), safe='')
            feed_fn = os.path.join(feed_dir, feed_hash)

            past_items = set()
            if os.path.exists(feed_fn):
                with open(feed_fn) as f:
                    for h in f:
                        past_items.add(h.strip())

            cur_items = set()
            for article in feed.articles[:]:
                item_hash = md5()
                if article.content:
                    item_hash.update(article.content.encode('utf-8'))
                if article.summary:
                    item_hash.update(article.summary.encode('utf-8'))
                item_hash = item_hash.hexdigest()
                if article.url:
                    item_hash = article.url + ':' + item_hash
                cur_items.add(item_hash)
                if item_hash in past_items:
                    feed.articles.remove(article)
            with open(feed_fn, 'w') as f:
                for h in cur_items:
                    f.write(h + '\n')

        remove = [fl for fl in feeds if len(fl) == 0 and
                  self.remove_empty_feeds]
        for f in remove:
            feeds.remove(f)

        return feeds