Office Space and Modoros by Zsolt Botykai

2025-07-09 03:04:10 -04:00 · 2011-03-15 10:50:23 -06:00 · 2011-03-15 10:50:23 -06:00 · ef1f808d5b
commit ef1f808d5b
parent f2e1962902
2 changed files with 198 additions and 0 deletions
--- a/resources/recipes/modoros.recipe
+++ b/resources/recipes/modoros.recipe
@ -0,0 +1,89 @@
+import re
+from calibre.web.feeds.recipes import BasicNewsRecipe
+from calibre.constants import config_dir, CONFIG_DIR_MODE
+import os, os.path, urllib
+from hashlib import md5
+
+class ModorosBlogHu(BasicNewsRecipe):
+    __author__              = 'Zsolt Botykai'
+    title                   = u'Modoros Blog'
+    description             = u"Modoros.blog.hu"
+    oldest_article          = 10000
+    max_articles_per_feed   = 10000
+    reverse_article_order   = True
+    language                = 'hu'
+    remove_javascript       = True
+    remove_empty_feeds      = True
+    no_stylesheets          = True
+    feeds                   = [(u'Modoros Blog', u'http://modoros.blog.hu/rss')]
+    remove_javascript       = True
+    use_embedded_content    = False
+    preprocess_regexps      = [
+        (re.compile(r'<!--megosztas -->.*?</body>', re.DOTALL|re.IGNORECASE),
+         lambda match: '</body>'),
+        (re.compile(r'<p align="left"'), lambda m: '<p'),
+        (re.compile(r'<noscript.+?noscript>', re.DOTALL|re.IGNORECASE), lambda m: ''),
+        (re.compile(r'<img style="position: absolute;top:-10px.+?>', re.DOTALL|re.IGNORECASE), lambda m: ''),
+        (re.compile(r'<p>( |&nbsp;)*?</p>', re.DOTALL|re.IGNORECASE), lambda match: ''),
+    ]
+    extra_css = '''
+                    body { background-color: white; color: black }
+                '''
+
+
+    remove_tags = [
+                       dict(name='div', attrs={'id':['csucs']}) ,
+                       dict(name='img', attrs={'style':['position: absolute;top:-10px;left:-10px;']}) ,
+                       dict(name='div', attrs={'class':['tovabb-is-van', \
+                                                        'page-break', \
+                                                        'clear']}) ,
+                       dict(name='span', attrs={'class':['hozzaszolas-szamlalo']})
+                  ]
+
+    masthead_url='http://modoros.blog.hu/media/skins/modoros-neon/img/modorosblog-felirat.png'
+
+    def get_cover_url(self):
+        return 'http://modoros.blog.hu/media/skins/modoros-neon/img/modorosblog-felirat.png'
+
+    # As seen here: http://www.mobileread.com/forums/showpost.php?p=1295505&postcount=10
+    def parse_feeds(self):
+        recipe_dir = os.path.join(config_dir,'recipes')
+        hash_dir = os.path.join(recipe_dir,'recipe_storage')
+        feed_dir = os.path.join(hash_dir,self.title.encode('utf-8').replace('/',':'))
+        if not os.path.isdir(feed_dir):
+            os.makedirs(feed_dir,mode=CONFIG_DIR_MODE)
+
+        feeds = BasicNewsRecipe.parse_feeds(self)
+
+        for feed in feeds:
+            feed_hash = urllib.quote(feed.title.encode('utf-8'),safe='')
+            feed_fn = os.path.join(feed_dir,feed_hash)
+
+            past_items = set()
+            if os.path.exists(feed_fn):
+               with file(feed_fn) as f:
+                   for h in f:
+                       past_items.add(h.strip())
+
+            cur_items = set()
+            for article in feed.articles[:]:
+                item_hash = md5()
+                if article.content: item_hash.update(article.content.encode('utf-8'))
+                if article.summary: item_hash.update(article.summary.encode('utf-8'))
+                item_hash = item_hash.hexdigest()
+                if article.url:
+                    item_hash = article.url + ':' + item_hash
+                cur_items.add(item_hash)
+                if item_hash in past_items:
+                    feed.articles.remove(article)
+            with file(feed_fn,'w') as f:
+                for h in cur_items:
+                    f.write(h+'\n')
+
+        remove = [f for f in feeds if len(f) == 0 and
+                self.remove_empty_feeds]
+        for f in remove:
+            feeds.remove(f)
+
+        return feeds
+
--- a/resources/recipes/office_space.recipe
+++ b/resources/recipes/office_space.recipe
@ -0,0 +1,109 @@
+import re
+from calibre.web.feeds.recipes import BasicNewsRecipe
+from calibre.constants import config_dir, CONFIG_DIR_MODE
+import os, os.path, urllib
+from hashlib import md5
+
+class OfficeSpaceBlogHu(BasicNewsRecipe):
+    __author__              = 'Zsolt Botykai'
+    title                   = u'Office Space Blog'
+    description             = u"officespace.blog.hu"
+    oldest_article          = 10000
+    max_articles_per_feed   = 10000
+    reverse_article_order   = True
+    language                = 'hu'
+    remove_javascript       = True
+    remove_empty_feeds      = True
+    no_stylesheets          = True
+    feeds                   = [(u'Office Space Blog', u'http://officespace.blog.hu/rss')]
+    remove_javascript       = True
+    use_embedded_content    = False
+    title          = u'Irodai patkényok'
+    feeds          = [(u'Office Space', u'http://officespace.blog.hu/rss')]
+
+    masthead_url='http://m.blog.hu/of/officespace/ipfejlec7.jpg'
+
+    keep_only_tags = [
+                    dict(name='div', attrs={'id':['mainWrapper']})
+                    ]
+
+    #   1.: I like justified lines more
+    #   2.: remove empty paragraphs
+    #   3.: drop header and sidebar
+    #   4.: drop comments counter
+    #   5.: drop everything after article-tags
+    # 6-8.: drop audit images
+
+    preprocess_regexps = [
+        (re.compile(r'<p align="left"'), lambda m: '<p'),
+        (re.compile(r'<p>( |&nbsp;)*?</p>', re.DOTALL|re.IGNORECASE), lambda match: ''),
+        (re.compile(r'<body[^>]+>.*?<div id="mainIn"', re.DOTALL|re.IGNORECASE), lambda match: '<body><div id="mainIn"'),
+        (re.compile(r'<h3 class="comments">.*?</h3>', re.DOTALL|re.IGNORECASE), lambda match: ''),
+        (re.compile(r'<div class="related">.*?</body>', re.DOTALL|re.IGNORECASE), lambda match: '<body>'),
+        (re.compile(r'<img style="position: absolute;" src="[^"]+pixel\?uc.*?>', re.DOTALL|re.IGNORECASE), lambda match: ''),
+        (re.compile(r'<noscript.+?noscript>', re.DOTALL|re.IGNORECASE), lambda m: ''),
+        (re.compile(r'<img style="position: absolute;top:-10px.+?>', re.DOTALL|re.IGNORECASE), lambda m: ''),
+                         ]
+    extra_css = '''
+                    body { background-color: white; color: black }
+                '''
+
+    def get_cover_url(self):
+        return 'http://m.blog.hu/of/officespace/ipfejlec7.jpg'
+
+    def preprocess_html(self, soup):
+        for tagz in soup.findAll('h3', attrs={'class':'tags'}):
+            for taglink in tagz.findAll('a'):
+                if taglink.string is not None:
+                   tstr = taglink.string + ','
+                   taglink.replaceWith(tstr)
+
+        for alink in soup.findAll('a'):
+            if alink.string is not None:
+               tstr = alink.string
+               alink.replaceWith(tstr)
+
+        return soup
+
+    # As seen here: http://www.mobileread.com/forums/showpost.php?p=1295505&postcount=10
+    def parse_feeds(self):
+        recipe_dir = os.path.join(config_dir,'recipes')
+        hash_dir = os.path.join(recipe_dir,'recipe_storage')
+        feed_dir = os.path.join(hash_dir,self.title.encode('utf-8').replace('/',':'))
+        if not os.path.isdir(feed_dir):
+            os.makedirs(feed_dir,mode=CONFIG_DIR_MODE)
+
+        feeds = BasicNewsRecipe.parse_feeds(self)
+
+        for feed in feeds:
+            feed_hash = urllib.quote(feed.title.encode('utf-8'),safe='')
+            feed_fn = os.path.join(feed_dir,feed_hash)
+
+            past_items = set()
+            if os.path.exists(feed_fn):
+               with file(feed_fn) as f:
+                   for h in f:
+                       past_items.add(h.strip())
+
+            cur_items = set()
+            for article in feed.articles[:]:
+                item_hash = md5()
+                if article.content: item_hash.update(article.content.encode('utf-8'))
+                if article.summary: item_hash.update(article.summary.encode('utf-8'))
+                item_hash = item_hash.hexdigest()
+                if article.url:
+                    item_hash = article.url + ':' + item_hash
+                cur_items.add(item_hash)
+                if item_hash in past_items:
+                    feed.articles.remove(article)
+            with file(feed_fn,'w') as f:
+                for h in cur_items:
+                    f.write(h+'\n')
+
+        remove = [f for f in feeds if len(f) == 0 and
+                self.remove_empty_feeds]
+        for f in remove:
+            feeds.remove(f)
+
+        return feeds
+