From 7fdce6b2fd321b58d8c9f7aef5b5e38c64888d3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20D=C5=82ugosz?= Date: Sat, 23 Mar 2013 13:15:06 +0100 Subject: [PATCH] rewritten interia recipes --- recipes/interia_fakty.recipe | 47 +++++++++++++++++------ recipes/interia_sport.recipe | 74 ++++++++++++++++-------------------- 2 files changed, 67 insertions(+), 54 deletions(-) diff --git a/recipes/interia_fakty.recipe b/recipes/interia_fakty.recipe index 74cf56b267..baedd35d0c 100644 --- a/recipes/interia_fakty.recipe +++ b/recipes/interia_fakty.recipe @@ -1,7 +1,7 @@ #!/usr/bin/env python __license__ = 'GPL v3' -__copyright__ = u'2010, Tomasz Dlugosz ' +__copyright__ = u'2010-2013, Tomasz Dlugosz ' ''' fakty.interia.pl ''' @@ -12,12 +12,13 @@ class InteriaFakty(BasicNewsRecipe): title = u'Interia.pl - Fakty' description = u'Fakty ze strony interia.pl' language = 'pl' - oldest_article = 7 + oldest_article = 1 __author__ = u'Tomasz D\u0142ugosz' - simultaneous_downloads = 2 no_stylesheets = True remove_javascript = True - max_articles_per_feed = 100 + remove_empty_feeds= True + use_embedded_content = False + ignore_duplicate_articles = {'title', 'url'} feeds = [(u'Kraj', u'http://kanaly.rss.interia.pl/kraj.xml'), (u'\u015awiat', u'http://kanaly.rss.interia.pl/swiat.xml'), @@ -26,14 +27,36 @@ class InteriaFakty(BasicNewsRecipe): (u'Wywiady', u'http://kanaly.rss.interia.pl/wywiady.xml'), (u'Ciekawostki', u'http://kanaly.rss.interia.pl/ciekawostki.xml')] - keep_only_tags = [dict(name='div', attrs={'id':'article'})] + keep_only_tags = [ + dict(name='h1'), + dict(name='div', attrs={'class': ['lead textContent', 'text textContent', 'source']})] - remove_tags = [ - dict(name='div', attrs={'class':'box fontSizeSwitch'}), - dict(name='div', attrs={'class':'clear'}), - dict(name='div', attrs={'class':'embed embedLeft articleEmbedArticleList articleEmbedArticleListTitle'}), - dict(name='span', attrs={'class':'keywords'})] + remove_tags = [dict(name='div', attrs={'class':['embed embedAd', 'REMOVE', 'boxHeader']})] + + preprocess_regexps = [ + (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in + [ + (r'embed embed(Left|Right|Center) articleEmbed(Audio|Wideo articleEmbedVideo|ArticleFull|ArticleTitle|ArticleListTitle|AlbumHorizontal)">', lambda match: 'REMOVE">'), + (r'
', lambda match: ''), + (r'

' +__copyright__ = u'2010-2013, Tomasz Dlugosz ' ''' sport.interia.pl ''' @@ -13,61 +13,51 @@ class InteriaSport(BasicNewsRecipe): title = u'Interia.pl - Sport' description = u'Sport ze strony interia.pl' language = 'pl' - oldest_article = 7 + oldest_article = 1 __author__ = u'Tomasz D\u0142ugosz' - simultaneous_downloads = 3 no_stylesheets = True remove_javascript = True - max_articles_per_feed = 100 + remove_empty_feeds= True + use_embedded_content = False + ignore_duplicate_articles = {'title', 'url'} feeds = [(u'Wydarzenia sportowe', u'http://kanaly.rss.interia.pl/sport.xml'), (u'Pi\u0142ka no\u017cna', u'http://kanaly.rss.interia.pl/pilka_nozna.xml'), - (u'Siatk\xf3wka', u'http://kanaly.rss.interia.pl/siatkowka.xml'), (u'Koszyk\xf3wka', u'http://kanaly.rss.interia.pl/koszykowka.xml'), - (u'NBA', u'http://kanaly.rss.interia.pl/nba.xml'), - (u'Kolarstwo', u'http://kanaly.rss.interia.pl/kolarstwo.xml'), - (u'\u017bu\u017cel', u'http://kanaly.rss.interia.pl/zuzel.xml'), (u'Tenis', u'http://kanaly.rss.interia.pl/tenis.xml')] - keep_only_tags = [dict(name='div', attrs={'id':'article'})] + keep_only_tags = [ + dict(name='h1'), + dict(name='div', attrs={'class': ['lead textContent', 'text textContent', 'source']})] - remove_tags = [dict(name='div', attrs={'class':'object gallery'}), - dict(name='div', attrs={'class':'box fontSizeSwitch'})] - - extra_css = ''' - .articleDate { - font-size: 0.5em; - color: black; - } - - .articleFoto { - display: block; - font-family: sans; - font-size: 0.5em; - text-indent: 0 - color: black; - } - - .articleText { - display: block; - margin-bottom: 1em; - margin-left: 0; - margin-right: 0; - margin-top: 1em - color: black; - } - - .articleLead { - font-size: 1.2em; - } - ''' + remove_tags = [dict(name='div', attrs={'class':['embed embedAd', 'REMOVE', 'boxHeader']})] preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in [ (r'

', lambda match: ''), - # FIXME - #(r'(
)(.*?)()(.*?)()', lambda match: '\1\2\4'), - (r'

()?(ZOBACZ|CZYTAJ) T.*?

', lambda match: '
') + (r'

()?(ZOBACZ|CZYTAJ) T.*?', lambda match: ''), + (r'embed embed(Left|Right|Center) articleEmbed(Audio|Wideo articleEmbedVideo|ArticleFull|ArticleTitle|ArticleListTitle|AlbumHorizontal)">', lambda match: 'REMOVE">'), + (r'

', lambda match: ''), + (r'