diff --git a/recipes/android_com_pl.recipe b/recipes/android_com_pl.recipe index beefd9f28d..39eeb39582 100644 --- a/recipes/android_com_pl.recipe +++ b/recipes/android_com_pl.recipe @@ -15,5 +15,5 @@ class Android_com_pl(BasicNewsRecipe): remove_tags_after = [{'class': 'post-content'}] remove_tags = [dict(name='ul', attrs={'class': 'tags small-tags'}), dict(name='a', attrs={'onclick': 'return ss_plugin_loadpopup_js(this);'})] preprocess_regexps = [ - (re.compile(ur'

.{,1}

', re.DOTALL), lambda match: '')] + (re.compile(u'

.{,1}

', re.DOTALL), lambda match: '')] feeds = [(u'Android', u'http://android.com.pl/feed/')] diff --git a/recipes/appledaily_tw.recipe b/recipes/appledaily_tw.recipe index 4a2949a3ec..14e5be20a0 100644 --- a/recipes/appledaily_tw.recipe +++ b/recipes/appledaily_tw.recipe @@ -104,7 +104,7 @@ class AppledailyTW(BasicNewsRecipe): ] def preprocess_raw_html(self, raw_html, url): - raw_html = re.sub(ur'
.*?<\/a>'), '', raw_html) raw_html = re.sub( - ur'(.*?)[\s]+\|.*<\/title>', '<title>\1<\/title>', raw_html) + unicode(r'<title>(.*?)[\s]+\|.*<\/title>', '<title>\1<\/title>'), raw_html) return raw_html diff --git a/recipes/benchmark_pl.recipe b/recipes/benchmark_pl.recipe index 1501c6f5d2..6433742978 100644 --- a/recipes/benchmark_pl.recipe +++ b/recipes/benchmark_pl.recipe @@ -16,8 +16,8 @@ class BenchmarkPl(BasicNewsRecipe): extra_css = 'ul {list-style-type: none;}' no_stylesheets = True use_embedded_content = False - preprocess_regexps = [(re.compile(ur'<h3><span style="font-size: small;"> Zobacz poprzednie <a href="http://www.benchmark.pl/news/zestawienie/grupa_id/135">Opinie dnia:</a></span>.*</body>', # noqa - re.DOTALL | re.IGNORECASE), lambda match: '</body>'), (re.compile(ur'Więcej o .*?</ul>', re.DOTALL | re.IGNORECASE), lambda match: '')] # noqa + preprocess_regexps = [(re.compile(u'<h3><span style="font-size: small;"> Zobacz poprzednie <a href="http://www.benchmark.pl/news/zestawienie/grupa_id/135">Opinie dnia:</a></span>.*</body>', # noqa + re.DOTALL | re.IGNORECASE), lambda match: '</body>'), (re.compile(u'Więcej o .*?</ul>', re.DOTALL | re.IGNORECASE), lambda match: '')] # noqa keep_only_tags = [dict(id=['articleHeader', 'articleGallery']), dict( name='div', attrs={'class': ['m_zwykly', 'gallery']}), dict(id='article')] diff --git a/recipes/ciekawostki_historyczne.recipe b/recipes/ciekawostki_historyczne.recipe index a62b383833..7df7b61e9d 100644 --- a/recipes/ciekawostki_historyczne.recipe +++ b/recipes/ciekawostki_historyczne.recipe @@ -14,8 +14,8 @@ class Ciekawostki_Historyczne(BasicNewsRecipe): max_articles_per_feed = 100 extra_css = 'img.alignleft {float:left; margin-right:5px;} .alignright {float:right; margin-left:5px;}' oldest_article = 12 - preprocess_regexps = [(re.compile(ur'Ten artykuł ma kilka stron.*?</fb:like>', re.DOTALL), - lambda match: ''), (re.compile(ur'<h2>Zobacz też:</h2>.*?</ol>', re.DOTALL), lambda match: '')] + preprocess_regexps = [(re.compile(u'Ten artykuł ma kilka stron.*?</fb:like>', re.DOTALL), + lambda match: ''), (re.compile(u'<h2>Zobacz też:</h2>.*?</ol>', re.DOTALL), lambda match: '')] no_stylesheets = True remove_empty_feeds = True keep_only_tags = [dict(name='div', attrs={'class': 'post'})] diff --git a/recipes/cnetjapan.recipe b/recipes/cnetjapan.recipe index c5e2b78abc..fa4494fc99 100644 --- a/recipes/cnetjapan.recipe +++ b/recipes/cnetjapan.recipe @@ -16,11 +16,11 @@ class CNetJapan(BasicNewsRecipe): remove_javascript = True preprocess_regexps = [ - (re.compile(ur'<!--\u25B2contents_left END\u25B2-->.*</body>', re.DOTALL | re.IGNORECASE | re.UNICODE), + (re.compile(unicode(r'<!--\u25B2contents_left END\u25B2-->.*</body>'), re.DOTALL | re.IGNORECASE | re.UNICODE), lambda match: '</body>'), (re.compile(r'<!--AD_ELU_HEADER-->.*</body>', re.DOTALL | re.IGNORECASE), lambda match: '</body>'), - (re.compile(ur'<!-- \u25B2\u95A2\u9023\u30BF\u30B0\u25B2 -->.*<!-- \u25B2ZDNet\u25B2 -->', re.UNICODE), + (re.compile(unicode(r'<!-- \u25B2\u95A2\u9023\u30BF\u30B0\u25B2 -->.*<!-- \u25B2ZDNet\u25B2 -->'), re.UNICODE), lambda match: '<!-- removed -->'), ] diff --git a/recipes/cnetjapan_digital.recipe b/recipes/cnetjapan_digital.recipe index 485175ec1f..cb16741a5d 100644 --- a/recipes/cnetjapan_digital.recipe +++ b/recipes/cnetjapan_digital.recipe @@ -14,11 +14,11 @@ class CNetJapanDigital(BasicNewsRecipe): remove_javascript = True preprocess_regexps = [ - (re.compile(ur'<!--\u25B2contents_left END\u25B2-->.*</body>', re.DOTALL | re.IGNORECASE | re.UNICODE), + (re.compile(unicode(r'<!--\u25B2contents_left END\u25B2-->.*</body>'), re.DOTALL | re.IGNORECASE | re.UNICODE), lambda match: '</body>'), (re.compile(r'<!--AD_ELU_HEADER-->.*</body>', re.DOTALL | re.IGNORECASE), lambda match: '</body>'), - (re.compile(ur'<!-- \u25B2\u95A2\u9023\u30BF\u30B0\u25B2 -->.*<!-- \u25B2ZDNet\u25B2 -->', re.UNICODE), + (re.compile(unicode(r'<!-- \u25B2\u95A2\u9023\u30BF\u30B0\u25B2 -->.*<!-- \u25B2ZDNet\u25B2 -->'), re.UNICODE), lambda match: '<!-- removed -->'), ] diff --git a/recipes/cnetjapan_release.recipe b/recipes/cnetjapan_release.recipe index 11a67ba814..a21d69e43b 100644 --- a/recipes/cnetjapan_release.recipe +++ b/recipes/cnetjapan_release.recipe @@ -14,11 +14,11 @@ class CNetJapanRelease(BasicNewsRecipe): remove_javascript = True preprocess_regexps = [ - (re.compile(ur'<!--\u25B2contents_left END\u25B2-->.*</body>', re.DOTALL | re.IGNORECASE | re.UNICODE), + (re.compile(unicode(r'<!--\u25B2contents_left END\u25B2-->.*</body>'), re.DOTALL | re.IGNORECASE | re.UNICODE), lambda match: '</body>'), (re.compile(r'<!--AD_ELU_HEADER-->.*</body>', re.DOTALL | re.IGNORECASE), lambda match: '</body>'), - (re.compile(ur'<!-- \u25B2\u95A2\u9023\u30BF\u30B0\u25B2 -->.*<!-- \u25B2ZDNet\u25B2 -->', re.UNICODE), + (re.compile(unicode(r'<!-- \u25B2\u95A2\u9023\u30BF\u30B0\u25B2 -->.*<!-- \u25B2ZDNet\u25B2 -->'), re.UNICODE), lambda match: '<!-- removed -->'), ] diff --git a/recipes/computerworld_pl.recipe b/recipes/computerworld_pl.recipe index a8eac7a37a..0d7db5fbd8 100644 --- a/recipes/computerworld_pl.recipe +++ b/recipes/computerworld_pl.recipe @@ -16,7 +16,7 @@ class Computerworld_pl(BasicNewsRecipe): max_articles_per_feed = 100 use_embedded_content = False preprocess_regexps = [(re.compile(u'Zobacz również:', re.IGNORECASE), lambda m: ''), - (re.compile(ur'[*]+reklama[*]+', re.IGNORECASE), lambda m: ''), ] + (re.compile(u'[*]+reklama[*]+', re.IGNORECASE), lambda m: ''), ] keep_only_tags = [dict(name='article')] remove_tags = [dict(attrs={'class': ['share_tools nocontent', 'rec']}), dict(name='ul',attrs={'class':'tags'}), diff --git a/recipes/dobreprogamy.recipe b/recipes/dobreprogamy.recipe index 457dd99be2..878ceb57c3 100644 --- a/recipes/dobreprogamy.recipe +++ b/recipes/dobreprogamy.recipe @@ -19,7 +19,7 @@ class Dobreprogramy_pl(BasicNewsRecipe): max_articles_per_feed = 100 remove_attrs = ['style', 'width', 'height'] preprocess_regexps = [(re.compile( - ur'<div id="\S+360pmp4">Twoja przeglądarka nie obsługuje Flasha i HTML5 lub wyłączono obsługę JavaScript...</div>'), lambda match: '')] + unicode(r'<div id="\S+360pmp4">Twoja przeglądarka nie obsługuje Flasha i HTML5 lub wyłączono obsługę JavaScript...</div>')), lambda match: '')] keep_only_tags = [dict(name='h1'), dict( attrs={'class': ['entry single']}), dict(id='phContent_divArticle')] remove_tags = [dict(attrs={'class': ['newsOptions', 'noPrint', 'komentarze', 'tags font-heading-master', 'social nested-grid grid-margin-px15-top clearfix no-mobile', 'page-info text-h4 font-heading grid-margin-px15-top color-annotation clearfix', 'series grid-margin-px30-top']}), dict(id='komentarze'), dict(id='phContent_ctl02_sBreadcrumb'), dict(name='iframe')] # noqa diff --git a/recipes/dziennik_wschodni.recipe b/recipes/dziennik_wschodni.recipe index 9f48511be7..ad1cc3ef96 100644 --- a/recipes/dziennik_wschodni.recipe +++ b/recipes/dziennik_wschodni.recipe @@ -19,8 +19,8 @@ class DziennikWschodni(BasicNewsRecipe): no_stylesheets = True ignore_duplicate_articles = {'title', 'url'} - preprocess_regexps = [(re.compile(ur'Czytaj:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(ur'Przeczytaj także:.*?</a>', re.DOTALL | re.IGNORECASE), lambda match: ''), # noqa - (re.compile(ur'Przeczytaj również:.*?</a>', re.DOTALL | re.IGNORECASE), lambda match: ''), (re.compile(ur'Zobacz też:.*?</a>', re.DOTALL | re.IGNORECASE), lambda match: '')] # noqa + preprocess_regexps = [(re.compile(u'Czytaj:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(u'Przeczytaj także:.*?</a>', re.DOTALL | re.IGNORECASE), lambda match: ''), # noqa + (re.compile(u'Przeczytaj również:.*?</a>', re.DOTALL | re.IGNORECASE), lambda match: ''), (re.compile(u'Zobacz też:.*?</a>', re.DOTALL | re.IGNORECASE), lambda match: '')] # noqa keep_only_tags = [dict(id=['article', 'cover', 'photostory'])] remove_tags = [dict(id=['articleTags', 'articleMeta', 'boxReadIt', 'articleGalleries', 'articleConnections', diff --git a/recipes/echo_dnia.recipe b/recipes/echo_dnia.recipe index 2c5c0e22b8..7c1805c103 100644 --- a/recipes/echo_dnia.recipe +++ b/recipes/echo_dnia.recipe @@ -20,8 +20,8 @@ class EchoDnia(BasicNewsRecipe): use_embedded_content = False ignore_duplicate_articles = {'title', 'url'} - preprocess_regexps = [(re.compile(ur'Czytaj:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(ur'Przeczytaj także:.*?</a>', re.DOTALL | re.IGNORECASE), lambda match: ''), # noqa - (re.compile(ur'Przeczytaj również:.*?</a>', re.DOTALL | re.IGNORECASE), lambda match: ''), (re.compile(ur'Zobacz też:.*?</a>', re.DOTALL | re.IGNORECASE), lambda match: '')] # noqa + preprocess_regexps = [(re.compile(u'Czytaj:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(u'Przeczytaj także:.*?</a>', re.DOTALL | re.IGNORECASE), lambda match: ''), # noqa + (re.compile(u'Przeczytaj również:.*?</a>', re.DOTALL | re.IGNORECASE), lambda match: ''), (re.compile(u'Zobacz też:.*?</a>', re.DOTALL | re.IGNORECASE), lambda match: '')] # noqa keep_only_tags = [dict(id=['article', 'cover', 'photostory'])] remove_tags = [dict(id=['articleTags', 'articleMeta', 'boxReadIt', 'articleGalleries', 'articleConnections', diff --git a/recipes/esenja.recipe b/recipes/esenja.recipe index 0dcf276d89..b4e38374f9 100644 --- a/recipes/esenja.recipe +++ b/recipes/esenja.recipe @@ -45,7 +45,7 @@ class Esensja(BasicNewsRecipe): preprocess_regexps = [(re.compile(r'alt="[^"]*"'), lambda match: ''), (re.compile( - ur'(title|alt)="[^"]*?"', re.DOTALL), lambda match: ''), + u'(title|alt)="[^"]*?"', re.DOTALL), lambda match: ''), ] def parse_index(self): diff --git a/recipes/esensja_(rss).recipe b/recipes/esensja_(rss).recipe index 7253125dde..d8db4d8738 100644 --- a/recipes/esensja_(rss).recipe +++ b/recipes/esensja_(rss).recipe @@ -23,7 +23,7 @@ class EsensjaRSS(BasicNewsRecipe): ignore_duplicate_articles = {'title', 'url'} preprocess_regexps = [(re.compile(r'alt="[^"]*"'), lambda match: ''), (re.compile( - ur'(title|alt)="[^"]*?"', re.DOTALL), lambda match: ''), + u'(title|alt)="[^"]*?"', re.DOTALL), lambda match: ''), ] remove_attributes = ['style', 'bgcolor', 'alt', 'color'] keep_only_tags = [dict(attrs={'class': 'sekcja'}), ] diff --git a/recipes/film_web.recipe b/recipes/film_web.recipe index 4b2e83d1bd..a3ae619068 100644 --- a/recipes/film_web.recipe +++ b/recipes/film_web.recipe @@ -23,7 +23,7 @@ class FilmWebPl(BasicNewsRecipe): 'ul.inline {padding:0px;} .vertical-align {display: inline-block;}') preprocess_regexps = [(re.compile(r'<body.+?</head>', re.DOTALL), lambda match: ''), # fix malformed HTML with 2 body tags... (re.compile(u'(?:<sup>)?\(kliknij\,\ aby powiększyć\)(?:</sup>)?', re.IGNORECASE), lambda m: ''), - (re.compile(ur'(<br ?/?>\s*?<br ?/?>\s*?)+', re.IGNORECASE), lambda m: '<br />') + (re.compile(unicode(r'(<br ?/?>\s*?<br ?/?>\s*?)+'), re.IGNORECASE), lambda m: '<br />') ] remove_tags = [dict(attrs={'class':['infoParent', 'likeBar', 'droptions-box pull-right', 'photoDesc', 'imageLicense', 'play big', 'shadow embed__icon--svg']})] diff --git a/recipes/forbes_pl.recipe b/recipes/forbes_pl.recipe index 41856aedcb..226cfdb574 100644 --- a/recipes/forbes_pl.recipe +++ b/recipes/forbes_pl.recipe @@ -17,8 +17,8 @@ class forbes_pl(BasicNewsRecipe): cover_url = 'http://www.forbes.pl/resources/front/images/logo.png' max_articles_per_feed = 100 extra_css = '.Block-Photo {float:left; max-width: 300px; margin-right: 5px;}' - preprocess_regexps = [(re.compile(ur'<p>(<strong>)?(Czytaj|Zobacz) (też|także):.*?</p>', re.DOTALL), - lambda match: ''), (re.compile(ur'<strong>Zobacz:.*?</strong>', re.DOTALL), lambda match: '')] + preprocess_regexps = [(re.compile(u'<p>(<strong>)?(Czytaj|Zobacz) (też|także):.*?</p>', re.DOTALL), + lambda match: ''), (re.compile(u'<strong>Zobacz:.*?</strong>', re.DOTALL), lambda match: '')] remove_javascript = True no_stylesheets = True now = datetime.datetime.now() diff --git a/recipes/forsal.recipe b/recipes/forsal.recipe index e302d121f3..69efcfb203 100644 --- a/recipes/forsal.recipe +++ b/recipes/forsal.recipe @@ -43,7 +43,7 @@ class ForsalPL(BasicNewsRecipe): (u'Moja firma', u'http://forsal.pl/atom/tagi/moja_firma')] def print_version(self, url): - url_id = re.search(ur'/[0-9]+,', url) + url_id = re.search(u'/[0-9]+,', url) if url_id: return 'http://forsal.pl/drukowanie' + url_id.group(0)[:-1] else: diff --git a/recipes/gildia_pl.recipe b/recipes/gildia_pl.recipe index 2a9c446645..0cc92013ca 100644 --- a/recipes/gildia_pl.recipe +++ b/recipes/gildia_pl.recipe @@ -16,7 +16,7 @@ class Gildia(BasicNewsRecipe): no_stylesheets = True use_embedded_content = False ignore_duplicate_articles = {'title', 'url'} - preprocess_regexps = [(re.compile(ur'</?sup>'), lambda match: '')] + preprocess_regexps = [(re.compile(u'</?sup>'), lambda match: '')] ignore_duplicate_articles = {'title', 'url'} remove_tags = [dict(name='div', attrs={'class': [ 'backlink', 'im_img', 'addthis_toolbox addthis_default_style', 'banner-bottom']})] diff --git a/recipes/in4_pl.recipe b/recipes/in4_pl.recipe index 9b48279bae..ab4bffd50c 100644 --- a/recipes/in4_pl.recipe +++ b/recipes/in4_pl.recipe @@ -15,7 +15,7 @@ class in4(BasicNewsRecipe): no_stylesheets = True remove_empty_feeds = True preprocess_regexps = [ - (re.compile(ur'<a title="translate into.*?</a>', re.DOTALL), lambda match: '')] + (re.compile(u'<a title="translate into.*?</a>', re.DOTALL), lambda match: '')] keep_only_tags = [dict(name='div', attrs={'class': 'left_alone'})] remove_tags_after = dict(name='img', attrs={'title': 'komentarze'}) remove_tags = [dict(name='img', attrs={'title': 'komentarze'})] diff --git a/recipes/ksiazka_pl.recipe b/recipes/ksiazka_pl.recipe index 0dba469ebb..ba2105d43e 100644 --- a/recipes/ksiazka_pl.recipe +++ b/recipes/ksiazka_pl.recipe @@ -14,7 +14,7 @@ class Ksiazka_net_pl(BasicNewsRecipe): no_stylesheets = True remove_empty_feeds = True preprocess_regexps = [ - (re.compile(ur'Podoba mi się, kupuję:'), lambda match: '<br />')] + (re.compile(u'Podoba mi się, kupuję:'), lambda match: '<br />')] remove_tags_before = dict(name='div', attrs={'class': 'm-body'}) remove_tags_after = dict(name='div', attrs={'class': 'm-body-link'}) remove_tags = [ diff --git a/recipes/natemat_pl.recipe b/recipes/natemat_pl.recipe index 4c0ec1c9c5..0ed257f9e2 100644 --- a/recipes/natemat_pl.recipe +++ b/recipes/natemat_pl.recipe @@ -10,8 +10,8 @@ class NaTemat(BasicNewsRecipe): description = u'informacje, komentarze, opinie' category = 'news' language = 'pl' - preprocess_regexps = [(re.compile(ur'Czytaj też\:.*?</a>', re.IGNORECASE), lambda m: ''), (re.compile(ur'Zobacz też\:.*?</a>', re.IGNORECASE), lambda m: ''), # noqa - (re.compile(ur'Czytaj więcej\:.*?</a>', re.IGNORECASE), lambda m: ''), (re.compile(ur'Czytaj również\:.*?</a>', re.IGNORECASE), lambda m: '')] # noqa + preprocess_regexps = [(re.compile(u'Czytaj też\\:.*?</a>', re.IGNORECASE), lambda m: ''), (re.compile(u'Zobacz też\\:.*?</a>', re.IGNORECASE), lambda m: ''), # noqa + (re.compile(u'Czytaj więcej\\:.*?</a>', re.IGNORECASE), lambda m: ''), (re.compile(u'Czytaj również\\:.*?</a>', re.IGNORECASE), lambda m: '')] # noqa cover_url = 'http://blog.plona.pl/wp-content/uploads/2012/05/natemat.png' no_stylesheets = True keep_only_tags = [dict(id='main')] diff --git a/recipes/tablety_pl.recipe b/recipes/tablety_pl.recipe index 120c1132c8..36bf58a8ef 100644 --- a/recipes/tablety_pl.recipe +++ b/recipes/tablety_pl.recipe @@ -14,8 +14,8 @@ class Tablety_pl(BasicNewsRecipe): no_stylesheets = True oldest_article = 8 max_articles_per_feed = 100 - preprocess_regexps = [(re.compile(ur'<p><strong>Przeczytaj także.*?</a></strong></p>', re.DOTALL), lambda match: ''), - (re.compile(ur'<p><strong>Przeczytaj koniecznie.*?</a></strong></p>', re.DOTALL), lambda match: '')] + preprocess_regexps = [(re.compile(u'<p><strong>Przeczytaj także.*?</a></strong></p>', re.DOTALL), lambda match: ''), + (re.compile(u'<p><strong>Przeczytaj koniecznie.*?</a></strong></p>', re.DOTALL), lambda match: '')] keep_only_tags = [dict(id='news_block')] remove_tags = [dict(attrs={'class': ['comments_icon', 'wp-polls', 'entry-comments', 'wp-polls-loading', 'ts-fab-wrapper', 'entry-footer', 'social-custom']})] diff --git a/recipes/tanuki.recipe b/recipes/tanuki.recipe index 49c67091d7..fbcad64d20 100644 --- a/recipes/tanuki.recipe +++ b/recipes/tanuki.recipe @@ -12,8 +12,8 @@ class tanuki(BasicNewsRecipe): max_articles_per_feed = 100 encoding = 'utf-8' extra_css = 'ul {list-style: none; padding: 0; margin: 0;} .kadr{float: left;} .dwazdania {float: right;}' - preprocess_regexps = [(re.compile(ur'<h3><a class="screen".*?</h3>', re.DOTALL), lambda match: ''), (re.compile( - ur'<div><a href="/strony/((manga)|(anime))/[0-9]+?/oceny(\-redakcji){0,1}">Zobacz jak ocenili</a></div>', re.DOTALL), lambda match: '')] + preprocess_regexps = [(re.compile(u'<h3><a class="screen".*?</h3>', re.DOTALL), lambda match: ''), (re.compile( + unicode(r'<div><a href="/strony/((manga)|(anime))/[0-9]+?/oceny(\-redakcji){0,1}">Zobacz jak ocenili</a></div>'), re.DOTALL), lambda match: '')] remove_empty_feeds = True no_stylesheets = True keep_only_tags = [dict(attrs={'class': ['animename', 'storyname', 'nextarrow', 'sideinfov', 'sidelinfov', 'sideinfo', 'sidelinfo']}), dict(name='table', attrs={ 'summary': 'Technikalia'}), dict(attrs={'class': ['chaptername', 'copycat']}), dict(id='rightcolumn'), dict(attrs={'class': ['headn_tt', 'subtable']})] # noqa diff --git a/recipes/tawernarpg_pl.recipe b/recipes/tawernarpg_pl.recipe index c0ebe90ccf..0b0b3340ce 100644 --- a/recipes/tawernarpg_pl.recipe +++ b/recipes/tawernarpg_pl.recipe @@ -11,7 +11,7 @@ class TawernaRPG(BasicNewsRecipe): language = 'pl' extra_css = '.slajd {list-style-type: none; padding-left: 0px; margin-left: 0px;} .lewanc {float: left; margin-right: 5px;} .srodek {display: block; margin-left: auto; margin-right: auto;}' # noqa cover_url = 'http://www.tawerna.rpg.pl/img/logo.png' - preprocess_regexps = [(re.compile(ur'<h2>Dodaj komentarz</h2>.*</body>', + preprocess_regexps = [(re.compile(u'<h2>Dodaj komentarz</h2>.*</body>', re.DOTALL | re.IGNORECASE), lambda match: '</body>')] use_embedded_content = False oldest_article = 7 diff --git a/recipes/trojmiasto_pl.recipe b/recipes/trojmiasto_pl.recipe index bef9a8672d..2977211191 100644 --- a/recipes/trojmiasto_pl.recipe +++ b/recipes/trojmiasto_pl.recipe @@ -20,8 +20,8 @@ class Trojmiasto(BasicNewsRecipe): remove_attributes = ['style', 'font'] ignore_duplicate_articles = {'title', 'url'} - preprocess_regexps = [(re.compile(ur'<strong>Czytaj więcej.*?</a>', re.DOTALL | re.IGNORECASE), lambda match: ''), (re.compile(ur'<strong>Zobacz też.*?</a>', re.DOTALL | re.IGNORECASE), lambda match: ''), # noqa - (re.compile(ur'<b>[A-ZĄĆĘŁŃÓŚŹŻ \-,.:]*?</b>', re.DOTALL), lambda match: ''), ] + preprocess_regexps = [(re.compile(u'<strong>Czytaj więcej.*?</a>', re.DOTALL | re.IGNORECASE), lambda match: ''), (re.compile(u'<strong>Zobacz też.*?</a>', re.DOTALL | re.IGNORECASE), lambda match: ''), # noqa + (re.compile(u'<b>[A-ZĄĆĘŁŃÓŚŹŻ ,.:-]*?</b>', re.DOTALL), lambda match: ''), ] remove_tags = [ dict(id=['logo', 'font_small', 'font_big']), diff --git a/recipes/wnp.recipe b/recipes/wnp.recipe index e30965a700..200b004717 100644 --- a/recipes/wnp.recipe +++ b/recipes/wnp.recipe @@ -9,8 +9,8 @@ class WNP(BasicNewsRecipe): description = u'Wirtualny Nowy Przemysł' category = 'economy' language = 'pl' - preprocess_regexps = [(re.compile(ur'Czytaj też:.*?</a>', re.DOTALL), lambda match: ''), - (re.compile(ur'Czytaj więcej:.*?</a>', re.DOTALL), lambda match: '')] + preprocess_regexps = [(re.compile(u'Czytaj też:.*?</a>', re.DOTALL), lambda match: ''), + (re.compile(u'Czytaj więcej:.*?</a>', re.DOTALL), lambda match: '')] oldest_article = 8 max_articles_per_feed = 100 no_stylesheets = True