Fix unicode string syntax errors in recipies

This commit is contained in:
Flaviu Tamas 2018-09-04 18:14:34 -04:00 committed by Kovid Goyal
parent e44a10560e
commit c011243859
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
25 changed files with 39 additions and 39 deletions

View File

@ -15,5 +15,5 @@ class Android_com_pl(BasicNewsRecipe):
remove_tags_after = [{'class': 'post-content'}] remove_tags_after = [{'class': 'post-content'}]
remove_tags = [dict(name='ul', attrs={'class': 'tags small-tags'}), dict(name='a', attrs={'onclick': 'return ss_plugin_loadpopup_js(this);'})] remove_tags = [dict(name='ul', attrs={'class': 'tags small-tags'}), dict(name='a', attrs={'onclick': 'return ss_plugin_loadpopup_js(this);'})]
preprocess_regexps = [ preprocess_regexps = [
(re.compile(ur'<p>.{,1}</p>', re.DOTALL), lambda match: '')] (re.compile(u'<p>.{,1}</p>', re.DOTALL), lambda match: '')]
feeds = [(u'Android', u'http://android.com.pl/feed/')] feeds = [(u'Android', u'http://android.com.pl/feed/')]

View File

@ -104,7 +104,7 @@ class AppledailyTW(BasicNewsRecipe):
] ]
def preprocess_raw_html(self, raw_html, url): def preprocess_raw_html(self, raw_html, url):
raw_html = re.sub(ur'<a href=".*?<br><br>.*?<\/a>', '', raw_html) raw_html = re.sub(unicode(r'<a href=".*?<br><br>.*?<\/a>'), '', raw_html)
raw_html = re.sub( raw_html = re.sub(
ur'<title>(.*?)[\s]+\|.*<\/title>', '<title>\1<\/title>', raw_html) unicode(r'<title>(.*?)[\s]+\|.*<\/title>', '<title>\1<\/title>'), raw_html)
return raw_html return raw_html

View File

@ -16,8 +16,8 @@ class BenchmarkPl(BasicNewsRecipe):
extra_css = 'ul {list-style-type: none;}' extra_css = 'ul {list-style-type: none;}'
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
preprocess_regexps = [(re.compile(ur'<h3><span style="font-size: small;">&nbsp;Zobacz poprzednie <a href="http://www.benchmark.pl/news/zestawienie/grupa_id/135">Opinie dnia:</a></span>.*</body>', # noqa preprocess_regexps = [(re.compile(u'<h3><span style="font-size: small;">&nbsp;Zobacz poprzednie <a href="http://www.benchmark.pl/news/zestawienie/grupa_id/135">Opinie dnia:</a></span>.*</body>', # noqa
re.DOTALL | re.IGNORECASE), lambda match: '</body>'), (re.compile(ur'Więcej o .*?</ul>', re.DOTALL | re.IGNORECASE), lambda match: '')] # noqa re.DOTALL | re.IGNORECASE), lambda match: '</body>'), (re.compile(u'Więcej o .*?</ul>', re.DOTALL | re.IGNORECASE), lambda match: '')] # noqa
keep_only_tags = [dict(id=['articleHeader', 'articleGallery']), dict( keep_only_tags = [dict(id=['articleHeader', 'articleGallery']), dict(
name='div', attrs={'class': ['m_zwykly', 'gallery']}), dict(id='article')] name='div', attrs={'class': ['m_zwykly', 'gallery']}), dict(id='article')]

View File

@ -14,8 +14,8 @@ class Ciekawostki_Historyczne(BasicNewsRecipe):
max_articles_per_feed = 100 max_articles_per_feed = 100
extra_css = 'img.alignleft {float:left; margin-right:5px;} .alignright {float:right; margin-left:5px;}' extra_css = 'img.alignleft {float:left; margin-right:5px;} .alignright {float:right; margin-left:5px;}'
oldest_article = 12 oldest_article = 12
preprocess_regexps = [(re.compile(ur'Ten artykuł ma kilka stron.*?</fb:like>', re.DOTALL), preprocess_regexps = [(re.compile(u'Ten artykuł ma kilka stron.*?</fb:like>', re.DOTALL),
lambda match: ''), (re.compile(ur'<h2>Zobacz też:</h2>.*?</ol>', re.DOTALL), lambda match: '')] lambda match: ''), (re.compile(u'<h2>Zobacz też:</h2>.*?</ol>', re.DOTALL), lambda match: '')]
no_stylesheets = True no_stylesheets = True
remove_empty_feeds = True remove_empty_feeds = True
keep_only_tags = [dict(name='div', attrs={'class': 'post'})] keep_only_tags = [dict(name='div', attrs={'class': 'post'})]

View File

@ -16,11 +16,11 @@ class CNetJapan(BasicNewsRecipe):
remove_javascript = True remove_javascript = True
preprocess_regexps = [ preprocess_regexps = [
(re.compile(ur'<!--\u25B2contents_left END\u25B2-->.*</body>', re.DOTALL | re.IGNORECASE | re.UNICODE), (re.compile(unicode(r'<!--\u25B2contents_left END\u25B2-->.*</body>'), re.DOTALL | re.IGNORECASE | re.UNICODE),
lambda match: '</body>'), lambda match: '</body>'),
(re.compile(r'<!--AD_ELU_HEADER-->.*</body>', re.DOTALL | re.IGNORECASE), (re.compile(r'<!--AD_ELU_HEADER-->.*</body>', re.DOTALL | re.IGNORECASE),
lambda match: '</body>'), lambda match: '</body>'),
(re.compile(ur'<!-- \u25B2\u95A2\u9023\u30BF\u30B0\u25B2 -->.*<!-- \u25B2ZDNet\u25B2 -->', re.UNICODE), (re.compile(unicode(r'<!-- \u25B2\u95A2\u9023\u30BF\u30B0\u25B2 -->.*<!-- \u25B2ZDNet\u25B2 -->'), re.UNICODE),
lambda match: '<!-- removed -->'), lambda match: '<!-- removed -->'),
] ]

View File

@ -14,11 +14,11 @@ class CNetJapanDigital(BasicNewsRecipe):
remove_javascript = True remove_javascript = True
preprocess_regexps = [ preprocess_regexps = [
(re.compile(ur'<!--\u25B2contents_left END\u25B2-->.*</body>', re.DOTALL | re.IGNORECASE | re.UNICODE), (re.compile(unicode(r'<!--\u25B2contents_left END\u25B2-->.*</body>'), re.DOTALL | re.IGNORECASE | re.UNICODE),
lambda match: '</body>'), lambda match: '</body>'),
(re.compile(r'<!--AD_ELU_HEADER-->.*</body>', re.DOTALL | re.IGNORECASE), (re.compile(r'<!--AD_ELU_HEADER-->.*</body>', re.DOTALL | re.IGNORECASE),
lambda match: '</body>'), lambda match: '</body>'),
(re.compile(ur'<!-- \u25B2\u95A2\u9023\u30BF\u30B0\u25B2 -->.*<!-- \u25B2ZDNet\u25B2 -->', re.UNICODE), (re.compile(unicode(r'<!-- \u25B2\u95A2\u9023\u30BF\u30B0\u25B2 -->.*<!-- \u25B2ZDNet\u25B2 -->'), re.UNICODE),
lambda match: '<!-- removed -->'), lambda match: '<!-- removed -->'),
] ]

View File

@ -14,11 +14,11 @@ class CNetJapanRelease(BasicNewsRecipe):
remove_javascript = True remove_javascript = True
preprocess_regexps = [ preprocess_regexps = [
(re.compile(ur'<!--\u25B2contents_left END\u25B2-->.*</body>', re.DOTALL | re.IGNORECASE | re.UNICODE), (re.compile(unicode(r'<!--\u25B2contents_left END\u25B2-->.*</body>'), re.DOTALL | re.IGNORECASE | re.UNICODE),
lambda match: '</body>'), lambda match: '</body>'),
(re.compile(r'<!--AD_ELU_HEADER-->.*</body>', re.DOTALL | re.IGNORECASE), (re.compile(r'<!--AD_ELU_HEADER-->.*</body>', re.DOTALL | re.IGNORECASE),
lambda match: '</body>'), lambda match: '</body>'),
(re.compile(ur'<!-- \u25B2\u95A2\u9023\u30BF\u30B0\u25B2 -->.*<!-- \u25B2ZDNet\u25B2 -->', re.UNICODE), (re.compile(unicode(r'<!-- \u25B2\u95A2\u9023\u30BF\u30B0\u25B2 -->.*<!-- \u25B2ZDNet\u25B2 -->'), re.UNICODE),
lambda match: '<!-- removed -->'), lambda match: '<!-- removed -->'),
] ]

View File

@ -16,7 +16,7 @@ class Computerworld_pl(BasicNewsRecipe):
max_articles_per_feed = 100 max_articles_per_feed = 100
use_embedded_content = False use_embedded_content = False
preprocess_regexps = [(re.compile(u'Zobacz również:', re.IGNORECASE), lambda m: ''), preprocess_regexps = [(re.compile(u'Zobacz również:', re.IGNORECASE), lambda m: ''),
(re.compile(ur'[*]+reklama[*]+', re.IGNORECASE), lambda m: ''), ] (re.compile(u'[*]+reklama[*]+', re.IGNORECASE), lambda m: ''), ]
keep_only_tags = [dict(name='article')] keep_only_tags = [dict(name='article')]
remove_tags = [dict(attrs={'class': ['share_tools nocontent', 'rec']}), remove_tags = [dict(attrs={'class': ['share_tools nocontent', 'rec']}),
dict(name='ul',attrs={'class':'tags'}), dict(name='ul',attrs={'class':'tags'}),

View File

@ -19,7 +19,7 @@ class Dobreprogramy_pl(BasicNewsRecipe):
max_articles_per_feed = 100 max_articles_per_feed = 100
remove_attrs = ['style', 'width', 'height'] remove_attrs = ['style', 'width', 'height']
preprocess_regexps = [(re.compile( preprocess_regexps = [(re.compile(
ur'<div id="\S+360pmp4">Twoja przeglądarka nie obsługuje Flasha i HTML5 lub wyłączono obsługę JavaScript...</div>'), lambda match: '')] unicode(r'<div id="\S+360pmp4">Twoja przeglądarka nie obsługuje Flasha i HTML5 lub wyłączono obsługę JavaScript...</div>')), lambda match: '')]
keep_only_tags = [dict(name='h1'), dict( keep_only_tags = [dict(name='h1'), dict(
attrs={'class': ['entry single']}), dict(id='phContent_divArticle')] attrs={'class': ['entry single']}), dict(id='phContent_divArticle')]
remove_tags = [dict(attrs={'class': ['newsOptions', 'noPrint', 'komentarze', 'tags font-heading-master', 'social nested-grid grid-margin-px15-top clearfix no-mobile', 'page-info text-h4 font-heading grid-margin-px15-top color-annotation clearfix', 'series grid-margin-px30-top']}), dict(id='komentarze'), dict(id='phContent_ctl02_sBreadcrumb'), dict(name='iframe')] # noqa remove_tags = [dict(attrs={'class': ['newsOptions', 'noPrint', 'komentarze', 'tags font-heading-master', 'social nested-grid grid-margin-px15-top clearfix no-mobile', 'page-info text-h4 font-heading grid-margin-px15-top color-annotation clearfix', 'series grid-margin-px30-top']}), dict(id='komentarze'), dict(id='phContent_ctl02_sBreadcrumb'), dict(name='iframe')] # noqa

View File

@ -19,8 +19,8 @@ class DziennikWschodni(BasicNewsRecipe):
no_stylesheets = True no_stylesheets = True
ignore_duplicate_articles = {'title', 'url'} ignore_duplicate_articles = {'title', 'url'}
preprocess_regexps = [(re.compile(ur'Czytaj:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(ur'Przeczytaj także:.*?</a>', re.DOTALL | re.IGNORECASE), lambda match: ''), # noqa preprocess_regexps = [(re.compile(u'Czytaj:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(u'Przeczytaj także:.*?</a>', re.DOTALL | re.IGNORECASE), lambda match: ''), # noqa
(re.compile(ur'Przeczytaj również:.*?</a>', re.DOTALL | re.IGNORECASE), lambda match: ''), (re.compile(ur'Zobacz też:.*?</a>', re.DOTALL | re.IGNORECASE), lambda match: '')] # noqa (re.compile(u'Przeczytaj również:.*?</a>', re.DOTALL | re.IGNORECASE), lambda match: ''), (re.compile(u'Zobacz też:.*?</a>', re.DOTALL | re.IGNORECASE), lambda match: '')] # noqa
keep_only_tags = [dict(id=['article', 'cover', 'photostory'])] keep_only_tags = [dict(id=['article', 'cover', 'photostory'])]
remove_tags = [dict(id=['articleTags', 'articleMeta', 'boxReadIt', 'articleGalleries', 'articleConnections', remove_tags = [dict(id=['articleTags', 'articleMeta', 'boxReadIt', 'articleGalleries', 'articleConnections',

View File

@ -20,8 +20,8 @@ class EchoDnia(BasicNewsRecipe):
use_embedded_content = False use_embedded_content = False
ignore_duplicate_articles = {'title', 'url'} ignore_duplicate_articles = {'title', 'url'}
preprocess_regexps = [(re.compile(ur'Czytaj:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(ur'Przeczytaj także:.*?</a>', re.DOTALL | re.IGNORECASE), lambda match: ''), # noqa preprocess_regexps = [(re.compile(u'Czytaj:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(u'Przeczytaj także:.*?</a>', re.DOTALL | re.IGNORECASE), lambda match: ''), # noqa
(re.compile(ur'Przeczytaj również:.*?</a>', re.DOTALL | re.IGNORECASE), lambda match: ''), (re.compile(ur'Zobacz też:.*?</a>', re.DOTALL | re.IGNORECASE), lambda match: '')] # noqa (re.compile(u'Przeczytaj również:.*?</a>', re.DOTALL | re.IGNORECASE), lambda match: ''), (re.compile(u'Zobacz też:.*?</a>', re.DOTALL | re.IGNORECASE), lambda match: '')] # noqa
keep_only_tags = [dict(id=['article', 'cover', 'photostory'])] keep_only_tags = [dict(id=['article', 'cover', 'photostory'])]
remove_tags = [dict(id=['articleTags', 'articleMeta', 'boxReadIt', 'articleGalleries', 'articleConnections', remove_tags = [dict(id=['articleTags', 'articleMeta', 'boxReadIt', 'articleGalleries', 'articleConnections',

View File

@ -45,7 +45,7 @@ class Esensja(BasicNewsRecipe):
preprocess_regexps = [(re.compile(r'alt="[^"]*"'), lambda match: ''), preprocess_regexps = [(re.compile(r'alt="[^"]*"'), lambda match: ''),
(re.compile( (re.compile(
ur'(title|alt)="[^"]*?"', re.DOTALL), lambda match: ''), u'(title|alt)="[^"]*?"', re.DOTALL), lambda match: ''),
] ]
def parse_index(self): def parse_index(self):

View File

@ -23,7 +23,7 @@ class EsensjaRSS(BasicNewsRecipe):
ignore_duplicate_articles = {'title', 'url'} ignore_duplicate_articles = {'title', 'url'}
preprocess_regexps = [(re.compile(r'alt="[^"]*"'), lambda match: ''), preprocess_regexps = [(re.compile(r'alt="[^"]*"'), lambda match: ''),
(re.compile( (re.compile(
ur'(title|alt)="[^"]*?"', re.DOTALL), lambda match: ''), u'(title|alt)="[^"]*?"', re.DOTALL), lambda match: ''),
] ]
remove_attributes = ['style', 'bgcolor', 'alt', 'color'] remove_attributes = ['style', 'bgcolor', 'alt', 'color']
keep_only_tags = [dict(attrs={'class': 'sekcja'}), ] keep_only_tags = [dict(attrs={'class': 'sekcja'}), ]

View File

@ -23,7 +23,7 @@ class FilmWebPl(BasicNewsRecipe):
'ul.inline {padding:0px;} .vertical-align {display: inline-block;}') 'ul.inline {padding:0px;} .vertical-align {display: inline-block;}')
preprocess_regexps = [(re.compile(r'<body.+?</head>', re.DOTALL), lambda match: ''), # fix malformed HTML with 2 body tags... preprocess_regexps = [(re.compile(r'<body.+?</head>', re.DOTALL), lambda match: ''), # fix malformed HTML with 2 body tags...
(re.compile(u'(?:<sup>)?\(kliknij\,\ aby powiększyć\)(?:</sup>)?', re.IGNORECASE), lambda m: ''), (re.compile(u'(?:<sup>)?\(kliknij\,\ aby powiększyć\)(?:</sup>)?', re.IGNORECASE), lambda m: ''),
(re.compile(ur'(<br ?/?>\s*?<br ?/?>\s*?)+', re.IGNORECASE), lambda m: '<br />') (re.compile(unicode(r'(<br ?/?>\s*?<br ?/?>\s*?)+'), re.IGNORECASE), lambda m: '<br />')
] ]
remove_tags = [dict(attrs={'class':['infoParent', 'likeBar', remove_tags = [dict(attrs={'class':['infoParent', 'likeBar',
'droptions-box pull-right', 'photoDesc', 'imageLicense', 'play big', 'shadow embed__icon--svg']})] 'droptions-box pull-right', 'photoDesc', 'imageLicense', 'play big', 'shadow embed__icon--svg']})]

View File

@ -17,8 +17,8 @@ class forbes_pl(BasicNewsRecipe):
cover_url = 'http://www.forbes.pl/resources/front/images/logo.png' cover_url = 'http://www.forbes.pl/resources/front/images/logo.png'
max_articles_per_feed = 100 max_articles_per_feed = 100
extra_css = '.Block-Photo {float:left; max-width: 300px; margin-right: 5px;}' extra_css = '.Block-Photo {float:left; max-width: 300px; margin-right: 5px;}'
preprocess_regexps = [(re.compile(ur'<p>(<strong>)?(Czytaj|Zobacz) (też|także):.*?</p>', re.DOTALL), preprocess_regexps = [(re.compile(u'<p>(<strong>)?(Czytaj|Zobacz) (też|także):.*?</p>', re.DOTALL),
lambda match: ''), (re.compile(ur'<strong>Zobacz:.*?</strong>', re.DOTALL), lambda match: '')] lambda match: ''), (re.compile(u'<strong>Zobacz:.*?</strong>', re.DOTALL), lambda match: '')]
remove_javascript = True remove_javascript = True
no_stylesheets = True no_stylesheets = True
now = datetime.datetime.now() now = datetime.datetime.now()

View File

@ -43,7 +43,7 @@ class ForsalPL(BasicNewsRecipe):
(u'Moja firma', u'http://forsal.pl/atom/tagi/moja_firma')] (u'Moja firma', u'http://forsal.pl/atom/tagi/moja_firma')]
def print_version(self, url): def print_version(self, url):
url_id = re.search(ur'/[0-9]+,', url) url_id = re.search(u'/[0-9]+,', url)
if url_id: if url_id:
return 'http://forsal.pl/drukowanie' + url_id.group(0)[:-1] return 'http://forsal.pl/drukowanie' + url_id.group(0)[:-1]
else: else:

View File

@ -16,7 +16,7 @@ class Gildia(BasicNewsRecipe):
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
ignore_duplicate_articles = {'title', 'url'} ignore_duplicate_articles = {'title', 'url'}
preprocess_regexps = [(re.compile(ur'</?sup>'), lambda match: '')] preprocess_regexps = [(re.compile(u'</?sup>'), lambda match: '')]
ignore_duplicate_articles = {'title', 'url'} ignore_duplicate_articles = {'title', 'url'}
remove_tags = [dict(name='div', attrs={'class': [ remove_tags = [dict(name='div', attrs={'class': [
'backlink', 'im_img', 'addthis_toolbox addthis_default_style', 'banner-bottom']})] 'backlink', 'im_img', 'addthis_toolbox addthis_default_style', 'banner-bottom']})]

View File

@ -15,7 +15,7 @@ class in4(BasicNewsRecipe):
no_stylesheets = True no_stylesheets = True
remove_empty_feeds = True remove_empty_feeds = True
preprocess_regexps = [ preprocess_regexps = [
(re.compile(ur'<a title="translate into.*?</a>', re.DOTALL), lambda match: '')] (re.compile(u'<a title="translate into.*?</a>', re.DOTALL), lambda match: '')]
keep_only_tags = [dict(name='div', attrs={'class': 'left_alone'})] keep_only_tags = [dict(name='div', attrs={'class': 'left_alone'})]
remove_tags_after = dict(name='img', attrs={'title': 'komentarze'}) remove_tags_after = dict(name='img', attrs={'title': 'komentarze'})
remove_tags = [dict(name='img', attrs={'title': 'komentarze'})] remove_tags = [dict(name='img', attrs={'title': 'komentarze'})]

View File

@ -14,7 +14,7 @@ class Ksiazka_net_pl(BasicNewsRecipe):
no_stylesheets = True no_stylesheets = True
remove_empty_feeds = True remove_empty_feeds = True
preprocess_regexps = [ preprocess_regexps = [
(re.compile(ur'Podoba mi się, kupuję:'), lambda match: '<br />')] (re.compile(u'Podoba mi się, kupuję:'), lambda match: '<br />')]
remove_tags_before = dict(name='div', attrs={'class': 'm-body'}) remove_tags_before = dict(name='div', attrs={'class': 'm-body'})
remove_tags_after = dict(name='div', attrs={'class': 'm-body-link'}) remove_tags_after = dict(name='div', attrs={'class': 'm-body-link'})
remove_tags = [ remove_tags = [

View File

@ -10,8 +10,8 @@ class NaTemat(BasicNewsRecipe):
description = u'informacje, komentarze, opinie' description = u'informacje, komentarze, opinie'
category = 'news' category = 'news'
language = 'pl' language = 'pl'
preprocess_regexps = [(re.compile(ur'Czytaj też\:.*?</a>', re.IGNORECASE), lambda m: ''), (re.compile(ur'Zobacz też\:.*?</a>', re.IGNORECASE), lambda m: ''), # noqa preprocess_regexps = [(re.compile(u'Czytaj też\\:.*?</a>', re.IGNORECASE), lambda m: ''), (re.compile(u'Zobacz też\\:.*?</a>', re.IGNORECASE), lambda m: ''), # noqa
(re.compile(ur'Czytaj więcej\:.*?</a>', re.IGNORECASE), lambda m: ''), (re.compile(ur'Czytaj również\:.*?</a>', re.IGNORECASE), lambda m: '')] # noqa (re.compile(u'Czytaj więcej\\:.*?</a>', re.IGNORECASE), lambda m: ''), (re.compile(u'Czytaj również\\:.*?</a>', re.IGNORECASE), lambda m: '')] # noqa
cover_url = 'http://blog.plona.pl/wp-content/uploads/2012/05/natemat.png' cover_url = 'http://blog.plona.pl/wp-content/uploads/2012/05/natemat.png'
no_stylesheets = True no_stylesheets = True
keep_only_tags = [dict(id='main')] keep_only_tags = [dict(id='main')]

View File

@ -14,8 +14,8 @@ class Tablety_pl(BasicNewsRecipe):
no_stylesheets = True no_stylesheets = True
oldest_article = 8 oldest_article = 8
max_articles_per_feed = 100 max_articles_per_feed = 100
preprocess_regexps = [(re.compile(ur'<p><strong>Przeczytaj także.*?</a></strong></p>', re.DOTALL), lambda match: ''), preprocess_regexps = [(re.compile(u'<p><strong>Przeczytaj także.*?</a></strong></p>', re.DOTALL), lambda match: ''),
(re.compile(ur'<p><strong>Przeczytaj koniecznie.*?</a></strong></p>', re.DOTALL), lambda match: '')] (re.compile(u'<p><strong>Przeczytaj koniecznie.*?</a></strong></p>', re.DOTALL), lambda match: '')]
keep_only_tags = [dict(id='news_block')] keep_only_tags = [dict(id='news_block')]
remove_tags = [dict(attrs={'class': ['comments_icon', 'wp-polls', 'entry-comments', remove_tags = [dict(attrs={'class': ['comments_icon', 'wp-polls', 'entry-comments',
'wp-polls-loading', 'ts-fab-wrapper', 'entry-footer', 'social-custom']})] 'wp-polls-loading', 'ts-fab-wrapper', 'entry-footer', 'social-custom']})]

View File

@ -12,8 +12,8 @@ class tanuki(BasicNewsRecipe):
max_articles_per_feed = 100 max_articles_per_feed = 100
encoding = 'utf-8' encoding = 'utf-8'
extra_css = 'ul {list-style: none; padding: 0; margin: 0;} .kadr{float: left;} .dwazdania {float: right;}' extra_css = 'ul {list-style: none; padding: 0; margin: 0;} .kadr{float: left;} .dwazdania {float: right;}'
preprocess_regexps = [(re.compile(ur'<h3><a class="screen".*?</h3>', re.DOTALL), lambda match: ''), (re.compile( preprocess_regexps = [(re.compile(u'<h3><a class="screen".*?</h3>', re.DOTALL), lambda match: ''), (re.compile(
ur'<div><a href="/strony/((manga)|(anime))/[0-9]+?/oceny(\-redakcji){0,1}">Zobacz jak ocenili</a></div>', re.DOTALL), lambda match: '')] unicode(r'<div><a href="/strony/((manga)|(anime))/[0-9]+?/oceny(\-redakcji){0,1}">Zobacz jak ocenili</a></div>'), re.DOTALL), lambda match: '')]
remove_empty_feeds = True remove_empty_feeds = True
no_stylesheets = True no_stylesheets = True
keep_only_tags = [dict(attrs={'class': ['animename', 'storyname', 'nextarrow', 'sideinfov', 'sidelinfov', 'sideinfo', 'sidelinfo']}), dict(name='table', attrs={ 'summary': 'Technikalia'}), dict(attrs={'class': ['chaptername', 'copycat']}), dict(id='rightcolumn'), dict(attrs={'class': ['headn_tt', 'subtable']})] # noqa keep_only_tags = [dict(attrs={'class': ['animename', 'storyname', 'nextarrow', 'sideinfov', 'sidelinfov', 'sideinfo', 'sidelinfo']}), dict(name='table', attrs={ 'summary': 'Technikalia'}), dict(attrs={'class': ['chaptername', 'copycat']}), dict(id='rightcolumn'), dict(attrs={'class': ['headn_tt', 'subtable']})] # noqa

View File

@ -11,7 +11,7 @@ class TawernaRPG(BasicNewsRecipe):
language = 'pl' language = 'pl'
extra_css = '.slajd {list-style-type: none; padding-left: 0px; margin-left: 0px;} .lewanc {float: left; margin-right: 5px;} .srodek {display: block; margin-left: auto; margin-right: auto;}' # noqa extra_css = '.slajd {list-style-type: none; padding-left: 0px; margin-left: 0px;} .lewanc {float: left; margin-right: 5px;} .srodek {display: block; margin-left: auto; margin-right: auto;}' # noqa
cover_url = 'http://www.tawerna.rpg.pl/img/logo.png' cover_url = 'http://www.tawerna.rpg.pl/img/logo.png'
preprocess_regexps = [(re.compile(ur'<h2>Dodaj komentarz</h2>.*</body>', preprocess_regexps = [(re.compile(u'<h2>Dodaj komentarz</h2>.*</body>',
re.DOTALL | re.IGNORECASE), lambda match: '</body>')] re.DOTALL | re.IGNORECASE), lambda match: '</body>')]
use_embedded_content = False use_embedded_content = False
oldest_article = 7 oldest_article = 7

View File

@ -20,8 +20,8 @@ class Trojmiasto(BasicNewsRecipe):
remove_attributes = ['style', 'font'] remove_attributes = ['style', 'font']
ignore_duplicate_articles = {'title', 'url'} ignore_duplicate_articles = {'title', 'url'}
preprocess_regexps = [(re.compile(ur'<strong>Czytaj więcej.*?</a>', re.DOTALL | re.IGNORECASE), lambda match: ''), (re.compile(ur'<strong>Zobacz też.*?</a>', re.DOTALL | re.IGNORECASE), lambda match: ''), # noqa preprocess_regexps = [(re.compile(u'<strong>Czytaj więcej.*?</a>', re.DOTALL | re.IGNORECASE), lambda match: ''), (re.compile(u'<strong>Zobacz też.*?</a>', re.DOTALL | re.IGNORECASE), lambda match: ''), # noqa
(re.compile(ur'<b>[A-ZĄĆĘŁŃÓŚŹŻ \-,.:]*?</b>', re.DOTALL), lambda match: ''), ] (re.compile(u'<b>[A-ZĄĆĘŁŃÓŚŹŻ ,.:-]*?</b>', re.DOTALL), lambda match: ''), ]
remove_tags = [ remove_tags = [
dict(id=['logo', 'font_small', 'font_big']), dict(id=['logo', 'font_small', 'font_big']),

View File

@ -9,8 +9,8 @@ class WNP(BasicNewsRecipe):
description = u'Wirtualny Nowy Przemysł' description = u'Wirtualny Nowy Przemysł'
category = 'economy' category = 'economy'
language = 'pl' language = 'pl'
preprocess_regexps = [(re.compile(ur'Czytaj też:.*?</a>', re.DOTALL), lambda match: ''), preprocess_regexps = [(re.compile(u'Czytaj też:.*?</a>', re.DOTALL), lambda match: ''),
(re.compile(ur'Czytaj więcej:.*?</a>', re.DOTALL), lambda match: '')] (re.compile(u'Czytaj więcej:.*?</a>', re.DOTALL), lambda match: '')]
oldest_article = 8 oldest_article = 8
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True