Update aktualne.cz. Fixes #1419375 [Fetching aktualne.cz results in wrong encoding in article details](https://bugs.launchpad.net/calibre/+bug/1419375)

This commit is contained in:
Kovid Goyal 2015-02-08 09:55:18 +05:30
parent 05ee6b477e
commit e1c136e3c3

View File

@ -10,6 +10,7 @@ class aktualneRecipe(BasicNewsRecipe):
description = 'aktuálně.cz'
oldest_article = 1
max_articles_per_feed = 20
encoding = 'utf-8'
feeds = [
(u'Domácí', u'http://aktualne.centrum.cz/feeds/rss/domaci/?photo=0'),
@ -20,7 +21,6 @@ class aktualneRecipe(BasicNewsRecipe):
(u'Blogy a názory', u'http://blog.aktualne.centrum.cz/export-all.php')
]
language = 'cs'
cover_url = 'http://img.aktualne.centrum.cz/design/akt4/o/l/logo-akt-ciste.png'
remove_javascript = True
@ -43,7 +43,8 @@ class aktualneRecipe(BasicNewsRecipe):
dict(name='div', attrs={'class':'boxP'}),
dict(name='div', attrs={'class':'box2'})]
preprocess_regexps = [
(re.compile(r'<div class="(contenttitle"|socialni-site|wiki|facebook-promo|facebook-like-button"|meta-akce).*', re.DOTALL|re.IGNORECASE), lambda match: '</body>'),
(re.compile(r'<div class="(contenttitle"|socialni-site|wiki|facebook-promo|facebook-like-button"|meta-akce).*',
re.DOTALL|re.IGNORECASE), lambda match: '</body>'),
(re.compile(r'<div class="[^"]*poutak-clanek-trojka".*', re.DOTALL|re.IGNORECASE), lambda match: '</body>')]
keep_only_tags = []
@ -58,12 +59,3 @@ class aktualneRecipe(BasicNewsRecipe):
self.visited_urls[url] = True
self.log.debug('Accepting: ' + url)
return url
def encoding(self, source):
if source.newurl.find('blog.aktualne') >= 0:
enc = 'utf-8'
else:
enc = 'iso-8859-2'
self.log.debug('Called encoding ' + enc + " " + str(source.newurl))
return source.decode(enc, 'replace')