Fix Orlando Sentinel

This commit is contained in:
Kovid Goyal 2012-04-16 08:18:44 +05:30
parent 49115aa77e
commit 3aa377145c

View File

@ -1,3 +1,4 @@
import urllib, re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1279258912(BasicNewsRecipe): class AdvancedUserRecipe1279258912(BasicNewsRecipe):
@ -27,12 +28,30 @@ class AdvancedUserRecipe1279258912(BasicNewsRecipe):
encoding = 'utf-8' encoding = 'utf-8'
conversion_options = {'linearize_tables':True} conversion_options = {'linearize_tables':True}
masthead_url = 'http://www.orlandosentinel.com/media/graphic/2009-07/46844851.gif' masthead_url = 'http://www.orlandosentinel.com/media/graphic/2009-07/46844851.gif'
keep_only_tags = [
dict(name='div', attrs={'class':'story'}) auto_cleanup = True
]
remove_tags = [ def get_article_url(self, article):
dict(name='div', attrs={'class':['articlerail','tools','comment-group','clearfix']}), ans = None
] try:
remove_tags_after = [ s = article.summary
dict(name='p', attrs={'class':'copyright'}), ans = urllib.unquote(
] re.search(r'href=".+?bookmark.cfm.+?link=(.+?)"', s).group(1))
except:
pass
if ans is None:
link = article.get('feedburner_origlink', None)
if link and link.split('/')[-1]=="story01.htm":
link=link.split('/')[-2]
encoding = {'0B': '.', '0C': '/', '0A': '0', '0F': '=', '0G': '&',
'0D': '?', '0E': '-', '0N': '.com', '0L': 'http:',
'0S':'//'}
for k, v in encoding.iteritems():
link = link.replace(k, v)
ans = link
elif link:
ans = link
if ans is not None:
return ans.replace('?track=rss', '')