import re try: from urllib.parse import unquote except ImportError: from urllib import unquote from calibre.web.feeds.news import BasicNewsRecipe class AdvancedUserRecipe1279258912(BasicNewsRecipe): title = u'Orlando Sentinel' oldest_article = 3 max_articles_per_feed = 100 feeds = [ (u'News', u'http://feeds.feedburner.com/orlandosentinel/news'), (u'Opinion', u'http://feeds.feedburner.com/orlandosentinel/news/opinion'), (u'Business', u'http://feeds.feedburner.com/orlandosentinel/business'), (u'Technology', u'http://feeds.feedburner.com/orlandosentinel/technology'), (u'Space and Science', u'http://feeds.feedburner.com/orlandosentinel/news/space'), (u'Entertainment', u'http://feeds.feedburner.com/orlandosentinel/entertainment'), (u'Life and Family', u'http://feeds.feedburner.com/orlandosentinel/features/lifestyle'), ] __author__ = 'rty' pubisher = 'OrlandoSentinel.com' description = 'Orlando, Florida, Newspaper' category = 'News, Orlando, Florida' remove_javascript = True use_embedded_content = False no_stylesheets = True language = 'en' encoding = 'utf-8' conversion_options = {'linearize_tables': True} remove_empty_feeds = True auto_cleanup = True def get_article_url(self, article): ans = None try: s = article.summary ans = unquote( re.search(r'href=".+?bookmark.cfm.+?link=(.+?)"', s).group(1)) except: pass if ans is None: link = article.get('feedburner_origlink', None) if link and link.split('/')[-1] == "story01.htm": link = link.split('/')[-2] encoding = {'0B': '.', '0C': '/', '0A': '0', '0F': '=', '0G': '&', '0D': '?', '0E': '-', '0N': '.com', '0L': 'http:', '0S': '//', '0H': ','} for k, v in encoding.items(): link = link.replace(k, v) ans = link elif link: ans = link if ans is not None: return ans.replace('?track=rss', '')