mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-11-03 19:17:02 -05:00 
			
		
		
		
	Merge branch 'master' of https://github.com/CoderAllan/calibre
This commit is contained in:
		
						commit
						96947fd01a
					
				@ -18,35 +18,6 @@ class EdgeConversationRSS(BasicNewsRecipe):
 | 
				
			|||||||
    oldest_article = 60
 | 
					    oldest_article = 60
 | 
				
			||||||
    max_articles_per_feed = 100
 | 
					    max_articles_per_feed = 100
 | 
				
			||||||
    no_stylesheets = True
 | 
					    no_stylesheets = True
 | 
				
			||||||
 | 
					    auto_cleanup = True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    keep_only_tags = [
 | 
					    feeds = [(u'Edge RSS', u'http://edge.org/feed')]
 | 
				
			||||||
        dict(name='div', attrs={'class': 'HomeLeftPannel IMGCTRL'})]
 | 
					 | 
				
			||||||
    remove_tags = [
 | 
					 | 
				
			||||||
        dict(name='div', attrs={'class': 'Logo'})
 | 
					 | 
				
			||||||
    ]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    feeds = [(u'Edge RSS', u'http://edge.org/feeds/')]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def print_version(self, url):
 | 
					 | 
				
			||||||
        return url.replace('conversation/', 'conversation.php?cid=')
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def parse_feeds(self):
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        # Call parent's method.
 | 
					 | 
				
			||||||
        feeds = BasicNewsRecipe.parse_feeds(self)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        # Loop through all feeds.
 | 
					 | 
				
			||||||
        for feed in feeds:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
            # Loop through all articles in feed.
 | 
					 | 
				
			||||||
            for article in feed.articles[:]:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
                # Remove anything that is not a conversation, and remove PDF
 | 
					 | 
				
			||||||
                # files as well...
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
                if not ('CONVERSATION' in article.title):
 | 
					 | 
				
			||||||
                    feed.articles.remove(article)
 | 
					 | 
				
			||||||
                elif 'pdf' in article.url:
 | 
					 | 
				
			||||||
                    feed.articles.remove(article)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        return feeds
 | 
					 | 
				
			||||||
 | 
				
			|||||||
@ -1,53 +0,0 @@
 | 
				
			|||||||
 | 
					 | 
				
			||||||
from calibre.web.feeds.news import BasicNewsRecipe
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
class EuropeanVoice(BasicNewsRecipe):
 | 
					 | 
				
			||||||
    title = u'European Voice'
 | 
					 | 
				
			||||||
    __author__ = 'malfi'
 | 
					 | 
				
			||||||
    oldest_article = 14
 | 
					 | 
				
			||||||
    max_articles_per_feed = 100
 | 
					 | 
				
			||||||
    no_stylesheets = True
 | 
					 | 
				
			||||||
    cover_url = 'http://www.europeanvoice.com/Css/images/logo.gif'
 | 
					 | 
				
			||||||
    language = 'en'
 | 
					 | 
				
			||||||
    keep_only_tags = [dict(name='div', attrs={'id': 'articleLeftColumn'})]
 | 
					 | 
				
			||||||
    remove_tags = [dict(name='div', attrs={'id': 'BreadCrump'})]
 | 
					 | 
				
			||||||
    feeds = [
 | 
					 | 
				
			||||||
        (u'Whole site ', u'http://www.europeanvoice.com/Rss/2.xml'),
 | 
					 | 
				
			||||||
        (u'News and analysis', u'http://www.europeanvoice.com/Rss/6.xml'),
 | 
					 | 
				
			||||||
        (u'Comment', u'http://www.europeanvoice.com/Rss/7.xml'),
 | 
					 | 
				
			||||||
        (u'Special reports', u'http://www.europeanvoice.com/Rss/5.xml'),
 | 
					 | 
				
			||||||
        (u'People', u'http://www.europeanvoice.com/Rss/8.xml'),
 | 
					 | 
				
			||||||
        (u'Career', u'http://www.europeanvoice.com/Rss/11.xml'),
 | 
					 | 
				
			||||||
        (u'Policies', u'http://www.europeanvoice.com/Rss/4.xml'),
 | 
					 | 
				
			||||||
        (u'EVents', u'http://www.europeanvoice.com/Rss/10.xml'),
 | 
					 | 
				
			||||||
        (u'Policies - Economics', u'http://www.europeanvoice.com/Rss/31.xml'),
 | 
					 | 
				
			||||||
        (u'Policies - Business', u'http://www.europeanvoice.com/Rss/19.xml'),
 | 
					 | 
				
			||||||
        (u'Policies - Trade', u'http://www.europeanvoice.com/Rss/25.xml'),
 | 
					 | 
				
			||||||
        (u'Policies - Information society',
 | 
					 | 
				
			||||||
         u'http://www.europeanvoice.com/Rss/20.xml'),
 | 
					 | 
				
			||||||
        (u'Policies - Energy', u'http://www.europeanvoice.com/Rss/15.xml'),
 | 
					 | 
				
			||||||
        (u'Policies - Transport', u'http://www.europeanvoice.com/Rss/18.xml'),
 | 
					 | 
				
			||||||
        (u'Policies - Climate change', u'http://www.europeanvoice.com/Rss/16.xml'),
 | 
					 | 
				
			||||||
        (u'Policies - Environment', u'http://www.europeanvoice.com/Rss/17.xml'),
 | 
					 | 
				
			||||||
        (u'Policies - Farming & food', u'http://www.europeanvoice.com/Rss/23.xml'),
 | 
					 | 
				
			||||||
        (u'Policies - Health & society', u'http://www.europeanvoice.com/Rss/24.xml'),
 | 
					 | 
				
			||||||
        (u'Policies - Justice', u'http://www.europeanvoice.com/Rss/29.xml'),
 | 
					 | 
				
			||||||
        (u'Policies - Foreign affairs', u'http://www.europeanvoice.com/Rss/27.xml')
 | 
					 | 
				
			||||||
    ]
 | 
					 | 
				
			||||||
    extra_css = '''
 | 
					 | 
				
			||||||
        h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
 | 
					 | 
				
			||||||
        h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
 | 
					 | 
				
			||||||
        p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
 | 
					 | 
				
			||||||
        body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
 | 
					 | 
				
			||||||
        '''
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def print_version(self, url):
 | 
					 | 
				
			||||||
        return url + '?bPrint=1'
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def preprocess_html(self, soup):
 | 
					 | 
				
			||||||
        denied = soup.findAll(True, text='Subscribers')
 | 
					 | 
				
			||||||
        if denied:
 | 
					 | 
				
			||||||
            raise Exception(
 | 
					 | 
				
			||||||
                'Article skipped, because content can only be seen with subscription')
 | 
					 | 
				
			||||||
        return soup
 | 
					 | 
				
			||||||
@ -15,29 +15,17 @@ class Kitsapsun(BasicNewsRecipe):
 | 
				
			|||||||
    publisher = 'Scripps Interactive Newspapers Group'
 | 
					    publisher = 'Scripps Interactive Newspapers Group'
 | 
				
			||||||
    category = 'news, Kitsap county, USA'
 | 
					    category = 'news, Kitsap county, USA'
 | 
				
			||||||
    language = 'en'
 | 
					    language = 'en'
 | 
				
			||||||
    oldest_article = 2
 | 
					    oldest_article = 7
 | 
				
			||||||
    max_articles_per_feed = 100
 | 
					    max_articles_per_feed = 50
 | 
				
			||||||
    no_stylesheets = True
 | 
					    no_stylesheets = True
 | 
				
			||||||
    encoding = 'cp1252'
 | 
					    encoding = 'cp1252'
 | 
				
			||||||
    use_embedded_content = False
 | 
					    use_embedded_content = False
 | 
				
			||||||
 | 
					    auto_cleanup = True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    conversion_options = {
 | 
					    feeds = [(u'News', u'http://www.kitsapsun.com/feeds/rss/news'),
 | 
				
			||||||
        'comments': description, 'tags': category, 'language': language, 'publisher': publisher
 | 
					             (u'Sports', u'http://www.kitsapsun.com/feeds/rss/sports'),
 | 
				
			||||||
    }
 | 
					             (u'Entertainment',
 | 
				
			||||||
 | 
					              u'http://www.kitsapsun.com/feeds/rss/entertainment'),
 | 
				
			||||||
    keep_only_tags = [
 | 
					             (u'Lifestyles', u'http://www.kitsapsun.com/feeds/rss/lifestyle'),
 | 
				
			||||||
        dict(name='div', attrs={'id': ['story_meta', 'story_content']})]
 | 
					             (u'Opinion', u'http://www.kitsapsun.com/feeds/rss/opinion'),
 | 
				
			||||||
 | 
					 | 
				
			||||||
    remove_tags = [dict(name=['object', 'link', 'embed', 'form', 'iframe'])]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    feeds = [
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    (u'News', u'http://www.kitsapsun.com/rss/headlines/news/'),
 | 
					 | 
				
			||||||
    (u'Business', u'http://www.kitsapsun.com/rss/headlines/business/'),
 | 
					 | 
				
			||||||
    (u'Communities', u'http://www.kitsapsun.com/rss/headlines/communities/'),
 | 
					 | 
				
			||||||
    (u'Entertainment', u'http://www.kitsapsun.com/rss/headlines/entertainment/'),
 | 
					 | 
				
			||||||
    (u'Lifestyles', u'http://www.kitsapsun.com/rss/headlines/lifestyles/')
 | 
					 | 
				
			||||||
             ]
 | 
					             ]
 | 
				
			||||||
 | 
					 | 
				
			||||||
    def print_version(self, url):
 | 
					 | 
				
			||||||
        return url.rpartition('/')[0] + '/?print=1'
 | 
					 | 
				
			||||||
 | 
				
			|||||||
@ -14,5 +14,5 @@ class HindustanTimes(BasicNewsRecipe):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    feeds = [
 | 
					    feeds = [
 | 
				
			||||||
        ('News',
 | 
					        ('News',
 | 
				
			||||||
         'http://www.mobilenations.com/rss/mb.xml'),
 | 
					         'http://www.mobilenations.com/about?format=RSS'),
 | 
				
			||||||
    ]
 | 
					    ]
 | 
				
			||||||
 | 
				
			|||||||
@ -1,8 +1,4 @@
 | 
				
			|||||||
import re
 | 
					 | 
				
			||||||
from calibre.web.feeds.news import BasicNewsRecipe
 | 
					from calibre.web.feeds.news import BasicNewsRecipe
 | 
				
			||||||
from calibre import browser
 | 
					 | 
				
			||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
 | 
					 | 
				
			||||||
from calibre.ebooks.BeautifulSoup import Tag
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class AdvancedUserRecipe1306061239(BasicNewsRecipe):
 | 
					class AdvancedUserRecipe1306061239(BasicNewsRecipe):
 | 
				
			||||||
@ -31,383 +27,10 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
 | 
				
			|||||||
    simultaneous_downloads = 20
 | 
					    simultaneous_downloads = 20
 | 
				
			||||||
    use_embedded_content = False
 | 
					    use_embedded_content = False
 | 
				
			||||||
    recursions = 0
 | 
					    recursions = 0
 | 
				
			||||||
 | 
					    auto_cleanup = True
 | 
				
			||||||
    conversion_options = {
 | 
					 | 
				
			||||||
        'comment': description, 'tags': category, 'publisher': publisher, 'language': language
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    feeds = [
 | 
					    feeds = [
 | 
				
			||||||
        (u'NME News', u'http://www.nme.com/rss/news'),
 | 
					        (u'NME News', u'http://www.nme.com/news/feed'),
 | 
				
			||||||
        (u'Reviews', u'http://www.nme.com/rss/reviews'),
 | 
					        (u'Reviews', u'http://www.nme.com/reviews/feed/'),
 | 
				
			||||||
        (u'Blogs', u'http://www.nme.com/rss/blogs'),
 | 
					        (u'Blogs', u'http://www.nme.com/blogs/feed'),
 | 
				
			||||||
    ]
 | 
					    ]
 | 
				
			||||||
 | 
					 | 
				
			||||||
    keep_only_tags = [
 | 
					 | 
				
			||||||
        dict(name='div', attrs={'id': 'content'}),
 | 
					 | 
				
			||||||
    ]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    remove_attributes = ['border', 'cellspacing', 'align', 'cellpadding', 'colspan',
 | 
					 | 
				
			||||||
                         'valign', 'vspace', 'hspace', 'alt', 'width', 'height']
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    remove_tags = [
 | 
					 | 
				
			||||||
        dict(name='meta'),
 | 
					 | 
				
			||||||
        dict(name='span', attrs={'class': 'article_info'}),
 | 
					 | 
				
			||||||
        dict(name='div', attrs={'class': 'breadcrumbs'}),
 | 
					 | 
				
			||||||
        dict(name='div', attrs={'class': 'mugshot'}),
 | 
					 | 
				
			||||||
        dict(name='div', attrs={'class': 'header'}),
 | 
					 | 
				
			||||||
        dict(name='div', attrs={'class': re.compile(
 | 
					 | 
				
			||||||
            'youtube.*', re.IGNORECASE)}),
 | 
					 | 
				
			||||||
        dict(name='div', attrs={'class': re.compile(
 | 
					 | 
				
			||||||
            'socialbuttons.*', re.IGNORECASE)}),
 | 
					 | 
				
			||||||
        dict(name='div', attrs={'class': 'clear_both'}),
 | 
					 | 
				
			||||||
        dict(name='div', attrs={'class': re.compile(
 | 
					 | 
				
			||||||
            'headline.*', re.IGNORECASE)}),
 | 
					 | 
				
			||||||
        dict(name='div', attrs={'class': 'member-signedout'}),
 | 
					 | 
				
			||||||
        dict(name='div', attrs={'class': re.compile(
 | 
					 | 
				
			||||||
            'prev_next.*', re.IGNORECASE)}),
 | 
					 | 
				
			||||||
        dict(name='div', attrs={'class': re.compile(
 | 
					 | 
				
			||||||
            'article_related.*', re.IGNORECASE)}),
 | 
					 | 
				
			||||||
        dict(name='div', attrs={'class': re.compile(
 | 
					 | 
				
			||||||
            'feature_bar.*', re.IGNORECASE)}),
 | 
					 | 
				
			||||||
        dict(name='div', attrs={'class': re.compile('ebay.*', re.IGNORECASE)}),
 | 
					 | 
				
			||||||
        dict(name='div', attrs={'id': re.compile(
 | 
					 | 
				
			||||||
            'morenews.*', re.IGNORECASE)}),
 | 
					 | 
				
			||||||
        dict(name='div', attrs={'id': re.compile(
 | 
					 | 
				
			||||||
            'ticketspopup.*', re.IGNORECASE)}),
 | 
					 | 
				
			||||||
        dict(name='div', attrs={'id': re.compile(
 | 
					 | 
				
			||||||
            'ratemy_logprompt.*', re.IGNORECASE)}),
 | 
					 | 
				
			||||||
        dict(name='div', attrs={'id': re.compile(
 | 
					 | 
				
			||||||
            'related_artist.*', re.IGNORECASE)}),
 | 
					 | 
				
			||||||
        dict(name='img', attrs={'class': re.compile(
 | 
					 | 
				
			||||||
            'video_play_large.*', re.IGNORECASE)}),
 | 
					 | 
				
			||||||
        dict(name='ul', attrs={'class': re.compile(
 | 
					 | 
				
			||||||
            'prev_next.*', re.IGNORECASE)}),
 | 
					 | 
				
			||||||
        dict(name='ul', attrs={'class': re.compile(
 | 
					 | 
				
			||||||
            'nme_store.*', re.IGNORECASE)}),
 | 
					 | 
				
			||||||
        dict(name='p', attrs={'class': re.compile('top', re.IGNORECASE)}),
 | 
					 | 
				
			||||||
        dict(name='table', attrs={
 | 
					 | 
				
			||||||
             'class': re.compile('tickets.*', re.IGNORECASE)}),
 | 
					 | 
				
			||||||
    ]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    masthead_url = 'http://default.media.ipcdigital.co.uk/300/000001014/e1ab_orh100000w300/NME-logo.jpg'
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def get_cover_url(self):
 | 
					 | 
				
			||||||
        magazine_page_raw = self.index_to_soup(
 | 
					 | 
				
			||||||
            'http://www.nme.com/magazine', raw=True)
 | 
					 | 
				
			||||||
        magazine_page_raw = re.sub(
 | 
					 | 
				
			||||||
            r'<script\b.+?</script>', '', magazine_page_raw, flags=re.DOTALL | re.IGNORECASE)
 | 
					 | 
				
			||||||
        magazine_page_raw = re.sub(
 | 
					 | 
				
			||||||
            r'\!\[if ', '!--[if ', magazine_page_raw, flags=re.DOTALL | re.IGNORECASE)
 | 
					 | 
				
			||||||
        magazine_page = self.index_to_soup(magazine_page_raw)
 | 
					 | 
				
			||||||
        cov = magazine_page.find('img', attrs={'class': 'magcover'})
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        cov2 = str(cov['src'])
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        br = browser()
 | 
					 | 
				
			||||||
        br.set_handle_redirect(False)
 | 
					 | 
				
			||||||
        try:
 | 
					 | 
				
			||||||
            br.open_novisit(cov2)
 | 
					 | 
				
			||||||
            cover_url = str(cov2)
 | 
					 | 
				
			||||||
        except:
 | 
					 | 
				
			||||||
            cover_url = 'http://default.media.ipcdigital.co.uk/300/000001014/e1ab_orh100000w300/NME-logo.jpg'
 | 
					 | 
				
			||||||
        return cover_url
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def preprocess_raw_html(self, raw_html, url):
 | 
					 | 
				
			||||||
        '''
 | 
					 | 
				
			||||||
        Need this for a bug on site that prevents blogg post being parsed correctly
 | 
					 | 
				
			||||||
        '''
 | 
					 | 
				
			||||||
        raw_html = re.sub(r'\!\[if ', '!--[if ', raw_html,
 | 
					 | 
				
			||||||
                          flags=re.DOTALL | re.IGNORECASE)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        return raw_html
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def preprocess_html(self, soup):
 | 
					 | 
				
			||||||
        youtube_regex = re.compile(
 | 
					 | 
				
			||||||
            r'(?:youtube\.com/(?:[^/]+/.+/|(?:v|e(?:mbed)?)/|.*[?&]v=)|youtu\.be/)(?P<id>[^"&?/ ]{11})', re.DOTALL | re.IGNORECASE)
 | 
					 | 
				
			||||||
        instagram_regex = re.compile(
 | 
					 | 
				
			||||||
            r'.*?instagram.*?', re.DOTALL | re.IGNORECASE)
 | 
					 | 
				
			||||||
        twitter_regex = re.compile(r'.*?twitter.*?', re.DOTALL | re.IGNORECASE)
 | 
					 | 
				
			||||||
        visualise_regex = re.compile(
 | 
					 | 
				
			||||||
            r'.*?visualise.*?', re.DOTALL | re.IGNORECASE)
 | 
					 | 
				
			||||||
        soundcloud_regex = re.compile(
 | 
					 | 
				
			||||||
            r'(?P<url>.*?(w|api)\.soundcloud.*?com\/(tracks|playlists)\/\d{8,9})', re.DOTALL | re.IGNORECASE)
 | 
					 | 
				
			||||||
        dailymotion_regex = re.compile(
 | 
					 | 
				
			||||||
            r'.*?dailymotion.*?', re.DOTALL | re.IGNORECASE)
 | 
					 | 
				
			||||||
        spotify_regex = re.compile(r'.*?spotify.*?', re.DOTALL | re.IGNORECASE)
 | 
					 | 
				
			||||||
        vine_regex = re.compile(r'.*?vine.*?', re.DOTALL | re.IGNORECASE)
 | 
					 | 
				
			||||||
        doubleHtmlEntities = re.compile(
 | 
					 | 
				
			||||||
            ur'(&)(?P<e>[\d\w\#]*;)', re.DOTALL | re.IGNORECASE | re.UNICODE)
 | 
					 | 
				
			||||||
        for iframe in soup.findAll('iframe'):
 | 
					 | 
				
			||||||
            if iframe.has_key('src') and youtube_regex.search(iframe['src']) is not None:  # noqa
 | 
					 | 
				
			||||||
                pq = Tag(soup, 'blockquote')
 | 
					 | 
				
			||||||
                br = Tag(soup, 'br')
 | 
					 | 
				
			||||||
                pq.insert(0, '[ YouTube ]')
 | 
					 | 
				
			||||||
                pq.insert(1, br)
 | 
					 | 
				
			||||||
                m = youtube_regex.search(iframe['src'])
 | 
					 | 
				
			||||||
                if m.group('id') is not None:
 | 
					 | 
				
			||||||
                    imgTag = Tag(soup, 'img', [
 | 
					 | 
				
			||||||
                                 ('src', 'http://img.youtube.com/vi/' + m.group('id') + '/0.jpg')])
 | 
					 | 
				
			||||||
                    pq.insert(len(pq.contents), imgTag)
 | 
					 | 
				
			||||||
                pq.insert(len(pq.contents), iframe['src'])
 | 
					 | 
				
			||||||
                iframe.replaceWith(pq)
 | 
					 | 
				
			||||||
            elif iframe.has_key('src') and soundcloud_regex.search(iframe['src']) is not None:  # noqa
 | 
					 | 
				
			||||||
                m = soundcloud_regex.search(iframe['src'])
 | 
					 | 
				
			||||||
                pq = Tag(soup, 'blockquote')
 | 
					 | 
				
			||||||
                br = Tag(soup, 'br')
 | 
					 | 
				
			||||||
                pq.insert(0, '[ SoundCloud ]')
 | 
					 | 
				
			||||||
                pq.insert(1, br)
 | 
					 | 
				
			||||||
                pq.insert(2, m.group('url'))
 | 
					 | 
				
			||||||
                iframe.replaceWith(pq)
 | 
					 | 
				
			||||||
            elif iframe.has_key('src') and dailymotion_regex.search(iframe['src']) is not None:  # noqa
 | 
					 | 
				
			||||||
                pq = Tag(soup, 'blockquote')
 | 
					 | 
				
			||||||
                br = Tag(soup, 'br')
 | 
					 | 
				
			||||||
                pq.insert(0, '[ DailyMotion ]')
 | 
					 | 
				
			||||||
                pq.insert(1, br)
 | 
					 | 
				
			||||||
                imgUrl = self.get_dailymotion_pic(iframe['src'])
 | 
					 | 
				
			||||||
                if imgUrl is not None:
 | 
					 | 
				
			||||||
                    imgTag = Tag(soup, 'img', [('src', imgUrl)])
 | 
					 | 
				
			||||||
                    pq.insert(len(pq.contents), imgTag)
 | 
					 | 
				
			||||||
                pq.insert(len(pq.contents), iframe['src'])
 | 
					 | 
				
			||||||
                iframe.replaceWith(pq)
 | 
					 | 
				
			||||||
            elif iframe.has_key('src') and spotify_regex.search(iframe['src']) is not None:  # noqa
 | 
					 | 
				
			||||||
                pq = Tag(soup, 'blockquote')
 | 
					 | 
				
			||||||
                br = Tag(soup, 'br')
 | 
					 | 
				
			||||||
                pq.insert(0, '[ Spotify ]')
 | 
					 | 
				
			||||||
                pq.insert(1, br)
 | 
					 | 
				
			||||||
                imgUrl = self.get_spotify_pic(iframe['src'])
 | 
					 | 
				
			||||||
                if imgUrl is not None:
 | 
					 | 
				
			||||||
                    imgTag = Tag(soup, 'img', [('src', imgUrl)])
 | 
					 | 
				
			||||||
                    pq.insert(len(pq.contents), imgTag)
 | 
					 | 
				
			||||||
                pq.insert(len(pq.contents), iframe['src'])
 | 
					 | 
				
			||||||
                iframe.replaceWith(pq)
 | 
					 | 
				
			||||||
            elif iframe.has_key('src') and vine_regex.search(iframe['src']) is not None:  # noqa
 | 
					 | 
				
			||||||
                pq = Tag(soup, 'blockquote')
 | 
					 | 
				
			||||||
                br = Tag(soup, 'br')
 | 
					 | 
				
			||||||
                pq.insert(0, '[ Vine ]')
 | 
					 | 
				
			||||||
                pq.insert(1, br)
 | 
					 | 
				
			||||||
                imgUrl = self.get_vine_pic(iframe['src'])
 | 
					 | 
				
			||||||
                if imgUrl is not None:
 | 
					 | 
				
			||||||
                    imgTag = Tag(soup, 'img', [('src', imgUrl)])
 | 
					 | 
				
			||||||
                    pq.insert(len(pq.contents), imgTag)
 | 
					 | 
				
			||||||
                pq.insert(len(pq.contents), iframe['src'])
 | 
					 | 
				
			||||||
                iframe.replaceWith(pq)
 | 
					 | 
				
			||||||
            elif iframe.has_key('src') and visualise_regex.search(iframe['src']) is not None:  # noqa
 | 
					 | 
				
			||||||
                imgUrl = self.get_visualise_pic(iframe['src'])
 | 
					 | 
				
			||||||
                if imgUrl is not None:
 | 
					 | 
				
			||||||
                    imgTag = Tag(soup, 'img', [('src', imgUrl)])
 | 
					 | 
				
			||||||
                    iframe.replaceWith(imgTag)
 | 
					 | 
				
			||||||
        for blockquote in soup.findAll('blockquote'):
 | 
					 | 
				
			||||||
            if blockquote.has_key('class') and twitter_regex.search(blockquote['class']) is not None:  # noqa
 | 
					 | 
				
			||||||
                pq = Tag(soup, 'blockquote')
 | 
					 | 
				
			||||||
                br = Tag(soup, 'br')
 | 
					 | 
				
			||||||
                pq.insert(0, '[ Twitter ]')
 | 
					 | 
				
			||||||
                pq.insert(len(pq.contents), br)
 | 
					 | 
				
			||||||
                match = re.search(
 | 
					 | 
				
			||||||
                    "(?P<url>pic\.twitter[^\s<]+)", str(blockquote))
 | 
					 | 
				
			||||||
                if match is not None:
 | 
					 | 
				
			||||||
                    img = self.get_twitter_pic(str(match.group("url")))
 | 
					 | 
				
			||||||
                    if img is not None:
 | 
					 | 
				
			||||||
                        pq.insert(len(pq.contents), img)
 | 
					 | 
				
			||||||
                for p in blockquote.findAll(name='p'):
 | 
					 | 
				
			||||||
                    x = 0
 | 
					 | 
				
			||||||
                    plen = len(p.contents)
 | 
					 | 
				
			||||||
                    while True:
 | 
					 | 
				
			||||||
                        c = len(pq.contents)
 | 
					 | 
				
			||||||
                        if p.contents[x].string is not None:
 | 
					 | 
				
			||||||
                            pq.insert(c, re.sub(doubleHtmlEntities, lambda m: '&' + m.group(
 | 
					 | 
				
			||||||
                                2), str(p.contents[x].string), re.IGNORECASE | re.UNICODE))
 | 
					 | 
				
			||||||
                        else:
 | 
					 | 
				
			||||||
                            pq.insert(c, p.contents[x].content)
 | 
					 | 
				
			||||||
                        x += 1
 | 
					 | 
				
			||||||
                        if x == plen:
 | 
					 | 
				
			||||||
                            break
 | 
					 | 
				
			||||||
                    br = Tag(soup, 'br')
 | 
					 | 
				
			||||||
                    pq.insert(len(pq.contents), br)
 | 
					 | 
				
			||||||
                    p.extract()
 | 
					 | 
				
			||||||
                if len(blockquote.contents) > 0:
 | 
					 | 
				
			||||||
                    x = 0
 | 
					 | 
				
			||||||
                    xlen = len(blockquote.contents)
 | 
					 | 
				
			||||||
                    while True:
 | 
					 | 
				
			||||||
                        c = len(pq.contents)
 | 
					 | 
				
			||||||
                        if blockquote.contents[x].string is not None:
 | 
					 | 
				
			||||||
                            pq.insert(c, re.sub(doubleHtmlEntities, lambda m: '&' + m.group(
 | 
					 | 
				
			||||||
                                2), str(blockquote.contents[x].string), re.IGNORECASE | re.UNICODE))
 | 
					 | 
				
			||||||
                        else:
 | 
					 | 
				
			||||||
                            pq.insert(c, blockquote.contents[x].content)
 | 
					 | 
				
			||||||
                        x += 1
 | 
					 | 
				
			||||||
                        if x == xlen:
 | 
					 | 
				
			||||||
                            break
 | 
					 | 
				
			||||||
                blockquote.replaceWith(pq)
 | 
					 | 
				
			||||||
            elif blockquote.has_key('class') and instagram_regex.search(blockquote['class']) is not None:  # noqa
 | 
					 | 
				
			||||||
                pq = Tag(soup, 'blockquote')
 | 
					 | 
				
			||||||
                br = Tag(soup, 'br')
 | 
					 | 
				
			||||||
                pq.insert(0, '[ Instagram ]')
 | 
					 | 
				
			||||||
                pq.insert(1, br)
 | 
					 | 
				
			||||||
                a = blockquote.find(name='a', attrs={'href': instagram_regex})
 | 
					 | 
				
			||||||
                imgUrl = None
 | 
					 | 
				
			||||||
                if a is not None:
 | 
					 | 
				
			||||||
                    imgUrl = self.get_instagram_pic(str(a['href']))
 | 
					 | 
				
			||||||
                if imgUrl is not None:
 | 
					 | 
				
			||||||
                    img = Tag(soup, 'img', [('src', imgUrl)])
 | 
					 | 
				
			||||||
                    pq.insert(len(pq.contents), img)
 | 
					 | 
				
			||||||
                for p in blockquote.findAll(name='p'):
 | 
					 | 
				
			||||||
                    x = 0
 | 
					 | 
				
			||||||
                    plen = len(p.contents)
 | 
					 | 
				
			||||||
                    while x < plen:
 | 
					 | 
				
			||||||
                        c = len(pq.contents)
 | 
					 | 
				
			||||||
                        if p.contents[x].string is not None:
 | 
					 | 
				
			||||||
                            pq.insert(c, re.sub(doubleHtmlEntities, lambda m: '&' + m.group(
 | 
					 | 
				
			||||||
                                2), str(p.contents[x].string), re.IGNORECASE | re.UNICODE))
 | 
					 | 
				
			||||||
                        # else:
 | 
					 | 
				
			||||||
                            # pq.insert(c, p.contents[x].content)
 | 
					 | 
				
			||||||
                        x += 1
 | 
					 | 
				
			||||||
                    br = Tag(soup, 'br')
 | 
					 | 
				
			||||||
                    c = len(pq.contents)
 | 
					 | 
				
			||||||
                    pq.insert(c, br)
 | 
					 | 
				
			||||||
                blockquote.replaceWith(pq)
 | 
					 | 
				
			||||||
        for alink in soup.findAll('a'):
 | 
					 | 
				
			||||||
            if alink.string is not None:
 | 
					 | 
				
			||||||
                tstr = alink.string
 | 
					 | 
				
			||||||
                alink.replaceWith(tstr)
 | 
					 | 
				
			||||||
            elif alink.img is not None:
 | 
					 | 
				
			||||||
                tstr = alink.img
 | 
					 | 
				
			||||||
                alink.replaceWith(tstr)
 | 
					 | 
				
			||||||
            elif alink.span is not None:
 | 
					 | 
				
			||||||
                tstr = alink.span
 | 
					 | 
				
			||||||
                alink.replaceWith(tstr)
 | 
					 | 
				
			||||||
        return soup
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def get_visualise_pic(self, url):
 | 
					 | 
				
			||||||
        returnValue = None
 | 
					 | 
				
			||||||
        try:
 | 
					 | 
				
			||||||
            raw = self.browser.open(url).read()
 | 
					 | 
				
			||||||
        except:
 | 
					 | 
				
			||||||
            print '404: ' + url
 | 
					 | 
				
			||||||
            return returnValue
 | 
					 | 
				
			||||||
        bs = BeautifulSoup(raw)
 | 
					 | 
				
			||||||
        imgRaw = bs.find(name='meta', attrs={'property': 'og:image'})
 | 
					 | 
				
			||||||
        if imgRaw is not None:
 | 
					 | 
				
			||||||
            returnValue = str(imgRaw['content'])
 | 
					 | 
				
			||||||
        return returnValue
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def get_twitter_pic(self, url):
 | 
					 | 
				
			||||||
        returnValue = None
 | 
					 | 
				
			||||||
        try:
 | 
					 | 
				
			||||||
            raw = self.browser.open('https://' + url).read()
 | 
					 | 
				
			||||||
        except:
 | 
					 | 
				
			||||||
            print '404: ' + url
 | 
					 | 
				
			||||||
            return returnValue
 | 
					 | 
				
			||||||
        bs = BeautifulSoup(raw)
 | 
					 | 
				
			||||||
        refresh = bs.find('meta', {'http-equiv': 'refresh'})
 | 
					 | 
				
			||||||
        if refresh is not None:
 | 
					 | 
				
			||||||
            content = refresh.get('content').partition('=')[2]
 | 
					 | 
				
			||||||
            try:
 | 
					 | 
				
			||||||
                raw = self.browser.open(content).read()
 | 
					 | 
				
			||||||
            except:
 | 
					 | 
				
			||||||
                print '404: ' + url
 | 
					 | 
				
			||||||
                return returnValue
 | 
					 | 
				
			||||||
            bs = BeautifulSoup(raw)
 | 
					 | 
				
			||||||
        img = bs.find(name='img', attrs={
 | 
					 | 
				
			||||||
                      'alt': re.compile('.*permalink.*', re.IGNORECASE)})
 | 
					 | 
				
			||||||
        if img is not None:
 | 
					 | 
				
			||||||
            returnValue = img
 | 
					 | 
				
			||||||
        return returnValue
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def get_soundcloud_pic(self, url):
 | 
					 | 
				
			||||||
        # content loaded via javascript and require an login and/or registered application identification
 | 
					 | 
				
			||||||
        # returnValue = None
 | 
					 | 
				
			||||||
        # raw = self.browser.open(soundcloudUrl + '&visual=true').read()
 | 
					 | 
				
			||||||
        # bs = BeautifulSoup(raw)
 | 
					 | 
				
			||||||
        # imgRaw = bs.find(name='div', attrs={'style':re.compile(r'backgroud-image:*?',re.IGNORECASE)})
 | 
					 | 
				
			||||||
        # if imgRaw is not None:
 | 
					 | 
				
			||||||
            # returnValue = str(imgRaw['style'])
 | 
					 | 
				
			||||||
        return None  # returnValue
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def get_instagram_pic(self, url):
 | 
					 | 
				
			||||||
        returnValue = None
 | 
					 | 
				
			||||||
        try:
 | 
					 | 
				
			||||||
            raw = self.browser.open(url).read()
 | 
					 | 
				
			||||||
        except:
 | 
					 | 
				
			||||||
            print '404: ' + url
 | 
					 | 
				
			||||||
            return returnValue
 | 
					 | 
				
			||||||
        m = re.search('\"display_src\":\"(?P<url>http[s]?:.*?)\"', str(raw))
 | 
					 | 
				
			||||||
        if m is not None:
 | 
					 | 
				
			||||||
            returnValue = re.sub(r'\\', '', m.group(
 | 
					 | 
				
			||||||
                "url"), flags=re.DOTALL | re.IGNORECASE)
 | 
					 | 
				
			||||||
        return returnValue
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def get_dailymotion_pic(self, url):
 | 
					 | 
				
			||||||
        returnValue = None
 | 
					 | 
				
			||||||
        try:
 | 
					 | 
				
			||||||
            raw = self.browser.open(url).read()
 | 
					 | 
				
			||||||
        except:
 | 
					 | 
				
			||||||
            print '404: ' + url
 | 
					 | 
				
			||||||
            return returnValue
 | 
					 | 
				
			||||||
        m = re.search('("thumbnail_url\"\:\")(?P<url>http.*?)(\")', str(raw))
 | 
					 | 
				
			||||||
        if m is not None:
 | 
					 | 
				
			||||||
            returnValue = re.sub(r'\\', '', m.group(
 | 
					 | 
				
			||||||
                "url"), flags=re.DOTALL | re.IGNORECASE)
 | 
					 | 
				
			||||||
        return returnValue
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def get_spotify_pic(self, url):
 | 
					 | 
				
			||||||
        returnValue = None
 | 
					 | 
				
			||||||
        try:
 | 
					 | 
				
			||||||
            raw = self.browser.open(url).read()
 | 
					 | 
				
			||||||
        except:
 | 
					 | 
				
			||||||
            print '404: ' + url
 | 
					 | 
				
			||||||
            return returnValue
 | 
					 | 
				
			||||||
        m = re.search('data-ca=\"(?P<url>.*?)\"', str(raw))
 | 
					 | 
				
			||||||
        if m is not None:
 | 
					 | 
				
			||||||
            returnValue = m.group("url")
 | 
					 | 
				
			||||||
        return returnValue
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def get_vine_pic(self, url):
 | 
					 | 
				
			||||||
        returnValue = None
 | 
					 | 
				
			||||||
        try:
 | 
					 | 
				
			||||||
            raw = self.browser.open(url).read()
 | 
					 | 
				
			||||||
        except:
 | 
					 | 
				
			||||||
            print '404: ' + url
 | 
					 | 
				
			||||||
            return returnValue
 | 
					 | 
				
			||||||
        m = re.search('"thumbnail.*?src=\"(?P<url>.*?)\"', str(raw))
 | 
					 | 
				
			||||||
        if m is not None:
 | 
					 | 
				
			||||||
            returnValue = m.group("url")
 | 
					 | 
				
			||||||
        return returnValue
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    preprocess_regexps = [
 | 
					 | 
				
			||||||
        (re.compile(r'<script\b.+?</script>', re.DOTALL | re.IGNORECASE), lambda h1: ''),
 | 
					 | 
				
			||||||
        (re.compile(r'<a.* id="buy-tickets-button".*</a>', re.IGNORECASE), lambda h2: ''),
 | 
					 | 
				
			||||||
        (re.compile(r'<a.* class="gallery.*</a>', re.IGNORECASE), lambda h2: ''),
 | 
					 | 
				
			||||||
    ]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    extra_css = '''
 | 
					 | 
				
			||||||
                    h1 h2 {
 | 
					 | 
				
			||||||
                        font-family:Arial,Helvetica,sans-serif;
 | 
					 | 
				
			||||||
                        font-weight:bold;font-size:large;
 | 
					 | 
				
			||||||
                    }
 | 
					 | 
				
			||||||
                    h3 {
 | 
					 | 
				
			||||||
                        font-family:Arial,Helvetica,sans-serif;
 | 
					 | 
				
			||||||
                        font-weight:normal;
 | 
					 | 
				
			||||||
                        font-size:small;
 | 
					 | 
				
			||||||
                        font-style:italic;
 | 
					 | 
				
			||||||
                        display:inline;
 | 
					 | 
				
			||||||
                    }
 | 
					 | 
				
			||||||
                    body {
 | 
					 | 
				
			||||||
                        font-family:Helvetica,Arial,sans-serif;
 | 
					 | 
				
			||||||
                        font-size:small;
 | 
					 | 
				
			||||||
                    }
 | 
					 | 
				
			||||||
                    blockquote {
 | 
					 | 
				
			||||||
                        font-family:"Courier New",
 | 
					 | 
				
			||||||
                        Courier, monospace;
 | 
					 | 
				
			||||||
                        font-size:90%;
 | 
					 | 
				
			||||||
                    }
 | 
					 | 
				
			||||||
                    img {
 | 
					 | 
				
			||||||
                        display:block;
 | 
					 | 
				
			||||||
                    }
 | 
					 | 
				
			||||||
                    .date{
 | 
					 | 
				
			||||||
                        font-style:italic;
 | 
					 | 
				
			||||||
                        font-weight:normal;
 | 
					 | 
				
			||||||
                    }
 | 
					 | 
				
			||||||
                    .article_header>p:not(.date){
 | 
					 | 
				
			||||||
                        font-weight:bold;
 | 
					 | 
				
			||||||
                    }
 | 
					 | 
				
			||||||
                '''
 | 
					 | 
				
			||||||
 | 
				
			|||||||
@ -1,22 +0,0 @@
 | 
				
			|||||||
__license__ = 'GPL v3'
 | 
					 | 
				
			||||||
__copyright__ = '2012, Peter Grungi <p dot grungi at gmail dot com>'
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from calibre.web.feeds.news import BasicNewsRecipe
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
class TheResurgence(BasicNewsRecipe):
 | 
					 | 
				
			||||||
    title = u'The Resurgence'
 | 
					 | 
				
			||||||
    __author__ = 'Peter Grungi'
 | 
					 | 
				
			||||||
    language = 'en'
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    oldest_article = 7
 | 
					 | 
				
			||||||
    max_articles_per_feed = 10
 | 
					 | 
				
			||||||
    auto_cleanup = True
 | 
					 | 
				
			||||||
    cover_url = 'http://cdn.theresurgence.com/images/logo.png'
 | 
					 | 
				
			||||||
    masthead_url = 'http://cdn.theresurgence.com/images/logo.png'
 | 
					 | 
				
			||||||
    language = 'en'
 | 
					 | 
				
			||||||
    publisher = 'The Resurgence'
 | 
					 | 
				
			||||||
    author = 'The Resurgence'
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    feeds = [
 | 
					 | 
				
			||||||
        (u'The Resurgence', u'http://feeds.theresurgence.com/TheResurgence?format=xml')]
 | 
					 | 
				
			||||||
@ -10,30 +10,10 @@ class SecurityWatch(BasicNewsRecipe):
 | 
				
			|||||||
    oldest_article = 14
 | 
					    oldest_article = 14
 | 
				
			||||||
    max_articles_per_feed = 100
 | 
					    max_articles_per_feed = 100
 | 
				
			||||||
    use_embedded_content = False
 | 
					    use_embedded_content = False
 | 
				
			||||||
    filter_regexps = [r'feedads\.googleadservices\.com']
 | 
					 | 
				
			||||||
    filter_regexps = [r'ad\.doubleclick']
 | 
					 | 
				
			||||||
    filter_regexps = [r'advert']
 | 
					 | 
				
			||||||
    language = 'en'
 | 
					    language = 'en'
 | 
				
			||||||
 | 
					    auto_cleanup = True
 | 
				
			||||||
    extra_css = 'div {text-align:left}'
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    remove_tags = [dict(id='topBannerContainer'),
 | 
					 | 
				
			||||||
                   dict(id='topBannerSmall'),
 | 
					 | 
				
			||||||
                   dict(id='topSearchBar'),
 | 
					 | 
				
			||||||
                   dict(id='topSearchForm'),
 | 
					 | 
				
			||||||
                   dict(id='rtBannerMPU'),
 | 
					 | 
				
			||||||
                   dict(id='topNavBar'),
 | 
					 | 
				
			||||||
                   dict(id='breadcrumbs'),
 | 
					 | 
				
			||||||
                   # dict(id='entry-28272'),
 | 
					 | 
				
			||||||
                   dict(id='topSearchLinks'),
 | 
					 | 
				
			||||||
                   dict(name='span', attrs={'class': 'date'})]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    remove_tags_after = [dict(id='googlemp')]
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    feeds = [
 | 
					    feeds = [
 | 
				
			||||||
        (u'securitywatch', u'http://feeds.ziffdavisenterprise.com/RSS/security_watch/')]
 | 
					        (u'securitywatch',
 | 
				
			||||||
 | 
					         u'http://feeds.pcmag.com/Rss.aspx/SectionArticles?sectionId=28026')
 | 
				
			||||||
    def postprocess_html(self, soup, first_fetch):
 | 
					        ]
 | 
				
			||||||
        for t in soup.findAll(['table', 'tr', 'td']):
 | 
					 | 
				
			||||||
            t.name = 'div'
 | 
					 | 
				
			||||||
        return soup
 | 
					 | 
				
			||||||
 | 
				
			|||||||
@ -17,39 +17,38 @@ class AdvancedUserRecipe1315899507(BasicNewsRecipe):
 | 
				
			|||||||
    auto_cleanup = True
 | 
					    auto_cleanup = True
 | 
				
			||||||
    remove_empty_feeds = True
 | 
					    remove_empty_feeds = True
 | 
				
			||||||
    publication_type = 'newspaper'
 | 
					    publication_type = 'newspaper'
 | 
				
			||||||
    masthead_url = 'http://media.signonsandiego.com/e2/sosd/images/sosd_logo.png'
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    feeds = [
 | 
					    feeds = [
 | 
				
			||||||
        (u'Latest News', u'http://www.signonsandiego.com/rss/headlines/'),
 | 
					        (u'Latest News',
 | 
				
			||||||
        (u'Local News', u'http://www.signonsandiego.com/rss/headlines/metro/'),
 | 
					         u'http://www.sandiegouniontribune.com/latest/rss2.0.xml'),
 | 
				
			||||||
        (u'Business', u'http://www.signonsandiego.com/rss/headlines/business/'),
 | 
					        (u'Business',
 | 
				
			||||||
        (u'Politics', u'http://www.signonsandiego.com/rss/headlines/local/politics/'),
 | 
					         u'http://www.sandiegouniontribune.com/business/rss2.0.xml'),
 | 
				
			||||||
        (u'Border & Immigration', u'http://www.signonsandiego.com/rss/headlines/border/'),
 | 
					        (u'Politics',
 | 
				
			||||||
        (u'Courts', u'http://www.signonsandiego.com/rss/headlines/courts/'),
 | 
					         u'http://www.sandiegouniontribune.com/news/politics/rss2.0.xml'),
 | 
				
			||||||
        (u'Education', u'http://www.signonsandiego.com/news/education/'),
 | 
					        (u'Immigration',
 | 
				
			||||||
        (u'Sports', u'http://www.signonsandiego.com/rss/headlines/sports/'),
 | 
					         u'http://www.sandiegouniontribune.com/news/immigration/rss2.0.xml'),
 | 
				
			||||||
        (u'Chargers', u'http://www.signonsandiego.com/rss/headlines/sports/chargers/'),
 | 
					        (u'Courts',
 | 
				
			||||||
        (u'Padres', u'http://www.signonsandiego.com/rss/headlines/sports/padres/'),
 | 
					         u'http://www.sandiegouniontribune.com/news/public-safety/rss2.0.xml'),
 | 
				
			||||||
        (u'NFL', u'http://www.signonsandiego.com/rss/headlines/sports/nfl/'),
 | 
					        (u'Education',
 | 
				
			||||||
        (u'NBA', u'http://www.signonsandiego.com/rss/headlines/sports/nba/'),
 | 
					         u'http://www.sandiegouniontribune.com/news/education/rss2.0.xml'),
 | 
				
			||||||
        (u'Nick Canepa', u'http://www.signonsandiego.com/rss/authors/nick-canepa/'),
 | 
					        (u'Sports',
 | 
				
			||||||
        (u'Tim Sullivan', u'http://www.signonsandiego.com/rss/authors/tim-sullivan/'),
 | 
					         u'http://www.sandiegouniontribune.com/sports/rss2.0.xml'),
 | 
				
			||||||
        (u'Ruben Navarrette', u'http://www.signonsandiego.com/rss/authors/ruben-navarrette/'),
 | 
					        (u'Chargers',
 | 
				
			||||||
        (u'Diane Bell', u'http://www.signonsandiego.com/rss/authors/diane-bell/'),
 | 
					         u'http://www.sandiegouniontribune.com/sports/chargers/rss2.0.xml'),
 | 
				
			||||||
        (u'Smart Living', u'http://www.signonsandiego.com/rss/headlines/smart-living/'),
 | 
					        (u'Padres',
 | 
				
			||||||
        (u'Photos', u'http://www.signonsandiego.com/rss/photos/'),
 | 
					         u'http://www.sandiegouniontribune.com/sports/padres/rss2.0.xml'),
 | 
				
			||||||
        (u'Arts', u'http://www.signonsandiego.com/rss/headlines/night-and-day/theater-arts/'),
 | 
					        (u'NFL',
 | 
				
			||||||
        (u'Books', u'http://www.signonsandiego.com/rss/headlines/lifestyle/books/'),
 | 
					         u'http://www.sandiegouniontribune.com/sports/nfl/rss2.0.xml'),
 | 
				
			||||||
        (u'Currents-Passages',
 | 
					        (u'NBA',
 | 
				
			||||||
         u'http://www.signonsandiego.com/rss/headlines/lifestyle/currents/passages/'),
 | 
					         u'http://www.sandiegouniontribune.com/sports/nba/rss2.0.xml'),
 | 
				
			||||||
        (u'Currents-Weekend',
 | 
					        (u'Photos',
 | 
				
			||||||
         u'http://www.signonsandiego.com/news/rss2/daily/currentsweekend.xml'),
 | 
					         u'http://www.sandiegouniontribune.com/visuals/rss2.0.xml'),
 | 
				
			||||||
        (u'Dialog', u'http://www.signonsandiego.com/news/rss2/daily/dialog.xml'),
 | 
					        (u'Entertainment',
 | 
				
			||||||
        (u'Home', u'http://www.signonsandiego.com/rss/headlines/home/'),
 | 
					         u'http://www.sandiegouniontribune.com/entertainment/rss2.0.xml'),
 | 
				
			||||||
        (u'Homescape', u'http://www.signonsandiego.com/rss/headlines/lifestyle/homescape/'),
 | 
					        (u'Books',
 | 
				
			||||||
        (u'Night & Day', u'http://www.signonsandiego.com/news/rss2/daily/nightday.xml'),
 | 
					         u'http://www.sandiegouniontribune.com/entertainment/books/rss2.0.xml'),
 | 
				
			||||||
        (u'Opinion', u'http://www.signonsandiego.com/rss/headlines/opinion/'),
 | 
					        (u'Opinion',
 | 
				
			||||||
        (u'Quest', u'http://www.signonsandiego.com/news/rss2/daily/quest.xml'),
 | 
					         u'http://www.sandiegouniontribune.com/opinion/rss2.0.xml'),
 | 
				
			||||||
        (u'Travel', u'http://www.signonsandiego.com/news/rss2/daily/travel.xml'),
 | 
					        (u'Travel',
 | 
				
			||||||
        (u'Wheels', u'http://www.signonsandiego.com/news/rss2/daily/wheels.xml')
 | 
					         u'http://www.sandiegouniontribune.com/lifestyle/travel/rss2.0.xml'),
 | 
				
			||||||
    ]
 | 
					    ]
 | 
				
			||||||
 | 
				
			|||||||
@ -28,5 +28,5 @@ class Starbulletin(BasicNewsRecipe):
 | 
				
			|||||||
        (u'Business', u'http://www.staradvertiser.com/business/feed/'),
 | 
					        (u'Business', u'http://www.staradvertiser.com/business/feed/'),
 | 
				
			||||||
        (u'Sports', u'http://www.staradvertiser.com/sports/feed/'),
 | 
					        (u'Sports', u'http://www.staradvertiser.com/sports/feed/'),
 | 
				
			||||||
        (u'Features',
 | 
					        (u'Features',
 | 
				
			||||||
         u'http://www.staradvertiser.com/featurespremium/index.rss')
 | 
					         u'http://www.staradvertiser.com/features/feed/')
 | 
				
			||||||
    ]
 | 
					    ]
 | 
				
			||||||
 | 
				
			|||||||
@ -1,97 +0,0 @@
 | 
				
			|||||||
from calibre.web.feeds.news import BasicNewsRecipe
 | 
					 | 
				
			||||||
import re
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
class TelevisionWithoutPity(BasicNewsRecipe):
 | 
					 | 
				
			||||||
    title = u'Television Without Pity'
 | 
					 | 
				
			||||||
    language = 'en'
 | 
					 | 
				
			||||||
    __author__ = 'Snarkastica'
 | 
					 | 
				
			||||||
    # Used for pulling down an entire show, not just the RSS feed
 | 
					 | 
				
			||||||
    SHOW = 'http://www.televisionwithoutpity.com/show/SHOW-NAME-HERE/recaps/'
 | 
					 | 
				
			||||||
    oldest_article = 7  # days
 | 
					 | 
				
			||||||
    max_articles_per_feed = 25
 | 
					 | 
				
			||||||
    # reverse_article_order=True # Useful for entire show, to display in episode order
 | 
					 | 
				
			||||||
    use_embedded_content = False
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    preprocess_regexps = [(re.compile(r'<span class="headline_recap_title .*?>',
 | 
					 | 
				
			||||||
                                      re.DOTALL | re.IGNORECASE), lambda match: '<span class="headline_recap_title">')]
 | 
					 | 
				
			||||||
    keep_only_tags = [dict(name='span', attrs={'class': 'headline_recap_title'}), dict(
 | 
					 | 
				
			||||||
        name='p', attrs={'class': 'byline'}), dict(name='div', attrs={'class': 'body_recap'}), dict(name='h1')]
 | 
					 | 
				
			||||||
    no_stylesheets = True
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    # Comment this out and configure process_index() to retrieve a single show
 | 
					 | 
				
			||||||
    feeds = [
 | 
					 | 
				
			||||||
        ('Ltest Recaps',
 | 
					 | 
				
			||||||
         'http://www.televisionwithoutpity.com/rss.xml'),
 | 
					 | 
				
			||||||
    ]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    '''
 | 
					 | 
				
			||||||
    This method can be used to grab all recaps for a single show
 | 
					 | 
				
			||||||
    Set the SHOW constant at the beginning of this file to the URL for a show's recap page
 | 
					 | 
				
			||||||
    (the page listing all recaps, usually of the form:
 | 
					 | 
				
			||||||
    http://www.televisionwithoutpity.com/show/SHOW-NAME/recaps/"
 | 
					 | 
				
			||||||
    Where SHOW-NAME is the hyphenated name of the show.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    To use:
 | 
					 | 
				
			||||||
    1. Comment out feeds = [...] earlier in this file
 | 
					 | 
				
			||||||
    2. Set the SHOW constant to the show's recap page
 | 
					 | 
				
			||||||
    3. Uncomment the following function
 | 
					 | 
				
			||||||
    '''
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    '''
 | 
					 | 
				
			||||||
    def parse_index(self):
 | 
					 | 
				
			||||||
        soup = self.index_to_soup(self.SHOW)
 | 
					 | 
				
			||||||
        feeds = []
 | 
					 | 
				
			||||||
        articles = []
 | 
					 | 
				
			||||||
        showTitle = soup.find('h1').string
 | 
					 | 
				
			||||||
        recaps = soup.find('table')
 | 
					 | 
				
			||||||
        for ep in recaps.findAll('tr'):
 | 
					 | 
				
			||||||
            epData = ep.findAll('td')
 | 
					 | 
				
			||||||
            epNum = epData[0].find(text=True).strip()
 | 
					 | 
				
			||||||
            if not epNum == "Ep.":
 | 
					 | 
				
			||||||
                epT = self.tag_to_string(epData[1].find('em')).strip()
 | 
					 | 
				
			||||||
                epST = " (or " + self.tag_to_string(epData[1].find('h3')).strip() + ")"
 | 
					 | 
				
			||||||
                epTitle = epNum + ": " + epT + epST
 | 
					 | 
				
			||||||
                epData[1].find('em').extract()
 | 
					 | 
				
			||||||
                epURL = epData[1].find('a', href=True)
 | 
					 | 
				
			||||||
                epURL = epURL['href']
 | 
					 | 
				
			||||||
                epSum = self.tag_to_string(epData[1].find('p')).strip()
 | 
					 | 
				
			||||||
                epDate = epData[2].find(text=True).strip()
 | 
					 | 
				
			||||||
                epAuthor = self.tag_to_string(epData[4].find('p')).strip()
 | 
					 | 
				
			||||||
                articles.append({'title':epTitle, 'url':epURL, 'description':epSum, 'date':epDate, 'author':epAuthor})
 | 
					 | 
				
			||||||
        feeds.append((showTitle, articles))
 | 
					 | 
				
			||||||
        #self.abort_recipe_processing("test")
 | 
					 | 
				
			||||||
        return feeds
 | 
					 | 
				
			||||||
    '''
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    # This will add subsequent pages of multipage recaps to a single article
 | 
					 | 
				
			||||||
    # page
 | 
					 | 
				
			||||||
    def append_page(self, soup, appendtag, position):
 | 
					 | 
				
			||||||
        # If false, will still grab single-page recaplets
 | 
					 | 
				
			||||||
        if (soup.find('p', attrs={'class': 'pages'})):
 | 
					 | 
				
			||||||
            pager = soup.find('p', attrs={'class': 'pages'}).find(text='Next')
 | 
					 | 
				
			||||||
            if pager:
 | 
					 | 
				
			||||||
                nexturl = pager.parent['href']
 | 
					 | 
				
			||||||
                soup2 = self.index_to_soup(nexturl)
 | 
					 | 
				
			||||||
                texttag = soup2.find('div', attrs={'class': 'body_recap'})
 | 
					 | 
				
			||||||
                for it in texttag.findAll(style=True):
 | 
					 | 
				
			||||||
                    del it['style']
 | 
					 | 
				
			||||||
                newpos = len(texttag.contents)
 | 
					 | 
				
			||||||
                self.append_page(soup2, texttag, newpos)
 | 
					 | 
				
			||||||
                texttag.extract()
 | 
					 | 
				
			||||||
                appendtag.insert(position, texttag)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def preprocess_html(self, soup):
 | 
					 | 
				
			||||||
        self.append_page(soup, soup.body, 3)
 | 
					 | 
				
			||||||
        return soup
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    # Remove the multi page links (we had to keep these in for append_page(), but they can go away now
 | 
					 | 
				
			||||||
    # Could have used CSS to hide, but some readers ignore CSS.
 | 
					 | 
				
			||||||
    def postprocess_html(self, soup, first_fetch):
 | 
					 | 
				
			||||||
        paginator = soup.findAll('p', attrs={'class': 'pages'})
 | 
					 | 
				
			||||||
        if paginator:
 | 
					 | 
				
			||||||
            for p in paginator:
 | 
					 | 
				
			||||||
                p.extract()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
                # TODO: Fix this so it converts the headline class into a heading 1
 | 
					 | 
				
			||||||
        return soup
 | 
					 | 
				
			||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user