mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-10-24 15:28:53 -04:00 
			
		
		
		
	Update New Musical Express
This commit is contained in:
		
							parent
							
								
									aafeb58d58
								
							
						
					
					
						commit
						6f986d0488
					
				| @ -1,25 +1,91 @@ | |||||||
|  | import re | ||||||
| from calibre.web.feeds.news import BasicNewsRecipe | from calibre.web.feeds.news import BasicNewsRecipe | ||||||
| from calibre import browser | from calibre import browser | ||||||
|  | from calibre.ebooks.BeautifulSoup import BeautifulSoup | ||||||
|  | from calibre.ebooks.BeautifulSoup import Tag | ||||||
| class AdvancedUserRecipe1306061239(BasicNewsRecipe): | class AdvancedUserRecipe1306061239(BasicNewsRecipe): | ||||||
|     title          = u'New Musical Express Magazine' |     title          = u'New Musical Express Magazine' | ||||||
|     description = 'Author D.Asbury. UK Rock & Pop Mag. ' |     description = 'UK Rock & Pop Mag.' | ||||||
|     __author__ = 'Dave Asbury' |     __author__ = 'Dave Asbury, Inge Aning' | ||||||
|     # last updated 17/5/13 News feed url altered |     category = 'Music, Film, Tv' | ||||||
|  |     publisher = 'Time Inc. (UK) Ltd.' | ||||||
|  |     ''' | ||||||
|  |     ' updated 11/3/2015 | ||||||
|  |     ' feeds url | ||||||
|  |     ' cover and masterhead url | ||||||
|  |     ' fix for a bug that prevents some pages render | ||||||
|  |     ' changes to website | ||||||
|  |     ''' | ||||||
|  | 
 | ||||||
|     remove_empty_feeds = True |     remove_empty_feeds = True | ||||||
|     remove_javascript     = True |     encoding  = 'utf-8' | ||||||
|  |     remove_javascript = True | ||||||
|     no_stylesheets = True |     no_stylesheets = True | ||||||
|     oldest_article = 7 |     oldest_article = 7 | ||||||
|     max_articles_per_feed = 20 |     max_articles_per_feed = 20 | ||||||
|     #auto_cleanup = True |     auto_cleanup = False | ||||||
|     language = 'en_GB' |     language = 'en' | ||||||
|     compress_news_images = True |     compress_news_images = True | ||||||
|  |     simultaneous_downloads  = 20 | ||||||
|  |     use_embedded_content = False | ||||||
|  |     recursions = 0 | ||||||
|  | 
 | ||||||
|  |     conversion_options = { | ||||||
|  |                           'comment'   : description | ||||||
|  |                         , 'tags'      : category | ||||||
|  |                         , 'publisher' : publisher | ||||||
|  |                         , 'language'  : language | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     feeds          = [ | ||||||
|  |                     (u'NME News',u'http://www.nme.com/rss/news'), | ||||||
|  |                     (u'Reviews',u'http://www.nme.com/rss/reviews'), | ||||||
|  |                     (u'Blogs',u'http://www.nme.com/rss/blogs'), | ||||||
|  |     ] | ||||||
|  | 
 | ||||||
|  |     keep_only_tags = [ | ||||||
|  |         dict(name='div',attrs={'id':'content'}), | ||||||
|  |     ] | ||||||
|  | 
 | ||||||
|  |     remove_attributes = ['border', 'cellspacing', 'align', 'cellpadding', 'colspan', | ||||||
|  |                           'valign', 'vspace', 'hspace', 'alt', 'width', 'height'] | ||||||
|  | 
 | ||||||
|  |     remove_tags = [ | ||||||
|  |             dict(name='meta'), | ||||||
|  |             dict(name='span',attrs={'class':'article_info'}), | ||||||
|  |             dict(name='div',attrs={'class':'breadcrumbs'}), | ||||||
|  |             dict(name='div',attrs={'class':'mugshot'}), | ||||||
|  |             dict(name='div',attrs={'class':'header'}), | ||||||
|  |             dict(name='div',attrs={'class':re.compile('youtube.*',re.IGNORECASE)}), | ||||||
|  |             dict(name='div',attrs={'class':re.compile('socialbuttons.*',re.IGNORECASE)}), | ||||||
|  |             dict(name='div',attrs={'class':'clear_both'}), | ||||||
|  |             dict(name='div',attrs={'class':re.compile('headline.*',re.IGNORECASE)}), | ||||||
|  |             dict(name='div',attrs={'class':'member-signedout'}), | ||||||
|  |             dict(name='div',attrs={'class':re.compile('prev_next.*',re.IGNORECASE)}), | ||||||
|  |             dict(name='div',attrs={'class':re.compile('article_related.*',re.IGNORECASE)}), | ||||||
|  |             dict(name='div',attrs={'class':re.compile('feature_bar.*',re.IGNORECASE)}), | ||||||
|  |             dict(name='div',attrs={'class':re.compile('ebay.*',re.IGNORECASE)}), | ||||||
|  |             dict(name='div',attrs={'id':re.compile('morenews.*',re.IGNORECASE)}), | ||||||
|  |             dict(name='div',attrs={'id':re.compile('ticketspopup.*',re.IGNORECASE)}), | ||||||
|  |             dict(name='div',attrs={'id':re.compile('ratemy_logprompt.*',re.IGNORECASE)}), | ||||||
|  |             dict(name='div',attrs={'id':re.compile('related_artist.*',re.IGNORECASE)}), | ||||||
|  |             dict(name='img',attrs={'class':re.compile('video_play_large.*',re.IGNORECASE)}), | ||||||
|  |             dict(name='ul',attrs={'class':re.compile('prev_next.*',re.IGNORECASE)}), | ||||||
|  |             dict(name='ul',attrs={'class':re.compile('nme_store.*',re.IGNORECASE)}), | ||||||
|  |             dict(name='p',attrs={'class':re.compile('top',re.IGNORECASE)}), | ||||||
|  |             dict(name='table',attrs={'class':re.compile('tickets.*',re.IGNORECASE)}), | ||||||
|  |     ] | ||||||
|  | 
 | ||||||
|  |     masthead_url   = 'http://default.media.ipcdigital.co.uk/300/000001014/e1ab_orh100000w300/NME-logo.jpg' | ||||||
|  | 
 | ||||||
|     def get_cover_url(self): |     def get_cover_url(self): | ||||||
|         soup = self.index_to_soup('http://www.nme.com/component/subscribe') |         magazine_page_raw = self.index_to_soup('http://www.nme.com/magazine', raw=True) | ||||||
|         cov = soup.find(attrs={'id' : 'magazine_cover'}) |         magazine_page_raw = re.sub(r'<script\b.+?</script>', '', magazine_page_raw, flags=re.DOTALL|re.IGNORECASE) | ||||||
|  |         magazine_page_raw = re.sub(r'\!\[if ', '!--[if ', magazine_page_raw, flags=re.DOTALL|re.IGNORECASE) | ||||||
|  |         magazine_page = self.index_to_soup(magazine_page_raw) | ||||||
|  |         cov = magazine_page.find('img',attrs={'class':'magcover'}) | ||||||
| 
 | 
 | ||||||
|         cov2 = str(cov['src']) |         cov2 = str(cov['src']) | ||||||
|         # print '**** Cov url =*', cover_url,'***' |  | ||||||
|         #print '**** Cov url =*','http://www.magazinesdirect.com/article_images/articledir_3138/1569221/1_largelisting.jpg','***' |  | ||||||
| 
 | 
 | ||||||
|         br = browser() |         br = browser() | ||||||
|         br.set_handle_redirect(False) |         br.set_handle_redirect(False) | ||||||
| @ -27,43 +93,291 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe): | |||||||
|             br.open_novisit(cov2) |             br.open_novisit(cov2) | ||||||
|             cover_url = str(cov2) |             cover_url = str(cov2) | ||||||
|         except: |         except: | ||||||
|                 cover_url = 'http://tawanda3000.files.wordpress.com/2011/02/nme-logo.jpg' |             cover_url = 'http://default.media.ipcdigital.co.uk/300/000001014/e1ab_orh100000w300/NME-logo.jpg' | ||||||
|         return cover_url |         return cover_url | ||||||
|     masthead_url   = 'http://tawanda3000.files.wordpress.com/2011/02/nme-logo.jpg' |  | ||||||
| 
 | 
 | ||||||
|     remove_tags = [ |     def preprocess_raw_html(self, raw_html, url): | ||||||
|             dict(attrs={'class':'clear_icons'}), |         ''' | ||||||
|             dict(attrs={'class':'share_links'}), |         Need this for a bug on site that prevents blogg post being parsed correctly | ||||||
|                             dict(attrs={'id':'right_panel'}), |         ''' | ||||||
|             dict(attrs={'class':'today box'}), |         raw_html = re.sub(r'\!\[if ', '!--[if ', raw_html, flags=re.DOTALL|re.IGNORECASE) | ||||||
| 
 | 
 | ||||||
|  |         return raw_html | ||||||
| 
 | 
 | ||||||
|                       ] |     def preprocess_html(self, soup): | ||||||
|  |         youtube_regex = re.compile(r'(?:youtube\.com/(?:[^/]+/.+/|(?:v|e(?:mbed)?)/|.*[?&]v=)|youtu\.be/)(?P<id>[^"&?/ ]{11})', re.DOTALL|re.IGNORECASE) | ||||||
|  |         instagram_regex = re.compile(r'.*?instagram.*?', re.DOTALL|re.IGNORECASE) | ||||||
|  |         twitter_regex = re.compile(r'.*?twitter.*?', re.DOTALL|re.IGNORECASE) | ||||||
|  |         visualise_regex = re.compile(r'.*?visualise.*?', re.DOTALL|re.IGNORECASE) | ||||||
|  |         soundcloud_regex = re.compile(r'(?P<url>.*?(w|api)\.soundcloud.*?com\/(tracks|playlists)\/\d{8,9})', re.DOTALL|re.IGNORECASE) | ||||||
|  |         dailymotion_regex = re.compile(r'.*?dailymotion.*?', re.DOTALL|re.IGNORECASE) | ||||||
|  |         spotify_regex = re.compile(r'.*?spotify.*?', re.DOTALL|re.IGNORECASE) | ||||||
|  |         vine_regex = re.compile(r'.*?vine.*?', re.DOTALL|re.IGNORECASE) | ||||||
|  |         doubleHtmlEntities = re.compile(ur'(&)(?P<e>[\d\w\#]*;)', re.DOTALL|re.IGNORECASE|re.UNICODE) | ||||||
|  |         for iframe in soup.findAll('iframe'): | ||||||
|  |             if iframe.has_key('src') and youtube_regex.search(iframe['src']) is not None : | ||||||
|  |                 pq = Tag(soup, 'blockquote') | ||||||
|  |                 br = Tag(soup, 'br') | ||||||
|  |                 pq.insert(0, '[ YouTube ]') | ||||||
|  |                 pq.insert(1, br) | ||||||
|  |                 m = youtube_regex.search(iframe['src']) | ||||||
|  |                 if m.group('id') is not None: | ||||||
|  |                     imgTag = Tag(soup, 'img', [('src','http://img.youtube.com/vi/' + m.group('id') + '/0.jpg')]) | ||||||
|  |                     pq.insert(len(pq.contents), imgTag) | ||||||
|  |                 pq.insert(len(pq.contents), iframe['src']) | ||||||
|  |                 iframe.replaceWith(pq) | ||||||
|  |             elif iframe.has_key('src') and soundcloud_regex.search(iframe['src']) is not None: | ||||||
|  |                 m = soundcloud_regex.search(iframe['src']) | ||||||
|  |                 pq = Tag(soup, 'blockquote') | ||||||
|  |                 br = Tag(soup, 'br') | ||||||
|  |                 pq.insert(0, '[ SoundCloud ]') | ||||||
|  |                 pq.insert(1, br) | ||||||
|  |                 pq.insert(2, m.group('url')) | ||||||
|  |                 #imgUrl = self.get_soundcloud_pic(iframe['src']) | ||||||
|  |                 iframe.replaceWith(pq) | ||||||
|  |             elif iframe.has_key('src') and dailymotion_regex.search(iframe['src']) is not None: | ||||||
|  |                 pq = Tag(soup, 'blockquote') | ||||||
|  |                 br = Tag(soup, 'br') | ||||||
|  |                 pq.insert(0, '[ DailyMotion ]') | ||||||
|  |                 pq.insert(1, br) | ||||||
|  |                 imgUrl = self.get_dailymotion_pic(iframe['src']) | ||||||
|  |                 if imgUrl is not None: | ||||||
|  |                     imgTag = Tag(soup, 'img', [('src',imgUrl)]) | ||||||
|  |                     pq.insert(len(pq.contents), imgTag) | ||||||
|  |                 pq.insert(len(pq.contents), iframe['src']) | ||||||
|  |                 iframe.replaceWith(pq) | ||||||
|  |             elif iframe.has_key('src') and spotify_regex.search(iframe['src']) is not None: | ||||||
|  |                 pq = Tag(soup, 'blockquote') | ||||||
|  |                 br = Tag(soup, 'br') | ||||||
|  |                 pq.insert(0, '[ Spotify ]') | ||||||
|  |                 pq.insert(1, br) | ||||||
|  |                 imgUrl = self.get_spotify_pic(iframe['src']) | ||||||
|  |                 if imgUrl is not None: | ||||||
|  |                     imgTag = Tag(soup, 'img', [('src',imgUrl)]) | ||||||
|  |                     pq.insert(len(pq.contents), imgTag) | ||||||
|  |                 pq.insert(len(pq.contents), iframe['src']) | ||||||
|  |                 iframe.replaceWith(pq) | ||||||
|  |             elif iframe.has_key('src') and vine_regex.search(iframe['src']) is not None: | ||||||
|  |                 pq = Tag(soup, 'blockquote') | ||||||
|  |                 br = Tag(soup, 'br') | ||||||
|  |                 pq.insert(0, '[ Vine ]') | ||||||
|  |                 pq.insert(1, br) | ||||||
|  |                 imgUrl = self.get_vine_pic(iframe['src']) | ||||||
|  |                 if imgUrl is not None: | ||||||
|  |                     imgTag = Tag(soup, 'img', [('src',imgUrl)]) | ||||||
|  |                     pq.insert(len(pq.contents), imgTag) | ||||||
|  |                 pq.insert(len(pq.contents), iframe['src']) | ||||||
|  |                 iframe.replaceWith(pq) | ||||||
|  |             elif iframe.has_key('src') and visualise_regex.search(iframe['src']) is not None : | ||||||
|  |                 imgUrl = self.get_visualise_pic(iframe['src']) | ||||||
|  |                 if imgUrl is not None: | ||||||
|  |                     imgTag = Tag(soup, 'img', [('src',imgUrl)]) | ||||||
|  |                     iframe.replaceWith(imgTag) | ||||||
|  |         for blockquote in soup.findAll('blockquote'): | ||||||
|  |             if blockquote.has_key('class') and twitter_regex.search(blockquote['class']) is not None: | ||||||
|  |                 pq = Tag(soup, 'blockquote') | ||||||
|  |                 br = Tag(soup, 'br') | ||||||
|  |                 pq.insert(0, '[ Twitter ]') | ||||||
|  |                 pq.insert(len(pq.contents), br) | ||||||
|  |                 match = re.search("(?P<url>pic\.twitter[^\s<]+)", str(blockquote)) | ||||||
|  |                 if match is not None: | ||||||
|  |                     img = self.get_twitter_pic(str(match.group("url"))) | ||||||
|  |                     if img is not None: | ||||||
|  |                         pq.insert(len(pq.contents),img) | ||||||
|  |                 for p in blockquote.findAll(name='p'): | ||||||
|  |                     x = 0 | ||||||
|  |                     plen = len(p.contents) | ||||||
|  |                     while True: | ||||||
|  |                         c = len(pq.contents) | ||||||
|  |                         if p.contents[x].string is not None: | ||||||
|  |                             pq.insert(c, re.sub(doubleHtmlEntities, lambda m: '&' + m.group(2), str(p.contents[x].string), re.IGNORECASE | re.UNICODE)) | ||||||
|  |                         else: | ||||||
|  |                             pq.insert(c, p.contents[x].content) | ||||||
|  |                         x += 1 | ||||||
|  |                         if x == plen: | ||||||
|  |                             break | ||||||
|  |                     br = Tag(soup, 'br') | ||||||
|  |                     pq.insert(len(pq.contents), br) | ||||||
|  |                     p.extract() | ||||||
|  |                 if len(blockquote.contents) > 0: | ||||||
|  |                     x = 0 | ||||||
|  |                     xlen = len(blockquote.contents) | ||||||
|  |                     while True: | ||||||
|  |                         c = len(pq.contents) | ||||||
|  |                         if blockquote.contents[x].string is not None: | ||||||
|  |                             pq.insert(c, re.sub(doubleHtmlEntities, lambda m: '&' + m.group(2), str(blockquote.contents[x].string), re.IGNORECASE | re.UNICODE)) | ||||||
|  |                         else: | ||||||
|  |                             pq.insert(c, blockquote.contents[x].content) | ||||||
|  |                         x += 1 | ||||||
|  |                         if x == xlen: | ||||||
|  |                             break | ||||||
|  |                 blockquote.replaceWith(pq) | ||||||
|  |             elif blockquote.has_key('class') and instagram_regex.search(blockquote['class']) is not None: | ||||||
|  |                 pq = Tag(soup, 'blockquote') | ||||||
|  |                 br = Tag(soup, 'br') | ||||||
|  |                 pq.insert(0, '[ Instagram ]') | ||||||
|  |                 pq.insert(1, br) | ||||||
|  |                 a = blockquote.find(name='a',attrs={'href':instagram_regex}) | ||||||
|  |                 imgUrl = None | ||||||
|  |                 if a is not None: | ||||||
|  |                     imgUrl = self.get_instagram_pic(str(a['href'])) | ||||||
|  |                 if imgUrl is not None: | ||||||
|  |                     img = Tag(soup, 'img', [('src',imgUrl)]) | ||||||
|  |                     pq.insert(len(pq.contents),img) | ||||||
|  |                 for p in blockquote.findAll(name='p'): | ||||||
|  |                     x = 0 | ||||||
|  |                     plen = len(p.contents) | ||||||
|  |                     while x < plen: | ||||||
|  |                         c = len(pq.contents) | ||||||
|  |                         if p.contents[x].string is not None: | ||||||
|  |                             pq.insert(c, re.sub(doubleHtmlEntities, lambda m: '&' + m.group(2), str(p.contents[x].string), re.IGNORECASE | re.UNICODE)) | ||||||
|  |                         # else: | ||||||
|  |                             # pq.insert(c, p.contents[x].content) | ||||||
|  |                         x += 1 | ||||||
|  |                     br = Tag(soup, 'br') | ||||||
|  |                     c = len(pq.contents) | ||||||
|  |                     pq.insert(c, br) | ||||||
|  |                 blockquote.replaceWith(pq) | ||||||
|  |         for alink in soup.findAll('a'): | ||||||
|  |             if alink.string is not None: | ||||||
|  |                 tstr = alink.string | ||||||
|  |                 alink.replaceWith(tstr) | ||||||
|  |             elif alink.img is not None: | ||||||
|  |                 tstr = alink.img | ||||||
|  |                 alink.replaceWith(tstr) | ||||||
|  |             elif alink.span is not None: | ||||||
|  |                 tstr = alink.span | ||||||
|  |                 alink.replaceWith(tstr) | ||||||
|  |         return soup | ||||||
| 
 | 
 | ||||||
|     keep_only_tags = [ |     def get_visualise_pic(self, url): | ||||||
|  |         returnValue = None | ||||||
|  |         try: | ||||||
|  |             raw = self.browser.open(url).read() | ||||||
|  |         except: | ||||||
|  |             print '404: ' + url | ||||||
|  |             return returnValue | ||||||
|  |         bs = BeautifulSoup(raw) | ||||||
|  |         imgRaw = bs.find(name='meta', attrs={'property':'og:image'}) | ||||||
|  |         if imgRaw is not None: | ||||||
|  |             returnValue = str(imgRaw['content']) | ||||||
|  |         return returnValue | ||||||
| 
 | 
 | ||||||
|         dict(name='h1'), |     def get_twitter_pic(self, url): | ||||||
|         #dict(name='h3'), |         returnValue = None | ||||||
|         dict(attrs={'class' :  'BText'}), |         try: | ||||||
|         dict(attrs={'class' :  'Bmore'}), |             raw = self.browser.open('https://' + url).read() | ||||||
|         dict(attrs={'class' : 'bPosts'}), |         except: | ||||||
|         dict(attrs={'class' :  'text'}), |             print '404: ' + url | ||||||
|         dict(attrs={'id' :  'article_gallery'}), |             return returnValue | ||||||
|                         #dict(attrs={'class' :  'image'}), |         bs = BeautifulSoup(raw) | ||||||
|         dict(attrs={'class' :  'article_text'}) |         refresh = bs.find('meta', {'http-equiv':'refresh'}) | ||||||
|  |         if refresh is not None: | ||||||
|  |             content = refresh.get('content').partition('=')[2] | ||||||
|  |             try: | ||||||
|  |                 raw = self.browser.open(content).read() | ||||||
|  |             except: | ||||||
|  |                 print '404: ' + url | ||||||
|  |                 return returnValue | ||||||
|  |             bs = BeautifulSoup(raw) | ||||||
|  |         img = bs.find(name='img',attrs={'alt':re.compile('.*permalink.*',re.IGNORECASE)}) | ||||||
|  |         if img is not None: | ||||||
|  |             returnValue = img | ||||||
|  |         return returnValue | ||||||
| 
 | 
 | ||||||
|  |     def get_soundcloud_pic(self, url): | ||||||
|  |         # content loaded via javascript and require an login and/or registered application identification | ||||||
|  |         # returnValue = None | ||||||
|  |         # raw = self.browser.open(soundcloudUrl + '&visual=true').read() | ||||||
|  |         # bs = BeautifulSoup(raw) | ||||||
|  |         # imgRaw = bs.find(name='div', attrs={'style':re.compile(r'backgroud-image:*?',re.IGNORECASE)}) | ||||||
|  |         # if imgRaw is not None: | ||||||
|  |             # returnValue = str(imgRaw['style']) | ||||||
|  |         return None  # returnValue | ||||||
|  | 
 | ||||||
|  |     def get_instagram_pic(self, url): | ||||||
|  |         returnValue = None | ||||||
|  |         try: | ||||||
|  |             raw = self.browser.open(url).read() | ||||||
|  |         except: | ||||||
|  |             print '404: ' + url | ||||||
|  |             return returnValue | ||||||
|  |         m = re.search('\"display_src\":\"(?P<url>http[s]?:.*?)\"', str(raw)) | ||||||
|  |         if m is not None: | ||||||
|  |             returnValue = re.sub(r'\\','',m.group("url"), flags=re.DOTALL|re.IGNORECASE) | ||||||
|  |         return returnValue | ||||||
|  | 
 | ||||||
|  |     def get_dailymotion_pic(self, url): | ||||||
|  |         returnValue = None | ||||||
|  |         try: | ||||||
|  |             raw = self.browser.open(url).read() | ||||||
|  |         except: | ||||||
|  |             print '404: ' + url | ||||||
|  |             return returnValue | ||||||
|  |         m = re.search('("thumbnail_url\"\:\")(?P<url>http.*?)(\")', str(raw)) | ||||||
|  |         if m is not None: | ||||||
|  |             returnValue = re.sub(r'\\','',m.group("url"), flags=re.DOTALL|re.IGNORECASE) | ||||||
|  |         return returnValue | ||||||
|  | 
 | ||||||
|  |     def get_spotify_pic(self, url): | ||||||
|  |         returnValue = None | ||||||
|  |         try: | ||||||
|  |             raw = self.browser.open(url).read() | ||||||
|  |         except: | ||||||
|  |             print '404: ' + url | ||||||
|  |             return returnValue | ||||||
|  |         m = re.search('data-ca=\"(?P<url>.*?)\"', str(raw)) | ||||||
|  |         if m is not None: | ||||||
|  |             returnValue = m.group("url") | ||||||
|  |         return returnValue | ||||||
|  | 
 | ||||||
|  |     def get_vine_pic(self, url): | ||||||
|  |         returnValue = None | ||||||
|  |         try: | ||||||
|  |             raw = self.browser.open(url).read() | ||||||
|  |         except: | ||||||
|  |             print '404: ' + url | ||||||
|  |             return returnValue | ||||||
|  |         m = re.search('"thumbnail.*?src=\"(?P<url>.*?)\"', str(raw)) | ||||||
|  |         if m is not None: | ||||||
|  |             returnValue = m.group("url") | ||||||
|  |         return returnValue | ||||||
|  | 
 | ||||||
|  |     preprocess_regexps = [ | ||||||
|  |         (re.compile(r'<script\b.+?</script>', re.DOTALL|re.IGNORECASE), lambda h1: ''), | ||||||
|  |         (re.compile(r'<a.* id="buy-tickets-button".*</a>',re.IGNORECASE), lambda h2: ''), | ||||||
|  |         (re.compile(r'<a.* class="gallery.*</a>',re.IGNORECASE), lambda h2: ''), | ||||||
|     ] |     ] | ||||||
| 
 | 
 | ||||||
|     feeds          = [ |  | ||||||
|         (u'NME News', u'http://www.nme.com/news?alt=rss' ), #http://feeds.feedburner.com/nmecom/rss/newsxml?format=xml'), |  | ||||||
|         #(u'Reviews', u'http://feeds2.feedburner.com/nme/SdML'), |  | ||||||
|         (u'Reviews',u'http://feed43.com/1817687144061333.xml'), |  | ||||||
|                         (u'Bloggs',u'http://feed43.com/3326754333186048.xml'), |  | ||||||
| 
 |  | ||||||
| 	] |  | ||||||
|     extra_css = ''' |     extra_css = ''' | ||||||
|                     h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;} |                     h1 h2 { | ||||||
|                     h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;} |                         font-family:Arial,Helvetica,sans-serif; | ||||||
|                     p{font-family:Arial,Helvetica,sans-serif;font-size:small;} |                         font-weight:bold;font-size:large; | ||||||
|                     body{font-family:Helvetica,Arial,sans-serif;font-size:small;} |                     } | ||||||
| 		''' |                     h3 { | ||||||
|  |                         font-family:Arial,Helvetica,sans-serif; | ||||||
|  |                         font-weight:normal; | ||||||
|  |                         font-size:small; | ||||||
|  |                         font-style:italic; | ||||||
|  |                         display:inline; | ||||||
|  |                     } | ||||||
|  |                     body { | ||||||
|  |                         font-family:Helvetica,Arial,sans-serif; | ||||||
|  |                         font-size:small; | ||||||
|  |                     } | ||||||
|  |                     blockquote { | ||||||
|  |                         font-family:"Courier New", | ||||||
|  |                         Courier, monospace; | ||||||
|  |                         font-size:90%; | ||||||
|  |                     } | ||||||
|  |                     img { | ||||||
|  |                         display:block; | ||||||
|  |                     } | ||||||
|  |                     .date{ | ||||||
|  |                         font-style:italic; | ||||||
|  |                         font-weight:normal; | ||||||
|  |                     } | ||||||
|  |                     .article_header>p:not(.date){ | ||||||
|  |                         font-weight:bold; | ||||||
|  |                     } | ||||||
|  |                 ''' | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user