diff --git a/recipes/independent.recipe b/recipes/independent.recipe index fc6bacce57..6bbdfec8be 100644 --- a/recipes/independent.recipe +++ b/recipes/independent.recipe @@ -1,504 +1,34 @@ # adapted from old recipe by Darko Miletic -import re from calibre.web.feeds.recipes import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import Tag, NavigableString - class TheIndependentNew(BasicNewsRecipe): - # flag to enable/disable article graphics on business pages/some others - # eg http://www.independent.co.uk/news/world/europe/berlusconi-departure-fails-to-calm-the-markets-6259682.html - # -max dimensions can be altered using the .pictureContainer img selector in the css - _FETCH_ARTICLE_GRAPHICS = True - - #Flag to enable/disable image fetching (not business) - _FETCH_IMAGES = True - - #Set max gallery images here (respects _FETCH_IMAGES) - # -1 for infinite - _MAX_GALLERY_IMAGES = -1 - - - #used for converting rating to stars + # used for converting rating to stars _STAR_URL = 'http://www.independent.co.uk/skins/ind/images/rating_star.png' _NO_STAR_URL = 'http://www.independent.co.uk/skins/ind/images/rating_star_grey.png' - title = u'The Independent' - __author__ = 'Will' + __author__ = 'Krittika Goyal' description = 'The latest in UK News and World News from The \ Independent. Wide range of international and local news, sports \ news, commentary and opinion pieces.Independent News - Breaking news \ that matters. Your daily comprehensive news source - The \ Independent Newspaper' publisher = 'The Independent' + oldest_article = 2.0 + ignore_duplicate_articles = {'title', 'url'} + remove_empty_feeds = True category = 'news, UK' no_stylesheets = True use_embedded_content = False remove_empty_feeds = True + auto_cleanup = True language = 'en_GB' publication_type = 'newspaper' masthead_url = 'http://www.independent.co.uk/independent.co.uk/editorial/logo/independent_Masthead.png' encoding = 'utf-8' compress_news_images = True - remove_tags =[ - dict(attrs={'id' : ['RelatedArtTag','renderBiography']}), - dict(attrs={'class' : ['autoplay','openBiogPopup']}), - dict(name='img',attrs={'alt' : ['Get Adobe Flash player']}), - dict(name='img',attrs={'alt' : ['view gallery']}), - dict(attrs={'style' : re.compile('.*')}), - dict(attrs={'class':lambda x: x and 'voicesRelatedTopics' in x.split()}), - ] - - keep_only_tags =[dict(attrs={'id':['main','top']})] - recursions = 0 - - # fixes non compliant html nesting and 'marks' article graphics links - preprocess_regexps = [ - (re.compile('(?P.*?)', re.DOTALL), - lambda match: '
' + match.group('nested') + '
'), - (re.compile('(.*?[Cc]lick.*?)', re.DOTALL), - lambda match: '
' + match.group(0) + '
'), - ] - - - conversion_options = { - 'comment' : description - , 'tags' : category - , 'publisher' : publisher - , 'language' : language - } - - extra_css = """ - h1{font-family: Georgia,serif ; font-size: x-large; } - body{font-family: Verdana,Arial,Helvetica,sans-serif} - img{margin-bottom: 0.4em; display:block} - .starRating img {float: left} - .starRating {margin-top:0.4em; display: block} - .image {clear:left; font-size: x-small; color:#888888;} - .articleByTimeLocation {font-size: x-small; color:#888888; - margin-bottom:0.2em ; margin-top:0.2em ; display:block} - .subtitle {clear:left ;} - .column-1 h1 { color: #191919} - .column-1 h2 { color: #333333} - .column-1 h3 { color: #444444} - .subtitle { color: #777777; font-size: medium;} - .column-1 a,h1,h2,h3 { margin: 0; } - .column-1 div{margin: 0;} - .articleContent {display: block; clear:left;} - .articleContent {color: #000000; font-size: medium;} - .ivDrip-section {color: #000000; font-size: medium;} - .datetime {color: #888888} - .title {font-weight:bold;} - .storyTop{} - .pictureContainer img { max-width: 400px; max-height: 400px;} - .image img { max-width: 400px; max-height: 400px;} - """ - - oldest_article = 1 - max_articles_per_feed = 100 - - _processed_urls = [] - - - def get_article_url(self, article): - url = super(self.__class__,self).get_article_url(article) - - title = article.get('title', None) - if title and re.search("^Video:",title): - return None - - #remove duplicates - if not (url in self._processed_urls): - self._processed_urls.append(url) - else: - url = None - return url - - def populate_article_metadata(self, article, soup, first): - if first and hasattr(self, 'add_toc_thumbnail'): - picdiv = soup.find('img') - if picdiv is not None: - self.add_toc_thumbnail(article,picdiv['src']) - - def preprocess_html(self, soup): - - #remove 'advertorial articles' - strapline = soup.find('div',attrs={'class' : re.compile('.*strapLine.*')}) - if strapline: - for para in strapline.findAll('p'): - if len(para.contents) and isinstance(para.contents[0],NavigableString) \ - and para.contents[0] == 'ADVERTORIAL FEATURE': - return None - - # remove Suggested Topics - items_to_extract = [] - - for item in soup.findAll('div',attrs={'class' : re.compile('.*RelatedArtTag.*')}): - items_to_extract.append(item) - - for item in items_to_extract: - item.extract() - - items_to_extract = [] - slideshow_elements = [] - - for item in soup.findAll(attrs={'class' : re.compile("widget.*")}): - remove = True - pattern = re.compile('((articleContent)|(title))$') - if (pattern.search(item['class'])) is not None: - remove = False - - # corrections - # story content always good - pattern = re.compile('storyContent') - if (pattern.search(item['class'])) is not None: - remove = False - - #images - pattern = re.compile('slideshow') - if (pattern.search(item['class'])) is not None: - if self._FETCH_IMAGES: - remove = False - slideshow_elements.append(item) - else: - remove = True - - #social widgets always bad - pattern = re.compile('socialwidget') - if (pattern.search(item['class'])) is not None: - remove = True - - if remove: - items_to_extract.append(item) - - for item in items_to_extract: - item.extract() - - items_to_extract = [] - - if self._FETCH_IMAGES: - for element in slideshow_elements: - for item in element.findAll('a',attrs={'href' : re.compile('.*')}): - if item.img is not None: - #use full size image - images = [] - - img = item.findNext('img') - - if not '?action=gallery' in item['href']: - img['src'] = item['href'] - tag = Tag(soup,'h3') - text = '' - try: - text = img['data-title'] - except: - pass - - if img.get('title') and (len(img['title']) > 1): - text = NavigableString(img['title']) - tag.insert(0,text) - images.append((img, tag)) - else: - gallery_images, remove_link = self._get_gallery_images(item['href']) - images = images + gallery_images - if remove_link: - gal_link = soup.find('a',attrs={'id' : 'view-gallery'}) - if gal_link: - gal_link.extract() - img.extract() - for (img, title) in images: - #insert caption if available - if title: - #picture before text - img.extract() - item.insert(0,img) - item.insert(1,title) - - # remove link - item.name = "div" - item["class"]='image' - del item["href"] - - - #remove empty subtitles - """ - currently the subtitle is located in first paragraph after - sibling

tag. This may be 'fixed' at - some point. - """ - subtitle = soup.find('h3',attrs={'class' : 'subtitle'}) - if subtitle is not None: - subtitleText = subtitle.findNext('p') - if subtitleText is not None: - if len(subtitleText.contents[0]) <= 1 : - subtitleText.extract() - subtitle.extract() - - - #replace rating numbers with stars - for item in soup.findAll('div',attrs={ 'class' : 'starRating'}): - if item is not None: - soup2 = self._insertRatingStars(soup,item) - if soup2 is not None: - soup = soup2 - - - #remove empty paragraph tags in storyTop which can leave a space - #between first paragraph and rest of story - nested_content = False - storyTop = soup.find('div',attrs={ 'class' : ['storyTop']}) - for item in storyTop.findAll('p'): - for nested in item: - if isinstance(nested, Tag): - nested_content = True - break - if not nested_content and item.contents is not None and len(item.contents[0]) <= 1 : - items_to_extract.append(item) - - for item in items_to_extract: - item.extract() - - items_to_extract = [] - - - #remove line breaks immediately next to tags with default margins - #to prevent double line spacing and narrow columns of text - storyTop = soup.find('div',attrs={ 'class' : ['storyTop']}) - self._remove_undesired_line_breaks_from_tag(storyTop,soup) - - - #replace article graphics link with the graphics themselves - if self._FETCH_ARTICLE_GRAPHICS: - items_to_insert = [] - for item in soup.findAll('div', attrs={'class' : ['article-graphic']}): - strong = item.find('strong') - if not strong: - continue - for child in strong: - if isinstance(child,Tag): - if str(child.name) == 'a': - items_to_insert.extend(self._get_article_graphic(strong,child['href'],soup)) - - for item in items_to_insert: - item[0].replaceWith(item[1]) - - for item in items_to_extract: - item.extract() - - return soup - - - def _get_article_graphic(self,old_item,url,soup): - - items_to_insert = [] - - if re.search('\.jpg$',str(url)): - div = Tag(soup,'div') - div['class'] = 'pictureContainer' - img = Tag(soup,'img') - img['src'] = url - img['alt'] = 'article graphic' - div.insert(0,img) - items_to_insert.append((old_item,div,)) - return items_to_insert - - soup2 = self.index_to_soup(url) - for item in soup2.findAll('div',attrs={'class' : re.compile("widget picture article.*")}): - items_to_insert.append((old_item,item),) - return items_to_insert - - - def _insertRatingStars(self,soup,item): - if item.contents is None or len(item.contents) < 1: - return - rating = item.contents[0] - - try: - rating = float(item.contents[0]) - except: - print 'Could not convert decimal rating to star: malformatted float.' - return - for i in range(1,6): - star = Tag(soup,'img') - if i <= rating: - star['src'] = self._STAR_URL - else: - star['src'] = self._NO_STAR_URL - star['alt'] = 'star number ' + str(i) - item.insert(i,star) - #item.contents[0] = NavigableString('(' + str(rating) + ')') - item.contents[0] = '' - - def postprocess_html(self,soup, first_fetch): - - #mark subtitle parent as non-compliant nesting causes - # p's to be 'popped out' of the h3 tag they are nested in. - subtitle = soup.find('h3', attrs={'class' : 'subtitle'}) - subtitle_div = None - if subtitle: - subtitle_div = subtitle.parent - if subtitle_div: - clazz = '' - if 'class' in subtitle_div: - clazz = subtitle_div['class'] + ' ' - clazz = clazz + 'subtitle' - subtitle_div['class'] = clazz - - #find broken images and remove captions - items_to_extract = [] - for item in soup.findAll('div', attrs={'class' : 'image'}): - img = item.findNext('img') - if img and img.get('src'): - # broken images still point to remote url - pattern = re.compile('http://www.independent.co.uk.*') - if pattern.match(img["src"]) is not None: - caption = img.findNextSibling('h3') - if caption is not None: - items_to_extract.append(caption) - items_to_extract.append(img) - - for item in items_to_extract: - item.extract() - - # nickredding's fix for non-justified text - for ptag in soup.findAll('p',attrs={'align':'left'}): - del(ptag['align']) - - return soup - - def _get_gallery_images(self,url): - gallery_soup = self.index_to_soup(url) - images = [] - remove_link = True - total = 1 - try: - counter = gallery_soup.find('div',attrs={'id' : ['counter']}) - total = counter.contents[0].split('/') - total = int(total[1].rstrip()) - except: - total = 1 - - if self._MAX_GALLERY_IMAGES >= 0 and total > self._MAX_GALLERY_IMAGES: - total = self._MAX_GALLERY_IMAGES - remove_link = False - - for i in range(1, total +1): - image, title = self._get_image_from_gallery(gallery_soup) - if image: - images.append((image,title)) - next = url + '&ino=' + str(i + 1) - gallery_soup = self.index_to_soup(next) - images.reverse() - return images, remove_link - - def _get_image_from_gallery(self,soup): - try: - container = soup.find('div',attrs={'id' : ['main-image']}) - image = container.find('img') - if image: - title = soup.find('div',attrs={'id' : ['image-title']}) - return image, title - except: - print 'error fetching gallery image' - return None - - - - def _recurisvely_linearise_tag_tree( - self, - item, - linearised= None, - count=0, - limit = 100 - ): - linearised = linearised or [] - count = count + 1 - if count > limit: - return linearised - if not (isinstance(item,Tag)): - return linearised - for nested in item: - linearised.append(nested) - linearised = self._recurisvely_linearise_tag_tree(nested,linearised, count) - return linearised - - - def _get_previous_tag(self,current_index, tag_tree): - if current_index == 0: - return None - else: - return tag_tree[current_index - 1] - - - def _get_next_tag(self,current_index, tag_tree): - if current_index < len(tag_tree) - 1: - return tag_tree[current_index + 1] - else: - return None - - - def _list_match(self,test_str, list_regex): - for regex in list_regex: - match = re.match(regex, test_str) - if match is not None: - return True - return False - - def _remove_undesired_line_breaks_from_tag(self,parent,soup): - - if parent is None: - return - - - tag_tree = self._recurisvely_linearise_tag_tree(parent) - items_to_remove = [] - - - for item in tag_tree: - if item == u'\n': - items_to_remove.append(item) - continue; - - for item in items_to_remove: - tag_tree.remove(item) - - - spaced_tags = [r'p', r'h\d', r'blockquote'] - tags_to_extract = [] - tags_to_replace = [] - for (i, tag) in enumerate(tag_tree): - if isinstance(tag, Tag): - if str(tag) == '
': - previous_tag = self._get_previous_tag(i, tag_tree) - - if isinstance(previous_tag, Tag): - previous_tag_is_spaced = previous_tag is not None\ - and self._list_match(str(previous_tag.name), - spaced_tags) - else: - previous_tag_is_spaced = False - - next_tag = self._get_next_tag(i, tag_tree) - - if isinstance(next_tag, Tag): - next_tag_is_spaced = next_tag is not None\ - and self._list_match(str(next_tag.name), spaced_tags) - else: - next_tag_is_spaced = False - - if previous_tag_is_spaced or next_tag_is_spaced or i == 0\ - or i == len(tag_tree) - 1: - tags_to_extract.append(tag) - else: - tags_to_replace.append((tag,NavigableString(' '),)) - - - for pair in tags_to_replace: - pair[0].replaceWith(pair[1]) - for tag in tags_to_extract: - tag.extract() feeds = [ (u'News - UK', @@ -610,3 +140,4 @@ class TheIndependentNew(BasicNewsRecipe): u'http://www.independent.co.uk/extras/indybest/?service=rss'), ] +