# adapted from old recipe by Darko Miletic import re from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import Tag, NavigableString class TheIndependentNew(BasicNewsRecipe): # flag to enable/disable article graphics on business pages/some others # eg http://www.independent.co.uk/news/world/europe/berlusconi-departure-fails-to-calm-the-markets-6259682.html # -max dimensions can be altered using the .pictureContainer img selector in the css _FETCH_ARTICLE_GRAPHICS = True #Flag to enable/disable image fetching (not business) _FETCH_IMAGES = True #used for converting rating to stars _STAR_URL = 'http://www.independent.co.uk/skins/ind/images/rating_star.png' _NO_STAR_URL = 'http://www.independent.co.uk/skins/ind/images/rating_star_grey.png' title = u'The Independent' __author__ = 'Will' description = 'The latest in UK News and World News from The \ Independent. Wide range of international and local news, sports \ news, commentary and opinion pieces.Independent News - Breaking news \ that matters. Your daily comprehensive news source - The \ Independent Newspaper' publisher = 'The Independent' category = 'news, UK' no_stylesheets = True use_embedded_content = False remove_empty_feeds = True language = 'en_GB' publication_type = 'newspaper' masthead_url = 'http://www.independent.co.uk/independent.co.uk/editorial/logo/independent_Masthead.png' encoding = 'utf-8' remove_tags =[ dict(attrs={'id' : ['RelatedArtTag','renderBiography']}), dict(attrs={'class' : ['autoplay','openBiogPopup']}), dict(name='img',attrs={'alt' : ['Get Adobe Flash player']}), dict(attrs={'style' : re.compile('.*')}), ] keep_only_tags =[dict(attrs={'id':'main'})] recursions = 0 # fixes non compliant html nesting and 'marks' article graphics links preprocess_regexps = [ (re.compile('(?P.*?)', re.DOTALL), lambda match: '

' + match.group('nested') + '

'), (re.compile('(.*?[Cc]lick.*?)', re.DOTALL), lambda match: '
' + match.group(0) + '
'), ] conversion_options = { 'comment' : description , 'tags' : category , 'publisher' : publisher , 'language' : language } extra_css = """ h1{font-family: Georgia,serif } body{font-family: Verdana,Arial,Helvetica,sans-serif} img{margin-bottom: 0.4em; display:block} .starRating img {float: left} .starRating {margin-top:0.4em; display: block} .image {clear:left; font-size: x-small; color:#888888;} .articleByTimeLocation {font-size: x-small; color:#888888; margin-bottom:0.2em ; margin-top:0.2em ; display:block} .subtitle {clear:left} .column-1 h1 { color: #191919} .column-1 h2 { color: #333333} .column-1 h3 { color: #444444} .column-1 p { color: #777777} .column-1 p,a,h1,h2,h3 { margin: 0; } .column-1 div{color:#888888; margin: 0;} .articleContent {display: block; clear:left;} .storyTop{} .pictureContainer img { max-width: 400px; max-height: 400px;} """ oldest_article = 1 max_articles_per_feed = 100 _processed_urls = [] def get_article_url(self, article): url = super(self.__class__,self).get_article_url(article) title = article.get('title', None) if title and re.search("^Video:",title): return None #remove duplicates if not (url in self._processed_urls): self._processed_urls.append(url) else: url = None return url def populate_article_metadata(self, article, soup, first): if first and hasattr(self, 'add_toc_thumbnail'): picdiv = soup.find('img') if picdiv is not None: self.add_toc_thumbnail(article,picdiv['src']) def preprocess_html(self, soup): #remove 'advertorial articles' strapline = soup.find('div',attrs={'class' : re.compile('.*strapLine.*')}) if strapline: for para in strapline.findAll('p'): if len(para.contents) and isinstance(para.contents[0],NavigableString) \ and para.contents[0] == 'ADVERTORIAL FEATURE': return None items_to_extract = [] slideshow_elements = [] for item in soup.findAll(attrs={'class' : re.compile("widget.*")}): remove = True pattern = re.compile('((articleContent)|(title))$') if (pattern.search(item['class'])) is not None: remove = False # corrections # story content always good pattern = re.compile('storyContent') if (pattern.search(item['class'])) is not None: remove = False #images pattern = re.compile('slideshow') if (pattern.search(item['class'])) is not None: if self._FETCH_IMAGES: remove = False slideshow_elements.append(item) else: remove = True #social widgets always bad pattern = re.compile('socialwidget') if (pattern.search(item['class'])) is not None: remove = True if remove: items_to_extract.append(item) for item in items_to_extract: item.extract() items_to_extract = [] if self._FETCH_IMAGES: for element in slideshow_elements: for item in element.findAll('a',attrs={'href' : re.compile('.*')}): if item.img is not None: #use full size image img = item.findNext('img') img['src'] = item['href'] #insert caption if available if img.get('title') and (len(img['title']) > 1): tag = Tag(soup,'h3') text = NavigableString(img['title']) tag.insert(0,text) #picture before text img.extract() item.insert(0,img) item.insert(1,tag) # remove link item.name = "div" item["class"]='image' del item["href"] #remove empty subtitles """ currently the subtitle is located in first paragraph after sibling
tag. This may be 'fixed' at some point. """ subtitle = soup.find('h3',attrs={'class' : 'subtitle'}) if subtitle is not None: subtitleText = subtitle.findNext('p') if subtitleText is not None: if len(subtitleText.contents[0]) <= 1 : subtitleText.extract() subtitle.extract() #replace rating numbers with stars for item in soup.findAll('div',attrs={ 'class' : 'starRating'}): if item is not None: soup2 = self._insertRatingStars(soup,item) if soup2 is not None: soup = soup2 #remove empty paragraph tags in storyTop which can leave a space #between first paragraph and rest of story nested_content = False storyTop = soup.find('div',attrs={ 'class' : ['storyTop']}) for item in storyTop.findAll('p'): for nested in item: if isinstance(nested, Tag): nested_content = True break if not nested_content and item.contents is not None and len(item.contents[0]) <= 1 : items_to_extract.append(item) for item in items_to_extract: item.extract() items_to_extract = [] #remove line breaks immediately next to tags with default margins #to prevent double line spacing and narrow columns of text storyTop = soup.find('div',attrs={ 'class' : ['storyTop']}) self._remove_undesired_line_breaks_from_tag(storyTop,soup) #replace article graphics link with the graphics themselves if self._FETCH_ARTICLE_GRAPHICS: items_to_insert = [] for item in soup.findAll('div', attrs={'class' : ['article-graphic']}): strong = item.find('strong') if not strong: continue for child in strong: if isinstance(child,Tag): if str(child.name) == 'a': items_to_insert.extend(self._get_article_graphic(strong,child['href'],soup)) for item in items_to_insert: item[0].replaceWith(item[1]) for item in items_to_extract: item.extract() return soup def _get_article_graphic(self,old_item,url,soup): items_to_insert = [] if re.search('\.jpg$',str(url)): div = Tag(soup,'div') div['class'] = 'pictureContainer' img = Tag(soup,'img') img['src'] = url img['alt'] = 'article graphic' div.insert(0,img) items_to_insert.append((old_item,div,)) return items_to_insert soup2 = self.index_to_soup(url) for item in soup2.findAll('div',attrs={'class' : re.compile("widget picture article.*")}): items_to_insert.append((old_item,item),) return items_to_insert def _insertRatingStars(self,soup,item): if item.contents is None or len(item.contents) < 1: return rating = item.contents[0] try: rating = float(item.contents[0]) except: print 'Could not convert decimal rating to star: malformatted float.' return for i in range(1,6): star = Tag(soup,'img') if i <= rating: star['src'] = self._STAR_URL else: star['src'] = self._NO_STAR_URL star['alt'] = 'star number ' + str(i) item.insert(i,star) #item.contents[0] = NavigableString('(' + str(rating) + ')') item.contents[0] = '' def postprocess_html(self,soup, first_fetch): #find broken images and remove captions items_to_extract = [] for item in soup.findAll('div', attrs={'class' : 'image'}): img = item.findNext('img') if img and img.get('src'): # broken images still point to remote url pattern = re.compile('http://www.independent.co.uk.*') if pattern.match(img["src"]) is not None: caption = img.findNextSibling('h3') if caption is not None: items_to_extract.append(caption) items_to_extract.append(img) for item in items_to_extract: item.extract() return soup def _recurisvely_linearise_tag_tree( self, item, linearised= None, count=0, limit = 100 ): linearised = linearised or [] count = count + 1 if count > limit: return linearised if not (isinstance(item,Tag)): return linearised for nested in item: linearised.append(nested) linearised = self._recurisvely_linearise_tag_tree(nested,linearised, count) return linearised def _get_previous_tag(self,current_index, tag_tree): if current_index == 0: return None else: return tag_tree[current_index - 1] def _get_next_tag(self,current_index, tag_tree): if current_index < len(tag_tree) - 1: return tag_tree[current_index + 1] else: return None def _list_match(self,test_str, list_regex): for regex in list_regex: match = re.match(regex, test_str) if match is not None: return True return False def _remove_undesired_line_breaks_from_tag(self,parent,soup): if parent is None: return tag_tree = self._recurisvely_linearise_tag_tree(parent) items_to_remove = [] for item in tag_tree: if item == u'\n': items_to_remove.append(item) continue; for item in items_to_remove: tag_tree.remove(item) spaced_tags = [r'p', r'h\d', r'blockquote'] tags_to_extract = [] tags_to_replace = [] for (i, tag) in enumerate(tag_tree): if isinstance(tag, Tag): if str(tag) == '
': previous_tag = self._get_previous_tag(i, tag_tree) if isinstance(previous_tag, Tag): previous_tag_is_spaced = previous_tag is not None\ and self._list_match(str(previous_tag.name), spaced_tags) else: previous_tag_is_spaced = False next_tag = self._get_next_tag(i, tag_tree) if isinstance(next_tag, Tag): next_tag_is_spaced = next_tag is not None\ and self._list_match(str(next_tag.name), spaced_tags) else: next_tag_is_spaced = False if previous_tag_is_spaced or next_tag_is_spaced or i == 0\ or i == len(tag_tree) - 1: tags_to_extract.append(tag) else: tags_to_replace.append((tag,NavigableString(' '),)) for pair in tags_to_replace: pair[0].replaceWith(pair[1]) for tag in tags_to_extract: tag.extract() feeds = [ (u'News - UK', u'http://www.independent.co.uk/news/uk/?service=rss'), (u'News - World', u'http://www.independent.co.uk/news/world/?service=rss'), (u'News - Business', u'http://www.independent.co.uk/news/business/?service=rss'), (u'News - People', u'http://www.independent.co.uk/news/people/?service=rss'), (u'News - Science', u'http://www.independent.co.uk/news/science/?service=rss'), (u'News - Media', u'http://www.independent.co.uk/news/media/?service=rss'), (u'News - Education', u'http://www.independent.co.uk/news/education/?service=rss'), (u'News - Obituaries', u'http://www.independent.co.uk/news/obituaries/?service=rss'), (u'News - Corrections', u'http://www.independent.co.uk/news/corrections/?service=rss' ), (u'Opinion', u'http://www.independent.co.uk/opinion/?service=rss'), (u'Environment', u'http://www.independent.co.uk/environment/?service=rss'), (u'Sport - Athletics', u'http://www.independent.co.uk/sport/general/athletics/?service=rss' ), (u'Sport - Cricket', u'http://www.independent.co.uk/sport/cricket/?service=rss'), (u'Sport - Football', u'http://www.independent.co.uk/sport/football/?service=rss'), (u'Sport - Golf', u'http://www.independent.co.uk/sport/golf/?service=rss'), (u'Sport - Motor racing', u'http://www.independent.co.uk/sport/motor-racing/?service=rss' ), (u'Sport - Olympics', u'http://www.independent.co.uk/sport/olympics/?service=rss'), (u'Sport - Racing', u'http://www.independent.co.uk/sport/racing/?service=rss'), (u'Sport - Rugby League', u'http://www.independent.co.uk/sport/general/rugby-league/?service=rss'), (u'Sport - Rugby Union', u'http://www.independent.co.uk/sport/rugby/rugby-union/?service=rss' ), (u'Sport - Sailing', u'http://www.independent.co.uk/sport/general/sailing/?service=rss' ), (u'Sport - Tennis', u'http://www.independent.co.uk/sport/tennis/?service=rss'), (u'Sport - Others', u'http://www.independent.co.uk/sport/general/others/?service=rss' ), (u'Life & Style - Fashion', u'http://www.independent.co.uk/life-style/fashion/?service=rss' ), (u'Life & Style -Food & Drink', u'http://www.independent.co.uk/life-style/food-and-drink/?service=rss' ), (u'Life & Style - Health and Families', u'http://www.independent.co.uk/life-style/health-and-families/?service=rss' ), (u'Life & Style - House & Home', u'http://www.independent.co.uk/life-style/house-and-home/'), (u'Life & Style - History', u'http://www.independent.co.uk/life-style/history/?service=rss' ), (u'Life & Style - Gadgets & Tech', u'http://www.independent.co.uk/life-style/gadgets-and-tech/?service=rss' ), (u'Life & Style - Motoring', u'http://www.independent.co.uk/life-style/motoring/?service=rss' ), (u'Arts & Ents - Art', u'http://www.independent.co.uk/arts-entertainment/art/?service=rss' ), (u'Arts & Ents - Architecture', u'http://www.independent.co.uk/arts-entertainment/architecture/?service=rss' ), (u'Arts & Ents - Music', u'http://www.independent.co.uk/arts-entertainment/music/?service=rss' ), (u'Arts & Ents - Classical', u'http://www.independent.co.uk/arts-entertainment/classical/?service=rss' ), (u'Arts & Ents - Films', u'http://www.independent.co.uk/arts-entertainment/films/?service=rss' ), (u'Arts & Ents - TV', u'http://www.independent.co.uk/arts-entertainment/tv/?service=rss' ), (u'Arts & Ents - Theatre and Dance', u'http://www.independent.co.uk/arts-entertainment/theatre-dance/?service=rss' ), (u'Arts & Ents - Comedy', u'http://www.independent.co.uk/arts-entertainment/comedy/?service=rss' ), (u'Arts & Ents - Books', u'http://www.independent.co.uk/arts-entertainment/books/?service=rss' ), (u'Travel', u'http://www.independent.co.uk/travel/?service=rss' ), (u'Money', u'http://www.independent.co.uk/money/?service=rss'), (u'IndyBest', u'http://www.independent.co.uk/extras/indybest/?service=rss'), ]