Update The Independent

2026-04-19 23:38:49 -04:00 · 2013-08-10 21:30:04 +05:30 · 2013-08-10 21:30:04 +05:30 · e5f761998f
commit e5f761998f
parent 7ed19123e4
1 changed files with 7 additions and 476 deletions
--- a/recipes/independent.recipe
+++ b/recipes/independent.recipe
@ -1,504 +1,34 @@
 # adapted from old recipe by Darko Miletic <darko.miletic at gmail.com>

-import re
 from calibre.web.feeds.recipes import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import Tag, NavigableString
-

 class TheIndependentNew(BasicNewsRecipe):

-    # flag to enable/disable article graphics on business pages/some others
-    # eg http://www.independent.co.uk/news/world/europe/berlusconi-departure-fails-to-calm-the-markets-6259682.html
-    # -max dimensions can be altered using the .pictureContainer img selector in the css
-    _FETCH_ARTICLE_GRAPHICS = True
-
-    #Flag to enable/disable image fetching (not business)
-    _FETCH_IMAGES = True
-
-    #Set max gallery images here (respects _FETCH_IMAGES)
-    # -1 for infinite
-    _MAX_GALLERY_IMAGES = -1
-
-
-     #used for converting rating to stars
+     # used for converting rating to stars
    _STAR_URL = 'http://www.independent.co.uk/skins/ind/images/rating_star.png'
    _NO_STAR_URL = 'http://www.independent.co.uk/skins/ind/images/rating_star_grey.png'

-
    title                   = u'The Independent'
-    __author__              = 'Will'
+    __author__              = 'Krittika Goyal'
    description             = 'The latest in UK News and World News from The \
                               Independent. Wide range of international and local news, sports \
                               news, commentary and opinion pieces.Independent News - Breaking news \
                               that matters. Your daily comprehensive news source - The \
                               Independent Newspaper'
    publisher               = 'The Independent'
+    oldest_article          = 2.0
+    ignore_duplicate_articles = {'title', 'url'}
+    remove_empty_feeds      = True
    category                = 'news, UK'
    no_stylesheets          = True
    use_embedded_content    = False
    remove_empty_feeds      = True
+    auto_cleanup = True
    language                = 'en_GB'
    publication_type        = 'newspaper'
    masthead_url            = 'http://www.independent.co.uk/independent.co.uk/editorial/logo/independent_Masthead.png'
    encoding                = 'utf-8'
    compress_news_images    = True
-    remove_tags             =[
-                               dict(attrs={'id' : ['RelatedArtTag','renderBiography']}),
-                               dict(attrs={'class' : ['autoplay','openBiogPopup']}),
-                               dict(name='img',attrs={'alt' : ['Get Adobe Flash player']}),
-                               dict(name='img',attrs={'alt' : ['view gallery']}),
-                               dict(attrs={'style' : re.compile('.*')}),
-                               dict(attrs={'class':lambda x: x and 'voicesRelatedTopics' in x.split()}),
-                             ]
-
-    keep_only_tags          =[dict(attrs={'id':['main','top']})]
-    recursions = 0
-
-    # fixes non compliant html nesting and 'marks' article graphics links
-    preprocess_regexps      = [
-                                (re.compile('<span class="storyTop ">(?P<nested>.*?)</span>', re.DOTALL),
-                                lambda match: '<div class="storyTop">' + match.group('nested') + '</div>'),
-                                (re.compile('(<strong>.*?[Cc]lick.*?<a.*?((HERE)|([Hh]ere)).*?</strong>)', re.DOTALL),
-                                lambda match: '<div class="article-graphic">' + match.group(0) + '</div>'),
-                              ]
-
-
-    conversion_options = {
-                          'comment'   : description
-                        , 'tags'      : category
-                        , 'publisher' : publisher
-                        , 'language'  : language
-                        }
-
-    extra_css             = """
-                               h1{font-family: Georgia,serif ; font-size: x-large; }
-                               body{font-family: Verdana,Arial,Helvetica,sans-serif}
-                               img{margin-bottom: 0.4em; display:block}
-                               .starRating img {float: left}
-                               .starRating {margin-top:0.4em; display: block}
-                               .image {clear:left; font-size: x-small; color:#888888;}
-                               .articleByTimeLocation {font-size: x-small; color:#888888;
-                                margin-bottom:0.2em ; margin-top:0.2em ; display:block}
-                                .subtitle {clear:left ;}
-                               .column-1 h1 { color: #191919}
-                               .column-1 h2 { color: #333333}
-                               .column-1 h3 { color: #444444}
-                               .subtitle { color: #777777; font-size: medium;}
-                               .column-1 a,h1,h2,h3 { margin: 0; }
-                               .column-1 div{margin: 0;}
-                               .articleContent {display: block; clear:left;}
-                               .articleContent {color: #000000; font-size: medium;}
-                               .ivDrip-section {color: #000000; font-size: medium;}
-                               .datetime {color: #888888}
-                               .title {font-weight:bold;}
-                               .storyTop{}
-                               .pictureContainer img { max-width: 400px; max-height: 400px;}
-                               .image img { max-width: 400px; max-height: 400px;}
-                            """
-
-    oldest_article = 1
-    max_articles_per_feed = 100
-
-    _processed_urls = []
-
-
-    def get_article_url(self, article):
-        url = super(self.__class__,self).get_article_url(article)
-
-        title = article.get('title', None)
-        if title and re.search("^Video:",title):
-            return None
-
-        #remove duplicates
-        if not (url in self._processed_urls):
-            self._processed_urls.append(url)
-        else:
-            url = None
-        return url
-
-    def populate_article_metadata(self, article, soup, first):
-        if first and hasattr(self, 'add_toc_thumbnail'):
-            picdiv = soup.find('img')
-            if picdiv is not None:
-                self.add_toc_thumbnail(article,picdiv['src'])
-
-    def preprocess_html(self, soup):
-
-        #remove 'advertorial articles'
-        strapline = soup.find('div',attrs={'class' : re.compile('.*strapLine.*')})
-        if strapline:
-            for para in strapline.findAll('p'):
-                if len(para.contents) and isinstance(para.contents[0],NavigableString) \
-                and para.contents[0] == 'ADVERTORIAL FEATURE':
-                    return None
-
-        # remove Suggested Topics
-        items_to_extract = []
-
-        for item in soup.findAll('div',attrs={'class' : re.compile('.*RelatedArtTag.*')}):
-            items_to_extract.append(item)
-
-        for item in items_to_extract:
-            item.extract()
-
-        items_to_extract = []
-        slideshow_elements = []
-
-        for item in soup.findAll(attrs={'class' : re.compile("widget.*")}):
-            remove = True
-            pattern = re.compile('((articleContent)|(title))$')
-            if (pattern.search(item['class'])) is not None:
-                remove = False
-
-            # corrections
-            # story content always good
-            pattern = re.compile('storyContent')
-            if (pattern.search(item['class'])) is not None:
-                remove = False
-
-            #images
-            pattern = re.compile('slideshow')
-            if (pattern.search(item['class'])) is not None:
-                if self._FETCH_IMAGES:
-                    remove = False
-                    slideshow_elements.append(item)
-                else:
-                    remove = True
-
-            #social widgets always bad
-            pattern = re.compile('socialwidget')
-            if (pattern.search(item['class'])) is not None:
-                remove = True
-
-            if remove:
-                items_to_extract.append(item)
-
-        for item in items_to_extract:
-            item.extract()
-
-        items_to_extract = []
-
-        if self._FETCH_IMAGES:
-            for element in slideshow_elements:
-                for item in element.findAll('a',attrs={'href' : re.compile('.*')}):
-                    if item.img is not None:
-                        #use full size image
-                        images = []
-
-                        img = item.findNext('img')
-
-                        if  not '?action=gallery' in item['href']:
-                            img['src'] = item['href']
-                            tag = Tag(soup,'h3')
-                            text = ''
-                            try:
-                                text = img['data-title']
-                            except:
-                                pass
-
-                            if img.get('title') and (len(img['title']) > 1):
-                                text = NavigableString(img['title'])
-                            tag.insert(0,text)
-                            images.append((img, tag))
-                        else:
-                            gallery_images, remove_link = self._get_gallery_images(item['href'])
-                            images = images + gallery_images
-                            if remove_link:
-                                gal_link = soup.find('a',attrs={'id' : 'view-gallery'})
-                                if gal_link:
-                                    gal_link.extract()
-                            img.extract()
-                        for (img, title) in images:
-                            #insert caption if available
-                            if title:
-                                #picture before text
-                                img.extract()
-                                item.insert(0,img)
-                                item.insert(1,title)
-
-                            # remove link
-                            item.name = "div"
-                            item["class"]='image'
-                            del item["href"]
-
-
-        #remove empty subtitles
-        """
-        currently the subtitle is located in first paragraph after
-        sibling <h3 class="subtitle"> tag. This may be 'fixed' at
-        some point.
-        """
-        subtitle = soup.find('h3',attrs={'class' : 'subtitle'})
-        if subtitle is not None:
-            subtitleText = subtitle.findNext('p')
-            if subtitleText is not None:
-                if len(subtitleText.contents[0]) <= 1 :
-                    subtitleText.extract()
-                    subtitle.extract()
-
-
-        #replace rating numbers with stars
-        for item in soup.findAll('div',attrs={ 'class' : 'starRating'}):
-            if item is not None:
-                soup2 = self._insertRatingStars(soup,item)
-            if soup2 is not None:
-                soup = soup2
-
-
-        #remove empty paragraph tags in storyTop which can leave a space
-        #between first paragraph and rest of story
-        nested_content = False
-        storyTop = soup.find('div',attrs={ 'class' : ['storyTop']})
-        for item in storyTop.findAll('p'):
-            for nested in item:
-                if isinstance(nested, Tag):
-                    nested_content = True
-                    break
-            if not nested_content and item.contents is not None and len(item.contents[0]) <= 1 :
-                items_to_extract.append(item)
-
-        for item in items_to_extract:
-            item.extract()
-
-        items_to_extract = []
-
-
-        #remove line breaks immediately next to tags with default margins
-        #to prevent double line spacing and narrow columns of text
-        storyTop = soup.find('div',attrs={ 'class' : ['storyTop']})
-        self._remove_undesired_line_breaks_from_tag(storyTop,soup)
-
-
-        #replace article graphics link with the graphics themselves
-        if self._FETCH_ARTICLE_GRAPHICS:
-            items_to_insert = []
-            for item in soup.findAll('div', attrs={'class' : ['article-graphic']}):
-                strong = item.find('strong')
-                if not strong:
-                    continue
-                for child in strong:
-                    if isinstance(child,Tag):
-                        if str(child.name) == 'a':
-                            items_to_insert.extend(self._get_article_graphic(strong,child['href'],soup))
-
-            for item in items_to_insert:
-                item[0].replaceWith(item[1])
-
-        for item in items_to_extract:
-            item.extract()
-
-        return soup
-
-
-    def _get_article_graphic(self,old_item,url,soup):
-
-        items_to_insert = []
-
-        if re.search('\.jpg$',str(url)):
-            div = Tag(soup,'div')
-            div['class'] = 'pictureContainer'
-            img = Tag(soup,'img')
-            img['src'] = url
-            img['alt'] = 'article graphic'
-            div.insert(0,img)
-            items_to_insert.append((old_item,div,))
-            return items_to_insert
-
-        soup2 = self.index_to_soup(url)
-        for item in soup2.findAll('div',attrs={'class' : re.compile("widget picture article.*")}):
-            items_to_insert.append((old_item,item),)
-        return items_to_insert
-
-
-    def _insertRatingStars(self,soup,item):
-        if item.contents is None or len(item.contents) < 1:
-            return
-        rating = item.contents[0]
-
-        try:
-            rating = float(item.contents[0])
-        except:
-            print 'Could not convert decimal rating to star: malformatted float.'
-            return
-        for i in range(1,6):
-            star = Tag(soup,'img')
-            if i <= rating:
-                star['src'] = self._STAR_URL
-            else:
-                star['src'] = self._NO_STAR_URL
-            star['alt'] = 'star number ' +  str(i)
-            item.insert(i,star)
-        #item.contents[0] = NavigableString('(' + str(rating) + ')')
-        item.contents[0] = ''
-
-    def postprocess_html(self,soup, first_fetch):
-
-        #mark subtitle parent as non-compliant nesting causes
-        # p's to be 'popped out' of the h3 tag they are nested in.
-        subtitle = soup.find('h3', attrs={'class' : 'subtitle'})
-        subtitle_div = None
-        if subtitle:
-            subtitle_div = subtitle.parent
-        if subtitle_div:
-            clazz = ''
-            if 'class' in subtitle_div:
-                clazz = subtitle_div['class'] + ' '
-            clazz = clazz + 'subtitle'
-            subtitle_div['class'] = clazz
-
-        #find broken images and remove captions
-        items_to_extract = []
-        for item in soup.findAll('div', attrs={'class' : 'image'}):
-            img = item.findNext('img')
-            if img and img.get('src'):
-                # broken images still point to remote url
-                pattern = re.compile('http://www.independent.co.uk.*')
-                if pattern.match(img["src"]) is not None:
-                    caption = img.findNextSibling('h3')
-                    if caption is not None:
-                        items_to_extract.append(caption)
-                    items_to_extract.append(img)
-
-        for item in items_to_extract:
-            item.extract()
-
-        # nickredding's fix for non-justified text
-        for ptag in soup.findAll('p',attrs={'align':'left'}):
-            del(ptag['align'])
-
-        return soup
-
-    def _get_gallery_images(self,url):
-        gallery_soup = self.index_to_soup(url)
-        images = []
-        remove_link = True
-        total = 1
-        try:
-            counter = gallery_soup.find('div',attrs={'id' : ['counter']})
-            total = counter.contents[0].split('/')
-            total = int(total[1].rstrip())
-        except:
-            total = 1
-
-        if self._MAX_GALLERY_IMAGES >= 0 and total > self._MAX_GALLERY_IMAGES:
-            total = self._MAX_GALLERY_IMAGES
-            remove_link = False
-
-        for i in range(1, total +1):
-            image, title = self._get_image_from_gallery(gallery_soup)
-            if image:
-                images.append((image,title))
-            next = url + '&ino=' + str(i + 1)
-            gallery_soup = self.index_to_soup(next)
-        images.reverse()
-        return images, remove_link
-
-    def _get_image_from_gallery(self,soup):
-        try:
-            container = soup.find('div',attrs={'id' : ['main-image']})
-            image = container.find('img')
-            if image:
-                title = soup.find('div',attrs={'id' : ['image-title']})
-            return image, title
-        except:
-            print 'error fetching gallery image'
-            return None
-
-
-
-    def _recurisvely_linearise_tag_tree(
-        self,
-        item,
-        linearised= None,
-        count=0,
-        limit = 100
-        ):
-        linearised = linearised or []
-        count = count + 1
-        if count > limit:
-            return linearised
-        if not (isinstance(item,Tag)):
-            return linearised
-        for nested in item:
-            linearised.append(nested)
-            linearised = self._recurisvely_linearise_tag_tree(nested,linearised, count)
-        return linearised
-
-
-    def _get_previous_tag(self,current_index, tag_tree):
-        if current_index == 0:
-            return None
-        else:
-            return tag_tree[current_index - 1]
-
-
-    def _get_next_tag(self,current_index, tag_tree):
-        if current_index < len(tag_tree) - 1:
-            return tag_tree[current_index + 1]
-        else:
-            return None
-
-
-    def _list_match(self,test_str, list_regex):
-        for regex in list_regex:
-            match = re.match(regex, test_str)
-            if match is not None:
-                return True
-        return False
-
-    def _remove_undesired_line_breaks_from_tag(self,parent,soup):
-
-        if parent is None:
-            return
-
-
-        tag_tree = self._recurisvely_linearise_tag_tree(parent)
-        items_to_remove = []
-
-
-        for item in tag_tree:
-            if item == u'\n':
-               items_to_remove.append(item)
-               continue;
-
-        for item in items_to_remove:
-            tag_tree.remove(item)
-
-
-        spaced_tags = [r'p', r'h\d', r'blockquote']
-        tags_to_extract = []
-        tags_to_replace = []
-        for (i, tag) in enumerate(tag_tree):
-            if isinstance(tag, Tag):
-                if str(tag) == '<br />':
-                    previous_tag = self._get_previous_tag(i, tag_tree)
-
-                    if isinstance(previous_tag, Tag):
-                        previous_tag_is_spaced = previous_tag is not None\
-                             and self._list_match(str(previous_tag.name),
-                                spaced_tags)
-                    else:
-                        previous_tag_is_spaced = False
-
-                    next_tag = self._get_next_tag(i, tag_tree)
-
-                    if isinstance(next_tag, Tag):
-                        next_tag_is_spaced = next_tag is not None\
-                             and self._list_match(str(next_tag.name), spaced_tags)
-                    else:
-                        next_tag_is_spaced = False
-
-                    if previous_tag_is_spaced or next_tag_is_spaced or i == 0\
-                         or i == len(tag_tree) - 1:
-                        tags_to_extract.append(tag)
-                    else:
-                        tags_to_replace.append((tag,NavigableString(' '),))
-
-
-        for pair in tags_to_replace:
-            pair[0].replaceWith(pair[1])
-        for tag in tags_to_extract:
-            tag.extract()

    feeds = [
        (u'News - UK',
@ -610,3 +140,4 @@ class TheIndependentNew(BasicNewsRecipe):
         u'http://www.independent.co.uk/extras/indybest/?service=rss'),
        ]

+