...

2025-07-09 03:04:10 -04:00 · 2011-11-13 20:13:21 +05:30 · 2011-11-13 20:13:21 +05:30 · 62764e1bbe
commit 62764e1bbe
parent 8b3af7ce8e
1 changed files with 123 additions and 107 deletions
--- a/recipes/independent.recipe
+++ b/recipes/independent.recipe
@ -1,8 +1,9 @@
 # adapted from old recipe by Darko Miletic <darko.miletic at gmail.com>

-import re
+import string, re
+from calibre import strftime
 from calibre.web.feeds.recipes import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import Tag, NavigableString
+from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString


 class TheIndependentNew(BasicNewsRecipe):
@ -49,7 +50,7 @@ class TheIndependentNew(BasicNewsRecipe):
    preprocess_regexps      = [
                                (re.compile('<span class="storyTop ">(?P<nested>.*?)</span>', re.DOTALL),
                                lambda match: '<div class="storyTop">' + match.group('nested') + '</div>'),
-                                (re.compile('<strong>.*?Click.*?to view graphic.*?</strong>', re.DOTALL),
+                                (re.compile('(<strong>.*?[Cc]lick.*?<a.*?((HERE)|([Hh]ere)).*?</strong>)', re.DOTALL),
                                lambda match: '<div class="article-graphic">' + match.group(0) + '</div>'),
                              ] 
    
@ -104,6 +105,14 @@ class TheIndependentNew(BasicNewsRecipe):

    def preprocess_html(self, soup):
        
+        #remove 'advertorial articles'
+        strapline = soup.find('div',attrs={'class' : re.compile('.*strapLine.*')})
+        if strapline:
+            for para in strapline.findAll('p'):
+                if len(para.contents) and isinstance(para.contents[0],NavigableString) \
+                and para.contents[0] == 'ADVERTORIAL FEATURE':
+                    return None                          
+        
        items_to_extract = []
        
        for item in soup.findAll(attrs={'class' : re.compile("widget.*")}):
@ -189,9 +198,14 @@ class TheIndependentNew(BasicNewsRecipe):
       
        #remove empty paragraph tags in storyTop which can leave a space
        #between first paragraph and rest of story
+        nested_content = False       
        storyTop = soup.find('div',attrs={ 'class' : ['storyTop']})
        for item in storyTop.findAll('p'):
-            if item.contents is not None and len(item.contents[0]) <= 1 :
+            for nested in item:
+                if isinstance(nested, Tag):
+                    nested_content = True
+                    break
+            if not nested_content and item.contents is not None and len(item.contents[0]) <= 1 :
                items_to_extract.append(item)
                
        for item in items_to_extract:
@ -211,6 +225,8 @@ class TheIndependentNew(BasicNewsRecipe):
            items_to_insert = []
            for item in soup.findAll('div', attrs={'class' : ['article-graphic']}):
                strong = item.find('strong')
+                if not strong:
+                    continue
                for child in strong:
                    if isinstance(child,Tag):
                        if str(child.name) == 'a':