This commit is contained in:
Kovid Goyal 2011-11-13 20:13:21 +05:30
parent 8b3af7ce8e
commit 62764e1bbe

View File

@ -1,8 +1,9 @@
# adapted from old recipe by Darko Miletic <darko.miletic at gmail.com>
import re
import string, re
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag, NavigableString
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString
class TheIndependentNew(BasicNewsRecipe):
@ -49,7 +50,7 @@ class TheIndependentNew(BasicNewsRecipe):
preprocess_regexps = [
(re.compile('<span class="storyTop ">(?P<nested>.*?)</span>', re.DOTALL),
lambda match: '<div class="storyTop">' + match.group('nested') + '</div>'),
(re.compile('<strong>.*?Click.*?to view graphic.*?</strong>', re.DOTALL),
(re.compile('(<strong>.*?[Cc]lick.*?<a.*?((HERE)|([Hh]ere)).*?</strong>)', re.DOTALL),
lambda match: '<div class="article-graphic">' + match.group(0) + '</div>'),
]
@ -104,6 +105,14 @@ class TheIndependentNew(BasicNewsRecipe):
def preprocess_html(self, soup):
#remove 'advertorial articles'
strapline = soup.find('div',attrs={'class' : re.compile('.*strapLine.*')})
if strapline:
for para in strapline.findAll('p'):
if len(para.contents) and isinstance(para.contents[0],NavigableString) \
and para.contents[0] == 'ADVERTORIAL FEATURE':
return None
items_to_extract = []
for item in soup.findAll(attrs={'class' : re.compile("widget.*")}):
@ -189,9 +198,14 @@ class TheIndependentNew(BasicNewsRecipe):
#remove empty paragraph tags in storyTop which can leave a space
#between first paragraph and rest of story
nested_content = False
storyTop = soup.find('div',attrs={ 'class' : ['storyTop']})
for item in storyTop.findAll('p'):
if item.contents is not None and len(item.contents[0]) <= 1 :
for nested in item:
if isinstance(nested, Tag):
nested_content = True
break
if not nested_content and item.contents is not None and len(item.contents[0]) <= 1 :
items_to_extract.append(item)
for item in items_to_extract:
@ -211,6 +225,8 @@ class TheIndependentNew(BasicNewsRecipe):
items_to_insert = []
for item in soup.findAll('div', attrs={'class' : ['article-graphic']}):
strong = item.find('strong')
if not strong:
continue
for child in strong:
if isinstance(child,Tag):
if str(child.name) == 'a':