mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Add recipe for Outlook India
This commit is contained in:
parent
84d42bfe3e
commit
cc06c421c3
@ -18,7 +18,7 @@
|
||||
Builtin recipes.
|
||||
'''
|
||||
recipes = ['newsweek', 'atlantic', 'economist', 'dilbert', 'portfolio',
|
||||
'nytimes', 'usatoday']
|
||||
'nytimes', 'usatoday', 'outlook_india']
|
||||
|
||||
import re, time
|
||||
from libprs500.web.feeds.news import BasicNewsRecipe, CustomIndexRecipe
|
||||
|
94
src/libprs500/web/feeds/recipes/outlook_india.py
Normal file
94
src/libprs500/web/feeds/recipes/outlook_india.py
Normal file
@ -0,0 +1,94 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
|
||||
## This program is free software; you can redistribute it and/or modify
|
||||
## it under the terms of the GNU General Public License as published by
|
||||
## the Free Software Foundation; either version 2 of the License, or
|
||||
## (at your option) any later version.
|
||||
##
|
||||
## This program is distributed in the hope that it will be useful,
|
||||
## but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
## GNU General Public License for more details.
|
||||
##
|
||||
## You should have received a copy of the GNU General Public License along
|
||||
## with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
'''
|
||||
outlookindia.com
|
||||
'''
|
||||
|
||||
from libprs500.web.feeds.news import BasicNewsRecipe
|
||||
import re
|
||||
|
||||
class OutlookIndia(BasicNewsRecipe):
|
||||
|
||||
title = 'Outlook India'
|
||||
recursions = 1
|
||||
match_regexp = r'full.asp.*&pn=\d+'
|
||||
html2lrf_options = ['--ignore-tables']
|
||||
|
||||
remove_tags = [
|
||||
dict(name='img', src="images/space.gif"),
|
||||
dict(name=lambda tag: tag.name == 'tr' and tag.find('img', src="image/tl.gif") is not None ),
|
||||
dict(name=lambda tag: tag.name == 'table' and tag.find('font', attrs={'class':'fontemailfeed'}) is not None),
|
||||
]
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'<body.*?<!--Add Banner ends from here-->', re.DOTALL|re.IGNORECASE),
|
||||
lambda match: '<body>'),
|
||||
|
||||
(re.compile(r'>More Stories:.*', re.DOTALL),
|
||||
lambda match: '></body></html>'),
|
||||
|
||||
(re.compile(r'<!-- Google panel start -->.*', re.DOTALL),
|
||||
lambda match: '</body></html>'),
|
||||
]
|
||||
|
||||
def parse_index(self):
|
||||
soup = self.index_to_soup('http://www.outlookindia.com/archivecontents.asp')
|
||||
feeds = []
|
||||
title = None
|
||||
bogus = True
|
||||
for table in soup.findAll('table'):
|
||||
if title is None:
|
||||
td = table.find('td', background="images/content_band1.jpg")
|
||||
if td is not None:
|
||||
title = self.tag_to_string(td, False)
|
||||
title = title.replace(u'\xa0', u'').strip()
|
||||
if 'Cover Story' in title and bogus:
|
||||
bogus = False
|
||||
title = None
|
||||
else:
|
||||
articles = []
|
||||
for a in table.findAll('a', href=True):
|
||||
if a.find('img') is not None:
|
||||
continue
|
||||
atitle = self.tag_to_string(a, use_alt=False)
|
||||
desc = a.findNextSibling('font', attrs={'class':'fontintro'})
|
||||
if desc is not None:
|
||||
desc = self.tag_to_string(desc)
|
||||
if not desc:
|
||||
desc = ''
|
||||
articles.append({
|
||||
'title':atitle,
|
||||
'description': desc,
|
||||
'content': '',
|
||||
'url':'http://www.outlookindia.com/'+a['href'],
|
||||
'date': '',
|
||||
})
|
||||
feeds.append((title, articles))
|
||||
title = None
|
||||
|
||||
|
||||
return feeds
|
||||
|
||||
def postprocess_html(self, soup):
|
||||
bad = []
|
||||
for table in soup.findAll('table'):
|
||||
if table.find(text=re.compile(r'\(\d+ of \d+\)')):
|
||||
bad.append(table)
|
||||
for b in bad:
|
||||
b.extract()
|
||||
return soup
|
||||
|
Loading…
x
Reference in New Issue
Block a user