Fix #1090902 (Updated recipe for Nin online)

This commit is contained in:
Kovid Goyal 2012-12-16 21:26:06 +05:30
parent 042bde4ada
commit 391a58f9e9

View File

@ -15,7 +15,7 @@ class Nin(BasicNewsRecipe):
publisher = 'NIN d.o.o. - Ringier d.o.o.' publisher = 'NIN d.o.o. - Ringier d.o.o.'
category = 'news, politics, Serbia' category = 'news, politics, Serbia'
no_stylesheets = True no_stylesheets = True
oldest_article = 15 oldest_article = 180
encoding = 'utf-8' encoding = 'utf-8'
needs_subscription = True needs_subscription = True
remove_empty_feeds = True remove_empty_feeds = True
@ -25,7 +25,7 @@ class Nin(BasicNewsRecipe):
use_embedded_content = False use_embedded_content = False
language = 'sr' language = 'sr'
publication_type = 'magazine' publication_type = 'magazine'
masthead_url = 'http://www.nin.co.rs/img/head/logo.jpg' masthead_url = 'http://www.nin.co.rs/img/logo_print.jpg'
extra_css = """ extra_css = """
@font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
body{font-family: Verdana, Lucida, sans1, sans-serif} body{font-family: Verdana, Lucida, sans1, sans-serif}
@ -42,11 +42,11 @@ class Nin(BasicNewsRecipe):
, 'tags' : category , 'tags' : category
, 'publisher' : publisher , 'publisher' : publisher
, 'language' : language , 'language' : language
, 'linearize_tables': True
} }
preprocess_regexps = [ preprocess_regexps = [
(re.compile(r'</body>.*?<html>', re.DOTALL|re.IGNORECASE),lambda match: '</body>') (re.compile(r'<div class="standardFont">.*', re.DOTALL|re.IGNORECASE),lambda match: '')
,(re.compile(r'</html>.*?</html>', re.DOTALL|re.IGNORECASE),lambda match: '</html>')
,(re.compile(u'\u0110'), lambda match: u'\u00D0') ,(re.compile(u'\u0110'), lambda match: u'\u00D0')
] ]
@ -60,42 +60,21 @@ class Nin(BasicNewsRecipe):
br.submit() br.submit()
return br return br
keep_only_tags =[dict(name='td', attrs={'width':'520'})] remove_tags_before = dict(name='div', attrs={'class':'titleFont'})
remove_tags_before =dict(name='span', attrs={'class':'izjava'}) remove_tags_after = dict(name='div', attrs={'class':'standardFont'})
remove_tags_after =dict(name='html') remove_tags = [dict(name=['object','link','iframe','meta','base'])]
remove_tags = [ remove_attributes = ['border','background','height','width','align','valign']
dict(name=['object','link','iframe','meta','base'])
,dict(attrs={'class':['fb-like','twitter-share-button']})
,dict(attrs={'rel':'nofollow'})
]
remove_attributes=['border','background','height','width','align','valign']
def get_cover_url(self): def get_cover_url(self):
cover_url = None cover_url = None
soup = self.index_to_soup(self.INDEX) soup = self.index_to_soup(self.INDEX)
for item in soup.findAll('a', href=True): cover = soup.find('img', attrs={'class':'issueImg'})
if item['href'].startswith('/pages/issue.php?id='): if cover:
simg = item.find('img') return self.PREFIX + cover['src']
if simg:
return self.PREFIX + item.img['src']
return cover_url return cover_url
feeds = [(u'NIN Online', u'http://www.nin.co.rs/misc/rss.php?feed=RSS2.0')] feeds = [(u'NIN Online', u'http://www.nin.co.rs/misc/rss.php?feed=RSS2.0')]
def preprocess_html(self, soup): def print_version(self, url):
for item in soup.findAll(style=True): return url + '&pf=1'
del item['style']
for item in soup.findAll('div'):
if len(item.contents) == 0:
item.extract()
for item in soup.findAll(['td','tr']):
item.name='div'
for item in soup.findAll('img'):
if not item.has_key('alt'):
item['alt'] = 'image'
for tbl in soup.findAll('table'):
img = tbl.find('img')
if img:
img.extract()
tbl.replaceWith(img)
return soup