From bbb6698e6b5fbac50487b7d9d4797daa6e1a379e Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 14 Feb 2011 11:44:23 -0700 Subject: [PATCH] Fix #8984 (Updated recipe for Sudney Morning herald) --- resources/recipes/smh.recipe | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/resources/recipes/smh.recipe b/resources/recipes/smh.recipe index b5c7f4d54e..220cd7faf3 100644 --- a/resources/recipes/smh.recipe +++ b/resources/recipes/smh.recipe @@ -1,5 +1,5 @@ __license__ = 'GPL v3' -__copyright__ = '2010, Darko Miletic ' +__copyright__ = '2010-2011, Darko Miletic ' ''' smh.com.au ''' @@ -22,7 +22,11 @@ class Smh_au(BasicNewsRecipe): remove_empty_feeds = True masthead_url = 'http://images.smh.com.au/2010/02/02/1087188/smh-620.jpg' publication_type = 'newspaper' - extra_css = ' h1{font-family: Georgia,"Times New Roman",Times,serif } body{font-family: Arial,Helvetica,sans-serif} .cT-imageLandscape{font-size: x-small} ' + extra_css = """ + h1{font-family: Georgia,"Times New Roman",Times,serif } + body{font-family: Arial,Helvetica,sans-serif} + .cT-imageLandscape,.cT-imagePortrait{font-size: x-small} + """ conversion_options = { 'comment' : description @@ -38,7 +42,11 @@ class Smh_au(BasicNewsRecipe): ] remove_tags_after = [dict(name='div',attrs={'class':'articleBody'})] keep_only_tags = [dict(name='div',attrs={'id':'content'})] - remove_attributes = ['width','height'] + remove_tags = [ + dict(attrs={'class':'hidden'}), + dict(name=['link','meta','base','embed','object','iframe']) + ] + remove_attributes = ['width','height','lang'] def parse_index(self): articles = [] @@ -66,3 +74,14 @@ class Smh_au(BasicNewsRecipe): ,'description':description }) return [(self.tag_to_string(soup.find('title')), articles)] + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + for item in soup.findAll('bod'): + item.name = 'div' + for item in soup.findAll('img'): + if not item.has_key('alt'): + item['alt'] = 'image' + return soup + \ No newline at end of file