diff --git a/resources/recipes/the_sun.recipe b/resources/recipes/the_sun.recipe index f9905a61dc..5699ec106c 100644 --- a/resources/recipes/the_sun.recipe +++ b/resources/recipes/the_sun.recipe @@ -1,5 +1,6 @@ import re from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Tag class AdvancedUserRecipe1268409464(BasicNewsRecipe): title = u'The Sun' @@ -14,24 +15,27 @@ class AdvancedUserRecipe1268409464(BasicNewsRecipe): remove_javascript = True keep_only_tags = [ - dict(name='div', attrs={'class':'medium-centered'}) - ,dict(name='div', attrs={'class':'article'}) - ,dict(name='div', attrs={'class':'clear-left'}) - ,dict(name='div', attrs={'class':'text-center'}) + dict(id='column-print') ] remove_tags = [ - dict(name='div', attrs={'class':'slideshow'}) - ,dict(name='div', attrs={'class':'float-left'}) - ,dict(name='div', attrs={'class':'ltbx-slideshow ltbx-btn-ss'}) - ,dict(name='a', attrs={'class':'add_a_comment'}) - ,dict(name='div', attrs={'id':'vxFlashPlayerContent'}) - ,dict(name='div', attrs={'id':'k1006094r1c1t5w380h529'}) - ,dict(name='div', attrs={'id':'tum_login_form_container'}) - ,dict(name='div', attrs={'class':'discHeader'}) - ,dict(name='div', attrs={'class':'margin-bottom-neg-2'}) + dict(name='div', attrs={'class':[ + 'clear text-center small padding-left-right-5 text-999 padding-top-5 padding-bottom-10 grey-solid-line', + 'clear width-625 bg-fff padding-top-10' + ]}), + dict(name='video'), ] + def preprocess_html(self, soup): + h1 = soup.find('h1') + if h1 is not None: + text = self.tag_to_string(h1) + nh = Tag(soup, 'h1') + nh.insert(0, text) + h1.replaceWith(nh) + + return soup + feeds = [(u'News', u'http://www.thesun.co.uk/sol/homepage/feeds/rss/article312900.ece') ,(u'Sport', u'http://www.thesun.co.uk/sol/homepage/feeds/rss/article247732.ece')