diff --git a/src/calibre/web/feeds/recipes/recipe_harpers.py b/src/calibre/web/feeds/recipes/recipe_harpers.py index 6370f6e0ea..6cdcfe800d 100644 --- a/src/calibre/web/feeds/recipes/recipe_harpers.py +++ b/src/calibre/web/feeds/recipes/recipe_harpers.py @@ -6,6 +6,7 @@ __copyright__ = '2008-2009, Darko Miletic ' harpers.org ''' from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Tag class Harpers(BasicNewsRecipe): title = u"Harper's Magazine" @@ -18,23 +19,30 @@ class Harpers(BasicNewsRecipe): max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False - remove_javascript = True html2lrf_options = [ '--comment', description , '--category', category , '--publisher', publisher ] - - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0em; margin-top: 0em; margin-bottom: 0.5em} img {margin-top: 0em; margin-bottom: 0.4em}"' - - + + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0em; margin-top: 0em; margin-bottom: 0.5em} img {margin-top: 0em; margin-bottom: 0.4em}"' + + keep_only_tags = [ dict(name='div', attrs={'id':'cached'}) ] remove_tags = [ - dict(name='table', attrs={'class':'rcnt'}) - ,dict(name='table', attrs={'class':'rcnt topline'}) + dict(name='table', attrs={'class':['rcnt','rcnt topline']}) ,dict(name=['link','object','embed']) ] feeds = [(u"Harper's Magazine", u'http://www.harpers.org/rss/frontpage-rss20.xml')] + def preprocess_html(self, soup): + mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")]) + soup.head.insert(1,mcharset) + for item in soup.findAll(style=True): + del item['style'] + for item in soup.findAll(xmlns=True): + del item['xmlns'] + return soup + diff --git a/src/calibre/web/feeds/recipes/recipe_harpers_full.py b/src/calibre/web/feeds/recipes/recipe_harpers_full.py index 69ec9d54f5..bec16cd5c4 100644 --- a/src/calibre/web/feeds/recipes/recipe_harpers_full.py +++ b/src/calibre/web/feeds/recipes/recipe_harpers_full.py @@ -9,40 +9,38 @@ images and pdf's are ignored ''' from calibre import strftime - from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Tag class Harpers_full(BasicNewsRecipe): title = u"Harper's Magazine - articles from printed edition" __author__ = u'Darko Miletic' description = u"Harper's Magazine: Founded June 1850." publisher = "Harpers's" - category = 'news, politics, USA' + category = 'news, politics, USA' oldest_article = 30 max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False - simultaneous_downloads = 1 delay = 1 language = _('English') needs_subscription = True INDEX = strftime('http://www.harpers.org/archive/%Y/%m') LOGIN = 'http://www.harpers.org' cover_url = strftime('http://www.harpers.org/media/pages/%Y/%m/gif/0001.gif') - remove_javascript = True - + html2lrf_options = [ '--comment', description , '--category', category , '--publisher', publisher ] - - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0em; margin-top: 0em; margin-bottom: 0.5em} img {margin-top: 0em; margin-bottom: 0.4em}"' + + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0em; margin-top: 0em; margin-bottom: 0.5em} img {margin-top: 0em; margin-bottom: 0.4em}"' keep_only_tags = [ dict(name='div', attrs={'id':'cached'}) ] remove_tags = [ - dict(name='table', attrs={'class':'rcnt'}) - ,dict(name='table', attrs={'class':'rcnt topline'}) + dict(name='table', attrs={'class':['rcnt','rcnt topline']}) + ,dict(name='link') ] def get_browser(self): @@ -54,13 +52,13 @@ class Harpers_full(BasicNewsRecipe): br['password'] = self.password br.submit() return br - + def parse_index(self): articles = [] print 'Processing ' + self.INDEX soup = self.index_to_soup(self.INDEX) for item in soup.findAll('div', attrs={'class':'title'}): - text_link = item.parent.find('img',attrs={'alt':'Text'}) + text_link = item.parent.find('img',attrs={'alt':'Text'}) if text_link: url = self.LOGIN + item.a['href'] title = item.a.contents[0] @@ -72,4 +70,12 @@ class Harpers_full(BasicNewsRecipe): ,'description':'' }) return [(soup.head.title.string, articles)] - + + def preprocess_html(self, soup): + mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")]) + soup.head.insert(1,mcharset) + for item in soup.findAll(style=True): + del item['style'] + for item in soup.findAll(xmlns=True): + del item['xmlns'] + return soup