From f179a74a079ca8904333e860ba81b233039e8b40 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 19 Feb 2009 10:24:48 -0800 Subject: [PATCH] Fix #1891 (Updated various recipes for better EPUB support) --- .../web/feeds/recipes/recipe_harpers.py | 69 +++++++++++-------- .../web/feeds/recipes/recipe_harpers_full.py | 19 ++--- .../web/feeds/recipes/recipe_pobjeda.py | 19 +++-- .../web/feeds/recipes/recipe_pressonline.py | 7 +- 4 files changed, 58 insertions(+), 56 deletions(-) diff --git a/src/calibre/web/feeds/recipes/recipe_harpers.py b/src/calibre/web/feeds/recipes/recipe_harpers.py index e15263730d..6370f6e0ea 100644 --- a/src/calibre/web/feeds/recipes/recipe_harpers.py +++ b/src/calibre/web/feeds/recipes/recipe_harpers.py @@ -1,29 +1,40 @@ -#!/usr/bin/env python - -__license__ = 'GPL v3' -__copyright__ = '2008, Darko Miletic ' -''' -harpers.org -''' -from calibre.web.feeds.news import BasicNewsRecipe - -class Harpers(BasicNewsRecipe): - title = u"Harper's Magazine" - __author__ = u'Darko Miletic' - language = _('English') - description = u"Harper's Magazine: Founded June 1850." - oldest_article = 30 - max_articles_per_feed = 100 - no_stylesheets = True - use_embedded_content = False - timefmt = ' [%A, %d %B, %Y]' - - keep_only_tags = [ dict(name='div', attrs={'id':'cached'}) ] - remove_tags = [ - dict(name='table', attrs={'class':'rcnt'}) - ,dict(name='table', attrs={'class':'rcnt topline'}) - ] - - feeds = [ - (u"Harper's Magazine", u'http://www.harpers.org/rss/frontpage-rss20.xml') - ] +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2008-2009, Darko Miletic ' +''' +harpers.org +''' +from calibre.web.feeds.news import BasicNewsRecipe + +class Harpers(BasicNewsRecipe): + title = u"Harper's Magazine" + __author__ = u'Darko Miletic' + language = _('English') + description = u"Harper's Magazine: Founded June 1850." + publisher = "Harper's Magazine " + category = 'news, politics, USA' + oldest_article = 30 + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = False + remove_javascript = True + + html2lrf_options = [ + '--comment', description + , '--category', category + , '--publisher', publisher + ] + + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0em; margin-top: 0em; margin-bottom: 0.5em} img {margin-top: 0em; margin-bottom: 0.4em}"' + + + keep_only_tags = [ dict(name='div', attrs={'id':'cached'}) ] + remove_tags = [ + dict(name='table', attrs={'class':'rcnt'}) + ,dict(name='table', attrs={'class':'rcnt topline'}) + ,dict(name=['link','object','embed']) + ] + + feeds = [(u"Harper's Magazine", u'http://www.harpers.org/rss/frontpage-rss20.xml')] + diff --git a/src/calibre/web/feeds/recipes/recipe_harpers_full.py b/src/calibre/web/feeds/recipes/recipe_harpers_full.py index 72e633bde0..69ec9d54f5 100644 --- a/src/calibre/web/feeds/recipes/recipe_harpers_full.py +++ b/src/calibre/web/feeds/recipes/recipe_harpers_full.py @@ -10,8 +10,8 @@ images and pdf's are ignored from calibre import strftime -from calibre.web.feeds.news import BasicNewsRecipe - +from calibre.web.feeds.news import BasicNewsRecipe + class Harpers_full(BasicNewsRecipe): title = u"Harper's Magazine - articles from printed edition" __author__ = u'Darko Miletic' @@ -23,7 +23,8 @@ class Harpers_full(BasicNewsRecipe): no_stylesheets = True use_embedded_content = False simultaneous_downloads = 1 - delay = 1 + delay = 1 + language = _('English') needs_subscription = True INDEX = strftime('http://www.harpers.org/archive/%Y/%m') LOGIN = 'http://www.harpers.org' @@ -31,12 +32,12 @@ class Harpers_full(BasicNewsRecipe): remove_javascript = True html2lrf_options = [ - '--comment', description + '--comment', description , '--category', category , '--publisher', publisher ] - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0em; margin-top: 0em; margin-bottom: 0.5em} img {margin-top: 0em; margin-bottom: 0.4em}"' keep_only_tags = [ dict(name='div', attrs={'id':'cached'}) ] remove_tags = [ @@ -71,10 +72,4 @@ class Harpers_full(BasicNewsRecipe): ,'description':'' }) return [(soup.head.title.string, articles)] - - def preprocess_html(self, soup): - for item in soup.findAll(style=True): - del item['style'] - return soup - - language = _('English') \ No newline at end of file + diff --git a/src/calibre/web/feeds/recipes/recipe_pobjeda.py b/src/calibre/web/feeds/recipes/recipe_pobjeda.py index 9a4dbb0eee..5afb2b3f6a 100644 --- a/src/calibre/web/feeds/recipes/recipe_pobjeda.py +++ b/src/calibre/web/feeds/recipes/recipe_pobjeda.py @@ -17,9 +17,6 @@ class Pobjeda(BasicNewsRecipe): description = 'News from Montenegro' publisher = 'Pobjeda a.d.' category = 'news, politics, Montenegro' - language = _('Serbian') - oldest_article = 2 - max_articles_per_feed = 100 no_stylesheets = True remove_javascript = True encoding = 'utf8' @@ -30,12 +27,14 @@ class Pobjeda(BasicNewsRecipe): html2lrf_options = [ '--comment', description + , '--base-font-size', '10' , '--category', category , '--publisher', publisher ] - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' - + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0em; margin-top: 0em; margin-bottom: 0.5em} img {margin-top: 0em; margin-bottom: 0.4em}"' + + preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] keep_only_tags = [dict(name='div', attrs={'class':'vijest'})] @@ -64,8 +63,6 @@ class Pobjeda(BasicNewsRecipe): soup.html['lang'] = 'sr-Latn-ME' mtag = '' soup.head.insert(0,mtag) - for item in soup.findAll(style=True): - del item['style'] return soup def get_cover_url(self): @@ -81,16 +78,16 @@ class Pobjeda(BasicNewsRecipe): lfeeds = self.get_feeds() for feedobj in lfeeds: feedtitle, feedurl = feedobj - self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl)) + self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl)) articles = [] - soup = self.index_to_soup(feedurl) + soup = self.index_to_soup(feedurl) for item in soup.findAll('div', attrs={'class':'vijest'}): description = self.tag_to_string(item.h2) atag = item.h1.find('a') - if atag: + if atag and atag.has_key('href'): url = self.INDEX + '/' + atag['href'] title = self.tag_to_string(atag) - date = strftime(self.timefmt) + date = strftime(self.timefmt) articles.append({ 'title' :title ,'date' :date diff --git a/src/calibre/web/feeds/recipes/recipe_pressonline.py b/src/calibre/web/feeds/recipes/recipe_pressonline.py index 41525cfc5f..71f69b9169 100644 --- a/src/calibre/web/feeds/recipes/recipe_pressonline.py +++ b/src/calibre/web/feeds/recipes/recipe_pressonline.py @@ -32,7 +32,7 @@ class PressOnline(BasicNewsRecipe): , '--publisher', publisher ] - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0em; margin-top: 0em; margin-bottom: 0.5em} img {margin-top: 0em; margin-bottom: 0.4em}"' preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] @@ -61,7 +61,6 @@ class PressOnline(BasicNewsRecipe): soup.html['lang'] = 'sr-Latn-RS' mtag = '\n' soup.head.insert(0,mtag) - img = soup.find('img') - if img: - del img['align'] + for img in soup.findAll('img', align=True): + del img['align'] return soup \ No newline at end of file