diff --git a/recipes/sunday_times_magazine b/recipes/sunday_times_magazine new file mode 100644 index 0000000000..e2338b381f --- /dev/null +++ b/recipes/sunday_times_magazine @@ -0,0 +1,90 @@ + +__license__ = 'GPL v3' +__copyright__ = '2010-2013, Darko Miletic ' +''' +www.thetimes.co.uk +''' +import urllib +from calibre.web.feeds.news import BasicNewsRecipe + + +class TimesOnline(BasicNewsRecipe): + title = 'The Sunday Times Magazine UK' + __author__ = 'Bobby Steel & Darko Miletic' + description = 'newsmagazine from United Kingdom and World' + language = 'en_GB' + publisher = 'Times Newspapers Ltd' + category = 'news, politics, UK' + oldest_article = 3 + max_articles_per_feed = 500 + no_stylesheets = True + use_embedded_content = False + encoding = 'utf-8' + delay = 1 + needs_subscription = True + publication_type = 'newspaper' + INDEX = 'http://www.thetimes.co.uk/' + PREFIX = u'http://www.thetimes.co.uk/' + extra_css = """ + .author-name,.authorName{font-style: italic} + .published-date,.multi-position-photo-text{font-family: Arial,Helvetica,sans-serif; + font-size: small; color: gray; + display:block; margin-bottom: 0.5em} + body{font-family: Georgia,"Times New Roman",Times,serif} + """ + + conversion_options = { + 'comment': description, 'tags': category, 'publisher': publisher, 'language': language + } + + def get_browser(self): + br = BasicNewsRecipe.get_browser(self) + br.open('http://www.thetimes.co.uk/') + if self.username is not None and self.password is not None: + data = urllib.urlencode({ + 'gotoUrl': self.INDEX, 'username': self.username, 'password': self.password + }) + br.open('https://login.thetimes.co.uk/', data) + return br + + remove_tags = [ + {'name': ['object', 'link', 'iframe', 'base', 'meta', 'script']}, + {'attrs': {'class': ['tools comments-parent','u-hide','Tooltip','Toolbar Toolbar--bottom','Comments Article-container','ArticlePager','Media-caption','RelatedLinks']}}, + {'attrs': {'class': lambda x: x and 'Toolbar' in x}} + ] + remove_attributes = ['lang'] + keep_only_tags = [ + dict(attrs={'class': 'Article Article--default'} + ), dict(attrs={'class': 'f-author'}), dict(attrs={'id': 'bodycopy'}) + ] + remove_tags_after = dict(attrs={'class': 'Article-content'}) + + feeds = [ + (u'The Sunday Times Magazine', u'http://www.thetimes.co.uk/magazine/the-sunday-times-magazine/'), + (u'Sunday Times Style', u'http://www.thetimes.co.uk/magazine/style/') + ] + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + return self.adeify_images(soup) + + def parse_index(self): + totalfeeds = [] + lfeeds = self.get_feeds() + for feedobj in lfeeds: + feedtitle, feedurl = feedobj + self.report_progress(0, _('Fetching feed') + ' %s...' % + (feedtitle if feedtitle else feedurl)) + articles = [] + soup = self.index_to_soup(feedurl) + for atag in soup.findAll('a', href=True): + parentName = atag.parent.name + title = self.tag_to_string(atag).strip() + if (parentName == 'h2' or parentName == 'h3') and title is not None and title != '': + url = self.INDEX + atag['href'] + articles.append({ + 'title': title, 'date': '', 'url': url, 'description': '' + }) + totalfeeds.append((feedtitle, articles)) + return totalfeeds