From 5cc6465941057f1a3e2820f37f0750cd48774d26 Mon Sep 17 00:00:00 2001 From: Tom Scholl Date: Mon, 4 Apr 2011 16:35:09 +0000 Subject: [PATCH 1/2] Added fast no image version of the daily mail recipe --- recipes/daily_mail_fast.recipe | 52 ++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 recipes/daily_mail_fast.recipe diff --git a/recipes/daily_mail_fast.recipe b/recipes/daily_mail_fast.recipe new file mode 100644 index 0000000000..9ddb6f77f8 --- /dev/null +++ b/recipes/daily_mail_fast.recipe @@ -0,0 +1,52 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class TheDailyMail(BasicNewsRecipe): + title = u'The Daily Mail (fast)' + oldest_article = 2 + language = 'en_GB' + + author = 'RufusA and Sujata Raman' + description = 'Faster and smaller version of the Daily Mail that does not download pictures' + simultaneous_downloads= 1 + max_articles_per_feed = 50 + + extra_css = '''#js-article-text{font-family:Arial,Helvetica,sans-serif;} + h1{font-size:x-large; font-weight:bold;} + a.author{color:#003580;} + .js-article-text{font-size:50%;} + .imageCaption{font-size:x-small; font-weight:bold} + + + ''' + + remove_tags = [ dict(name='div', attrs={'class':['article-icon-links-container','print-or-mail-links cleared', + 'social-links cleared','explore-links','relatedItems','intellicrumbs box','travel','moduleHalf']}), + dict(name='div', attrs={'id':['link-unit-wrapper','pushdown-ad','reader-comments','googleAds',]}), + dict(name='h3', attrs={'class':['social-links-title']}), + dict(name='span', attrs={'class':['clickToEnlargeTop']}), + dict(name=['img']), + ] + #remove_tags_before = dict(name='div', attrs={'id':'content'}) + keep_only_tags = [dict(name='div', attrs={'id':'js-article-text'})] + + no_stylesheets = True + + feeds = [ + (u'Home', u'http://www.dailymail.co.uk/home/index.rss'), + (u'News', u'http://www.dailymail.co.uk/news/index.rss'), + (u'Sport', u'http://www.dailymail.co.uk/sport/index.rss'), + (u'TV and Showbiz', u'http://www.dailymail.co.uk/tvshowbiz/index.rss'), + (u'Femail', u'http://www.dailymail.co.uk/femail/index.rss'), + (u'Health', u'http://www.dailymail.co.uk/health/index.rss'), + (u'Science and Technology', u'http://www.dailymail.co.uk/sciencetech/index.rss'), + (u'Money', u'http://www.dailymail.co.uk/money/index.rss'), + (u'Property', u'http://www.dailymail.co.uk/property/index.rss'), + (u'Motoring', u'http://www.dailymail.co.uk/motoring/index.rss'), + (u'Travel', u'http://www.dailymail.co.uk/travel/index.rss') + ] + + #def print_version(self, url): + # main = url.partition('?')[0] + # return main + '?printingPage=true' + + From 43b57cb343edcf9f6d17f0b7013ec0c8e2b67d51 Mon Sep 17 00:00:00 2001 From: Tom Scholl Date: Thu, 7 Apr 2011 22:16:12 +0000 Subject: [PATCH 2/2] More detailed sections for guardian recipe --- recipes/guardian.recipe | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/recipes/guardian.recipe b/recipes/guardian.recipe index 6211997b06..c5021cb91d 100644 --- a/recipes/guardian.recipe +++ b/recipes/guardian.recipe @@ -28,7 +28,7 @@ class Guardian(BasicNewsRecipe): # List of section titles to ignore # For example: ['Sport'] ignore_sections = [] - + timefmt = ' [%a, %d %b %Y]' keep_only_tags = [ dict(name='div', attrs={'id':["content","article_header","main-article-info",]}), @@ -87,8 +87,14 @@ class Guardian(BasicNewsRecipe): idx = soup.find('div', id='book-index') for s in idx.findAll('strong', attrs={'class':'book'}): a = s.find('a', href=True) - yield (self.tag_to_string(a), a['href']) - + section_title = self.tag_to_string(a) + if not section_title in self.ignore_sections: + prefix = '' + if section_title != 'Main section': + prefix = section_title + ': ' + for subsection in s.parent.findAll('a', attrs={'class':'book-section'}): + yield (prefix + self.tag_to_string(subsection), subsection['href']) + def find_articles(self, url): soup = self.index_to_soup(url) div = soup.find('div', attrs={'class':'book-index'}) @@ -109,15 +115,12 @@ class Guardian(BasicNewsRecipe): 'title': title, 'url':url, 'description':desc, 'date' : strftime('%a, %d %b'), } - + def parse_index(self): try: feeds = [] for title, href in self.find_sections(): - if not title in self.ignore_sections: - feeds.append((title, list(self.find_articles(href)))) + feeds.append((title, list(self.find_articles(href)))) return feeds except: raise NotImplementedError - -