## Copyright (C) 2008 B.Scott Wxby [bswxby] & ## Copyright (C) 2007 David Chen SonyReaderDaveChenorg ## ## This program is free software; you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by ## the Free Software Foundation; either version 2 of the License, or ## (at your option) any later version. ## ## This program is distributed in the hope that it will be useful, ## but WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## GNU General Public License for more details. ## ## Version 0.3-2008_2_28 ## Based on WIRED.py by David Chen, 2007, and newsweek.py, bbc.py, nytimes.py by Kovid Goyal ## ## Usage: ## >web2lrf --user-profile nasa.py ## Comment out the RSS feeds you don't want in the last section below ## ## Output: ## NASA [YearMonthDate Time].lrf ## ''' Custom User Profile to download RSS News Feeds and Articles from Wired.com ''' import re from calibre.web.feeds.news import BasicNewsRecipe class NASA(BasicNewsRecipe): title = 'NASA' timefmt = ' [%Y%b%d %H%M]' language = 'en' description = 'News from NASA' __author__ = 'Scott Wxby & David Chen' no_stylesheets = True ## Don't grab articles more than 30 days old oldest_article = 30 preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in [ ## Fix the encoding to UTF-8 (r')|(

)|(

)|())', lambda match: '

'), ## Remove any links/ads/comments/cruft from the end of the body of the article. (r'(()|(

)|(

©)|(

)|(

Via )).*?', lambda match : '

'), ## Correctly embed in-line images by removing the surrounding javascript that will be ignored in the conversion (r'.*?()', lambda match: match.group(1),), ## This removes header and footer information from each print version. (r'.*?', lambda match : ''), (r'

.*?', lambda match : ''), (r'.*?', lambda match : ''), ## This removes the "download image" of various sizes from the Image of the day. (r'

.*?