Add NASA profile

2025-08-30 23:00:21 -04:00 · 2008-03-03 01:15:19 +00:00 · 2008-03-03 01:15:19 +00:00 · 686fee381f
commit 686fee381f
parent 1bff449a4c
4 changed files with 182 additions and 1 deletions
--- a/src/libprs500/ebooks/lrf/web/init.py
+++ b/src/libprs500/ebooks/lrf/web/init.py
@ -36,10 +36,12 @@ from libprs500.ebooks.lrf.web.profiles.jutarnji      import Jutarnji
 from libprs500.ebooks.lrf.web.profiles.usatoday      import USAToday
 from libprs500.ebooks.lrf.web.profiles.upi           import UnitedPressInternational 
 from libprs500.ebooks.lrf.web.profiles.wash_post     import WashingtonPost 
+from libprs500.ebooks.lrf.web.profiles.nasa          import NASA 
+

 builtin_profiles   = [Atlantic, AssociatedPress, Barrons, BBC, 
                      ChristianScienceMonitor, CNN, Dilbert, Economist, FazNet, 
-                      JerusalemPost, Jutarnji, Newsweek, NewYorker, 
+                      JerusalemPost, Jutarnji, NASA, Newsweek, NewYorker, 
                      NewYorkReviewOfBooks, NYTimes, UnitedPressInternational, USAToday,  
                      Portfolio, Reuters, SpiegelOnline, WallStreetJournal, 
                      WashingtonPost, ZeitNachrichten,   
--- a/src/libprs500/ebooks/lrf/web/profiles/automatic.py
+++ b/src/libprs500/ebooks/lrf/web/profiles/automatic.py
@ -0,0 +1,87 @@
+#!/usr/bin/env  python
+##    Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
+##    This program is free software; you can redistribute it and/or modify
+##    it under the terms of the GNU General Public License as published by
+##    the Free Software Foundation; either version 2 of the License, or
+##    (at your option) any later version.
+##
+##    This program is distributed in the hope that it will be useful,
+##    but WITHOUT ANY WARRANTY; without even the implied warranty of
+##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+##    GNU General Public License for more details.
+##
+##    You should have received a copy of the GNU General Public License along
+##    with this program; if not, write to the Free Software Foundation, Inc.,
+##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+import os
+
+from libprs500.ebooks.lrf.web.profiles import DefaultProfile
+from libprs500.ebooks.BeautifulSoup import BeautifulSoup
+from libprs500 import iswindows
+from libprs500.ebooks.chardet import xml_to_unicode
+
+class AutomaticRSSProfile(DefaultProfile):
+    '''
+    Make downloading of RSS feeds completely automatic. Only input 
+    required is the URL of the feed.
+    '''
+    
+    max_recursions = 2
+    
+    def __init__(self, *args, **kwargs):
+        self.cindex = 1
+        DefaultProfile.__init__(*args, **kwargs)
+    
+    def fetch_content(self, index):
+        raw = open(index, 'rb').read()
+        if self.encoding:
+            raw = raw.decode(self.encoding)
+            enc = self.encoding
+        else:
+            raw, enc = xml_to_unicode(raw)
+        isoup = BeautifulSoup(raw)
+        for a in isoup.findAll('a', href=True):
+            src = a['href']
+            if src.startswith('file:'):
+                src = src[5:]
+            if os.access(src, os.R_OK):
+                self.fetch_content(src)
+                continue
+            try:
+                src = self.browser.open(src).read()
+            except:
+                continue
+            soup  = BeautifulSoup(src)
+            header, content = [], []
+            head = soup.find('head')
+            if head is not None:
+                for style in head('style'):
+                    header.append(unicode(style))
+            body = soup.find('body')
+            if body is None:
+                continue
+            for tag in body(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
+                in_table = False
+                c = tag.parent
+                while c is not None:
+                    if c.name == 'table':
+                        in_table = True
+                        break
+                    c = c.parent
+                if in_table:
+                    continue
+                content.append(unicode(tag))
+                
+            cfile = 'content%d.html'%self.cindex
+            self.cindex += 1
+            cfile = os.path.join(os.path.dirname(index), cfile)
+            html = '<html>\n<head>%s</head>\n<body>%s</body></html>'%('\n'.join(header), '\n'.join(content))
+            
+            open(cfile, 'wb').write(html.encode(enc))
+            a['href'] = ('file:' if iswindows else '') + cfile
+        open(index, 'wb').write(unicode(isoup).encode(enc)) 
+    
+    def build_index(self):
+        index = DefaultProfile.build_index(self)
+        self.fetch_content(index)
+    
--- a/src/libprs500/ebooks/lrf/web/profiles/nasa.py
+++ b/src/libprs500/ebooks/lrf/web/profiles/nasa.py
@ -0,0 +1,91 @@
+##	Copyright (C) 2008 B.Scott Wxby [bswxby] &
+##    Copyright (C) 2007 David Chen SonyReader<at>DaveChen<dot>org
+##
+##    This program is free software; you can redistribute it and/or modify
+##    it under the terms of the GNU General Public License as published by
+##    the Free Software Foundation; either version 2 of the License, or
+##    (at your option) any later version.
+##
+##    This program is distributed in the hope that it will be useful,
+##    but WITHOUT ANY WARRANTY; without even the implied warranty of
+##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+##    GNU General Public License for more details.
+##
+##	Version 0.3-2008_2_28
+##	Based on WIRED.py by David Chen, 2007, and newsweek.py, bbc.py, nytimes.py by Kovid Goyal
+##	https://libprs500.kovidgoyal.net/wiki/UserProfiles
+##
+##	Usage:
+##	>web2lrf --user-profile nasa.py
+##	Comment out the RSS feeds you don't want in the last section below
+##
+##	Output:
+##	NASA [YearMonthDate Time].lrf
+##
+'''
+Custom User Profile to download RSS News Feeds and Articles from Wired.com
+'''
+
+import re
+
+from libprs500.ebooks.lrf.web.profiles import DefaultProfile 
+	
+class NASA(DefaultProfile):
+
+	title = 'NASA'
+	max_recursions = 2
+	timefmt  = ' [%Y%b%d  %H%M]'
+	html_description = True
+	no_stylesheets = True
+	
+	## Don't grab articles more than 7 days old
+	oldest_article = 7
+
+	preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in 
+		[
+		## Fix the encoding to UTF-8
+		(r'<meta http-equiv="Content-Type" content="text/html; charset=(\S+)"', lambda match : match.group().replace(match.group(1), 'UTF-8')),
+
+		## Remove any banners/links/ads/cruft before the body of the article.
+		(r'<body.*?((<div id="article_body">)|(<div id="st-page-maincontent">)|(<div id="containermain">)|(<p class="ap-story-p">)|(<!-- img_nav -->))', lambda match: '<body><div>'),
+
+		## Remove any links/ads/comments/cruft from the end of the body of the article.
+		(r'((<!-- end article content -->)|(<div id="st-custom-afterpagecontent">)|(<p class="ap-story-p">&copy;)|(<div class="entry-footer">)|(<div id="see_also">)|(<p>Via <a href=)|(<div id="ss_nav">)).*?</html>', lambda match : '</div></body></html>'),
+
+		## Correctly embed in-line images by removing the surrounding javascript that will be ignored in the conversion
+		(r'<a.*?onclick.*?>.*?(<img .*?>)', lambda match: match.group(1),),
+		
+		## This removes header and footer information from each print version.
+       	(re.compile(r'<!-- Top Header starts -->.*?<!-- Body starts -->', re.IGNORECASE | re.DOTALL), lambda match : '<New Stuff>'),
+		(re.compile(r'<hr align="center" width="200"><p align="center">.*?<!-- Press Release standard text ends -->', re.IGNORECASE | re.DOTALL), lambda match : '<New Stuff>'),
+		(re.compile(r'<!-- Top Header starts -->.*?<!---->', re.IGNORECASE | re.DOTALL), lambda match : '<New Stuff>'),
+		
+		## This removes the "download image" of various sizes from the Image of the day.
+		(re.compile(r'<div id="download_image_box_print">.*?<div id="caption_region_print">', re.IGNORECASE | re.DOTALL), lambda match : '<New Stuff>'),
+
+
+		]
+	]
+		
+## NASA's print pages differ only by the ending "_prt.htm", so I've replaced them below.
+
+	def print_version(self, url):
+		return url.replace('.html', '_prt.htm')
+
+## Comment out the feeds you don't want retrieved.
+## Or add any new new RSS feed URL's here, sorted alphabetically when converted to LRF
+## If you want one of these at the top, append a space in front of the name.
+
+
+	def get_feeds(self):
+		return	[
+  		(' Breaking News', 'http://www.nasa.gov/rss/breaking_news.rss'),
+		('Image of the Day', 'http://www.nasa.gov/rss/image_of_the_day.rss'),
+		('Moon and Mars Exploration', 'http://www.nasa.gov/rss/moon_mars.rss'),
+		('Shuttle and Station News', 'http://www.nasa.gov/rss/shuttle_station.rss'),
+		('Solar System News', 'http://www.nasa.gov/rss/solar_system.rss'),
+		('Universe News', 'http://www.nasa.gov/rss/universe.rss'),
+		('Earth News', 'http://www.nasa.gov/rss/earth.rss'),
+		('Aeronautics News', 'http://www.nasa.gov/rss/aeronautics.rss'),
+		]
+
--- a/src/libprs500/gui2/images.qrc
+++ b/src/libprs500/gui2/images.qrc
@ -46,6 +46,7 @@
        <file>images/news/newsweek.png</file>
        <file>images/news/nytimes.png</file>
        <file>images/news/economist.png</file>
+        <file>images/news/nasa.png</file>
        <file>images/news/newyorker.png</file>
        <file>images/news/zeitde.png</file>
        <file>images/news/spiegelde.png</file>