diff --git a/src/libprs500/ebooks/lrf/web/__init__.py b/src/libprs500/ebooks/lrf/web/__init__.py index 0481618836..b89a157b44 100644 --- a/src/libprs500/ebooks/lrf/web/__init__.py +++ b/src/libprs500/ebooks/lrf/web/__init__.py @@ -36,10 +36,12 @@ from libprs500.ebooks.lrf.web.profiles.jutarnji import Jutarnji from libprs500.ebooks.lrf.web.profiles.usatoday import USAToday from libprs500.ebooks.lrf.web.profiles.upi import UnitedPressInternational from libprs500.ebooks.lrf.web.profiles.wash_post import WashingtonPost +from libprs500.ebooks.lrf.web.profiles.nasa import NASA + builtin_profiles = [Atlantic, AssociatedPress, Barrons, BBC, ChristianScienceMonitor, CNN, Dilbert, Economist, FazNet, - JerusalemPost, Jutarnji, Newsweek, NewYorker, + JerusalemPost, Jutarnji, NASA, Newsweek, NewYorker, NewYorkReviewOfBooks, NYTimes, UnitedPressInternational, USAToday, Portfolio, Reuters, SpiegelOnline, WallStreetJournal, WashingtonPost, ZeitNachrichten, diff --git a/src/libprs500/ebooks/lrf/web/profiles/automatic.py b/src/libprs500/ebooks/lrf/web/profiles/automatic.py new file mode 100644 index 0000000000..edacf12434 --- /dev/null +++ b/src/libprs500/ebooks/lrf/web/profiles/automatic.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python +## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +import os + +from libprs500.ebooks.lrf.web.profiles import DefaultProfile +from libprs500.ebooks.BeautifulSoup import BeautifulSoup +from libprs500 import iswindows +from libprs500.ebooks.chardet import xml_to_unicode + +class AutomaticRSSProfile(DefaultProfile): + ''' + Make downloading of RSS feeds completely automatic. Only input + required is the URL of the feed. + ''' + + max_recursions = 2 + + def __init__(self, *args, **kwargs): + self.cindex = 1 + DefaultProfile.__init__(*args, **kwargs) + + def fetch_content(self, index): + raw = open(index, 'rb').read() + if self.encoding: + raw = raw.decode(self.encoding) + enc = self.encoding + else: + raw, enc = xml_to_unicode(raw) + isoup = BeautifulSoup(raw) + for a in isoup.findAll('a', href=True): + src = a['href'] + if src.startswith('file:'): + src = src[5:] + if os.access(src, os.R_OK): + self.fetch_content(src) + continue + try: + src = self.browser.open(src).read() + except: + continue + soup = BeautifulSoup(src) + header, content = [], [] + head = soup.find('head') + if head is not None: + for style in head('style'): + header.append(unicode(style)) + body = soup.find('body') + if body is None: + continue + for tag in body(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']): + in_table = False + c = tag.parent + while c is not None: + if c.name == 'table': + in_table = True + break + c = c.parent + if in_table: + continue + content.append(unicode(tag)) + + cfile = 'content%d.html'%self.cindex + self.cindex += 1 + cfile = os.path.join(os.path.dirname(index), cfile) + html = '\n%s\n%s'%('\n'.join(header), '\n'.join(content)) + + open(cfile, 'wb').write(html.encode(enc)) + a['href'] = ('file:' if iswindows else '') + cfile + open(index, 'wb').write(unicode(isoup).encode(enc)) + + def build_index(self): + index = DefaultProfile.build_index(self) + self.fetch_content(index) + \ No newline at end of file diff --git a/src/libprs500/ebooks/lrf/web/profiles/nasa.py b/src/libprs500/ebooks/lrf/web/profiles/nasa.py new file mode 100644 index 0000000000..bdad3c6fa9 --- /dev/null +++ b/src/libprs500/ebooks/lrf/web/profiles/nasa.py @@ -0,0 +1,91 @@ +## Copyright (C) 2008 B.Scott Wxby [bswxby] & +## Copyright (C) 2007 David Chen SonyReaderDaveChenorg +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## Version 0.3-2008_2_28 +## Based on WIRED.py by David Chen, 2007, and newsweek.py, bbc.py, nytimes.py by Kovid Goyal +## https://libprs500.kovidgoyal.net/wiki/UserProfiles +## +## Usage: +## >web2lrf --user-profile nasa.py +## Comment out the RSS feeds you don't want in the last section below +## +## Output: +## NASA [YearMonthDate Time].lrf +## +''' +Custom User Profile to download RSS News Feeds and Articles from Wired.com +''' + +import re + +from libprs500.ebooks.lrf.web.profiles import DefaultProfile + +class NASA(DefaultProfile): + + title = 'NASA' + max_recursions = 2 + timefmt = ' [%Y%b%d %H%M]' + html_description = True + no_stylesheets = True + + ## Don't grab articles more than 7 days old + oldest_article = 7 + + preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in + [ + ## Fix the encoding to UTF-8 + (r')|(
)|(
)|(

)|())', lambda match: '

'), + + ## Remove any links/ads/comments/cruft from the end of the body of the article. + (r'(()|(
)|(

©)|(