mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Add NASA profile
This commit is contained in:
parent
1bff449a4c
commit
686fee381f
@ -36,10 +36,12 @@ from libprs500.ebooks.lrf.web.profiles.jutarnji import Jutarnji
|
|||||||
from libprs500.ebooks.lrf.web.profiles.usatoday import USAToday
|
from libprs500.ebooks.lrf.web.profiles.usatoday import USAToday
|
||||||
from libprs500.ebooks.lrf.web.profiles.upi import UnitedPressInternational
|
from libprs500.ebooks.lrf.web.profiles.upi import UnitedPressInternational
|
||||||
from libprs500.ebooks.lrf.web.profiles.wash_post import WashingtonPost
|
from libprs500.ebooks.lrf.web.profiles.wash_post import WashingtonPost
|
||||||
|
from libprs500.ebooks.lrf.web.profiles.nasa import NASA
|
||||||
|
|
||||||
|
|
||||||
builtin_profiles = [Atlantic, AssociatedPress, Barrons, BBC,
|
builtin_profiles = [Atlantic, AssociatedPress, Barrons, BBC,
|
||||||
ChristianScienceMonitor, CNN, Dilbert, Economist, FazNet,
|
ChristianScienceMonitor, CNN, Dilbert, Economist, FazNet,
|
||||||
JerusalemPost, Jutarnji, Newsweek, NewYorker,
|
JerusalemPost, Jutarnji, NASA, Newsweek, NewYorker,
|
||||||
NewYorkReviewOfBooks, NYTimes, UnitedPressInternational, USAToday,
|
NewYorkReviewOfBooks, NYTimes, UnitedPressInternational, USAToday,
|
||||||
Portfolio, Reuters, SpiegelOnline, WallStreetJournal,
|
Portfolio, Reuters, SpiegelOnline, WallStreetJournal,
|
||||||
WashingtonPost, ZeitNachrichten,
|
WashingtonPost, ZeitNachrichten,
|
||||||
|
87
src/libprs500/ebooks/lrf/web/profiles/automatic.py
Normal file
87
src/libprs500/ebooks/lrf/web/profiles/automatic.py
Normal file
@ -0,0 +1,87 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
|
||||||
|
## This program is free software; you can redistribute it and/or modify
|
||||||
|
## it under the terms of the GNU General Public License as published by
|
||||||
|
## the Free Software Foundation; either version 2 of the License, or
|
||||||
|
## (at your option) any later version.
|
||||||
|
##
|
||||||
|
## This program is distributed in the hope that it will be useful,
|
||||||
|
## but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
## GNU General Public License for more details.
|
||||||
|
##
|
||||||
|
## You should have received a copy of the GNU General Public License along
|
||||||
|
## with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
import os
|
||||||
|
|
||||||
|
from libprs500.ebooks.lrf.web.profiles import DefaultProfile
|
||||||
|
from libprs500.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
|
from libprs500 import iswindows
|
||||||
|
from libprs500.ebooks.chardet import xml_to_unicode
|
||||||
|
|
||||||
|
class AutomaticRSSProfile(DefaultProfile):
|
||||||
|
'''
|
||||||
|
Make downloading of RSS feeds completely automatic. Only input
|
||||||
|
required is the URL of the feed.
|
||||||
|
'''
|
||||||
|
|
||||||
|
max_recursions = 2
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
self.cindex = 1
|
||||||
|
DefaultProfile.__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
def fetch_content(self, index):
|
||||||
|
raw = open(index, 'rb').read()
|
||||||
|
if self.encoding:
|
||||||
|
raw = raw.decode(self.encoding)
|
||||||
|
enc = self.encoding
|
||||||
|
else:
|
||||||
|
raw, enc = xml_to_unicode(raw)
|
||||||
|
isoup = BeautifulSoup(raw)
|
||||||
|
for a in isoup.findAll('a', href=True):
|
||||||
|
src = a['href']
|
||||||
|
if src.startswith('file:'):
|
||||||
|
src = src[5:]
|
||||||
|
if os.access(src, os.R_OK):
|
||||||
|
self.fetch_content(src)
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
src = self.browser.open(src).read()
|
||||||
|
except:
|
||||||
|
continue
|
||||||
|
soup = BeautifulSoup(src)
|
||||||
|
header, content = [], []
|
||||||
|
head = soup.find('head')
|
||||||
|
if head is not None:
|
||||||
|
for style in head('style'):
|
||||||
|
header.append(unicode(style))
|
||||||
|
body = soup.find('body')
|
||||||
|
if body is None:
|
||||||
|
continue
|
||||||
|
for tag in body(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
|
||||||
|
in_table = False
|
||||||
|
c = tag.parent
|
||||||
|
while c is not None:
|
||||||
|
if c.name == 'table':
|
||||||
|
in_table = True
|
||||||
|
break
|
||||||
|
c = c.parent
|
||||||
|
if in_table:
|
||||||
|
continue
|
||||||
|
content.append(unicode(tag))
|
||||||
|
|
||||||
|
cfile = 'content%d.html'%self.cindex
|
||||||
|
self.cindex += 1
|
||||||
|
cfile = os.path.join(os.path.dirname(index), cfile)
|
||||||
|
html = '<html>\n<head>%s</head>\n<body>%s</body></html>'%('\n'.join(header), '\n'.join(content))
|
||||||
|
|
||||||
|
open(cfile, 'wb').write(html.encode(enc))
|
||||||
|
a['href'] = ('file:' if iswindows else '') + cfile
|
||||||
|
open(index, 'wb').write(unicode(isoup).encode(enc))
|
||||||
|
|
||||||
|
def build_index(self):
|
||||||
|
index = DefaultProfile.build_index(self)
|
||||||
|
self.fetch_content(index)
|
||||||
|
|
91
src/libprs500/ebooks/lrf/web/profiles/nasa.py
Normal file
91
src/libprs500/ebooks/lrf/web/profiles/nasa.py
Normal file
@ -0,0 +1,91 @@
|
|||||||
|
## Copyright (C) 2008 B.Scott Wxby [bswxby] &
|
||||||
|
## Copyright (C) 2007 David Chen SonyReader<at>DaveChen<dot>org
|
||||||
|
##
|
||||||
|
## This program is free software; you can redistribute it and/or modify
|
||||||
|
## it under the terms of the GNU General Public License as published by
|
||||||
|
## the Free Software Foundation; either version 2 of the License, or
|
||||||
|
## (at your option) any later version.
|
||||||
|
##
|
||||||
|
## This program is distributed in the hope that it will be useful,
|
||||||
|
## but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
## GNU General Public License for more details.
|
||||||
|
##
|
||||||
|
## Version 0.3-2008_2_28
|
||||||
|
## Based on WIRED.py by David Chen, 2007, and newsweek.py, bbc.py, nytimes.py by Kovid Goyal
|
||||||
|
## https://libprs500.kovidgoyal.net/wiki/UserProfiles
|
||||||
|
##
|
||||||
|
## Usage:
|
||||||
|
## >web2lrf --user-profile nasa.py
|
||||||
|
## Comment out the RSS feeds you don't want in the last section below
|
||||||
|
##
|
||||||
|
## Output:
|
||||||
|
## NASA [YearMonthDate Time].lrf
|
||||||
|
##
|
||||||
|
'''
|
||||||
|
Custom User Profile to download RSS News Feeds and Articles from Wired.com
|
||||||
|
'''
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from libprs500.ebooks.lrf.web.profiles import DefaultProfile
|
||||||
|
|
||||||
|
class NASA(DefaultProfile):
|
||||||
|
|
||||||
|
title = 'NASA'
|
||||||
|
max_recursions = 2
|
||||||
|
timefmt = ' [%Y%b%d %H%M]'
|
||||||
|
html_description = True
|
||||||
|
no_stylesheets = True
|
||||||
|
|
||||||
|
## Don't grab articles more than 7 days old
|
||||||
|
oldest_article = 7
|
||||||
|
|
||||||
|
preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
||||||
|
[
|
||||||
|
## Fix the encoding to UTF-8
|
||||||
|
(r'<meta http-equiv="Content-Type" content="text/html; charset=(\S+)"', lambda match : match.group().replace(match.group(1), 'UTF-8')),
|
||||||
|
|
||||||
|
## Remove any banners/links/ads/cruft before the body of the article.
|
||||||
|
(r'<body.*?((<div id="article_body">)|(<div id="st-page-maincontent">)|(<div id="containermain">)|(<p class="ap-story-p">)|(<!-- img_nav -->))', lambda match: '<body><div>'),
|
||||||
|
|
||||||
|
## Remove any links/ads/comments/cruft from the end of the body of the article.
|
||||||
|
(r'((<!-- end article content -->)|(<div id="st-custom-afterpagecontent">)|(<p class="ap-story-p">©)|(<div class="entry-footer">)|(<div id="see_also">)|(<p>Via <a href=)|(<div id="ss_nav">)).*?</html>', lambda match : '</div></body></html>'),
|
||||||
|
|
||||||
|
## Correctly embed in-line images by removing the surrounding javascript that will be ignored in the conversion
|
||||||
|
(r'<a.*?onclick.*?>.*?(<img .*?>)', lambda match: match.group(1),),
|
||||||
|
|
||||||
|
## This removes header and footer information from each print version.
|
||||||
|
(re.compile(r'<!-- Top Header starts -->.*?<!-- Body starts -->', re.IGNORECASE | re.DOTALL), lambda match : '<New Stuff>'),
|
||||||
|
(re.compile(r'<hr align="center" width="200"><p align="center">.*?<!-- Press Release standard text ends -->', re.IGNORECASE | re.DOTALL), lambda match : '<New Stuff>'),
|
||||||
|
(re.compile(r'<!-- Top Header starts -->.*?<!---->', re.IGNORECASE | re.DOTALL), lambda match : '<New Stuff>'),
|
||||||
|
|
||||||
|
## This removes the "download image" of various sizes from the Image of the day.
|
||||||
|
(re.compile(r'<div id="download_image_box_print">.*?<div id="caption_region_print">', re.IGNORECASE | re.DOTALL), lambda match : '<New Stuff>'),
|
||||||
|
|
||||||
|
|
||||||
|
]
|
||||||
|
]
|
||||||
|
|
||||||
|
## NASA's print pages differ only by the ending "_prt.htm", so I've replaced them below.
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
return url.replace('.html', '_prt.htm')
|
||||||
|
|
||||||
|
## Comment out the feeds you don't want retrieved.
|
||||||
|
## Or add any new new RSS feed URL's here, sorted alphabetically when converted to LRF
|
||||||
|
## If you want one of these at the top, append a space in front of the name.
|
||||||
|
|
||||||
|
|
||||||
|
def get_feeds(self):
|
||||||
|
return [
|
||||||
|
(' Breaking News', 'http://www.nasa.gov/rss/breaking_news.rss'),
|
||||||
|
('Image of the Day', 'http://www.nasa.gov/rss/image_of_the_day.rss'),
|
||||||
|
('Moon and Mars Exploration', 'http://www.nasa.gov/rss/moon_mars.rss'),
|
||||||
|
('Shuttle and Station News', 'http://www.nasa.gov/rss/shuttle_station.rss'),
|
||||||
|
('Solar System News', 'http://www.nasa.gov/rss/solar_system.rss'),
|
||||||
|
('Universe News', 'http://www.nasa.gov/rss/universe.rss'),
|
||||||
|
('Earth News', 'http://www.nasa.gov/rss/earth.rss'),
|
||||||
|
('Aeronautics News', 'http://www.nasa.gov/rss/aeronautics.rss'),
|
||||||
|
]
|
||||||
|
|
@ -46,6 +46,7 @@
|
|||||||
<file>images/news/newsweek.png</file>
|
<file>images/news/newsweek.png</file>
|
||||||
<file>images/news/nytimes.png</file>
|
<file>images/news/nytimes.png</file>
|
||||||
<file>images/news/economist.png</file>
|
<file>images/news/economist.png</file>
|
||||||
|
<file>images/news/nasa.png</file>
|
||||||
<file>images/news/newyorker.png</file>
|
<file>images/news/newyorker.png</file>
|
||||||
<file>images/news/zeitde.png</file>
|
<file>images/news/zeitde.png</file>
|
||||||
<file>images/news/spiegelde.png</file>
|
<file>images/news/spiegelde.png</file>
|
||||||
|
Loading…
x
Reference in New Issue
Block a user