mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Fix Newsweek profile
This commit is contained in:
parent
2d2d59a1de
commit
65b565cb59
@ -19,6 +19,21 @@ from libprs500 import iswindows
|
|||||||
from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup
|
from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup
|
||||||
from htmlentitydefs import name2codepoint
|
from htmlentitydefs import name2codepoint
|
||||||
|
|
||||||
|
DAY_MAP = dict(Sun=0, Mon=1, Tue=2, Wed=3, Thu=4, Fri=5, Sat=6)
|
||||||
|
MONTH_MAP = dict(Jan=1, Feb=2, Mar=3, Apr=4, May=5, Jun=6, Jul=7, Aug=8, Sep=9, Oct=10, Nov=11, Dec=12)
|
||||||
|
FULL_MONTH_MAP = dict(January=1, February=2, March=3, April=4, May=5, June=6,
|
||||||
|
July=7, August=8, September=9, October=10,
|
||||||
|
November=11, December=12)
|
||||||
|
|
||||||
|
def strptime(src):
|
||||||
|
src = src.strip().split()
|
||||||
|
src[0] = str(DAY_MAP[src[0][:-1]])+','
|
||||||
|
try:
|
||||||
|
src[2] = str(MONTH_MAP[src[2]])
|
||||||
|
except KeyError:
|
||||||
|
src[2] = str(FULL_MONTH_MAP[src[2]])
|
||||||
|
return time.strptime(' '.join(src), '%w, %d %m %Y %H:%M:%S %Z')
|
||||||
|
|
||||||
def process_html_description(tag):
|
def process_html_description(tag):
|
||||||
src = '\n'.join(tag.contents)
|
src = '\n'.join(tag.contents)
|
||||||
replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ]
|
replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ]
|
||||||
@ -41,7 +56,12 @@ def parse_feeds(feeds, browser, print_version,
|
|||||||
'''
|
'''
|
||||||
articles = {}
|
articles = {}
|
||||||
for title, url in feeds:
|
for title, url in feeds:
|
||||||
src = browser.open(url).read()
|
try:
|
||||||
|
src = browser.open(url).read()
|
||||||
|
except Exception, err:
|
||||||
|
print 'Could not fetch feed: %s\nError: %s'%(url, err)
|
||||||
|
continue
|
||||||
|
|
||||||
articles[title] = []
|
articles[title] = []
|
||||||
soup = BeautifulStoneSoup(src)
|
soup = BeautifulStoneSoup(src)
|
||||||
for item in soup.findAll('item'):
|
for item in soup.findAll('item'):
|
||||||
@ -53,14 +73,14 @@ def parse_feeds(feeds, browser, print_version,
|
|||||||
d = {
|
d = {
|
||||||
'title' : item.find('title').string,
|
'title' : item.find('title').string,
|
||||||
'url' : print_version(item.find('guid').string),
|
'url' : print_version(item.find('guid').string),
|
||||||
'timestamp': calendar.timegm(time.strptime(pubdate,
|
'timestamp': calendar.timegm(strptime(pubdate)),
|
||||||
'%a, %d %b %Y %H:%M:%S %Z')),
|
|
||||||
'date' : pubdate
|
'date' : pubdate
|
||||||
}
|
}
|
||||||
delta = time.time() - d['timestamp']
|
delta = time.time() - d['timestamp']
|
||||||
if delta > oldest_article*3600*24:
|
if delta > oldest_article*3600*24:
|
||||||
continue
|
continue
|
||||||
except:
|
|
||||||
|
except Exception, err:
|
||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
desc = item.find('description')
|
desc = item.find('description')
|
||||||
@ -72,6 +92,8 @@ def parse_feeds(feeds, browser, print_version,
|
|||||||
articles[title][max_articles_per_feed:] = []
|
articles[title][max_articles_per_feed:] = []
|
||||||
for item in articles[title]:
|
for item in articles[title]:
|
||||||
item.pop('timestamp')
|
item.pop('timestamp')
|
||||||
|
if not articles[title]:
|
||||||
|
articles.pop(title)
|
||||||
return articles
|
return articles
|
||||||
|
|
||||||
|
|
||||||
|
@ -20,30 +20,33 @@ from libprs500.ebooks.lrf.web import build_index, parse_feeds
|
|||||||
from libprs500 import __appname__, iswindows, browser
|
from libprs500 import __appname__, iswindows, browser
|
||||||
|
|
||||||
RSS_FEEDS = [
|
RSS_FEEDS = [
|
||||||
('Cover Story', 'http://feeds.newsweek.com/CoverStory'),
|
('Top News', 'http://feeds.newsweek.com/newsweek/TopNews',),
|
||||||
('Periscope', 'http://feeds.newsweek.com/newsweek/periscope'),
|
('Periscope', 'http://feeds.newsweek.com/newsweek/periscope'),
|
||||||
|
('Politics', 'http://feeds.newsweek.com/headlines/politics'),
|
||||||
|
('Health', 'http://feeds.newsweek.com/headlines/health'),
|
||||||
|
('Business', 'http://feeds.newsweek.com/headlines/business'),
|
||||||
|
('Science and Technology', 'http://feeds.newsweek.com/headlines/technology/science'),
|
||||||
('National News', 'http://feeds.newsweek.com/newsweek/NationalNews'),
|
('National News', 'http://feeds.newsweek.com/newsweek/NationalNews'),
|
||||||
('World News', 'http://feeds.newsweek.com/newsweek/WorldNews'),
|
('World News', 'http://feeds.newsweek.com/newsweek/WorldNews'),
|
||||||
('Iraq', 'http://feeds.newsweek.com/newsweek/iraq'),
|
('Iraq', 'http://feeds.newsweek.com/newsweek/iraq'),
|
||||||
('Health', 'http://feeds.newsweek.com/sections/health'),
|
|
||||||
('Society', 'http://feeds.newsweek.com/newsweek/society'),
|
('Society', 'http://feeds.newsweek.com/newsweek/society'),
|
||||||
('Business', 'http://feeds.newsweek.com/newsweek/business'),
|
|
||||||
('Science and Technology', 'http://feeds.newsweek.com/newsweek/TechnologyScience'),
|
|
||||||
('Entertainment', 'http://feeds.newsweek.com/newsweek/entertainment'),
|
('Entertainment', 'http://feeds.newsweek.com/newsweek/entertainment'),
|
||||||
('Tip Sheet', 'http://feeds.newsweek.com/newsweek/TipSheet/Highlights'),
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def print_version(url):
|
def print_version(url):
|
||||||
if '?' in url:
|
if '?' in url:
|
||||||
url = url[:url.index('?')]
|
url = url[:url.index('?')]
|
||||||
return url + 'print/1/displaymode/1098/'
|
if not url.endswith('/'):
|
||||||
|
url += '/'
|
||||||
|
return url + 'output/print'
|
||||||
|
|
||||||
def initialize(profile):
|
def initialize(profile):
|
||||||
profile['temp dir'] = tempfile.mkdtemp(prefix=__appname__+'_')
|
profile['temp dir'] = tempfile.mkdtemp(prefix=__appname__+'_')
|
||||||
profile['browser'] = browser()
|
profile['browser'] = browser()
|
||||||
articles = parse_feeds(RSS_FEEDS, profile['browser'], print_version,
|
articles = parse_feeds(RSS_FEEDS, profile['browser'], print_version,
|
||||||
max_articles_per_feed=20, html_description=True)
|
max_articles_per_feed=20, oldest_article=15,
|
||||||
|
html_description=True)
|
||||||
index = build_index('Newsweek', articles, profile['temp dir'])
|
index = build_index('Newsweek', articles, profile['temp dir'])
|
||||||
profile['url'] = 'file:'+ ('' if iswindows else '//') + index
|
profile['url'] = 'file:'+ ('' if iswindows else '//') + index
|
||||||
profile['timefmt'] = ' [%d %b %Y]'
|
profile['timefmt'] = ' [%d %b %Y]'
|
||||||
|
@ -82,7 +82,6 @@ profiles = {
|
|||||||
'newsweek' : {
|
'newsweek' : {
|
||||||
'initialize' : newsweek_initialize,
|
'initialize' : newsweek_initialize,
|
||||||
'finalize' : newsweek_finalize,
|
'finalize' : newsweek_finalize,
|
||||||
'no_stylesheets' : True,
|
|
||||||
'preprocess_regexps' :
|
'preprocess_regexps' :
|
||||||
[ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
[ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
||||||
[
|
[
|
||||||
|
Loading…
x
Reference in New Issue
Block a user