calibre/recipes/roger_ebert_blog.recipe
2019-04-01 13:57:21 +05:30

125 lines
5.4 KiB
Plaintext

import re
import time
from calibre.web.feeds.news import BasicNewsRecipe
from calibre import strftime
'''
Help Needed:
Still can't figure out why I'm getting strange characters. Esp. the Great Movies descriptions in the TOC.
Anyone help me figure that out?
Change Log:
2011-02-19: Version 2: Added "Oscars" section and fixed date problem
'''
class Ebert(BasicNewsRecipe):
title = 'Roger Ebert'
__author__ = 'Shane Erstad'
version = 2
description = 'Roger Ebert Movie Reviews'
publisher = 'Chicago Sun Times'
category = 'movies'
oldest_article = 8
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'UTF-8'
masthead_url = 'http://rogerebert.suntimes.com/graphics/global/roger.jpg'
language = 'en'
remove_empty_feeds = False
PREFIX = 'http://rogerebert.suntimes.com'
patternReviews = r'<span class="*?movietitle"*?>(.*?)</span>.*?<div class="*?headline"*?>(.*?)</div>(.*?)</div>'
patternCommentary = r'<div class="*?headline"*?>.*?(<a href="/apps/pbcs.dll/article\?AID=.*?COMMENTARY.*?" id="ltred">.*?</a>).*?<div class="blurb clear">(.*?)</div>' # noqa
patternPeople = r'<div class="*?headline"*?>.*?(<a href="/apps/pbcs.dll/article\?AID=.*?PEOPLE.*?" id="ltred">.*?</a>).*?<div class="blurb clear">(.*?)</div>' # noqa
patternOscars = r'<div class="*?headline"*?>.*?(<a href="/apps/pbcs.dll/article\?AID=.*?OSCARS.*?" id="ltred">.*?</a>).*?<div class="blurb clear">(.*?)</div>' # noqa
patternGlossary = r'<div class="*?headline"*?>.*?(<a href="/apps/pbcs.dll/article\?AID=.*?GLOSSARY.*?" id="ltred">.*?</a>).*?<div class="blurb clear">(.*?)</div>' # noqa
conversion_options = {
'comment': description, 'tags': category, 'publisher': publisher, 'language': language, 'linearize_tables': True
}
feeds = [
(u'Reviews', u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=reviews'),
(u'Commentary', u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=COMMENTARY'),
(u'Great Movies', u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=REVIEWS08'),
(u'People', u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=PEOPLE'),
(u'Oscars', u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=OSCARS'),
(u'Glossary', u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=GLOSSARY')
]
preprocess_regexps = [
(re.compile(r'<font.*?>.*?This is a printer friendly.*?</font>.*?<hr>', re.DOTALL | re.IGNORECASE),
lambda m: '')
]
def print_version(self, url):
return url + '&template=printart'
def parse_index(self):
totalfeeds = []
lfeeds = self.get_feeds()
for feedobj in lfeeds:
feedtitle, feedurl = feedobj
self.log('\tFeedurl: ', feedurl)
self.report_progress(0, _('Fetching feed') + ' %s...' %
(feedtitle if feedtitle else feedurl))
articles = []
page = self.index_to_soup(feedurl, raw=True)
if feedtitle == 'Reviews' or feedtitle == 'Great Movies':
pattern = self.patternReviews
elif feedtitle == 'Commentary':
pattern = self.patternCommentary
elif feedtitle == 'People':
pattern = self.patternPeople
elif feedtitle == 'Glossary':
pattern = self.patternGlossary
elif feedtitle == 'Oscars':
pattern = self.patternOscars
regex = re.compile(pattern, re.IGNORECASE | re.DOTALL)
for match in regex.finditer(page):
if feedtitle == 'Reviews' or feedtitle == 'Great Movies':
movietitle = match.group(1)
thislink = match.group(2)
description = match.group(3)
elif feedtitle == 'Commentary' or feedtitle == 'People' or feedtitle == 'Glossary' or feedtitle == 'Oscars':
thislink = match.group(1)
description = match.group(2)
self.log(thislink)
soup = self.index_to_soup(thislink)
for link in soup.findAll('a', href=True):
thisurl = self.PREFIX + link['href']
thislinktext = self.tag_to_string(link)
if feedtitle == 'Reviews' or feedtitle == 'Great Movies':
thistitle = movietitle
elif feedtitle == 'Commentary' or feedtitle == 'People' or feedtitle == 'Glossary' or feedtitle == 'Oscars':
thistitle = thislinktext
if thistitle == '':
continue
pattern2 = r'AID=\/(.*?)\/'
reg2 = re.compile(pattern2, re.IGNORECASE | re.DOTALL)
match2 = reg2.search(thisurl)
if match2:
c = time.strptime(match2.group(1), "%Y%m%d")
mydate = strftime("%A, %B %d, %Y", c)
else:
mydate = strftime("%A, %B %d, %Y")
self.log(mydate)
articles.append({
'title': thistitle, 'date': ' [' + mydate + ']', 'url': thisurl, 'description': description
})
totalfeeds.append((feedtitle, articles))
return totalfeeds