.*?(
.*?).*?
(.*?)
' # noqa
conversion_options = {
'comment': description, 'tags': category, 'publisher': publisher, 'language': language, 'linearize_tables': True
}
feeds = [
(u'Reviews', u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=reviews'),
(u'Commentary', u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=COMMENTARY'),
(u'Great Movies', u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=REVIEWS08'),
(u'People', u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=PEOPLE'),
(u'Glossary', u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=GLOSSARY')
]
preprocess_regexps = [
(re.compile(r'
.*?This is a printer friendly.*?.*?
', re.DOTALL | re.IGNORECASE),
lambda m: '')
]
def print_version(self, url):
return url + '&template=printart'
def parse_index(self):
totalfeeds = []
lfeeds = self.get_feeds()
for feedobj in lfeeds:
feedtitle, feedurl = feedobj
self.log('\tFeedurl: ', feedurl)
self.report_progress(0, _('Fetching feed') + ' %s...' %
(feedtitle if feedtitle else feedurl))
articles = []
page = self.index_to_soup(feedurl, raw=True)
if feedtitle == 'Reviews' or feedtitle == 'Great Movies':
pattern = self.patternReviews
elif feedtitle == 'Commentary':
pattern = self.patternCommentary
elif feedtitle == 'People':
pattern = self.patternPeople
elif feedtitle == 'Glossary':
pattern = self.patternGlossary
regex = re.compile(pattern, re.IGNORECASE | re.DOTALL)
for match in regex.finditer(page):
if feedtitle == 'Reviews' or feedtitle == 'Great Movies':
movietitle = match.group(1)
thislink = match.group(2)
description = match.group(3)
elif feedtitle == 'Commentary' or feedtitle == 'People' or feedtitle == 'Glossary':
thislink = match.group(1)
description = match.group(2)
self.log(thislink)
soup = self.index_to_soup(thislink)
for link in soup.findAll('a', href=True):
thisurl = self.PREFIX + link['href']
thislinktext = self.tag_to_string(link)
if feedtitle == 'Reviews' or feedtitle == 'Great Movies':
thistitle = movietitle
elif feedtitle == 'Commentary' or feedtitle == 'People' or feedtitle == 'Glossary':
thistitle = thislinktext
if thistitle == '':
thistitle = 'Ebert Journal Post'
r"""
pattern2 = r'AID=\/(.*?)\/'
reg2 = re.compile(pattern2, re.IGNORECASE|re.DOTALL)
match2 = reg2.search(thisurl)
date = match2.group(1)
c = time.strptime(match2.group(1),"%Y%m%d")
date=time.strftime("%a, %b %d, %Y", c)
self.log(date)
"""
articles.append({
'title': thistitle, 'date': '', 'url': thisurl, 'description': description
})
totalfeeds.append((feedtitle, articles))
return totalfeeds