Updated LWN weekly

This commit is contained in:
Kovid Goyal 2011-03-22 15:02:28 -06:00
parent b18cd40a4e
commit 377fd1d058

View File

@ -7,6 +7,7 @@ lwn.net
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
import re
class WeeklyLWN(BasicNewsRecipe):
@ -23,6 +24,11 @@ class WeeklyLWN(BasicNewsRecipe):
remove_tags_after = dict(attrs={'class':'ArticleText'})
remove_tags = [dict(name=['h2', 'form'])]
preprocess_regexps = [
# Remove the <hr> and "Log in to post comments"
(re.compile(r'<hr.*?comments[)]', re.DOTALL), lambda m: ''),
]
conversion_options = { 'linearize_tables' : True }
oldest_article = 7.0
@ -40,15 +46,16 @@ class WeeklyLWN(BasicNewsRecipe):
def parse_index(self):
if self.username is not None and self.password is not None:
index_url = 'http://lwn.net/current/bigpage'
index_url = 'http://lwn.net/current/bigpage?format=printable'
else:
index_url = 'http://lwn.net/free/bigpage'
index_url = 'http://lwn.net/free/bigpage?format=printable'
soup = self.index_to_soup(index_url)
body = soup.body
articles = {}
ans = []
url_re = re.compile('^http://lwn.net/Articles/')
old_section = None
url_re = re.compile('^/Articles/')
while True:
tag_title = body.findNext(name='p', attrs={'class':'SummaryHL'})
@ -77,7 +84,7 @@ class WeeklyLWN(BasicNewsRecipe):
if tag_url == None:
break
body = tag_url
if tag_url.string == None:
if tag_url.string == None:
continue
elif tag_url.string == 'Full Story':
break
@ -88,10 +95,10 @@ class WeeklyLWN(BasicNewsRecipe):
if tag_url == None:
break
article = dict(
title=tag_title.string,
url=tag_url['href'].split('#')[0],
url= 'http://lwn.net' + tag_url['href'].split('#')[0] + '?format=printable',
description='', content='', date='')
articles[section].append(article)