mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 10:44:09 -04:00
Updated LWN weekly
This commit is contained in:
parent
b18cd40a4e
commit
377fd1d058
@ -7,6 +7,7 @@ lwn.net
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
import re
|
||||
|
||||
class WeeklyLWN(BasicNewsRecipe):
|
||||
@ -23,6 +24,11 @@ class WeeklyLWN(BasicNewsRecipe):
|
||||
remove_tags_after = dict(attrs={'class':'ArticleText'})
|
||||
remove_tags = [dict(name=['h2', 'form'])]
|
||||
|
||||
preprocess_regexps = [
|
||||
# Remove the <hr> and "Log in to post comments"
|
||||
(re.compile(r'<hr.*?comments[)]', re.DOTALL), lambda m: ''),
|
||||
]
|
||||
|
||||
conversion_options = { 'linearize_tables' : True }
|
||||
|
||||
oldest_article = 7.0
|
||||
@ -40,15 +46,16 @@ class WeeklyLWN(BasicNewsRecipe):
|
||||
|
||||
def parse_index(self):
|
||||
if self.username is not None and self.password is not None:
|
||||
index_url = 'http://lwn.net/current/bigpage'
|
||||
index_url = 'http://lwn.net/current/bigpage?format=printable'
|
||||
else:
|
||||
index_url = 'http://lwn.net/free/bigpage'
|
||||
index_url = 'http://lwn.net/free/bigpage?format=printable'
|
||||
soup = self.index_to_soup(index_url)
|
||||
body = soup.body
|
||||
|
||||
articles = {}
|
||||
ans = []
|
||||
url_re = re.compile('^http://lwn.net/Articles/')
|
||||
old_section = None
|
||||
url_re = re.compile('^/Articles/')
|
||||
|
||||
while True:
|
||||
tag_title = body.findNext(name='p', attrs={'class':'SummaryHL'})
|
||||
@ -77,7 +84,7 @@ class WeeklyLWN(BasicNewsRecipe):
|
||||
if tag_url == None:
|
||||
break
|
||||
body = tag_url
|
||||
if tag_url.string == None:
|
||||
if tag_url.string == None:
|
||||
continue
|
||||
elif tag_url.string == 'Full Story':
|
||||
break
|
||||
@ -88,10 +95,10 @@ class WeeklyLWN(BasicNewsRecipe):
|
||||
|
||||
if tag_url == None:
|
||||
break
|
||||
|
||||
|
||||
article = dict(
|
||||
title=tag_title.string,
|
||||
url=tag_url['href'].split('#')[0],
|
||||
url= 'http://lwn.net' + tag_url['href'].split('#')[0] + '?format=printable',
|
||||
description='', content='', date='')
|
||||
articles[section].append(article)
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user