mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
As security section has no URLs in article titles, findNext() boldly returns whatever next link is encounered after the anchor. This leads to downloading and including in generated document of heavy CVE reports, as links to them usually placed after the article title. Instead we'd better search under anchor tag only, this way we'll filter useful articles' links.
154 lines
4.8 KiB
Python
154 lines
4.8 KiB
Python
#!/usr/bin/env python2
|
|
|
|
__license__ = 'GPL v3'
|
|
__copyright__ = '2011, Davide Cavalca <davide125 at tiscali.it>'
|
|
'''
|
|
lwn.net
|
|
'''
|
|
|
|
from calibre.web.feeds.news import BasicNewsRecipe
|
|
import re
|
|
import sys
|
|
|
|
|
|
class WeeklyLWN(BasicNewsRecipe):
|
|
title = 'LWN.net Weekly Edition'
|
|
description = 'Weekly summary of what has happened in the free software world.'
|
|
__author__ = 'Davide Cavalca'
|
|
language = 'en'
|
|
site_url = u'http://lwn.net'
|
|
|
|
no_stylesheets = True
|
|
remove_javascript = True
|
|
extra_css = '''pre,code,samp,kbd,tt { font-size: 80% }
|
|
blockquote {margin-left:0 }
|
|
DIV.BigQuote,SPAN { font-style:oblique }
|
|
DIV.FeatureByline,DIV.GaByline { background-color:#EEE }
|
|
DIV.tlr { background-color:#EED; border-style:groove; }
|
|
* { color: black }'''
|
|
|
|
cover_url = site_url + '/images/lcorner.png'
|
|
# masthead_url = 'http://lwn.net/images/lcorner.png'
|
|
publication_type = 'magazine'
|
|
|
|
keep_only_tags = [dict(attrs={'class': ['PageHeadline', 'ArticleText']})]
|
|
remove_tags = [dict(name=['h2', 'form'])]
|
|
|
|
preprocess_regexps = [
|
|
# Remove the <hr> and "Log in to post comments"
|
|
(re.compile(r'<hr [^>]+>\s*\n\s*.*?comments[)]'), lambda m: ''),
|
|
]
|
|
|
|
conversion_options = {
|
|
'no_inline_navbars': True,
|
|
}
|
|
|
|
oldest_article = 7.0
|
|
needs_subscription = 'optional'
|
|
|
|
def get_browser(self):
|
|
br = BasicNewsRecipe.get_browser(self)
|
|
if self.username is not None and self.password is not None:
|
|
br.open('https://lwn.net/login')
|
|
br.select_form(name='loginform')
|
|
br['Username'] = self.username
|
|
br['Password'] = self.password
|
|
br.submit()
|
|
return br
|
|
|
|
def print_version(self, url):
|
|
|
|
# Strip off anchor
|
|
url = url.split('#')[0]
|
|
|
|
# Prepend site_url
|
|
if url[0:len(self.site_url)] != self.site_url:
|
|
url = self.site_url + url
|
|
|
|
# Append printable URL parameter
|
|
print_param = '?format=printable'
|
|
if url[-len(print_param):] != print_param:
|
|
url += print_param
|
|
|
|
return url
|
|
|
|
def parse_index(self):
|
|
if self.username is not None and self.password is not None:
|
|
index_url = self.print_version('/current/bigpage')
|
|
else:
|
|
index_url = self.print_version('/free/bigpage')
|
|
soup = self.index_to_soup(index_url)
|
|
curr = soup.body
|
|
|
|
articles = {}
|
|
ans = []
|
|
|
|
section = self.tag_to_string(soup.title)
|
|
subsection = None
|
|
|
|
while True:
|
|
curr = curr.findNext(
|
|
attrs={'class': ['SummaryHL', 'Cat1HL', 'Cat2HL']})
|
|
|
|
if curr is None:
|
|
break
|
|
|
|
text = self.tag_to_string(curr.contents[0])
|
|
|
|
if 'Cat2HL' in curr['class']:
|
|
subsection = text
|
|
|
|
elif 'Cat1HL' in curr['class']:
|
|
section = text
|
|
subsection = None
|
|
|
|
elif 'SummaryHL' in curr['class']:
|
|
article_title = text
|
|
if not article_title:
|
|
article_title = _('Undefined article title')
|
|
|
|
if subsection:
|
|
section_title = "%s: %s" % (section, subsection)
|
|
else:
|
|
section_title = section
|
|
|
|
# Most articles have anchors in their titles, *except* the
|
|
# security vulnerabilities
|
|
article_anchor = curr.find(
|
|
name='a', attrs={'href': re.compile('^/Articles/')})
|
|
|
|
if article_anchor:
|
|
article_url = article_anchor.get('href')
|
|
if not article_url:
|
|
print >>sys.stderr, 'article_url is None for article_anchor "%s": "%s"' \
|
|
% (str(article_anchor), article_title)
|
|
continue
|
|
|
|
else:
|
|
self.log.warn('article_anchor is None for "%s"; skipping' % article_title)
|
|
article_url = None
|
|
continue
|
|
|
|
if section_title not in articles:
|
|
articles[section_title] = []
|
|
if section_title not in ans:
|
|
ans.append(section_title)
|
|
|
|
articles[section_title].append({
|
|
'url': article_url,
|
|
'title': article_title,
|
|
'description': '', 'content': '', 'date': '',
|
|
})
|
|
|
|
else:
|
|
self.log.error("lwn_weekly.recipe: something bad happened; should not be able to reach this")
|
|
|
|
ans = [(section2, articles[section2])
|
|
for section2 in ans if section2 in articles]
|
|
# from pprint import pprint
|
|
# pprint(ans)
|
|
|
|
return ans
|
|
|
|
# vim: expandtab:ts=4:sw=4
|