mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Ehancements for The Age recipe: strip out useless links, grab the pdf front page, editorial and letters, sort the feeds explicitly
This commit is contained in:
parent
257a82f06b
commit
23b01a98e0
@ -9,15 +9,19 @@ theage.com.au
|
|||||||
from calibre import strftime
|
from calibre import strftime
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
|
import re
|
||||||
|
|
||||||
class TheAge(BasicNewsRecipe):
|
class TheAge(BasicNewsRecipe):
|
||||||
|
|
||||||
title = 'The Age'
|
title = 'The Age'
|
||||||
description = 'Business News, World News and Breaking News in Melbourne, Australia'
|
description = 'Business News, World News and Breaking News in Melbourne, Australia'
|
||||||
__author__ = 'Matthew Briggs'
|
publication_type = 'newspaper'
|
||||||
language = 'en_AU'
|
__author__ = 'Matthew Briggs'
|
||||||
|
language = 'en_AU'
|
||||||
|
|
||||||
|
max_articles_per_feed = 1000
|
||||||
|
recursions = 0
|
||||||
|
remove_tags = [dict(name=['table', 'script', 'noscript', 'style']), dict(name='a', attrs={'href':'/'}), dict(name='a', attrs={'href':'/text/'})]
|
||||||
|
|
||||||
def get_browser(self):
|
def get_browser(self):
|
||||||
br = BasicNewsRecipe.get_browser()
|
br = BasicNewsRecipe.get_browser()
|
||||||
@ -28,30 +32,81 @@ class TheAge(BasicNewsRecipe):
|
|||||||
|
|
||||||
soup = BeautifulSoup(self.browser.open('http://www.theage.com.au/text/').read())
|
soup = BeautifulSoup(self.browser.open('http://www.theage.com.au/text/').read())
|
||||||
|
|
||||||
feeds, articles = [], []
|
section = None
|
||||||
feed = None
|
sections = {}
|
||||||
|
|
||||||
|
|
||||||
for tag in soup.findAll(['h3', 'a']):
|
for tag in soup.findAll(['h3', 'a']):
|
||||||
if tag.name == 'h3':
|
if tag.name == 'h3':
|
||||||
if articles:
|
section = self.tag_to_string(tag)
|
||||||
feeds.append((feed, articles))
|
sections[section] = []
|
||||||
articles = []
|
|
||||||
feed = self.tag_to_string(tag)
|
# Make sure to skip: <a href="/">TheAge</a>
|
||||||
elif feed is not None and tag.has_key('href') and tag['href'].strip():
|
|
||||||
|
elif section and tag.has_key('href') and len(tag['href'].strip())>1:
|
||||||
url = tag['href'].strip()
|
url = tag['href'].strip()
|
||||||
if url.startswith('/'):
|
if url.startswith('/'):
|
||||||
url = 'http://www.theage.com.au' + url
|
url = 'http://www.theage.com.au' + url
|
||||||
title = self.tag_to_string(tag)
|
title = self.tag_to_string(tag)
|
||||||
articles.append({
|
sections[section].append({
|
||||||
'title': title,
|
'title': title,
|
||||||
'url' : url,
|
'url' : url,
|
||||||
'date' : strftime('%a, %d %b'),
|
'date' : strftime('%a, %d %b'),
|
||||||
'description' : '',
|
'description' : '',
|
||||||
'content' : '',
|
'content' : '',
|
||||||
})
|
})
|
||||||
|
|
||||||
|
feeds = []
|
||||||
|
|
||||||
|
# Insert feeds in specified order, if available
|
||||||
|
|
||||||
|
feedSort = [ 'National', 'World', 'Opinion', 'Columns', 'Business', 'Sport', 'Entertainment' ]
|
||||||
|
for i in feedSort:
|
||||||
|
if i in sections:
|
||||||
|
feeds.append((i,sections[i]))
|
||||||
|
|
||||||
|
# Done with the sorted feeds
|
||||||
|
|
||||||
|
for i in feedSort:
|
||||||
|
del sections[i]
|
||||||
|
|
||||||
|
# Append what is left over...
|
||||||
|
|
||||||
|
for i in sections:
|
||||||
|
feeds.append((i,sections[i]))
|
||||||
|
|
||||||
return feeds
|
return feeds
|
||||||
|
|
||||||
|
def get_cover_url(self):
|
||||||
|
|
||||||
|
soup = BeautifulSoup(self.browser.open('http://www.theage.com.au/todays-paper').read())
|
||||||
|
|
||||||
|
for i in soup.findAll('a'):
|
||||||
|
href = i['href']
|
||||||
|
if href and re.match('http://www.theage.com.au/frontpage/[0-9]+/[0-9]+/[0-9]+/frontpage.pdf',href):
|
||||||
|
return href
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def preprocess_html(self,soup):
|
||||||
|
|
||||||
|
for p in soup.findAll('p'):
|
||||||
|
|
||||||
|
# Collapse the paragraph by joining the non-tag contents
|
||||||
|
|
||||||
|
contents = [i for i in p.contents if isinstance(i,unicode)]
|
||||||
|
if len(contents):
|
||||||
|
contents = ''.join(contents)
|
||||||
|
|
||||||
|
# Filter out what's left of the text-mode navigation stuff
|
||||||
|
|
||||||
|
if re.match('((\s)|(\ \;))*\[[\|\s*]*\]((\s)|(\ \;))*$',contents):
|
||||||
|
p.extract()
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Shrink the fine print font
|
||||||
|
|
||||||
|
if contents=='This material is subject to copyright and any unauthorised use, copying or mirroring is prohibited.':
|
||||||
|
p['style'] = 'font-size:small'
|
||||||
|
continue
|
||||||
|
|
||||||
|
return soup
|
||||||
|
Loading…
x
Reference in New Issue
Block a user