mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Add recipes for The Irish Times and The International Herald Tribune
This commit is contained in:
parent
c31e509de2
commit
6334cc850c
@ -98,6 +98,10 @@ class BasicNewsRecipe(object, LoggingInterface):
|
|||||||
#: embedded content.
|
#: embedded content.
|
||||||
use_embedded_content = None
|
use_embedded_content = None
|
||||||
|
|
||||||
|
#: Set to True and implement :method:`get_obfuscated_article` to handle
|
||||||
|
#: websites that try to make it difficult to scrape content.
|
||||||
|
articles_are_obfuscated = False
|
||||||
|
|
||||||
#: Specify any extra :term:`CSS` that should be addded to downloaded :term:`HTML` files
|
#: Specify any extra :term:`CSS` that should be addded to downloaded :term:`HTML` files
|
||||||
#: It will be inserted into `<style>` tags, just before the closing
|
#: It will be inserted into `<style>` tags, just before the closing
|
||||||
#: `</head>` tag thereby overrinding all :term:`CSS` except that which is
|
#: `</head>` tag thereby overrinding all :term:`CSS` except that which is
|
||||||
@ -360,12 +364,25 @@ class BasicNewsRecipe(object, LoggingInterface):
|
|||||||
'''
|
'''
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def get_obfuscated_article(self, url, logger):
|
||||||
|
'''
|
||||||
|
If you set :member:`articles_are_obfuscated` this method is called with
|
||||||
|
every article URL. It should return the path to a file on the filesystem
|
||||||
|
that contains the article HTML. That file is processed by the recursive
|
||||||
|
HTML fetching engine, so it can contain links to pages/images on the web.
|
||||||
|
|
||||||
|
This method is typically useful for sites that try to make it difficult to
|
||||||
|
access article content automatically. See for example the
|
||||||
|
:module:`calibre.web.recipes.iht` recipe.
|
||||||
|
'''
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
def __init__(self, options, parser, progress_reporter):
|
def __init__(self, options, parser, progress_reporter):
|
||||||
'''
|
'''
|
||||||
Initialize the recipe.
|
Initialize the recipe.
|
||||||
@param options: Parsed commandline options
|
:param options: Parsed commandline options
|
||||||
@param parser: Command line option parser. Used to intelligently merge options.
|
:param parser: Command line option parser. Used to intelligently merge options.
|
||||||
@param progress_reporter: A Callable that takes two arguments: progress (a number between 0 and 1) and a string message. The message should be optional.
|
:param progress_reporter: A Callable that takes two arguments: progress (a number between 0 and 1) and a string message. The message should be optional.
|
||||||
'''
|
'''
|
||||||
LoggingInterface.__init__(self, logging.getLogger('feeds2disk'))
|
LoggingInterface.__init__(self, logging.getLogger('feeds2disk'))
|
||||||
if not isinstance(self.title, unicode):
|
if not isinstance(self.title, unicode):
|
||||||
@ -564,7 +581,11 @@ class BasicNewsRecipe(object, LoggingInterface):
|
|||||||
|
|
||||||
def fetch_article(self, url, dir, logger, f, a, num_of_feeds):
|
def fetch_article(self, url, dir, logger, f, a, num_of_feeds):
|
||||||
return self._fetch_article(url, dir, logger, f, a, num_of_feeds)
|
return self._fetch_article(url, dir, logger, f, a, num_of_feeds)
|
||||||
|
|
||||||
|
def fetch_obfuscated_article(self, url, dir, logger, f, a, num_of_feeds):
|
||||||
|
path = os.path.abspath(self.get_obfuscated_article(url, logger))
|
||||||
|
url = ('file:'+path) if iswindows else ('file://'+path)
|
||||||
|
return self._fetch_article(url, dir, logger, f, a, num_of_feeds)
|
||||||
|
|
||||||
def fetch_embedded_article(self, article, dir, logger, f, a, num_of_feeds):
|
def fetch_embedded_article(self, article, dir, logger, f, a, num_of_feeds):
|
||||||
pt = PersistentTemporaryFile('_feeds2disk.html')
|
pt = PersistentTemporaryFile('_feeds2disk.html')
|
||||||
@ -620,7 +641,8 @@ class BasicNewsRecipe(object, LoggingInterface):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
func, arg = (self.fetch_embedded_article, article) if self.use_embedded_content else \
|
func, arg = (self.fetch_embedded_article, article) if self.use_embedded_content else \
|
||||||
(self.fetch_article, url)
|
((self.fetch_obfuscated_article if self.articles_are_obfuscated \
|
||||||
|
else self.fetch_article), url)
|
||||||
req = WorkRequest(func, (arg, art_dir, logger, f, a, len(feed)),
|
req = WorkRequest(func, (arg, art_dir, logger, f, a, len(feed)),
|
||||||
{}, (f, a), self.article_downloaded,
|
{}, (f, a), self.article_downloaded,
|
||||||
self.error_in_article_download)
|
self.error_in_article_download)
|
||||||
|
@ -8,7 +8,7 @@ recipes = [
|
|||||||
'newsweek', 'atlantic', 'economist', 'portfolio',
|
'newsweek', 'atlantic', 'economist', 'portfolio',
|
||||||
'nytimes', 'usatoday', 'outlook_india', 'bbc', 'greader', 'wsj',
|
'nytimes', 'usatoday', 'outlook_india', 'bbc', 'greader', 'wsj',
|
||||||
'wired', 'globe_and_mail', 'smh', 'espn', 'business_week',
|
'wired', 'globe_and_mail', 'smh', 'espn', 'business_week',
|
||||||
'ars_technica', 'upi', 'new_yorker',
|
'ars_technica', 'upi', 'new_yorker', 'irish_times', 'iht',
|
||||||
]
|
]
|
||||||
|
|
||||||
import re, imp, inspect, time
|
import re, imp, inspect, time
|
||||||
|
51
src/calibre/web/feeds/recipes/iht.py
Normal file
51
src/calibre/web/feeds/recipes/iht.py
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2008, Derry FitzGerald'
|
||||||
|
'''
|
||||||
|
iht.com
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.ptempfile import PersistentTemporaryFile
|
||||||
|
|
||||||
|
|
||||||
|
class InternationalHeraldTribune(BasicNewsRecipe):
|
||||||
|
title = u'The International Herald Tribune'
|
||||||
|
__author__ = 'Derry FitzGerald'
|
||||||
|
oldest_article = 1
|
||||||
|
max_articles_per_feed = 10
|
||||||
|
no_stylesheets = True
|
||||||
|
|
||||||
|
remove_tags = [dict(name='div', attrs={'class':'footer'})]
|
||||||
|
extra_css = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt }'
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'Frontpage', u'http://www.iht.com/rss/frontpage.xml'),
|
||||||
|
(u'Business', u'http://www.iht.com/rss/business.xml'),
|
||||||
|
(u'Americas', u'http://www.iht.com/rss/america.xml'),
|
||||||
|
(u'Europe', u'http://www.iht.com/rss/europe.xml'),
|
||||||
|
(u'Asia', u'http://www.iht.com/rss/asia.xml'),
|
||||||
|
(u'Africa and Middle East', u'http://www.iht.com/rss/africa.xml'),
|
||||||
|
(u'Opinion', u'http://www.iht.com/rss/opinion.xml'),
|
||||||
|
(u'Technology', u'http://www.iht.com/rss/technology.xml'),
|
||||||
|
(u'Health and Science', u'http://www.iht.com/rss/healthscience.xml'),
|
||||||
|
(u'Sports', u'http://www.iht.com/rss/sports.xml'),
|
||||||
|
(u'Culture', u'http://www.iht.com/rss/arts.xml'),
|
||||||
|
(u'Style and Design', u'http://www.iht.com/rss/style.xml'),
|
||||||
|
(u'Travel', u'http://www.iht.com/rss/travel.xml'),
|
||||||
|
(u'At Home Abroad', u'http://www.iht.com/rss/athome.xml'),
|
||||||
|
(u'Your Money', u'http://www.iht.com/rss/yourmoney.xml'),
|
||||||
|
(u'Properties', u'http://www.iht.com/rss/properties.xml')
|
||||||
|
]
|
||||||
|
temp_files = []
|
||||||
|
articles_are_obfuscated = True
|
||||||
|
|
||||||
|
def get_obfuscated_article(self, url, logger):
|
||||||
|
br = self.get_browser()
|
||||||
|
br.open(url)
|
||||||
|
br.select_form(name='printFriendly')
|
||||||
|
res = br.submit()
|
||||||
|
html = res.read()
|
||||||
|
self.temp_files.append(PersistentTemporaryFile('_iht.html'))
|
||||||
|
self.temp_files[-1].write(html)
|
||||||
|
self.temp_files[-1].close()
|
||||||
|
return self.temp_files[-1].name
|
36
src/calibre/web/feeds/recipes/irish_times.py
Normal file
36
src/calibre/web/feeds/recipes/irish_times.py
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2008, Derry FitzGerald'
|
||||||
|
'''
|
||||||
|
irishtimes.com
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class IrishTimes(BasicNewsRecipe):
|
||||||
|
title = u'The Irish Times'
|
||||||
|
__author__ = 'Derry FitzGerald'
|
||||||
|
no_stylesheets = True
|
||||||
|
|
||||||
|
remove_tags = [dict(name='div', attrs={'class':'footer'})]
|
||||||
|
extra_css = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt }'
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
('Frontpage', 'http://www.irishtimes.com/feeds/rss/newspaper/index.rss'),
|
||||||
|
('Ireland', 'http://www.irishtimes.com/feeds/rss/newspaper/ireland.rss'),
|
||||||
|
('World', 'http://www.irishtimes.com/feeds/rss/newspaper/world.rss'),
|
||||||
|
('Finance', 'http://www.irishtimes.com/feeds/rss/newspaper/finance.rss'),
|
||||||
|
('Features', 'http://www.irishtimes.com/feeds/rss/newspaper/features.rss'),
|
||||||
|
('Sport', 'http://www.irishtimes.com/feeds/rss/newspaper/sport.rss'),
|
||||||
|
('Opinion', 'http://www.irishtimes.com/feeds/rss/newspaper/opinion.rss'),
|
||||||
|
('Letters', 'http://www.irishtimes.com/feeds/rss/newspaper/letters.rss'),
|
||||||
|
('Health', 'http://www.irishtimes.com/feeds/rss/newspaper/health.rss'),
|
||||||
|
('Education and Parenting', 'http://www.irishtimes.com/feeds/rss/newspaper/education.rss'),
|
||||||
|
('Science Today', 'http://www.irishtimes.com/feeds/rss/newspaper/sciencetoday.rss'),
|
||||||
|
('The Ticket', 'http://www.irishtimes.com/feeds/rss/newspaper/theticket.rss'),
|
||||||
|
('Weekend', 'http://www.irishtimes.com/feeds/rss/newspaper/weekend.rss'),
|
||||||
|
('News Features', 'http://www.irishtimes.com/feeds/rss/newspaper/newsfeatures.rss'),
|
||||||
|
('Magazine', 'http://www.irishtimes.com/feeds/rss/newspaper/magazine.rss'),
|
||||||
|
]
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
return url.replace('.html', '_pf.html')
|
Loading…
x
Reference in New Issue
Block a user