798 lines
34 KiB
Python

#!/usr/bin/env python
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
'''
The backend to parse feeds and create HTML that can then be converted
to an ebook.
'''
import logging, os, cStringIO, time, traceback, re, urlparse
from collections import defaultdict
from libprs500 import browser, __appname__, iswindows
from libprs500.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
from libprs500.ebooks.metadata.opf import OPFCreator
from libprs500.ebooks.lrf import entity_to_unicode
from libprs500.ebooks.metadata.toc import TOC
from libprs500.ebooks.metadata import MetaInformation
from libprs500.web.feeds import feed_from_xml, templates, feeds_from_index
from libprs500.web.fetch.simple import option_parser as web2disk_option_parser
from libprs500.web.fetch.simple import RecursiveFetcher
from libprs500.threadpool import WorkRequest, ThreadPool, NoResultsPending
from libprs500.ebooks.lrf.web.profiles import FullContentProfile
from libprs500.ptempfile import PersistentTemporaryFile
class BasicNewsRecipe(object):
'''
Abstract base class that contains logic needed in all feed fetchers.
'''
#: The title to use for the ebook
#: @type: string
title = _('Unknown News Source')
#: The author of this recipe
__author__ = __appname__
#: Maximum number of articles to download from each feed
#: @type: integer
max_articles_per_feed = 100
#: Oldest article to download from this news source. In days.
#: @type: float
oldest_article = 7.0
#: Number of levels of links to follow on webpages that are linked
#: to by the feed.
#: @type: integer
recursions = 0
#: Delay between consecutive downloads in seconds
#: @type: integer
delay = 0
#: Number of simultaneous downloads. Set to 1 if the server is picky.
#: Automatically reduced to 1 if L{delay} > 0
#: @type: integer
simultaneous_downloads = 5
#: Timeout for fetching files from server in seconds
#: @type: integer
timeout = 120
#: The format string for the date shown on the first page
#: By default: Day Name Day Number Month Name Year
#: @type: string
timefmt = ' [%a, %d %b %Y]'
#: List of feeds to download
#: Can be either C{[url1, url2, ...]} or C{[('title1', url1), ('title2', url2),...]}
#: @type: List of strings or list of 2-tuples
feeds = None
#: Max number of characters in the short description.
#: @type: integer
summary_length = 500
#: If True stylesheets are not downloaded and processed
#: Convenient flag to disable loading of stylesheets for websites
#: that have overly complex stylesheets unsuitable for conversion
#: to ebooks formats
#: @type: boolean
no_stylesheets = False
#: If True the GUI will ask the user for a username and password
#: to use while downloading
#: @type: boolean
needs_subscription = False
#: Specify an override encoding for sites that have an incorrect
#: charset specification. The most common being specifying latin1 and
#: using cp1252. If None, try to detect the encoding.
encoding = None
#: Normally we try to guess if a feed has full articles embedded in it
#: based on the length of the embedded content. If C{None}, then the
#: default guessing is used. If C{True} then the we always assume the feeds has
#: embedded content and if False we always assume the feed does not have
#: embedded content.
use_embedded_content = None
#: Specify any extra CSS that should be addded to downloaded HTML files
#: It will be inserted into C{<style></style>} just before the closing
#: C{</head>} tag thereby overrinding all CSS except that which is
#: declared using the style attribute on individual HTML tags.
#: type: string
extra_css = None
#: List of regular expressions that determines which links to follow
#: If empty, it is ignored.
#: Only one of L{match_regexps} or L{filter_regexps} should be defined
#: @type: list of strings
match_regexps = []
#: List of regular expressions that determines which links to ignore
#: If empty it is ignored
#: Only one of L{match_regexps} or L{filter_regexps} should be defined
#: @type: list of strings
filter_regexps = []
#: List of options to pass to html2lrf, to customize generation of LRF ebooks.
#: @type: list of strings
html2lrf_options = []
#: List of tags to be removed. Specified tags are removed from downloaded HTML.
#: A tag is specified as a dictionary of the form::
#: {
#: name : 'tag name', #e.g. 'div'
#: attrs : a dictionary, #e.g. {class: 'advertisment'}
#: }
#: All keys are optional. For a full explanantion of the search criteria, see
#: U{http://www.crummy.com/software/BeautifulSoup/documentation.html#The basic find method: findAll(name, attrs, recursive, text, limit, **kwargs)}
#: A common example::
#: remove_tags = [dict(name='div', attrs={'class':'advert'})]
#: This will remove all <div class="advert"> tags and all their children from the downloaded HTML.
#: @type: list
remove_tags = []
#: Remove all tags that occur after the specified tag.
#: For the format for specifying a tag see L{remove_tags}.
#: For example, C{remove_tags_after = [dict(id='content')]} will remove all
#: tags after the element with id C{content}.
remove_tags_after = None
#: Remove all tags that occur before the specified tag.
#: For the format for specifying a tag see L{remove_tags}.
#: For example, C{remove_tags_before = [dict(id='content')]} will remove all
#: tags before the element with id C{content}.
remove_tags_before = None
#: Keep only the specified tags and their children.
#: For the format for specifying tags see L{remove_tags}.
#: If this list is not empty, then the <body> element will be emptied and re-filled with
#: the tags that match the entries in this list.
#: @type: list
keep_only_tags = []
#: List of regexp substitution rules to run on the downloaded HTML. Each element of the
#: list should be a two element tuple. The first element of the tuple should
#: be a compiled regular expression and the second a callable that takes
#: a single match object and returns a string to replace the match.
#: @type: list of tuples
preprocess_regexps = []
# See the built-in profiles for examples of these settings.
def get_cover_url(self):
'''
Return a URL to the cover image for this issue or None.
@rtype: string or None
'''
return getattr(self, 'cover_url', None)
def get_feeds(self):
'''
Return a list of RSS feeds to fetch for this profile. Each element of the list
must be a 2-element tuple of the form (title, url). If title is None or an
empty string, the title from the feed is used.
'''
if not self.feeds:
raise NotImplementedError
if self.test:
return self.feeds[:2]
return self.feeds
@classmethod
def print_version(cls, url):
'''
Take a URL pointing to an article and returns the URL pointing to the
print version of the article.
'''
raise NotImplementedError
@classmethod
def get_browser(cls):
'''
Return a browser instance used to fetch documents from the web.
If your profile requires that you login first, override this method
in your subclass. See for example the nytimes profile.
'''
return browser()
def get_article_url(self, item):
'''
Override to perform extraction of URL for each article.
@param item: An article instance from L{feedparser}.
@type item: L{FeedParserDict}
'''
return item.get('link', None)
def preprocess_html(self, soup):
'''
This function is called with the source of each downloaded HTML file, before
it is parsed for links and images.
It can be used to do arbitrarily powerful pre-processing on the HTML.
@param soup: A U{BeautifulSoup<http://www.crummy.com/software/BeautifulSoup/documentation.html>}
instance containing the downloaded HTML.
@type soup: A U{BeautifulSoup<http://www.crummy.com/software/BeautifulSoup/documentation.html>} instance
@return: It must return soup (after having done any needed preprocessing)
@rtype: A U{BeautifulSoup<http://www.crummy.com/software/BeautifulSoup/documentation.html>} instance
'''
return soup
def postprocess_html(self, soup):
'''
This function is called with the source of each downloaded HTML file, after
it is parsed for links and images.
It can be used to do arbitrarily powerful pre-processing on the HTML.
@param soup: A U{BeautifulSoup<http://www.crummy.com/software/BeautifulSoup/documentation.html>}
instance containing the downloaded HTML.
@type soup: A U{BeautifulSoup<http://www.crummy.com/software/BeautifulSoup/documentation.html>} instance
@return: It must return soup (after having done any needed preprocessing)
@rtype: A U{BeautifulSoup<http://www.crummy.com/software/BeautifulSoup/documentation.html>} instance
'''
return soup
def cleanup(self):
'''
Called after all articles have been download. Use it to do any cleanup like
logging out of subscription sites, etc.
'''
pass
def index_to_soup(self, url_or_raw):
'''
Convenience method that takes an URL to the index page and returns
a BeautifulSoup of it.
@param url_or_raw: Either a URL or the downloaded index page as a string
'''
if re.match(r'\w+://', url_or_raw):
raw = self.browser.open(url_or_raw).read()
else:
raw = url_or_raw
if not isinstance(raw, unicode) and self.encoding:
raw = raw.decode(self.encoding)
raw = re.sub(r'&(\S+?);',
lambda match: entity_to_unicode(match, encoding=self.encoding),
raw)
return BeautifulSoup(raw)
def sort_index_by(self, index, weights):
'''
Convenience method to sort the titles in index according to weights.
@param index: A list of titles.
@param weights: A dictionary that maps weights to titles. If any titles
in index are not in weights, they are assumed to have a weight of 0.
@return: Sorted index
'''
weights = defaultdict(lambda : 0, weights)
index.sort(cmp=lambda x, y: cmp(weights[x], weights[y]))
return index
def parse_index(self):
'''
This method should be implemented in recipes that parse a website
instead of feeds to generate a list of articles. Typical uses are for
news sources that have a "Print Edition" webpage that lists all the
articles in the current print edition. If this function is implemented,
it will be used in preference to L{parse_feeds}.
@rtype: list
@return: A list of two element tuples of the form ('feed title', list of articles).
Each list of articles contains dictionaries of the form::
{
'title' : article title,
'url' : URL of print version,
'date' : The publication date of the article as a string,
'description' : A summary of the article
'content' : The full article (can be an empty string). This is used by FullContentProfile
}
'''
raise NotImplementedError
def __init__(self, options, parser, progress_reporter):
'''
Initialize the recipe.
@param options: Parsed commandline options
@param parser: Command line option parser. Used to intelligently merge options.
@param progress_reporter: A Callable that takes two arguments: progress (a number between 0 and 1) and a string message. The message should be optional.
'''
for attr in ('username', 'password', 'lrf', 'output_dir', 'verbose', 'debug', 'test'):
setattr(self, attr, getattr(options, attr))
self.output_dir = os.path.abspath(self.output_dir)
if options.test:
self.max_articles_per_feed = 2
self.simultaneous_downloads = min(4, self.simultaneous_downloads)
self.logger = logging.getLogger('feeds2disk')
if self.debug:
self.logger.setLevel(logging.DEBUG)
self.verbose = True
self.report_progress = progress_reporter
self.username = self.password = None
#: If True optimize downloading for eventual conversion to LRF
self.lrf = False
defaults = parser.get_default_values()
for opt in options.__dict__.keys():
if getattr(options, opt) != getattr(defaults, opt):
setattr(self, opt, getattr(options, opt))
if isinstance(self.feeds, basestring):
self.feeds = eval(self.feeds)
if isinstance(self.feeds, basestring):
self.feeds = [self.feeds]
if self.needs_subscription and (self.username is None or self.password is None):
raise ValueError('The %s recipe needs a username and password.'%self.title)
self.browser = self.get_browser()
self.image_map, self.image_counter = {}, 1
self.css_map = {}
web2disk_cmdline = [ 'web2disk',
'--timeout', str(self.timeout),
'--max-recursions', str(self.recursions),
'--delay', str(self.delay),
'--timeout', str(self.timeout),
]
if self.encoding is not None:
web2disk_cmdline.extend(['--encoding', self.encoding])
if self.verbose:
web2disk_cmdline.append('--verbose')
if self.no_stylesheets:
web2disk_cmdline.append('--dont-download-stylesheets')
for reg in self.match_regexps:
web2disk_cmdline.extend(['--match-regexp', reg])
for reg in self.filter_regexps:
web2disk_cmdline.extend(['--filter-regexp', reg])
self.web2disk_options = web2disk_option_parser().parse_args(web2disk_cmdline)[0]
for extra in ('keep_only_tags', 'remove_tags', 'preprocess_regexps',
'preprocess_html', 'remove_tags_after', 'remove_tags_before'):
setattr(self.web2disk_options, extra, getattr(self, extra))
self.web2disk_options.postprocess_html = self._postprocess_html
if self.delay > 0:
self.simultaneous_downloads = 1
self.navbar = templates.NavBarTemplate()
self.html2lrf_options.extend(['--page-break-before', '$', '--use-spine', '--header'])
self.failed_downloads = []
self.partial_failures = []
def _postprocess_html(self, soup, first_fetch, job_info):
if self.extra_css is not None:
head = soup.find('head')
if head:
style = BeautifulSoup(u'<style type="text/css">%s</style>'%self.extra_css).find('style')
head.insert(len(head.contents), style)
if first_fetch and job_info:
url, f, a, feed_len = job_info
body = soup.find('body')
if body is not None:
templ = self.navbar.generate(False, f, a, feed_len,
not self.has_single_feed,
url, __appname__)
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
body.insert(0, elem)
return self.postprocess_html(soup)
def download(self):
'''
Download and pre-process all articles from the feeds in this recipe.
This method should be called only one on a particular Recipe instance.
Calling it more than once will lead to undefined behavior.
@return: Path to index.html
@rtype: string
'''
res = self.build_index()
self.cleanup()
self.report_progress(1, _('Download finished'))
if self.failed_downloads:
self.logger.warning(_('Failed to download the following articles:'))
for feed, article, debug in self.failed_downloads:
self.logger.warning(article.title+_(' from ')+feed.title)
self.logger.debug(article.url)
self.logger.debug(debug)
if self.partial_failures:
self.logger.warning(_('Failed to download parts of the following articles:'))
for feed, atitle, aurl, debug in self.partial_failures:
self.logger.warning(atitle + _(' from ') + feed)
self.logger.debug(aurl)
self.logger.warning(_('\tFailed links:'))
for l, tb in debug:
self.logger.warning(l)
self.logger.debug(tb)
return res
def feeds2index(self, feeds):
templ = templates.IndexTemplate()
return templ.generate(self.title, self.timefmt, feeds).render(doctype='xhtml')
@classmethod
def description_limiter(cls, src):
pos = cls.summary_length
fuzz = 50
si = src.find(';', pos)
if si > 0 and si-pos > fuzz:
si = -1
gi = src.find('>', pos)
if gi > 0 and gi-pos > fuzz:
gi = -1
npos = max(si, gi)
if npos < 0:
npos = pos
return src[:npos+1]+u'\u2026'
def feed2index(self, feed):
if feed.image_url is not None: # Download feed image
imgdir = os.path.join(self.output_dir, 'images')
if not os.path.isdir(imgdir):
os.makedirs(imgdir)
if self.image_map.has_key(feed.image_url):
feed.image_url = self.image_map[feed.image_url]
else:
bn = urlparse.urlsplit(feed.image_url).path
if bn:
bn = bn.rpartition('/')[-1]
if bn:
img = os.path.join(imgdir, 'feed_image_%d%s'%(self.image_counter, os.path.splitext(bn)))
open(img, 'wb').write(self.browser.open(feed.image_url).read())
self.image_counter += 1
feed.image_url = img
self.image_map[feed.image_url] = img
templ = templates.FeedTemplate()
return templ.generate(feed, self.description_limiter).render(doctype='xhtml')
def create_logger(self, feed_number, article_number):
logger = logging.getLogger('feeds2disk.article_%d_%d'%(feed_number, article_number))
out = cStringIO.StringIO()
handler = logging.StreamHandler(out)
handler.setFormatter(logging.Formatter('%(levelname)s: %(message)s'))
handler.setLevel(logging.INFO if self.verbose else logging.WARNING)
if self.debug:
handler.setLevel(logging.DEBUG)
logger.addHandler(handler)
return logger, out
def _fetch_article(self, url, dir, logger, f, a, num_of_feeds):
fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map, self.css_map, (url, f, a, num_of_feeds))
fetcher.base_dir = dir
fetcher.current_dir = dir
fetcher.show_progress = False
res, path, failures = fetcher.start_fetch(url), fetcher.downloaded_paths, fetcher.failed_links
if not res or not os.path.exists(res):
raise Exception(_('Could not fetch article. Run with --debug to see the reason'))
return res, path, failures
def fetch_article(self, url, dir, logger, f, a, num_of_feeds):
return self._fetch_article(url, dir, logger, f, a, num_of_feeds)
def fetch_embedded_article(self, article, dir, logger, f, a, num_of_feeds):
pt = PersistentTemporaryFile('_feeds2disk.html')
templ = templates.EmbeddedContent()
raw = templ.generate(article).render('html')
open(pt.name, 'wb').write(raw)
pt.close()
url = ('file:'+pt.name) if iswindows else ('file://'+pt.name)
return self._fetch_article(url, dir, logger, f, a, num_of_feeds)
def build_index(self):
self.report_progress(0, _('Fetching feeds...'))
try:
feeds = feeds_from_index(self.parse_index(), oldest_article=self.oldest_article,
max_articles_per_feed=self.max_articles_per_feed)
self.report_progress(0, _('Got feeds from index page'))
except NotImplementedError:
feeds = self.parse_feeds()
self.report_progress(0, _('Trying to download cover...'))
self.download_cover()
if self.test:
feeds = feeds[:2]
self.has_single_feed = len(feeds) == 1
if self.use_embedded_content is None:
self.use_embedded_content = feeds[0].has_embedded_content()
index = os.path.join(self.output_dir, 'index.html')
html = self.feeds2index(feeds)
open(index, 'wb').write(html)
self.jobs = []
for f, feed in enumerate(feeds):
feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
if not os.path.isdir(feed_dir):
os.makedirs(feed_dir)
for a, article in enumerate(feed):
if a >= self.max_articles_per_feed:
break
art_dir = os.path.join(feed_dir, 'article_%d'%a)
if not os.path.isdir(art_dir):
os.makedirs(art_dir)
logger, stream = self.create_logger(f, a)
try:
url = self.print_version(article.url)
except NotImplementedError:
url = article.url
func, arg = (self.fetch_embedded_article, article) if self.use_embedded_content else \
(self.fetch_article, url)
req = WorkRequest(func, (arg, art_dir, logger, f, a, len(feed)),
{}, (f, a), self.article_downloaded,
self.error_in_article_download)
req.stream = stream
req.feed = feed
req.article = article
req.feed_dir = feed_dir
self.jobs.append(req)
self.jobs_done = 0
tp = ThreadPool(self.simultaneous_downloads)
for req in self.jobs:
tp.putRequest(req, block=True, timeout=0)
self.report_progress(0, _('Starting download [%d thread(s)]...')%self.simultaneous_downloads)
while True:
try:
tp.poll()
time.sleep(0.1)
except NoResultsPending:
break
for f, feed in enumerate(feeds):
html = self.feed2index(feed)
feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
open(os.path.join(feed_dir, 'index.html'), 'wb').write(html)
self.create_opf(feeds)
self.report_progress(1, _('Feeds downloaded to %s')%index)
return index
def download_cover(self):
self.cover_path = None
try:
cu = self.get_cover_url()
except Exception, err:
cu = None
self.logger.error(_('Could not download cover: %s')%str(err))
self.logger.debug(traceback.format_exc())
if cu is not None:
ext = cu.rpartition('.')[-1]
ext = ext.lower() if ext else 'jpg'
self.report_progress(1, _('Downloading cover from %s')%cu)
cpath = os.path.join(self.output_dir, 'cover.'+ext)
cfile = open(cpath, 'wb')
cfile.write(self.browser.open(cu).read())
self.cover_path = cpath
def create_opf(self, feeds, dir=None):
if dir is None:
dir = self.output_dir
mi = MetaInformation(self.title + time.strftime(self.timefmt), [__appname__])
mi.publisher = __appname__
mi.author_sort = __appname__
opf_path = os.path.join(dir, 'index.opf')
ncx_path = os.path.join(dir, 'index.ncx')
opf = OPFCreator(dir, mi)
manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
manifest.append(os.path.join(dir, 'index.html'))
cpath = getattr(self, 'cover_path', None)
if cpath is not None and os.access(cpath, os.R_OK):
opf.cover = cpath
manifest.append(cpath)
opf.create_manifest_from_files_in(manifest)
entries = ['index.html']
toc = TOC(base_path=dir)
def feed_index(num, parent):
f = feeds[num]
for j, a in enumerate(f):
if getattr(a, 'downloaded', False):
adir = 'feed_%d/article_%d/'%(num, j)
entries.append('%sindex.html'%adir)
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'))
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
for sp in a.sub_pages:
prefix = os.path.commonprefix([opf_path, sp])
relp = sp[len(prefix):]
entries.append(relp.replace(os.sep, '/'))
last = sp
src = open(last, 'rb').read()
soup = BeautifulSoup(src)
body = soup.find('body')
if body is not None:
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
templ = self.navbar.generate(True, num, j, len(f),
not self.has_single_feed,
a.orig_url, __appname__, prefix=prefix)
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
body.insert(len(body.contents), elem)
open(last, 'wb').write(unicode(soup).encode('utf-8'))
if len(feeds) > 1:
for i, f in enumerate(feeds):
entries.append('feed_%d/index.html'%i)
feed_index(i, toc.add_item('feed_%d/index.html'%i, None, f.title))
else:
entries.append('feed_%d/index.html'%0)
feed_index(0, toc)
opf.create_spine(entries)
opf.set_toc(toc)
opf.render(open(opf_path, 'wb'), open(ncx_path, 'wb'))
def article_downloaded(self, request, result):
index = os.path.join(os.path.dirname(result[0]), 'index.html')
if index != result[0]:
os.rename(result[0], index)
a = request.requestID[1]
article = request.article
self.logger.debug(_('\nDownloaded article %s from %s\n%s')%(article.title, article.url, request.stream.getvalue().decode('utf-8', 'ignore')))
article.orig_url = article.url
article.url = 'article_%d/index.html'%a
article.downloaded = True
article.sub_pages = result[1][1:]
self.jobs_done += 1
self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article downloaded: %s')%article.title)
if result[2]:
self.partial_failures.append((request.feed.title, article.title, article.url, result[2]))
def error_in_article_download(self, request, traceback):
self.jobs_done += 1
self.logger.error(_('Failed to download article: %s from %s\n')%(request.article.title, request.article.url))
debug = request.stream.getvalue().decode('utf-8', 'ignore')
self.logger.debug(debug)
self.logger.debug(traceback)
self.logger.debug('\n')
self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article download failed: %s')%request.article.title)
self.failed_downloads.append((request.feed, request.article, debug))
def parse_feeds(self):
'''
Create a list of articles from a list of feeds.
@rtype: list
@return: A list of L{Feed}s.
'''
feeds = self.get_feeds()
parsed_feeds = []
for obj in feeds:
if isinstance(obj, basestring):
title, url = None, obj
else:
title, url = obj
self.report_progress(0, _('Fetching feed')+' %s...'%(title if title else url))
parsed_feeds.append(feed_from_xml(self.browser.open(url).read(),
title=title,
oldest_article=self.oldest_article,
max_articles_per_feed=self.max_articles_per_feed,
get_article_url=self.get_article_url))
return parsed_feeds
@classmethod
def tag_to_string(cls, tag, use_alt=True):
'''
Convenience method to take a BeautifulSoup Tag and extract the text from it
recursively, including any CDATA sections and alt tag attributes.
@param use_alt: If True try to use the alt attribute for tags that don't have any textual content
@type use_alt: boolean
@return: A unicode (possibly empty) object
@rtype: unicode string
'''
if not tag:
return ''
if isinstance(tag, basestring):
return tag
strings = []
for item in tag.contents:
if isinstance(item, (NavigableString, CData)):
strings.append(item.string)
elif isinstance(item, Tag):
res = cls.tag_to_string(item)
if res:
strings.append(res)
elif use_alt and item.has_key('alt'):
strings.append(item['alt'])
return u''.join(strings)
class Profile2Recipe(BasicNewsRecipe):
'''
Used to migrate the old news Profiles to the new Recipes. Uses the settings
from the old Profile to populate the settings in the Recipe. Also uses, the
Profile's get_browser and parse_feeds.
'''
def __init__(self, profile_class, options, parser, progress_reporter):
self.old_profile = profile_class(logging.getLogger('feeds2disk'),
username=options.username,
password=options.password,
lrf=options.lrf)
for attr in ('preprocess_regexps', 'oldest_article', 'delay', 'timeout',
'match_regexps', 'filter_regexps', 'html2lrf_options',
'timefmt', 'needs_subscription', 'summary_length',
'max_articles_per_feed', 'title','no_stylesheets', 'encoding'):
setattr(self, attr, getattr(self.old_profile, attr))
self.simultaneous_downloads = 1
BasicNewsRecipe.__init__(self, options, parser, progress_reporter)
self.browser = self.old_profile.browser
self.use_embedded_content = isinstance(self.old_profile, FullContentProfile)
def parse_index(self):
feeds = []
for key, val in self.old_profile.parse_feeds().items():
feeds.append((key, val))
return self.old_profile.parse_feeds()
class CustomIndexRecipe(BasicNewsRecipe):
def custom_index(self):
'''
Return the path to a custom HTML document that will serve as the index for
this recipe.
@rtype: string
'''
raise NotImplementedError
def create_opf(self):
mi = MetaInformation(self.title + time.strftime(self.timefmt), [__appname__])
mi.publisher = __appname__
mi.author_sort = __appname__
mi = OPFCreator(self.output_dir, mi)
mi.create_manifest_from_files_in([self.output_dir])
mi.create_spine(['index.html'])
mi.render(open(os.path.join(self.output_dir, 'index.opf'), 'wb'))
def download(self):
index = os.path.abspath(self.custom_index())
url = 'file:'+index if iswindows else 'file://'+index
fetcher = RecursiveFetcher(self.web2disk_options, self.logger)
fetcher.base_dir = self.output_dir
fetcher.current_dir = self.output_dir
fetcher.show_progress = False
res = fetcher.start_fetch(url)
self.create_opf()
return res