mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-11-13 10:06:59 -05:00
798 lines
34 KiB
Python
798 lines
34 KiB
Python
#!/usr/bin/env python
|
|
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
|
|
## This program is free software; you can redistribute it and/or modify
|
|
## it under the terms of the GNU General Public License as published by
|
|
## the Free Software Foundation; either version 2 of the License, or
|
|
## (at your option) any later version.
|
|
##
|
|
## This program is distributed in the hope that it will be useful,
|
|
## but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
## GNU General Public License for more details.
|
|
##
|
|
## You should have received a copy of the GNU General Public License along
|
|
## with this program; if not, write to the Free Software Foundation, Inc.,
|
|
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
'''
|
|
The backend to parse feeds and create HTML that can then be converted
|
|
to an ebook.
|
|
'''
|
|
import logging, os, cStringIO, time, traceback, re, urlparse
|
|
from collections import defaultdict
|
|
|
|
from libprs500 import browser, __appname__, iswindows
|
|
from libprs500.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
|
|
from libprs500.ebooks.metadata.opf import OPFCreator
|
|
from libprs500.ebooks.lrf import entity_to_unicode
|
|
from libprs500.ebooks.metadata.toc import TOC
|
|
from libprs500.ebooks.metadata import MetaInformation
|
|
from libprs500.web.feeds import feed_from_xml, templates, feeds_from_index
|
|
from libprs500.web.fetch.simple import option_parser as web2disk_option_parser
|
|
from libprs500.web.fetch.simple import RecursiveFetcher
|
|
from libprs500.threadpool import WorkRequest, ThreadPool, NoResultsPending
|
|
from libprs500.ebooks.lrf.web.profiles import FullContentProfile
|
|
from libprs500.ptempfile import PersistentTemporaryFile
|
|
|
|
|
|
class BasicNewsRecipe(object):
|
|
'''
|
|
Abstract base class that contains logic needed in all feed fetchers.
|
|
'''
|
|
|
|
#: The title to use for the ebook
|
|
#: @type: string
|
|
title = _('Unknown News Source')
|
|
|
|
#: The author of this recipe
|
|
__author__ = __appname__
|
|
|
|
#: Maximum number of articles to download from each feed
|
|
#: @type: integer
|
|
max_articles_per_feed = 100
|
|
|
|
#: Oldest article to download from this news source. In days.
|
|
#: @type: float
|
|
oldest_article = 7.0
|
|
|
|
#: Number of levels of links to follow on webpages that are linked
|
|
#: to by the feed.
|
|
#: @type: integer
|
|
recursions = 0
|
|
|
|
#: Delay between consecutive downloads in seconds
|
|
#: @type: integer
|
|
delay = 0
|
|
|
|
#: Number of simultaneous downloads. Set to 1 if the server is picky.
|
|
#: Automatically reduced to 1 if L{delay} > 0
|
|
#: @type: integer
|
|
simultaneous_downloads = 5
|
|
|
|
#: Timeout for fetching files from server in seconds
|
|
#: @type: integer
|
|
timeout = 120
|
|
|
|
#: The format string for the date shown on the first page
|
|
#: By default: Day Name Day Number Month Name Year
|
|
#: @type: string
|
|
timefmt = ' [%a, %d %b %Y]'
|
|
|
|
#: List of feeds to download
|
|
#: Can be either C{[url1, url2, ...]} or C{[('title1', url1), ('title2', url2),...]}
|
|
#: @type: List of strings or list of 2-tuples
|
|
feeds = None
|
|
|
|
#: Max number of characters in the short description.
|
|
#: @type: integer
|
|
summary_length = 500
|
|
|
|
#: If True stylesheets are not downloaded and processed
|
|
#: Convenient flag to disable loading of stylesheets for websites
|
|
#: that have overly complex stylesheets unsuitable for conversion
|
|
#: to ebooks formats
|
|
#: @type: boolean
|
|
no_stylesheets = False
|
|
|
|
#: If True the GUI will ask the user for a username and password
|
|
#: to use while downloading
|
|
#: @type: boolean
|
|
needs_subscription = False
|
|
|
|
#: Specify an override encoding for sites that have an incorrect
|
|
#: charset specification. The most common being specifying latin1 and
|
|
#: using cp1252. If None, try to detect the encoding.
|
|
encoding = None
|
|
|
|
#: Normally we try to guess if a feed has full articles embedded in it
|
|
#: based on the length of the embedded content. If C{None}, then the
|
|
#: default guessing is used. If C{True} then the we always assume the feeds has
|
|
#: embedded content and if False we always assume the feed does not have
|
|
#: embedded content.
|
|
use_embedded_content = None
|
|
|
|
#: Specify any extra CSS that should be addded to downloaded HTML files
|
|
#: It will be inserted into C{<style></style>} just before the closing
|
|
#: C{</head>} tag thereby overrinding all CSS except that which is
|
|
#: declared using the style attribute on individual HTML tags.
|
|
#: type: string
|
|
extra_css = None
|
|
|
|
#: List of regular expressions that determines which links to follow
|
|
#: If empty, it is ignored.
|
|
#: Only one of L{match_regexps} or L{filter_regexps} should be defined
|
|
#: @type: list of strings
|
|
match_regexps = []
|
|
|
|
#: List of regular expressions that determines which links to ignore
|
|
#: If empty it is ignored
|
|
#: Only one of L{match_regexps} or L{filter_regexps} should be defined
|
|
#: @type: list of strings
|
|
filter_regexps = []
|
|
|
|
#: List of options to pass to html2lrf, to customize generation of LRF ebooks.
|
|
#: @type: list of strings
|
|
html2lrf_options = []
|
|
|
|
#: List of tags to be removed. Specified tags are removed from downloaded HTML.
|
|
#: A tag is specified as a dictionary of the form::
|
|
#: {
|
|
#: name : 'tag name', #e.g. 'div'
|
|
#: attrs : a dictionary, #e.g. {class: 'advertisment'}
|
|
#: }
|
|
#: All keys are optional. For a full explanantion of the search criteria, see
|
|
#: U{http://www.crummy.com/software/BeautifulSoup/documentation.html#The basic find method: findAll(name, attrs, recursive, text, limit, **kwargs)}
|
|
#: A common example::
|
|
#: remove_tags = [dict(name='div', attrs={'class':'advert'})]
|
|
#: This will remove all <div class="advert"> tags and all their children from the downloaded HTML.
|
|
#: @type: list
|
|
remove_tags = []
|
|
|
|
#: Remove all tags that occur after the specified tag.
|
|
#: For the format for specifying a tag see L{remove_tags}.
|
|
#: For example, C{remove_tags_after = [dict(id='content')]} will remove all
|
|
#: tags after the element with id C{content}.
|
|
remove_tags_after = None
|
|
|
|
#: Remove all tags that occur before the specified tag.
|
|
#: For the format for specifying a tag see L{remove_tags}.
|
|
#: For example, C{remove_tags_before = [dict(id='content')]} will remove all
|
|
#: tags before the element with id C{content}.
|
|
remove_tags_before = None
|
|
|
|
#: Keep only the specified tags and their children.
|
|
#: For the format for specifying tags see L{remove_tags}.
|
|
#: If this list is not empty, then the <body> element will be emptied and re-filled with
|
|
#: the tags that match the entries in this list.
|
|
#: @type: list
|
|
keep_only_tags = []
|
|
|
|
#: List of regexp substitution rules to run on the downloaded HTML. Each element of the
|
|
#: list should be a two element tuple. The first element of the tuple should
|
|
#: be a compiled regular expression and the second a callable that takes
|
|
#: a single match object and returns a string to replace the match.
|
|
#: @type: list of tuples
|
|
preprocess_regexps = []
|
|
|
|
# See the built-in profiles for examples of these settings.
|
|
|
|
def get_cover_url(self):
|
|
'''
|
|
Return a URL to the cover image for this issue or None.
|
|
@rtype: string or None
|
|
'''
|
|
return getattr(self, 'cover_url', None)
|
|
|
|
def get_feeds(self):
|
|
'''
|
|
Return a list of RSS feeds to fetch for this profile. Each element of the list
|
|
must be a 2-element tuple of the form (title, url). If title is None or an
|
|
empty string, the title from the feed is used.
|
|
'''
|
|
if not self.feeds:
|
|
raise NotImplementedError
|
|
if self.test:
|
|
return self.feeds[:2]
|
|
return self.feeds
|
|
|
|
@classmethod
|
|
def print_version(cls, url):
|
|
'''
|
|
Take a URL pointing to an article and returns the URL pointing to the
|
|
print version of the article.
|
|
'''
|
|
raise NotImplementedError
|
|
|
|
@classmethod
|
|
def get_browser(cls):
|
|
'''
|
|
Return a browser instance used to fetch documents from the web.
|
|
|
|
If your profile requires that you login first, override this method
|
|
in your subclass. See for example the nytimes profile.
|
|
'''
|
|
return browser()
|
|
|
|
def get_article_url(self, item):
|
|
'''
|
|
Override to perform extraction of URL for each article.
|
|
@param item: An article instance from L{feedparser}.
|
|
@type item: L{FeedParserDict}
|
|
'''
|
|
return item.get('link', None)
|
|
|
|
def preprocess_html(self, soup):
|
|
'''
|
|
This function is called with the source of each downloaded HTML file, before
|
|
it is parsed for links and images.
|
|
It can be used to do arbitrarily powerful pre-processing on the HTML.
|
|
@param soup: A U{BeautifulSoup<http://www.crummy.com/software/BeautifulSoup/documentation.html>}
|
|
instance containing the downloaded HTML.
|
|
@type soup: A U{BeautifulSoup<http://www.crummy.com/software/BeautifulSoup/documentation.html>} instance
|
|
@return: It must return soup (after having done any needed preprocessing)
|
|
@rtype: A U{BeautifulSoup<http://www.crummy.com/software/BeautifulSoup/documentation.html>} instance
|
|
'''
|
|
return soup
|
|
|
|
def postprocess_html(self, soup):
|
|
'''
|
|
This function is called with the source of each downloaded HTML file, after
|
|
it is parsed for links and images.
|
|
It can be used to do arbitrarily powerful pre-processing on the HTML.
|
|
@param soup: A U{BeautifulSoup<http://www.crummy.com/software/BeautifulSoup/documentation.html>}
|
|
instance containing the downloaded HTML.
|
|
@type soup: A U{BeautifulSoup<http://www.crummy.com/software/BeautifulSoup/documentation.html>} instance
|
|
@return: It must return soup (after having done any needed preprocessing)
|
|
@rtype: A U{BeautifulSoup<http://www.crummy.com/software/BeautifulSoup/documentation.html>} instance
|
|
'''
|
|
return soup
|
|
|
|
def cleanup(self):
|
|
'''
|
|
Called after all articles have been download. Use it to do any cleanup like
|
|
logging out of subscription sites, etc.
|
|
'''
|
|
pass
|
|
|
|
def index_to_soup(self, url_or_raw):
|
|
'''
|
|
Convenience method that takes an URL to the index page and returns
|
|
a BeautifulSoup of it.
|
|
@param url_or_raw: Either a URL or the downloaded index page as a string
|
|
'''
|
|
if re.match(r'\w+://', url_or_raw):
|
|
raw = self.browser.open(url_or_raw).read()
|
|
else:
|
|
raw = url_or_raw
|
|
if not isinstance(raw, unicode) and self.encoding:
|
|
raw = raw.decode(self.encoding)
|
|
raw = re.sub(r'&(\S+?);',
|
|
lambda match: entity_to_unicode(match, encoding=self.encoding),
|
|
raw)
|
|
return BeautifulSoup(raw)
|
|
|
|
|
|
def sort_index_by(self, index, weights):
|
|
'''
|
|
Convenience method to sort the titles in index according to weights.
|
|
@param index: A list of titles.
|
|
@param weights: A dictionary that maps weights to titles. If any titles
|
|
in index are not in weights, they are assumed to have a weight of 0.
|
|
@return: Sorted index
|
|
'''
|
|
weights = defaultdict(lambda : 0, weights)
|
|
index.sort(cmp=lambda x, y: cmp(weights[x], weights[y]))
|
|
return index
|
|
|
|
def parse_index(self):
|
|
'''
|
|
This method should be implemented in recipes that parse a website
|
|
instead of feeds to generate a list of articles. Typical uses are for
|
|
news sources that have a "Print Edition" webpage that lists all the
|
|
articles in the current print edition. If this function is implemented,
|
|
it will be used in preference to L{parse_feeds}.
|
|
@rtype: list
|
|
@return: A list of two element tuples of the form ('feed title', list of articles).
|
|
Each list of articles contains dictionaries of the form::
|
|
{
|
|
'title' : article title,
|
|
'url' : URL of print version,
|
|
'date' : The publication date of the article as a string,
|
|
'description' : A summary of the article
|
|
'content' : The full article (can be an empty string). This is used by FullContentProfile
|
|
}
|
|
'''
|
|
raise NotImplementedError
|
|
|
|
def __init__(self, options, parser, progress_reporter):
|
|
'''
|
|
Initialize the recipe.
|
|
@param options: Parsed commandline options
|
|
@param parser: Command line option parser. Used to intelligently merge options.
|
|
@param progress_reporter: A Callable that takes two arguments: progress (a number between 0 and 1) and a string message. The message should be optional.
|
|
'''
|
|
for attr in ('username', 'password', 'lrf', 'output_dir', 'verbose', 'debug', 'test'):
|
|
setattr(self, attr, getattr(options, attr))
|
|
self.output_dir = os.path.abspath(self.output_dir)
|
|
if options.test:
|
|
self.max_articles_per_feed = 2
|
|
self.simultaneous_downloads = min(4, self.simultaneous_downloads)
|
|
|
|
self.logger = logging.getLogger('feeds2disk')
|
|
|
|
if self.debug:
|
|
self.logger.setLevel(logging.DEBUG)
|
|
self.verbose = True
|
|
self.report_progress = progress_reporter
|
|
|
|
self.username = self.password = None
|
|
#: If True optimize downloading for eventual conversion to LRF
|
|
self.lrf = False
|
|
defaults = parser.get_default_values()
|
|
|
|
for opt in options.__dict__.keys():
|
|
if getattr(options, opt) != getattr(defaults, opt):
|
|
setattr(self, opt, getattr(options, opt))
|
|
|
|
if isinstance(self.feeds, basestring):
|
|
self.feeds = eval(self.feeds)
|
|
if isinstance(self.feeds, basestring):
|
|
self.feeds = [self.feeds]
|
|
|
|
if self.needs_subscription and (self.username is None or self.password is None):
|
|
raise ValueError('The %s recipe needs a username and password.'%self.title)
|
|
|
|
self.browser = self.get_browser()
|
|
self.image_map, self.image_counter = {}, 1
|
|
self.css_map = {}
|
|
|
|
web2disk_cmdline = [ 'web2disk',
|
|
'--timeout', str(self.timeout),
|
|
'--max-recursions', str(self.recursions),
|
|
'--delay', str(self.delay),
|
|
'--timeout', str(self.timeout),
|
|
]
|
|
if self.encoding is not None:
|
|
web2disk_cmdline.extend(['--encoding', self.encoding])
|
|
|
|
if self.verbose:
|
|
web2disk_cmdline.append('--verbose')
|
|
|
|
if self.no_stylesheets:
|
|
web2disk_cmdline.append('--dont-download-stylesheets')
|
|
|
|
for reg in self.match_regexps:
|
|
web2disk_cmdline.extend(['--match-regexp', reg])
|
|
|
|
for reg in self.filter_regexps:
|
|
web2disk_cmdline.extend(['--filter-regexp', reg])
|
|
|
|
self.web2disk_options = web2disk_option_parser().parse_args(web2disk_cmdline)[0]
|
|
for extra in ('keep_only_tags', 'remove_tags', 'preprocess_regexps',
|
|
'preprocess_html', 'remove_tags_after', 'remove_tags_before'):
|
|
setattr(self.web2disk_options, extra, getattr(self, extra))
|
|
self.web2disk_options.postprocess_html = self._postprocess_html
|
|
|
|
if self.delay > 0:
|
|
self.simultaneous_downloads = 1
|
|
|
|
self.navbar = templates.NavBarTemplate()
|
|
self.html2lrf_options.extend(['--page-break-before', '$', '--use-spine', '--header'])
|
|
self.failed_downloads = []
|
|
self.partial_failures = []
|
|
|
|
|
|
def _postprocess_html(self, soup, first_fetch, job_info):
|
|
if self.extra_css is not None:
|
|
head = soup.find('head')
|
|
if head:
|
|
style = BeautifulSoup(u'<style type="text/css">%s</style>'%self.extra_css).find('style')
|
|
head.insert(len(head.contents), style)
|
|
if first_fetch and job_info:
|
|
url, f, a, feed_len = job_info
|
|
body = soup.find('body')
|
|
if body is not None:
|
|
templ = self.navbar.generate(False, f, a, feed_len,
|
|
not self.has_single_feed,
|
|
url, __appname__)
|
|
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
|
|
body.insert(0, elem)
|
|
|
|
return self.postprocess_html(soup)
|
|
|
|
|
|
def download(self):
|
|
'''
|
|
Download and pre-process all articles from the feeds in this recipe.
|
|
This method should be called only one on a particular Recipe instance.
|
|
Calling it more than once will lead to undefined behavior.
|
|
@return: Path to index.html
|
|
@rtype: string
|
|
'''
|
|
res = self.build_index()
|
|
self.cleanup()
|
|
self.report_progress(1, _('Download finished'))
|
|
if self.failed_downloads:
|
|
self.logger.warning(_('Failed to download the following articles:'))
|
|
for feed, article, debug in self.failed_downloads:
|
|
self.logger.warning(article.title+_(' from ')+feed.title)
|
|
self.logger.debug(article.url)
|
|
self.logger.debug(debug)
|
|
if self.partial_failures:
|
|
self.logger.warning(_('Failed to download parts of the following articles:'))
|
|
for feed, atitle, aurl, debug in self.partial_failures:
|
|
self.logger.warning(atitle + _(' from ') + feed)
|
|
self.logger.debug(aurl)
|
|
self.logger.warning(_('\tFailed links:'))
|
|
for l, tb in debug:
|
|
self.logger.warning(l)
|
|
self.logger.debug(tb)
|
|
return res
|
|
|
|
def feeds2index(self, feeds):
|
|
templ = templates.IndexTemplate()
|
|
return templ.generate(self.title, self.timefmt, feeds).render(doctype='xhtml')
|
|
|
|
@classmethod
|
|
def description_limiter(cls, src):
|
|
pos = cls.summary_length
|
|
fuzz = 50
|
|
si = src.find(';', pos)
|
|
if si > 0 and si-pos > fuzz:
|
|
si = -1
|
|
gi = src.find('>', pos)
|
|
if gi > 0 and gi-pos > fuzz:
|
|
gi = -1
|
|
npos = max(si, gi)
|
|
if npos < 0:
|
|
npos = pos
|
|
|
|
return src[:npos+1]+u'\u2026'
|
|
|
|
|
|
|
|
def feed2index(self, feed):
|
|
if feed.image_url is not None: # Download feed image
|
|
imgdir = os.path.join(self.output_dir, 'images')
|
|
if not os.path.isdir(imgdir):
|
|
os.makedirs(imgdir)
|
|
|
|
if self.image_map.has_key(feed.image_url):
|
|
feed.image_url = self.image_map[feed.image_url]
|
|
else:
|
|
bn = urlparse.urlsplit(feed.image_url).path
|
|
if bn:
|
|
bn = bn.rpartition('/')[-1]
|
|
if bn:
|
|
img = os.path.join(imgdir, 'feed_image_%d%s'%(self.image_counter, os.path.splitext(bn)))
|
|
open(img, 'wb').write(self.browser.open(feed.image_url).read())
|
|
self.image_counter += 1
|
|
feed.image_url = img
|
|
self.image_map[feed.image_url] = img
|
|
|
|
templ = templates.FeedTemplate()
|
|
return templ.generate(feed, self.description_limiter).render(doctype='xhtml')
|
|
|
|
|
|
def create_logger(self, feed_number, article_number):
|
|
logger = logging.getLogger('feeds2disk.article_%d_%d'%(feed_number, article_number))
|
|
out = cStringIO.StringIO()
|
|
handler = logging.StreamHandler(out)
|
|
handler.setFormatter(logging.Formatter('%(levelname)s: %(message)s'))
|
|
handler.setLevel(logging.INFO if self.verbose else logging.WARNING)
|
|
if self.debug:
|
|
handler.setLevel(logging.DEBUG)
|
|
logger.addHandler(handler)
|
|
return logger, out
|
|
|
|
def _fetch_article(self, url, dir, logger, f, a, num_of_feeds):
|
|
fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map, self.css_map, (url, f, a, num_of_feeds))
|
|
fetcher.base_dir = dir
|
|
fetcher.current_dir = dir
|
|
fetcher.show_progress = False
|
|
res, path, failures = fetcher.start_fetch(url), fetcher.downloaded_paths, fetcher.failed_links
|
|
if not res or not os.path.exists(res):
|
|
raise Exception(_('Could not fetch article. Run with --debug to see the reason'))
|
|
return res, path, failures
|
|
|
|
def fetch_article(self, url, dir, logger, f, a, num_of_feeds):
|
|
return self._fetch_article(url, dir, logger, f, a, num_of_feeds)
|
|
|
|
|
|
def fetch_embedded_article(self, article, dir, logger, f, a, num_of_feeds):
|
|
pt = PersistentTemporaryFile('_feeds2disk.html')
|
|
templ = templates.EmbeddedContent()
|
|
raw = templ.generate(article).render('html')
|
|
open(pt.name, 'wb').write(raw)
|
|
pt.close()
|
|
url = ('file:'+pt.name) if iswindows else ('file://'+pt.name)
|
|
return self._fetch_article(url, dir, logger, f, a, num_of_feeds)
|
|
|
|
|
|
def build_index(self):
|
|
self.report_progress(0, _('Fetching feeds...'))
|
|
try:
|
|
feeds = feeds_from_index(self.parse_index(), oldest_article=self.oldest_article,
|
|
max_articles_per_feed=self.max_articles_per_feed)
|
|
self.report_progress(0, _('Got feeds from index page'))
|
|
except NotImplementedError:
|
|
feeds = self.parse_feeds()
|
|
|
|
self.report_progress(0, _('Trying to download cover...'))
|
|
self.download_cover()
|
|
if self.test:
|
|
feeds = feeds[:2]
|
|
self.has_single_feed = len(feeds) == 1
|
|
|
|
if self.use_embedded_content is None:
|
|
self.use_embedded_content = feeds[0].has_embedded_content()
|
|
|
|
index = os.path.join(self.output_dir, 'index.html')
|
|
|
|
html = self.feeds2index(feeds)
|
|
open(index, 'wb').write(html)
|
|
|
|
self.jobs = []
|
|
for f, feed in enumerate(feeds):
|
|
feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
|
|
if not os.path.isdir(feed_dir):
|
|
os.makedirs(feed_dir)
|
|
|
|
for a, article in enumerate(feed):
|
|
if a >= self.max_articles_per_feed:
|
|
break
|
|
art_dir = os.path.join(feed_dir, 'article_%d'%a)
|
|
if not os.path.isdir(art_dir):
|
|
os.makedirs(art_dir)
|
|
logger, stream = self.create_logger(f, a)
|
|
try:
|
|
url = self.print_version(article.url)
|
|
except NotImplementedError:
|
|
url = article.url
|
|
|
|
func, arg = (self.fetch_embedded_article, article) if self.use_embedded_content else \
|
|
(self.fetch_article, url)
|
|
req = WorkRequest(func, (arg, art_dir, logger, f, a, len(feed)),
|
|
{}, (f, a), self.article_downloaded,
|
|
self.error_in_article_download)
|
|
req.stream = stream
|
|
req.feed = feed
|
|
req.article = article
|
|
req.feed_dir = feed_dir
|
|
self.jobs.append(req)
|
|
|
|
|
|
self.jobs_done = 0
|
|
tp = ThreadPool(self.simultaneous_downloads)
|
|
for req in self.jobs:
|
|
tp.putRequest(req, block=True, timeout=0)
|
|
|
|
|
|
self.report_progress(0, _('Starting download [%d thread(s)]...')%self.simultaneous_downloads)
|
|
while True:
|
|
try:
|
|
tp.poll()
|
|
time.sleep(0.1)
|
|
except NoResultsPending:
|
|
break
|
|
|
|
for f, feed in enumerate(feeds):
|
|
html = self.feed2index(feed)
|
|
feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
|
|
open(os.path.join(feed_dir, 'index.html'), 'wb').write(html)
|
|
|
|
self.create_opf(feeds)
|
|
self.report_progress(1, _('Feeds downloaded to %s')%index)
|
|
return index
|
|
|
|
def download_cover(self):
|
|
self.cover_path = None
|
|
try:
|
|
cu = self.get_cover_url()
|
|
except Exception, err:
|
|
cu = None
|
|
self.logger.error(_('Could not download cover: %s')%str(err))
|
|
self.logger.debug(traceback.format_exc())
|
|
if cu is not None:
|
|
ext = cu.rpartition('.')[-1]
|
|
ext = ext.lower() if ext else 'jpg'
|
|
self.report_progress(1, _('Downloading cover from %s')%cu)
|
|
cpath = os.path.join(self.output_dir, 'cover.'+ext)
|
|
cfile = open(cpath, 'wb')
|
|
cfile.write(self.browser.open(cu).read())
|
|
self.cover_path = cpath
|
|
|
|
|
|
def create_opf(self, feeds, dir=None):
|
|
if dir is None:
|
|
dir = self.output_dir
|
|
mi = MetaInformation(self.title + time.strftime(self.timefmt), [__appname__])
|
|
mi.publisher = __appname__
|
|
mi.author_sort = __appname__
|
|
opf_path = os.path.join(dir, 'index.opf')
|
|
ncx_path = os.path.join(dir, 'index.ncx')
|
|
opf = OPFCreator(dir, mi)
|
|
|
|
|
|
manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
|
|
manifest.append(os.path.join(dir, 'index.html'))
|
|
cpath = getattr(self, 'cover_path', None)
|
|
if cpath is not None and os.access(cpath, os.R_OK):
|
|
opf.cover = cpath
|
|
manifest.append(cpath)
|
|
opf.create_manifest_from_files_in(manifest)
|
|
|
|
entries = ['index.html']
|
|
toc = TOC(base_path=dir)
|
|
|
|
def feed_index(num, parent):
|
|
f = feeds[num]
|
|
for j, a in enumerate(f):
|
|
if getattr(a, 'downloaded', False):
|
|
adir = 'feed_%d/article_%d/'%(num, j)
|
|
entries.append('%sindex.html'%adir)
|
|
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'))
|
|
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
|
|
for sp in a.sub_pages:
|
|
prefix = os.path.commonprefix([opf_path, sp])
|
|
relp = sp[len(prefix):]
|
|
entries.append(relp.replace(os.sep, '/'))
|
|
last = sp
|
|
|
|
src = open(last, 'rb').read()
|
|
soup = BeautifulSoup(src)
|
|
body = soup.find('body')
|
|
if body is not None:
|
|
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
|
|
templ = self.navbar.generate(True, num, j, len(f),
|
|
not self.has_single_feed,
|
|
a.orig_url, __appname__, prefix=prefix)
|
|
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
|
|
body.insert(len(body.contents), elem)
|
|
open(last, 'wb').write(unicode(soup).encode('utf-8'))
|
|
|
|
if len(feeds) > 1:
|
|
for i, f in enumerate(feeds):
|
|
entries.append('feed_%d/index.html'%i)
|
|
feed_index(i, toc.add_item('feed_%d/index.html'%i, None, f.title))
|
|
else:
|
|
entries.append('feed_%d/index.html'%0)
|
|
feed_index(0, toc)
|
|
|
|
opf.create_spine(entries)
|
|
opf.set_toc(toc)
|
|
|
|
opf.render(open(opf_path, 'wb'), open(ncx_path, 'wb'))
|
|
|
|
|
|
def article_downloaded(self, request, result):
|
|
index = os.path.join(os.path.dirname(result[0]), 'index.html')
|
|
if index != result[0]:
|
|
os.rename(result[0], index)
|
|
a = request.requestID[1]
|
|
|
|
article = request.article
|
|
self.logger.debug(_('\nDownloaded article %s from %s\n%s')%(article.title, article.url, request.stream.getvalue().decode('utf-8', 'ignore')))
|
|
article.orig_url = article.url
|
|
article.url = 'article_%d/index.html'%a
|
|
article.downloaded = True
|
|
article.sub_pages = result[1][1:]
|
|
self.jobs_done += 1
|
|
self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article downloaded: %s')%article.title)
|
|
if result[2]:
|
|
self.partial_failures.append((request.feed.title, article.title, article.url, result[2]))
|
|
|
|
def error_in_article_download(self, request, traceback):
|
|
self.jobs_done += 1
|
|
self.logger.error(_('Failed to download article: %s from %s\n')%(request.article.title, request.article.url))
|
|
debug = request.stream.getvalue().decode('utf-8', 'ignore')
|
|
self.logger.debug(debug)
|
|
self.logger.debug(traceback)
|
|
self.logger.debug('\n')
|
|
self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article download failed: %s')%request.article.title)
|
|
self.failed_downloads.append((request.feed, request.article, debug))
|
|
|
|
def parse_feeds(self):
|
|
'''
|
|
Create a list of articles from a list of feeds.
|
|
@rtype: list
|
|
@return: A list of L{Feed}s.
|
|
'''
|
|
feeds = self.get_feeds()
|
|
parsed_feeds = []
|
|
for obj in feeds:
|
|
if isinstance(obj, basestring):
|
|
title, url = None, obj
|
|
else:
|
|
title, url = obj
|
|
self.report_progress(0, _('Fetching feed')+' %s...'%(title if title else url))
|
|
parsed_feeds.append(feed_from_xml(self.browser.open(url).read(),
|
|
title=title,
|
|
oldest_article=self.oldest_article,
|
|
max_articles_per_feed=self.max_articles_per_feed,
|
|
get_article_url=self.get_article_url))
|
|
|
|
return parsed_feeds
|
|
|
|
@classmethod
|
|
def tag_to_string(cls, tag, use_alt=True):
|
|
'''
|
|
Convenience method to take a BeautifulSoup Tag and extract the text from it
|
|
recursively, including any CDATA sections and alt tag attributes.
|
|
@param use_alt: If True try to use the alt attribute for tags that don't have any textual content
|
|
@type use_alt: boolean
|
|
@return: A unicode (possibly empty) object
|
|
@rtype: unicode string
|
|
'''
|
|
if not tag:
|
|
return ''
|
|
if isinstance(tag, basestring):
|
|
return tag
|
|
strings = []
|
|
for item in tag.contents:
|
|
if isinstance(item, (NavigableString, CData)):
|
|
strings.append(item.string)
|
|
elif isinstance(item, Tag):
|
|
res = cls.tag_to_string(item)
|
|
if res:
|
|
strings.append(res)
|
|
elif use_alt and item.has_key('alt'):
|
|
strings.append(item['alt'])
|
|
return u''.join(strings)
|
|
|
|
class Profile2Recipe(BasicNewsRecipe):
|
|
'''
|
|
Used to migrate the old news Profiles to the new Recipes. Uses the settings
|
|
from the old Profile to populate the settings in the Recipe. Also uses, the
|
|
Profile's get_browser and parse_feeds.
|
|
'''
|
|
def __init__(self, profile_class, options, parser, progress_reporter):
|
|
self.old_profile = profile_class(logging.getLogger('feeds2disk'),
|
|
username=options.username,
|
|
password=options.password,
|
|
lrf=options.lrf)
|
|
for attr in ('preprocess_regexps', 'oldest_article', 'delay', 'timeout',
|
|
'match_regexps', 'filter_regexps', 'html2lrf_options',
|
|
'timefmt', 'needs_subscription', 'summary_length',
|
|
'max_articles_per_feed', 'title','no_stylesheets', 'encoding'):
|
|
setattr(self, attr, getattr(self.old_profile, attr))
|
|
|
|
self.simultaneous_downloads = 1
|
|
BasicNewsRecipe.__init__(self, options, parser, progress_reporter)
|
|
self.browser = self.old_profile.browser
|
|
self.use_embedded_content = isinstance(self.old_profile, FullContentProfile)
|
|
|
|
def parse_index(self):
|
|
feeds = []
|
|
for key, val in self.old_profile.parse_feeds().items():
|
|
feeds.append((key, val))
|
|
return self.old_profile.parse_feeds()
|
|
|
|
class CustomIndexRecipe(BasicNewsRecipe):
|
|
|
|
def custom_index(self):
|
|
'''
|
|
Return the path to a custom HTML document that will serve as the index for
|
|
this recipe.
|
|
@rtype: string
|
|
'''
|
|
raise NotImplementedError
|
|
|
|
def create_opf(self):
|
|
mi = MetaInformation(self.title + time.strftime(self.timefmt), [__appname__])
|
|
mi.publisher = __appname__
|
|
mi.author_sort = __appname__
|
|
mi = OPFCreator(self.output_dir, mi)
|
|
mi.create_manifest_from_files_in([self.output_dir])
|
|
mi.create_spine(['index.html'])
|
|
mi.render(open(os.path.join(self.output_dir, 'index.opf'), 'wb'))
|
|
|
|
def download(self):
|
|
index = os.path.abspath(self.custom_index())
|
|
url = 'file:'+index if iswindows else 'file://'+index
|
|
fetcher = RecursiveFetcher(self.web2disk_options, self.logger)
|
|
fetcher.base_dir = self.output_dir
|
|
fetcher.current_dir = self.output_dir
|
|
fetcher.show_progress = False
|
|
res = fetcher.start_fetch(url)
|
|
self.create_opf()
|
|
return res |