#!/usr/bin/env python ## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net ## This program is free software; you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by ## the Free Software Foundation; either version 2 of the License, or ## (at your option) any later version. ## ## This program is distributed in the hope that it will be useful, ## but WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## GNU General Public License for more details. ## ## You should have received a copy of the GNU General Public License along ## with this program; if not, write to the Free Software Foundation, Inc., ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ''' The backend to parse feeds and create HTML that can then be converted to an ebook. ''' import logging, os, cStringIO, time, traceback, re, urlparse from collections import defaultdict from libprs500 import browser, __appname__, iswindows from libprs500.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag from libprs500.ebooks.metadata.opf import OPFCreator from libprs500.ebooks.lrf import entity_to_unicode from libprs500.ebooks.metadata.toc import TOC from libprs500.ebooks.metadata import MetaInformation from libprs500.web.feeds import feed_from_xml, templates, feeds_from_index from libprs500.web.fetch.simple import option_parser as web2disk_option_parser from libprs500.web.fetch.simple import RecursiveFetcher from libprs500.threadpool import WorkRequest, ThreadPool, NoResultsPending from libprs500.ebooks.lrf.web.profiles import FullContentProfile from libprs500.ptempfile import PersistentTemporaryFile class BasicNewsRecipe(object): ''' Abstract base class that contains logic needed in all feed fetchers. ''' #: The title to use for the ebook #: @type: string title = _('Unknown News Source') #: The author of this recipe __author__ = __appname__ #: Maximum number of articles to download from each feed #: @type: integer max_articles_per_feed = 100 #: Oldest article to download from this news source. In days. #: @type: float oldest_article = 7.0 #: Number of levels of links to follow on webpages that are linked #: to by the feed. #: @type: integer recursions = 0 #: Delay between consecutive downloads in seconds #: @type: integer delay = 0 #: Number of simultaneous downloads. Set to 1 if the server is picky. #: Automatically reduced to 1 if L{delay} > 0 #: @type: integer simultaneous_downloads = 5 #: Timeout for fetching files from server in seconds #: @type: integer timeout = 120 #: The format string for the date shown on the first page #: By default: Day Name Day Number Month Name Year #: @type: string timefmt = ' [%a, %d %b %Y]' #: List of feeds to download #: Can be either C{[url1, url2, ...]} or C{[('title1', url1), ('title2', url2),...]} #: @type: List of strings or list of 2-tuples feeds = None #: Max number of characters in the short description. #: @type: integer summary_length = 500 #: If True stylesheets are not downloaded and processed #: Convenient flag to disable loading of stylesheets for websites #: that have overly complex stylesheets unsuitable for conversion #: to ebooks formats #: @type: boolean no_stylesheets = False #: If True the GUI will ask the user for a username and password #: to use while downloading #: @type: boolean needs_subscription = False #: Specify an override encoding for sites that have an incorrect #: charset specification. The most common being specifying latin1 and #: using cp1252. If None, try to detect the encoding. encoding = None #: Normally we try to guess if a feed has full articles embedded in it #: based on the length of the embedded content. If C{None}, then the #: default guessing is used. If C{True} then the we always assume the feeds has #: embedded content and if False we always assume the feed does not have #: embedded content. use_embedded_content = None #: Specify any extra CSS that should be addded to downloaded HTML files #: It will be inserted into C{} just before the closing #: C{} tag thereby overrinding all CSS except that which is #: declared using the style attribute on individual HTML tags. #: type: string extra_css = None #: List of regular expressions that determines which links to follow #: If empty, it is ignored. #: Only one of L{match_regexps} or L{filter_regexps} should be defined #: @type: list of strings match_regexps = [] #: List of regular expressions that determines which links to ignore #: If empty it is ignored #: Only one of L{match_regexps} or L{filter_regexps} should be defined #: @type: list of strings filter_regexps = [] #: List of options to pass to html2lrf, to customize generation of LRF ebooks. #: @type: list of strings html2lrf_options = [] #: List of tags to be removed. Specified tags are removed from downloaded HTML. #: A tag is specified as a dictionary of the form:: #: { #: name : 'tag name', #e.g. 'div' #: attrs : a dictionary, #e.g. {class: 'advertisment'} #: } #: All keys are optional. For a full explanantion of the search criteria, see #: U{http://www.crummy.com/software/BeautifulSoup/documentation.html#The basic find method: findAll(name, attrs, recursive, text, limit, **kwargs)} #: A common example:: #: remove_tags = [dict(name='div', attrs={'class':'advert'})] #: This will remove all