This commit is contained in:
Kovid Goyal 2008-03-06 07:19:55 +00:00
parent e7e657d27f
commit c9be221404
9 changed files with 3270 additions and 7 deletions

View File

@ -13,7 +13,7 @@
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
''' E-book management software'''
__version__ = "0.4.40"
__version__ = "0.4.41"
__docformat__ = "epytext"
__author__ = "Kovid Goyal <kovid@kovidgoyal.net>"
__appname__ = 'libprs500'
@ -109,17 +109,53 @@ class OptionParser(_OptionParser):
version='%%prog (%s %s)'%(__appname__, __version__),
epilog=_('Created by ')+terminal_controller.RED+__author__+terminal_controller.NORMAL,
gui_mode=False,
conflict_handler='resolve',
**kwds):
usage += '''\n\nWhenever you pass arguments to %prog that have spaces in them, '''\
'''enclose the arguments in quotation marks.'''
_OptionParser.__init__(self, usage=usage, version=version, epilog=epilog,
formatter=CustomHelpFormatter(), **kwds)
formatter=CustomHelpFormatter(),
conflict_handler=conflict_handler, **kwds)
self.gui_mode = gui_mode
def error(self, msg):
if self.gui_mode:
raise Exception(msg)
_OptionParser.error(self, msg)
def subsume(self, group_name, msg=''):
'''
Move all existing options into a subgroup named
C{group_name} with description C{msg}.
'''
opts = list(self.option_list)
groups = list(self.option_groups)
exclude = []
for opt in opts:
ops = opt.get_opt_string()
if ops in ('--help', '--version'):
exclude.append(opt)
else:
self.remove_option(ops)
for group in groups:
for opt in group.option_list:
opts.append(opt)
group.remove_option(opt)
self.option_groups = []
subgroup = self.add_option_group(group_name, msg)
for opt in opts:
if opt in exclude:
continue
subgroup.add_option(opt)
def load_library(name, cdll):
if iswindows:
@ -234,7 +270,6 @@ def get_font_families(cached=None):
continue
zlist.append((family, ff))
font_families = dict(zlist)
return font_families

View File

@ -104,7 +104,7 @@
<p> A
<span style='font-style:italic'>similar</span>
paragraph, but now <span style="text-decoration:overline">using</span>
<span style='font-weight:bold'>CSS</span>
<span style='font-weight:bold'><i>CSS</i></span>
to perform the text formatting.</p>
<hr/>
<center>A centered phrase</center>

View File

@ -43,6 +43,7 @@ entry_points = {
'epub2lrf = libprs500.ebooks.lrf.epub.convert_from:main',
'rtf2lrf = libprs500.ebooks.lrf.rtf.convert_from:main',
'web2disk = libprs500.web.fetch.simple:main',
'feeds2disk = libprs500.web.feeds.main:main',
'web2lrf = libprs500.ebooks.lrf.web.convert_from:main',
'pdf2lrf = libprs500.ebooks.lrf.pdf.convert_from:main',
'mobi2lrf = libprs500.ebooks.lrf.mobi.convert_from:main',

View File

@ -188,9 +188,8 @@ class ProgressBar:
self.bar = term.render(self.BAR)
self.header = self.term.render(self.HEADER % header.center(self.width))
self.cleared = 1 #: true if we haven't drawn the bar yet.
self.update(0, '')
def update(self, percent, message):
def update(self, percent, message=''):
if self.cleared:
sys.stdout.write(self.header)
self.cleared = 0

View File

@ -0,0 +1,20 @@
#!/usr/bin/env python
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
'''
Contains the logic for parsing feeds.
'''

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,107 @@
#!/usr/bin/env python
from libprs500.web.feeds.news import BasicNewsRecipe
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
''''''
import sys, os
from libprs500.web.recipes import get_feed, compile_recipe
from libprs500.web.fetch.simple import option_parser as _option_parser
def option_parser(usage='''\
%prog [options] ARG
%prog parsers an online source of articles, like an RSS or ATOM feed and
fetches the article contents organized in a nice hierarchy.
ARG can be one of:
file name - %prog will try to load a recipe from the file
builtin recipe title - %prog will load the builtin recipe and use it to fetch the feed. For e.g. Newsweek or "The BBC" or "The New York Times"
recipe as a string - $prog will load the recipe directly from the string arg.
'''):
p = _option_parser(usage=usage)
p.remove_option('--max-recursions')
p.remove_option('--base-dir')
p.remove_option('--verbose')
p.remove_option('--max-files')
p.subsume('WEB2DISK OPTIONS', 'Options to control web2disk (used to fetch websites linked from feeds)')
p.add_option('--feeds', default=None,
help=_('''Specify a list of feeds to download. For example:
"['http://feeds.newsweek.com/newsweek/TopNews', 'http://feeds.newsweek.com/headlines/politics']"
If you specify this option, any argument to %prog is ignored and a default recipe is used to download the feeds.'''))
p.add_option('--verbose', default=False, action='store_true',
help=_('''Be more verbose while processing.'''))
p.add_option('--username', default=None, help=_('Username for sites that require a login to access content.'))
p.add_option('--password', default=None, help=_('Password for sites that require a login to access content.'))
p.add_option('--lrf', default=False, action='store_true', help='Optimize fetching for subsequent conversion to LRF.')
p.add_option('--recursions', default=0, type='int',
help=_('Number of levels of links to follow on webpages that are linked to from feeds. Defaul %default'))
return p
def simple_progress_bar(*args):
print '%d%%'%(args[0]*100),
sys.stdout.flush()
def main(args=sys.argv, notification=None):
p = option_parser()
opts, args = p.parse_args(args)
if notification is None:
from libprs500.terminfo import TerminalController, ProgressBar
term = TerminalController(sys.stdout)
try:
pb = ProgressBar(term, _('Fetching feeds...'))
notification = pb.update
except ValueError:
notification = simple_progress_bar
print _('Fetching feeds...')
if len(args) != 2:
p.print_help()
return 1
recipe = None
if opts.feeds is not None:
recipe = BasicNewsRecipe
else:
try:
if os.access(args[1], os.R_OK):
recipe = compile_recipe(open(args[1]).read())
else:
raise Exception('')
except:
recipe = get_feed(args[1])
if recipe is None:
recipe = compile_recipe(args[1])
if recipe is None:
p.print_help()
print
print args[1], 'is an invalid recipe'
return 1
recipe = recipe(opts, p, notification)
index = recipe.download()
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -0,0 +1,183 @@
#!/usr/bin/env python
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
'''
The backend to parse feeds and create HTML that can then be converted
to an ebook.
'''
import logging
from libprs500 import browser
class BasicNewsRecipe(object):
'''
Abstract base class that contains logic needed in all feed fetchers.
'''
#: The title to use for the ebook
#: @type: string
title = 'Unknown News Source'
#: Maximum number of articles to download from each feed
#: @type: integer
max_articles_per_feed = 100
#: Oldest article to download from this news source. In days.
#: @type: float
oldest_article = 7.0
#: Number of levels of links to follow on webpages that are linked
#: to by the feed.
#: @type: integer
recursions = 0
#: Delay between consecutive downloads in seconds
#: @type: integer
delay = 0
#: Timeout for fetching files from server in seconds
#: @type: integer
timeout = 10
#: The format string for the date shown on the first page
#: By default: Day Name Day Number Month Name Year
#: @type: string
timefmt = ' [%a %d %b %Y]'
#: Max number of characters in the short description.
#: @type: integer
summary_length = 500
#: If True stylesheets are not downloaded and processed
#: Convenient flag to disable loading of stylesheets for websites
#: that have overly complex stylesheets unsuitable for conversion
#: to ebooks formats
#: @type: boolean
no_stylesheets = False
#: If True the GUI will ask the user for a username and password
#: to use while downloading
#: @type: boolean
needs_subscription = False
#: Specify an override encoding for sites that have an incorrect
#: charset specification. The most common being specifying latin1 and
#: using cp1252. If None, try to detect the encoding.
encoding = None
#: List of regular expressions that determines which links to follow
#: If empty, it is ignored.
#: Only one of L{match_regexps} or L{filter_regexps} should be defined
#: @type: list of strings
match_regexps = []
#: List of regular expressions that determines which links to ignore
#: If empty it is ignored
#: Only one of L{match_regexps} or L{filter_regexps} should be defined
#: @type: list of strings
filter_regexps = []
#: List of options to pass to html2lrf, to customize generation of LRF ebooks.
#: @type: list of strings
html2lrf_options = []
#: List of regexp substitution rules to run on the downloaded HTML. Each element of the
#: list should be a two element tuple. The first element of the tuple should
#: be a compiled regular expression and the second a callable that takes
#: a single match object and returns a string to replace the match.
#: @type: list of tuples
preprocess_regexps = []
# See the built-in profiles for examples of these settings.
def get_feeds(self):
'''
Return a list of RSS feeds to fetch for this profile. Each element of the list
must be a 2-element tuple of the form (title, url). If title is None or an
empty string, the title from the feed is used.
'''
if not self.feeds:
raise NotImplementedError
return self.feeds
@classmethod
def print_version(cls, url):
'''
Take a URL pointing to an article and returns the URL pointing to the
print version of the article.
'''
raise NotImplementedError
@classmethod
def get_browser(cls):
'''
Return a browser instance used to fetch documents from the web.
If your profile requires that you login first, override this method
in your subclass. See for example the nytimes profile.
'''
return browser()
def __init__(self, options, parser, progress_reporter):
'''
Initialize the recipe.
@param options: Parsed commandline options
@param parser: Command line option parser. Used to intelligently merge options.
@param progress_reporter: A Callable that takes two arguments: progress (a number between 0 and 1) and a string message. The message should be optional.
'''
for attr in ('username', 'password', 'lrf'):
setattr(self, attr, getattr(options, attr))
self.logger = logging.getLogger('feeds2disk')
self.report_progress = progress_reporter
self.username = self.password = None
#: If True optimize downloading for eventual conversion to LRF
self.lrf = False
defaults = parser.get_default_values()
for opt in options.__dict__.keys():
if getattr(options, opt) != getattr(defaults, opt):
setattr(self, opt, getattr(options, opt))
if isinstance(self.feeds, basestring):
self.feeds = eval(self.feeds)
if isinstance(self.feeds, basestring):
self.feeds = [self.feeds]
if self.needs_subscription and (self.username is None or self.password is None):
raise ValueError('The %s recipe needs a username and password.'%self.title)
def download(self):
self.report_progress(0, 'Starting download...')
return self.build_index()
def build_index(self):
self.parse_feeds()
def parse_feeds(self):
'''
Create list of articles from a list of feeds.
@rtype: list
@return: A list whose items are 2-tuples C{('feed title', articles)},
where C{articles} is a list of dictionaries each of the form::
{
'title' : article title,
'url' : URL of print version,
'date' : The publication date of the article as a string,
'description' : A summary of the article
'content' : The full article (can be an empty string). This is used by FullContentProfile
}
'''

View File

@ -0,0 +1,63 @@
#!/usr/bin/env python
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
'''
Contains recipes for various common news sources and websites.
'''
import re
from libprs500.web.feeds.news import BasicNewsRecipe
_basic_recipes = (BasicNewsRecipe,)
_basic_recipe_names = (i.__name__ for i in _basic_recipes)
def compile_recipe(src):
'''
Compile the code in src and return the first object that is
'''
locals = {}
exec src in globals(), locals
for obj in locals.values():
if type(obj) is type and obj.__name__ not in _basic_recipe_names:
for base in obj.__bases__:
if base in _basic_recipes:
return obj
return None
def get_feed(title):
'''
Return a builtin recipe class whoose title == C{title} or None if no such
recipe exists.
@type title: string
@rtype: class or None
'''
if isinstance(_feeds[0], basestring):
for i, val in enumerate(_feeds):
recipe = compile_recipe(val)
if recipe is None:
raise RuntimeError('The builtin Recipe #%d is invalid.'%i)
_feeds[i] = recipe
for recipe in _feeds:
if recipe.title == title:
return recipe
return None
#: Recipes to be used with feeds2disk
_feeds = ['class Temp(BasicNewsRecipe):\n\ttitle="temp"']