From 8f38a29165be6c84b2c3b98173af1d4f61c44396 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 13 Jul 2007 23:20:41 +0000 Subject: [PATCH] Initial implementation of web2lrf --- src/libprs500/__init__.py | 2 +- src/libprs500/ebooks/lrf/__init__.py | 2 +- src/libprs500/ebooks/lrf/web/__init__.py | 14 +++ src/libprs500/ebooks/lrf/web/convert_from.py | 119 +++++++++++++++++++ src/libprs500/ebooks/lrf/web/profiles.py | 99 +++++++++++++++ src/libprs500/web/fetch/simple.py | 51 +++++--- 6 files changed, 271 insertions(+), 16 deletions(-) create mode 100644 src/libprs500/ebooks/lrf/web/__init__.py create mode 100644 src/libprs500/ebooks/lrf/web/convert_from.py create mode 100644 src/libprs500/ebooks/lrf/web/profiles.py diff --git a/src/libprs500/__init__.py b/src/libprs500/__init__.py index 745d1aec19..1b2963a5fa 100644 --- a/src/libprs500/__init__.py +++ b/src/libprs500/__init__.py @@ -13,7 +13,7 @@ ## with this program; if not, write to the Free Software Foundation, Inc., ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ''' E-book management software''' -__version__ = "0.3.69" +__version__ = "0.3.70" __docformat__ = "epytext" __author__ = "Kovid Goyal " __appname__ = 'libprs500' diff --git a/src/libprs500/ebooks/lrf/__init__.py b/src/libprs500/ebooks/lrf/__init__.py index ccd7212f65..1022dcf9ba 100644 --- a/src/libprs500/ebooks/lrf/__init__.py +++ b/src/libprs500/ebooks/lrf/__init__.py @@ -74,7 +74,7 @@ def option_parser(usage): metadata = parser.add_option_group('METADATA OPTIONS') metadata.add_option('--header', action='store_true', default=False, dest='header', help='Add a header to all the pages with title and author.') - metadata.add_option("-t", "--title", action="store", type="string", \ + metadata.add_option("-t", "--title", action="store", type="string", default=None,\ dest="title", help="Set the title. Default: filename.") metadata.add_option("-a", "--author", action="store", type="string", \ dest="author", help="Set the author. Default: %default", default='Unknown') diff --git a/src/libprs500/ebooks/lrf/web/__init__.py b/src/libprs500/ebooks/lrf/web/__init__.py new file mode 100644 index 0000000000..aaf49de99e --- /dev/null +++ b/src/libprs500/ebooks/lrf/web/__init__.py @@ -0,0 +1,14 @@ +## Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. diff --git a/src/libprs500/ebooks/lrf/web/convert_from.py b/src/libprs500/ebooks/lrf/web/convert_from.py new file mode 100644 index 0000000000..23a767599e --- /dev/null +++ b/src/libprs500/ebooks/lrf/web/convert_from.py @@ -0,0 +1,119 @@ +## Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +'''Convert known websites into LRF files.''' + +import sys, time, tempfile, shutil, os +from urlparse import urlsplit + +from libprs500 import __appname__ +from libprs500.ebooks.lrf.html.convert_from import option_parser as html_option_parser +from libprs500.ebooks.lrf.html.convert_from import process_file +from libprs500.ebooks.lrf.web.profiles import profiles +from libprs500.web.fetch.simple import setup_logger as web2disk_setup_logger +from libprs500.web.fetch.simple import create_fetcher + +available_profiles = profiles.keys() +available_profiles.remove('default') +available_profiles = ' '.join(available_profiles) + +def option_parser(): + parser = html_option_parser(usage='''%prog [options] website_profile\n\n''' + '''%prog downloads a site from the web and converts it ''' + '''into a LRF file for use with the SONY Reader. ''' + '''website_profile is one of '''+available_profiles+\ + ''' If you specify a website_profile of default or do not specify ''' + '''it, you must specify the --url option.''' + ) + + parser.remove_option('-t') + + parser.add_option('-u', '--url', dest='url', default=None, + help='The URL to download. You only need to specify this if you are not specifying a website_profile.') + + parser.add_option('-t', '--timeout', help='Timeout in seconds to wait for a response from the server. Default: %default s', + default=None, type='int', dest='timeout') + parser.add_option('-r', '--max-recursions', help='Maximum number of levels to recurse i.e. depth of links to follow. Default %default', + default=None, type='int', dest='max_recursions') + parser.add_option('-n', '--max-files', default=None, type='int', dest='max_files', + help='The maximum number of files to download. This only applies to files from tags. Default is %default') + parser.add_option('--delay', default=None, dest='delay', type='int', + help='Minimum interval in seconds between consecutive fetches. Default is %default s') + parser.add_option('--dont-download-stylesheets', action='store_true', default=None, + help='Do not download CSS stylesheets.', dest='no_stylesheets') + + parser.add_option('--match-regexp', dest='match_regexps', default=[], action='append', + help='Only links that match this regular expression will be followed. This option can be specified multiple times, in which case as long as a link matches any one regexp, it will be followed. By default all links are followed.') + parser.add_option('--filter-regexp', default=[], action='append', dest='filter_regexps', + help='Any link that matches this regular expression will be ignored. This option can be specified multiple times, in which case as long as any regexp matches a link, it will be ignored.By default, no links are ignored. If both --filter-regexp and --match-regexp are specified, then --filter-regexp is applied first.') + return parser + +def fetch_website(options): + tdir = tempfile.mkdtemp(prefix=__appname__+'_' ) + options.dir = tdir + web2disk_setup_logger(options) + fetcher = create_fetcher(options) + fetcher.preprocess_regexps = options.preprocess_regexps + return fetcher.start_fetch(options.url), tdir + +def create_lrf(htmlfile, options): + options.author = __appname__ + options.header = True + if not options.output: + options.output = os.path.abspath(os.path.expanduser(options.title + ('.lrs' if options.lrs else '.lrf'))) + process_file(htmlfile, options) + +def main(args=sys.argv): + parser = option_parser() + options, args = parser.parse_args(args) + if len(args) > 2: + parser.print_help() + return 1 + if len(args) == 2: + if not profiles.has_key(args[1]): + print >>sys.stderr, 'Unknown profile', args[1] + print >>sys.stderr, 'Valid profiles:', profiles.keys() + return 1 + profile = profiles[args[1]] if len(args) == 2 else profiles['default'] + + for opt in ('url', 'timeout', 'max_recursions', 'max_files', 'delay', 'no_stylesheets'): + val = getattr(options, opt) + if val is None: + setattr(options, opt, profile[opt]) + + if not options.url: + parser.print_help() + print >>sys.stderr + print >>sys.stderr, 'You must specify the --url option or a profile from one of:', + print >>sys.stderr, available_profiles + return 1 + + if not options.title: + title = profile['title'] + if not title: + title = urlsplit(options.url).netloc + options.title = title + time.strftime(' [%a %d %b %Y]', time.localtime()) + + options.match_regexps += profile['match_regexps'] + options.preprocess_regexps = profile['preprocess_regexps'] + options.filter_regexps += profile['filter_regexps'] + + htmlfile, tdir = fetch_website(options) + create_lrf(htmlfile, options) + shutil.rmtree(tdir) + + return 0 + +if __name__ == '__main__': + sys.exit(main()) \ No newline at end of file diff --git a/src/libprs500/ebooks/lrf/web/profiles.py b/src/libprs500/ebooks/lrf/web/profiles.py new file mode 100644 index 0000000000..4b363857cc --- /dev/null +++ b/src/libprs500/ebooks/lrf/web/profiles.py @@ -0,0 +1,99 @@ +## Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +'''Profiles for known websites.''' + +import time, re + +profiles = { + 'default' : { + 'url' : '', # The URL of the website + 'title' : '', # The title to use for the LRF file + 'max_recursions' : 1, # Number of levels of links to follow + 'max_files' : 1000, # Maximum number of files to download + 'delay' : 0, # Delay between consecutive downloads + 'timeout' : 10, # Timeout for fetching files from server + 'no_stylesheets' : False, # Download stylesheets + 'match_regexps' : [], # List of regular expressions that determines which links to follow + 'filter_regexps' : [], # List of regular expressions that determines which links to ignore + # Only one of match_regexps or filter_regexps should be defined + 'html2lrf_options' : [], # List of options to pass to html2lrf + 'preprocess_regexps': [], # List of regexp substitution rules to run on the downloaded HTML before running html2lrf + # See the profiles below for examples of these settings. + }, + + 'nytimes' : { + 'url' : 'http://nytimesriver.com', + 'title' : 'The New York Times', + 'match_regexps' : 'nytimes.com/'+time.strftime('%Y', time.localtime()), + 'preprocess_regexps' : + [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in + [ + # Remove help link and replace by title + (r'', + lambda match: '

The New York Times

\n

%s

'%(time.strftime('%a %d %b %Y', time.localtime()),)), + # Blank line before categories + (r'\s*NYT', lambda match: '

NYT'), + # Blank line between articles + (r'

\n

'), + # Remove TimesSelect garbage + (r'.*?TimesSelect', lambda match : 'Downloading of TimesSelect stories is not supported.<!--'), + ] + ], + }, + + 'bbc' : { + 'url' : 'http://bbcriver.com', + 'title' : 'The BBC', + 'no_stylesheets' : True, + 'preprocess_regexps' : + [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in + [ + # Remove help link and replace by title + (r'<a .*?alt=.Click here for information about this service.*?</a>', + lambda match: '<h1>The BBC</h1>\n<p align="right"><b>%s</b></p>'%(time.strftime('%a %d %b %Y', time.localtime()),)), + # Blank line before categories + (r'<b>\s*BBC', lambda match: '<p></p><b>BBC'), + # Remove footer from individual stories + (r'<div class=.footer.>.*?Published', + lambda match : '<p></p><div class="footer">Published'), + # Add some style info in place of disabled stylesheet + (r'<link.*?type=.text/css.*?>', + '<style type="text/css">.headline {font-size: x-large;}</style>'), + ] + ], + }, + } + +for key in profiles.keys(): + if key == 'default': + continue + newd = profiles['default'].copy() + newd.update(profiles[key]) + profiles[key] = newd + +def profile_to_command_line_options(profile): + args = [] + args.append('--max-recursions='+str(profile['max_recursions'])) + args.append('--delay='+str(profile['delay'])) + for i in profile['match_regexps']: + args.append('--match-regexp="'+i+'"') + for i in profile['filter_regexps']: + args.append('--filter-regexp="'+i+'"') + return args \ No newline at end of file diff --git a/src/libprs500/web/fetch/simple.py b/src/libprs500/web/fetch/simple.py index 9199fb198a..b1e6e37867 100644 --- a/src/libprs500/web/fetch/simple.py +++ b/src/libprs500/web/fetch/simple.py @@ -15,7 +15,7 @@ ''' Fetch a webpage and its links recursively. ''' -import sys, socket, urllib2, os, urlparse, codecs, logging, re, time +import sys, socket, urllib2, os, urlparse, codecs, logging, re, time, copy from urllib import url2pathname from httplib import responses from optparse import OptionParser @@ -45,6 +45,11 @@ def save_soup(soup, target): class RecursiveFetcher(object): LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in ('.exe\s*$', '.mp3\s*$', '.ogg\s*$', '^\s*mailto:', '^\s*$')) + #ADBLOCK_FILTER = tuple(re.compile(i, re.IGNORECASE) for it in + # ( + # + # ) + # ) CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE) def __init__(self, options): @@ -64,6 +69,14 @@ class RecursiveFetcher(object): self.stylemap = {} self.current_dir = self.base_dir self.files = 0 + self.preprocess_regexps = [] + self.download_stylesheets = not options.no_stylesheets + + + def get_soup(self, src): + nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE) + nmassage.extend(self.preprocess_regexps) + return BeautifulSoup(src, markupMassage=nmassage) def fetch_url(self, url): f = None @@ -84,7 +97,7 @@ class RecursiveFetcher(object): def start_fetch(self, url): soup = BeautifulSoup('<a href="'+url+'" />') - print 'Working', + print 'Downloading', res = self.process_links(soup, url, 0, into_dir='') print '%s saved to %s'%(url, res) return res @@ -99,9 +112,8 @@ class RecursiveFetcher(object): if self.filter_regexps: for f in self.filter_regexps: if f.search(url): - return False - return True - elif self.match_regexps: + return False + if self.match_regexps: for m in self.match_regexps: if m.search(url): return True @@ -243,10 +255,11 @@ class RecursiveFetcher(object): try: self.current_dir = linkdiskpath f = self.fetch_url(iurl) - soup = BeautifulSoup(f.read()) + soup = self.get_soup(f.read()) logger.info('Processing images...') self.process_images(soup, f.geturl()) - self.process_stylesheets(soup, f.geturl()) + if self.download_stylesheets: + self.process_stylesheets(soup, f.geturl()) res = os.path.join(linkdiskpath, basename(iurl)) self.filemap[nurl] = res @@ -284,26 +297,36 @@ def option_parser(usage='%prog URL\n\nWhere URL is for example http://google.com default=1, type='int', dest='max_recursions') parser.add_option('-n', '--max-files', default=sys.maxint, type='int', dest='max_files', help='The maximum number of files to download. This only applies to files from <a href> tags. Default is %default') + parser.add_option('--delay', default=0, dest='delay', type='int', + help='Minimum interval in seconds between consecutive fetches. Default is %default s') parser.add_option('--match-regexp', default=[], action='append', dest='match_regexps', help='Only links that match this regular expression will be followed. This option can be specified multiple times, in which case as long as a link matches any one regexp, it will be followed. By default all links are followed.') parser.add_option('--filter-regexp', default=[], action='append', dest='filter_regexps', - help='Any link that matches this regular expression will be ignored. This option can be specified multiple times, in which case as long as any regexp matches a link, it will be ignored.By default, no links are ignored. If both --filter-regexp and --match-regexp are specified, then --match-regexp is ignored.') - parser.add_option('--delay', default=0, dest='delay', type='int', - help='Minimum interval in seconds between consecutive fetches. Default is %default s') + help='Any link that matches this regular expression will be ignored. This option can be specified multiple times, in which case as long as any regexp matches a link, it will be ignored.By default, no links are ignored. If both --filter-regexp and --match-regexp are specified, then --filter-regexp is applied first.') + parser.add_option('--dont-download-stylesheets', action='store_true', default=False, + help='Do not download CSS stylesheets.', dest='no_stylesheets') + parser.add_option('--verbose', help='Show detailed output information. Useful for debugging', default=False, action='store_true', dest='verbose') return parser + +def create_fetcher(options): + return RecursiveFetcher(options) + +def setup_logger(options): + level = logging.DEBUG if options.verbose else logging.WARNING + setup_cli_handlers(logger, level) + def main(args=sys.argv): parser = option_parser() options, args = parser.parse_args(args) if len(args) != 2: parser.print_help() return 1 - level = logging.DEBUG if options.verbose else logging.WARNING - setup_cli_handlers(logger, level) - - fetcher = RecursiveFetcher(options) + + setup_logger(options) + fetcher = create_fetcher(options) fetcher.start_fetch(args[1])