Initial implementation of web2lrf

2025-07-09 03:04:10 -04:00 · 2007-07-13 23:20:41 +00:00 · 2007-07-13 23:20:41 +00:00 · 8f38a29165
commit 8f38a29165
parent a601c9c11e
6 changed files with 271 additions and 16 deletions
--- a/src/libprs500/init.py
+++ b/src/libprs500/init.py
@ -13,7 +13,7 @@
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 ''' E-book management software'''
-__version__   = "0.3.69"
+__version__   = "0.3.70"
 __docformat__ = "epytext"
 __author__    = "Kovid Goyal <kovid@kovidgoyal.net>"
 __appname__   = 'libprs500'
--- a/src/libprs500/ebooks/lrf/init.py
+++ b/src/libprs500/ebooks/lrf/init.py
@ -74,7 +74,7 @@ def option_parser(usage):
    metadata = parser.add_option_group('METADATA OPTIONS')
    metadata.add_option('--header', action='store_true', default=False, dest='header',
                      help='Add a header to all the pages with title and author.')
-    metadata.add_option("-t", "--title", action="store", type="string", \
+    metadata.add_option("-t", "--title", action="store", type="string", default=None,\
                    dest="title", help="Set the title. Default: filename.")
    metadata.add_option("-a", "--author", action="store", type="string", \
                    dest="author", help="Set the author. Default: %default", default='Unknown')
--- a/src/libprs500/ebooks/lrf/web/init.py
+++ b/src/libprs500/ebooks/lrf/web/init.py
@ -0,0 +1,14 @@
 ##    Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
 ##    This program is free software; you can redistribute it and/or modify
 ##    it under the terms of the GNU General Public License as published by
 ##    the Free Software Foundation; either version 2 of the License, or
 ##    (at your option) any later version.
 ##
 ##    This program is distributed in the hope that it will be useful,
 ##    but WITHOUT ANY WARRANTY; without even the implied warranty of
 ##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 ##    GNU General Public License for more details.
 ##
 ##    You should have received a copy of the GNU General Public License along
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
--- a/src/libprs500/ebooks/lrf/web/convert_from.py
+++ b/src/libprs500/ebooks/lrf/web/convert_from.py
@ -0,0 +1,119 @@
 ##    Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
 ##    This program is free software; you can redistribute it and/or modify
 ##    it under the terms of the GNU General Public License as published by
 ##    the Free Software Foundation; either version 2 of the License, or
 ##    (at your option) any later version.
 ##
 ##    This program is distributed in the hope that it will be useful,
 ##    but WITHOUT ANY WARRANTY; without even the implied warranty of
 ##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 ##    GNU General Public License for more details.
 ##
 ##    You should have received a copy of the GNU General Public License along
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 '''Convert known websites into LRF files.'''
 import sys, time, tempfile, shutil, os
 from urlparse import urlsplit
 from libprs500 import __appname__
 from libprs500.ebooks.lrf.html.convert_from import option_parser as html_option_parser
 from libprs500.ebooks.lrf.html.convert_from import process_file
 from libprs500.ebooks.lrf.web.profiles import profiles
 from libprs500.web.fetch.simple import setup_logger as web2disk_setup_logger
 from libprs500.web.fetch.simple import create_fetcher
 available_profiles = profiles.keys()
 available_profiles.remove('default')
 available_profiles = ' '.join(available_profiles)
 def option_parser():
    parser = html_option_parser(usage='''%prog [options] website_profile\n\n'''
                          '''%prog downloads a site from the web and converts it '''
                          '''into a LRF file for use with the SONY Reader. '''
                          '''website_profile is one of '''+available_profiles+\
                          ''' If you specify a website_profile of default or do not specify '''
                          '''it, you must specify the --url option.'''
                          )
    parser.remove_option('-t')
    parser.add_option('-u', '--url', dest='url', default=None,  
                      help='The URL to download. You only need to specify this if you are not specifying a website_profile.')
    parser.add_option('-t', '--timeout', help='Timeout in seconds to wait for a response from the server. Default: %default s',
                      default=None, type='int', dest='timeout')
    parser.add_option('-r', '--max-recursions', help='Maximum number of levels to recurse i.e. depth of links to follow. Default %default',
                      default=None, type='int', dest='max_recursions')
    parser.add_option('-n', '--max-files', default=None, type='int', dest='max_files',
                      help='The maximum number of files to download. This only applies to files from <a href> tags. Default is %default')
    parser.add_option('--delay', default=None, dest='delay', type='int',
                      help='Minimum interval in seconds between consecutive fetches. Default is %default s')
    parser.add_option('--dont-download-stylesheets', action='store_true', default=None,
                      help='Do not download CSS stylesheets.', dest='no_stylesheets')    
    parser.add_option('--match-regexp', dest='match_regexps', default=[], action='append',
                      help='Only links that match this regular expression will be followed. This option can be specified multiple times, in which case as long as a link matches any one regexp, it will be followed. By default all links are followed.')
    parser.add_option('--filter-regexp', default=[], action='append', dest='filter_regexps',
                      help='Any link that matches this regular expression will be ignored. This option can be specified multiple times, in which case as long as any regexp matches a link, it will be ignored.By default, no links are ignored. If both --filter-regexp and --match-regexp are specified, then --filter-regexp is applied first.')
    return parser
 def fetch_website(options):
    tdir = tempfile.mkdtemp(prefix=__appname__+'_' )
    options.dir = tdir 
    web2disk_setup_logger(options)
    fetcher = create_fetcher(options)
    fetcher.preprocess_regexps = options.preprocess_regexps
    return fetcher.start_fetch(options.url), tdir
 def create_lrf(htmlfile, options):
    options.author = __appname__
    options.header = True
    if not options.output:
        options.output = os.path.abspath(os.path.expanduser(options.title + ('.lrs' if options.lrs else '.lrf')))
    process_file(htmlfile, options)
 def main(args=sys.argv):
    parser = option_parser()
    options, args = parser.parse_args(args)
    if len(args) > 2:
        parser.print_help()
        return 1
    if len(args) == 2:
        if not profiles.has_key(args[1]):
            print >>sys.stderr, 'Unknown profile', args[1]
            print >>sys.stderr, 'Valid profiles:', profiles.keys()
            return 1
    profile = profiles[args[1]] if len(args) == 2 else profiles['default']
    for opt in ('url', 'timeout', 'max_recursions', 'max_files', 'delay', 'no_stylesheets'):
        val = getattr(options, opt)
        if val is None:
            setattr(options, opt, profile[opt])
    if not options.url:
        parser.print_help()
        print >>sys.stderr
        print >>sys.stderr, 'You must specify the --url option or a profile from one of:',
        print >>sys.stderr, available_profiles
        return 1
    if not options.title:
        title = profile['title']
        if not title:
            title = urlsplit(options.url).netloc
        options.title = title + time.strftime(' [%a %d %b %Y]', time.localtime())
    options.match_regexps += profile['match_regexps']
    options.preprocess_regexps = profile['preprocess_regexps']
    options.filter_regexps += profile['filter_regexps']
    htmlfile, tdir = fetch_website(options)
    create_lrf(htmlfile, options)
    shutil.rmtree(tdir)
    return 0
 if __name__ == '__main__':
    sys.exit(main())
--- a/src/libprs500/ebooks/lrf/web/profiles.py
+++ b/src/libprs500/ebooks/lrf/web/profiles.py
@ -0,0 +1,99 @@
 ##    Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
 ##    This program is free software; you can redistribute it and/or modify
 ##    it under the terms of the GNU General Public License as published by
 ##    the Free Software Foundation; either version 2 of the License, or
 ##    (at your option) any later version.
 ##
 ##    This program is distributed in the hope that it will be useful,
 ##    but WITHOUT ANY WARRANTY; without even the implied warranty of
 ##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 ##    GNU General Public License for more details.
 ##
 ##    You should have received a copy of the GNU General Public License along
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 '''Profiles for known websites.'''
 import time, re
 profiles = {
            'default' : {
                         'url'               : '',    # The URL of the website
                         'title'             : '',    # The title to use for the LRF file
                         'max_recursions'    : 1,     # Number of levels of links to follow
                         'max_files'         : 1000,  # Maximum number of files to download
                         'delay'             : 0,     # Delay between consecutive downloads
                         'timeout'           : 10,    # Timeout for fetching files from server
                         'no_stylesheets'    : False, # Download stylesheets 
                         'match_regexps'     : [],    # List of regular expressions that determines which links to follow
                         'filter_regexps'    : [],    # List of regular expressions that determines which links to ignore
                         # Only one of match_regexps or filter_regexps should be defined
                         'html2lrf_options'  : [],    # List of options to pass to html2lrf
                         'preprocess_regexps': [],    # List of regexp substitution rules to run on the downloaded HTML before running html2lrf
                         # See the profiles below for examples of these settings. 
                       },
            'nytimes' : {
                         'url'               : 'http://nytimesriver.com',
                         'title'             : 'The New York Times',
                         'match_regexps'     : 'nytimes.com/'+time.strftime('%Y', time.localtime()),
                         'preprocess_regexps' :
                         [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in 
                          [
                           # Remove help link and replace by title
                           (r'<a .*?alt=.Click here for information about this service.*?</a>', 
                            lambda match: '<h1>The New York Times</h1>\n<p align="right"><b>%s</b></p>'%(time.strftime('%a %d %b %Y', time.localtime()),)),
                           # Blank line before categories
                           (r'<b>\s*NYT', lambda match: '<p></p><b>NYT'),
                           # Blank line between articles
                           (r'<p><a href', lambda match : '<br /><p><a href'),
                           # Remove header on individual articles
                           (r'<body class=.printerversion..*?<h1><nyt_headline', 
                            lambda match : '<body class="printerversion">\n<h1><nyt_headline'),
                           # Remove footer from individiual articles
                           (r'<nyt_update_bottom.*', lambda match : '</body></html>'),
                           # Remove TimesSelect garbage
                           (r'<title>.*?TimesSelect', lambda match : 'Downloading of TimesSelect stories is not supported.<!--'),
                           ]
                          ],
                         },
            'bbc'     : {
                         'url'               : 'http://bbcriver.com',
                         'title'             : 'The BBC',
                         'no_stylesheets'    : True,
                         'preprocess_regexps' :
                         [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in 
                          [
                           # Remove help link and replace by title
                           (r'<a .*?alt=.Click here for information about this service.*?</a>', 
                            lambda match: '<h1>The BBC</h1>\n<p align="right"><b>%s</b></p>'%(time.strftime('%a %d %b %Y', time.localtime()),)),
                           # Blank line before categories
                           (r'<b>\s*BBC', lambda match: '<p></p><b>BBC'),
                           # Remove footer from individual stories
                           (r'<div class=.footer.>.*?Published', 
                            lambda match : '<p></p><div class="footer">Published'),
                           # Add some style info in place of disabled stylesheet
                           (r'<link.*?type=.text/css.*?>',
                            '<style type="text/css">.headline {font-size: x-large;}</style>'),
                           ]
                          ],
                         },                                     
            }
 for key in profiles.keys():
    if key == 'default':
        continue
    newd = profiles['default'].copy()
    newd.update(profiles[key])
    profiles[key] = newd
 def profile_to_command_line_options(profile):
    args = []
    args.append('--max-recursions='+str(profile['max_recursions']))
    args.append('--delay='+str(profile['delay']))
    for i in profile['match_regexps']:
        args.append('--match-regexp="'+i+'"')
    for i in profile['filter_regexps']:
        args.append('--filter-regexp="'+i+'"')
    return args
--- a/src/libprs500/web/fetch/simple.py
+++ b/src/libprs500/web/fetch/simple.py
@ -15,7 +15,7 @@
 '''
 Fetch a webpage and its links recursively.
 '''
-import sys, socket, urllib2, os, urlparse, codecs, logging, re, time
+import sys, socket, urllib2, os, urlparse, codecs, logging, re, time, copy
 from urllib import url2pathname
 from httplib import responses
 from optparse import OptionParser
@ -45,6 +45,11 @@ def save_soup(soup, target):
 class RecursiveFetcher(object):
    LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in 
                ('.exe\s*$', '.mp3\s*$', '.ogg\s*$', '^\s*mailto:', '^\s*$'))
    #ADBLOCK_FILTER = tuple(re.compile(i, re.IGNORECASE) for it in
    #                       (
    #                        
    #                        )
    #                       )
    CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE)
    def __init__(self, options):
@ -64,6 +69,14 @@ class RecursiveFetcher(object):
        self.stylemap = {}
        self.current_dir = self.base_dir
        self.files = 0
        self.preprocess_regexps = []
        self.download_stylesheets = not options.no_stylesheets
    def get_soup(self, src):
        nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
        nmassage.extend(self.preprocess_regexps)
        return BeautifulSoup(src, markupMassage=nmassage)
    def fetch_url(self, url):
        f = None
@ -84,7 +97,7 @@ class RecursiveFetcher(object):
    def start_fetch(self, url):
        soup = BeautifulSoup('<a href="'+url+'" />')
-        print 'Working',
+        print 'Downloading',
        res = self.process_links(soup, url, 0, into_dir='')
        print '%s saved to %s'%(url, res)
        return res
@ -99,9 +112,8 @@ class RecursiveFetcher(object):
        if self.filter_regexps:
            for f in self.filter_regexps:
                if f.search(url):
-                    return False
+                    return False            
-            return True
+        if self.match_regexps:
        elif self.match_regexps:
            for m in self.match_regexps:
                if m.search(url):
                    return True
@ -243,10 +255,11 @@ class RecursiveFetcher(object):
                try:
                    self.current_dir = linkdiskpath
                    f = self.fetch_url(iurl)
-                    soup = BeautifulSoup(f.read())
+                    soup = self.get_soup(f.read())
                    logger.info('Processing images...')
                    self.process_images(soup, f.geturl())
-                    self.process_stylesheets(soup, f.geturl())
+                    if self.download_stylesheets:
                        self.process_stylesheets(soup, f.geturl())
                    res = os.path.join(linkdiskpath, basename(iurl))
                    self.filemap[nurl] = res
@ -284,26 +297,36 @@ def option_parser(usage='%prog URL\n\nWhere URL is for example http://google.com
                      default=1, type='int', dest='max_recursions')
    parser.add_option('-n', '--max-files', default=sys.maxint, type='int', dest='max_files',
                      help='The maximum number of files to download. This only applies to files from <a href> tags. Default is %default')
    parser.add_option('--delay', default=0, dest='delay', type='int',
                      help='Minimum interval in seconds between consecutive fetches. Default is %default s')
    parser.add_option('--match-regexp', default=[], action='append', dest='match_regexps',
                      help='Only links that match this regular expression will be followed. This option can be specified multiple times, in which case as long as a link matches any one regexp, it will be followed. By default all links are followed.')
    parser.add_option('--filter-regexp', default=[], action='append', dest='filter_regexps',
-                      help='Any link that matches this regular expression will be ignored. This option can be specified multiple times, in which case as long as any regexp matches a link, it will be ignored.By default, no links are ignored. If both --filter-regexp and --match-regexp are specified, then --match-regexp is ignored.')
+                      help='Any link that matches this regular expression will be ignored. This option can be specified multiple times, in which case as long as any regexp matches a link, it will be ignored.By default, no links are ignored. If both --filter-regexp and --match-regexp are specified, then --filter-regexp is applied first.')
-    parser.add_option('--delay', default=0, dest='delay', type='int',
+    parser.add_option('--dont-download-stylesheets', action='store_true', default=False,
-                      help='Minimum interval in seconds between consecutive fetches. Default is %default s')
+                      help='Do not download CSS stylesheets.', dest='no_stylesheets')
    parser.add_option('--verbose', help='Show detailed output information. Useful for debugging',
                      default=False, action='store_true', dest='verbose')
    return parser
 def create_fetcher(options):
    return RecursiveFetcher(options)
 def setup_logger(options):
    level = logging.DEBUG if options.verbose else logging.WARNING
    setup_cli_handlers(logger, level)
 def main(args=sys.argv):
    parser = option_parser()    
    options, args = parser.parse_args(args)
    if len(args) != 2:
        parser.print_help()
        return 1
-    level = logging.DEBUG if options.verbose else logging.WARNING
+    
-    setup_cli_handlers(logger, level)
+    setup_logger(options)
-        
+    fetcher = create_fetcher(options) 
    fetcher = RecursiveFetcher(options)
    fetcher.start_fetch(args[1])