From 8f38a29165be6c84b2c3b98173af1d4f61c44396 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Fri, 13 Jul 2007 23:20:41 +0000
Subject: [PATCH] Initial implementation of web2lrf

---
 src/libprs500/__init__.py                    |   2 +-
 src/libprs500/ebooks/lrf/__init__.py         |   2 +-
 src/libprs500/ebooks/lrf/web/__init__.py     |  14 +++
 src/libprs500/ebooks/lrf/web/convert_from.py | 119 +++++++++++++++++++
 src/libprs500/ebooks/lrf/web/profiles.py     |  99 +++++++++++++++
 src/libprs500/web/fetch/simple.py            |  51 +++++---
 6 files changed, 271 insertions(+), 16 deletions(-)
 create mode 100644 src/libprs500/ebooks/lrf/web/__init__.py
 create mode 100644 src/libprs500/ebooks/lrf/web/convert_from.py
 create mode 100644 src/libprs500/ebooks/lrf/web/profiles.py

diff --git a/src/libprs500/__init__.py b/src/libprs500/__init__.py
index 745d1aec19..1b2963a5fa 100644
--- a/src/libprs500/__init__.py
+++ b/src/libprs500/__init__.py
@@ -13,7 +13,7 @@
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 ''' E-book management software'''
-__version__   = "0.3.69"
+__version__   = "0.3.70"
 __docformat__ = "epytext"
 __author__    = "Kovid Goyal <kovid@kovidgoyal.net>"
 __appname__   = 'libprs500'
diff --git a/src/libprs500/ebooks/lrf/__init__.py b/src/libprs500/ebooks/lrf/__init__.py
index ccd7212f65..1022dcf9ba 100644
--- a/src/libprs500/ebooks/lrf/__init__.py
+++ b/src/libprs500/ebooks/lrf/__init__.py
@@ -74,7 +74,7 @@ def option_parser(usage):
     metadata = parser.add_option_group('METADATA OPTIONS')
     metadata.add_option('--header', action='store_true', default=False, dest='header',
                       help='Add a header to all the pages with title and author.')
-    metadata.add_option("-t", "--title", action="store", type="string", \
+    metadata.add_option("-t", "--title", action="store", type="string", default=None,\
                     dest="title", help="Set the title. Default: filename.")
     metadata.add_option("-a", "--author", action="store", type="string", \
                     dest="author", help="Set the author. Default: %default", default='Unknown')
diff --git a/src/libprs500/ebooks/lrf/web/__init__.py b/src/libprs500/ebooks/lrf/web/__init__.py
new file mode 100644
index 0000000000..aaf49de99e
--- /dev/null
+++ b/src/libprs500/ebooks/lrf/web/__init__.py
@@ -0,0 +1,14 @@
+##    Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
+##    This program is free software; you can redistribute it and/or modify
+##    it under the terms of the GNU General Public License as published by
+##    the Free Software Foundation; either version 2 of the License, or
+##    (at your option) any later version.
+##
+##    This program is distributed in the hope that it will be useful,
+##    but WITHOUT ANY WARRANTY; without even the implied warranty of
+##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+##    GNU General Public License for more details.
+##
+##    You should have received a copy of the GNU General Public License along
+##    with this program; if not, write to the Free Software Foundation, Inc.,
+##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
diff --git a/src/libprs500/ebooks/lrf/web/convert_from.py b/src/libprs500/ebooks/lrf/web/convert_from.py
new file mode 100644
index 0000000000..23a767599e
--- /dev/null
+++ b/src/libprs500/ebooks/lrf/web/convert_from.py
@@ -0,0 +1,119 @@
+##    Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
+##    This program is free software; you can redistribute it and/or modify
+##    it under the terms of the GNU General Public License as published by
+##    the Free Software Foundation; either version 2 of the License, or
+##    (at your option) any later version.
+##
+##    This program is distributed in the hope that it will be useful,
+##    but WITHOUT ANY WARRANTY; without even the implied warranty of
+##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+##    GNU General Public License for more details.
+##
+##    You should have received a copy of the GNU General Public License along
+##    with this program; if not, write to the Free Software Foundation, Inc.,
+##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+'''Convert known websites into LRF files.'''
+
+import sys, time, tempfile, shutil, os
+from urlparse import urlsplit
+
+from libprs500 import __appname__
+from libprs500.ebooks.lrf.html.convert_from import option_parser as html_option_parser
+from libprs500.ebooks.lrf.html.convert_from import process_file
+from libprs500.ebooks.lrf.web.profiles import profiles
+from libprs500.web.fetch.simple import setup_logger as web2disk_setup_logger
+from libprs500.web.fetch.simple import create_fetcher
+
+available_profiles = profiles.keys()
+available_profiles.remove('default')
+available_profiles = ' '.join(available_profiles)
+
+def option_parser():
+    parser = html_option_parser(usage='''%prog [options] website_profile\n\n'''
+                          '''%prog downloads a site from the web and converts it '''
+                          '''into a LRF file for use with the SONY Reader. '''
+                          '''website_profile is one of '''+available_profiles+\
+                          ''' If you specify a website_profile of default or do not specify '''
+                          '''it, you must specify the --url option.'''
+                          )
+    
+    parser.remove_option('-t')
+    
+    parser.add_option('-u', '--url', dest='url', default=None,  
+                      help='The URL to download. You only need to specify this if you are not specifying a website_profile.')
+    
+    parser.add_option('-t', '--timeout', help='Timeout in seconds to wait for a response from the server. Default: %default s',
+                      default=None, type='int', dest='timeout')
+    parser.add_option('-r', '--max-recursions', help='Maximum number of levels to recurse i.e. depth of links to follow. Default %default',
+                      default=None, type='int', dest='max_recursions')
+    parser.add_option('-n', '--max-files', default=None, type='int', dest='max_files',
+                      help='The maximum number of files to download. This only applies to files from <a href> tags. Default is %default')
+    parser.add_option('--delay', default=None, dest='delay', type='int',
+                      help='Minimum interval in seconds between consecutive fetches. Default is %default s')
+    parser.add_option('--dont-download-stylesheets', action='store_true', default=None,
+                      help='Do not download CSS stylesheets.', dest='no_stylesheets')    
+    
+    parser.add_option('--match-regexp', dest='match_regexps', default=[], action='append',
+                      help='Only links that match this regular expression will be followed. This option can be specified multiple times, in which case as long as a link matches any one regexp, it will be followed. By default all links are followed.')
+    parser.add_option('--filter-regexp', default=[], action='append', dest='filter_regexps',
+                      help='Any link that matches this regular expression will be ignored. This option can be specified multiple times, in which case as long as any regexp matches a link, it will be ignored.By default, no links are ignored. If both --filter-regexp and --match-regexp are specified, then --filter-regexp is applied first.')
+    return parser
+    
+def fetch_website(options):
+    tdir = tempfile.mkdtemp(prefix=__appname__+'_' )
+    options.dir = tdir 
+    web2disk_setup_logger(options)
+    fetcher = create_fetcher(options)
+    fetcher.preprocess_regexps = options.preprocess_regexps
+    return fetcher.start_fetch(options.url), tdir
+    
+def create_lrf(htmlfile, options):
+    options.author = __appname__
+    options.header = True
+    if not options.output:
+        options.output = os.path.abspath(os.path.expanduser(options.title + ('.lrs' if options.lrs else '.lrf')))
+    process_file(htmlfile, options)
+
+def main(args=sys.argv):
+    parser = option_parser()
+    options, args = parser.parse_args(args)
+    if len(args) > 2:
+        parser.print_help()
+        return 1
+    if len(args) == 2:
+        if not profiles.has_key(args[1]):
+            print >>sys.stderr, 'Unknown profile', args[1]
+            print >>sys.stderr, 'Valid profiles:', profiles.keys()
+            return 1
+    profile = profiles[args[1]] if len(args) == 2 else profiles['default']
+    
+    for opt in ('url', 'timeout', 'max_recursions', 'max_files', 'delay', 'no_stylesheets'):
+        val = getattr(options, opt)
+        if val is None:
+            setattr(options, opt, profile[opt])
+        
+    if not options.url:
+        parser.print_help()
+        print >>sys.stderr
+        print >>sys.stderr, 'You must specify the --url option or a profile from one of:',
+        print >>sys.stderr, available_profiles
+        return 1
+    
+    if not options.title:
+        title = profile['title']
+        if not title:
+            title = urlsplit(options.url).netloc
+        options.title = title + time.strftime(' [%a %d %b %Y]', time.localtime())
+    
+    options.match_regexps += profile['match_regexps']
+    options.preprocess_regexps = profile['preprocess_regexps']
+    options.filter_regexps += profile['filter_regexps']
+        
+    htmlfile, tdir = fetch_website(options)
+    create_lrf(htmlfile, options)
+    shutil.rmtree(tdir)
+         
+    return 0
+
+if __name__ == '__main__':
+    sys.exit(main())
\ No newline at end of file
diff --git a/src/libprs500/ebooks/lrf/web/profiles.py b/src/libprs500/ebooks/lrf/web/profiles.py
new file mode 100644
index 0000000000..4b363857cc
--- /dev/null
+++ b/src/libprs500/ebooks/lrf/web/profiles.py
@@ -0,0 +1,99 @@
+##    Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
+##    This program is free software; you can redistribute it and/or modify
+##    it under the terms of the GNU General Public License as published by
+##    the Free Software Foundation; either version 2 of the License, or
+##    (at your option) any later version.
+##
+##    This program is distributed in the hope that it will be useful,
+##    but WITHOUT ANY WARRANTY; without even the implied warranty of
+##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+##    GNU General Public License for more details.
+##
+##    You should have received a copy of the GNU General Public License along
+##    with this program; if not, write to the Free Software Foundation, Inc.,
+##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+'''Profiles for known websites.'''
+
+import time, re
+
+profiles = {
+            'default' : {
+                         'url'               : '',    # The URL of the website
+                         'title'             : '',    # The title to use for the LRF file
+                         'max_recursions'    : 1,     # Number of levels of links to follow
+                         'max_files'         : 1000,  # Maximum number of files to download
+                         'delay'             : 0,     # Delay between consecutive downloads
+                         'timeout'           : 10,    # Timeout for fetching files from server
+                         'no_stylesheets'    : False, # Download stylesheets 
+                         'match_regexps'     : [],    # List of regular expressions that determines which links to follow
+                         'filter_regexps'    : [],    # List of regular expressions that determines which links to ignore
+                         # Only one of match_regexps or filter_regexps should be defined
+                         'html2lrf_options'  : [],    # List of options to pass to html2lrf
+                         'preprocess_regexps': [],    # List of regexp substitution rules to run on the downloaded HTML before running html2lrf
+                         # See the profiles below for examples of these settings. 
+                       },
+                       
+            'nytimes' : {
+                         'url'               : 'http://nytimesriver.com',
+                         'title'             : 'The New York Times',
+                         'match_regexps'     : 'nytimes.com/'+time.strftime('%Y', time.localtime()),
+                         'preprocess_regexps' :
+                         [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in 
+                          [
+                           # Remove help link and replace by title
+                           (r'<a .*?alt=.Click here for information about this service.*?</a>', 
+                            lambda match: '<h1>The New York Times</h1>\n<p align="right"><b>%s</b></p>'%(time.strftime('%a %d %b %Y', time.localtime()),)),
+                           # Blank line before categories
+                           (r'<b>\s*NYT', lambda match: '<p></p><b>NYT'),
+                           # Blank line between articles
+                           (r'<p><a href', lambda match : '<br /><p><a href'),
+                           # Remove header on individual articles
+                           (r'<body class=.printerversion..*?<h1><nyt_headline', 
+                            lambda match : '<body class="printerversion">\n<h1><nyt_headline'),
+                           # Remove footer from individiual articles
+                           (r'<nyt_update_bottom.*', lambda match : '</body></html>'),
+                           # Remove TimesSelect garbage
+                           (r'<title>.*?TimesSelect', lambda match : 'Downloading of TimesSelect stories is not supported.<!--'),
+                           ]
+                          ],
+                         },
+                         
+            'bbc'     : {
+                         'url'               : 'http://bbcriver.com',
+                         'title'             : 'The BBC',
+                         'no_stylesheets'    : True,
+                         'preprocess_regexps' :
+                         [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in 
+                          [
+                           # Remove help link and replace by title
+                           (r'<a .*?alt=.Click here for information about this service.*?</a>', 
+                            lambda match: '<h1>The BBC</h1>\n<p align="right"><b>%s</b></p>'%(time.strftime('%a %d %b %Y', time.localtime()),)),
+                           # Blank line before categories
+                           (r'<b>\s*BBC', lambda match: '<p></p><b>BBC'),
+                           # Remove footer from individual stories
+                           (r'<div class=.footer.>.*?Published', 
+                            lambda match : '<p></p><div class="footer">Published'),
+                           # Add some style info in place of disabled stylesheet
+                           (r'<link.*?type=.text/css.*?>',
+                            '<style type="text/css">.headline {font-size: x-large;}</style>'),
+                           ]
+                          ],
+                         },                                     
+            }
+
+for key in profiles.keys():
+    if key == 'default':
+        continue
+    newd = profiles['default'].copy()
+    newd.update(profiles[key])
+    profiles[key] = newd
+
+def profile_to_command_line_options(profile):
+    args = []
+    args.append('--max-recursions='+str(profile['max_recursions']))
+    args.append('--delay='+str(profile['delay']))
+    for i in profile['match_regexps']:
+        args.append('--match-regexp="'+i+'"')
+    for i in profile['filter_regexps']:
+        args.append('--filter-regexp="'+i+'"')
+    return args
\ No newline at end of file
diff --git a/src/libprs500/web/fetch/simple.py b/src/libprs500/web/fetch/simple.py
index 9199fb198a..b1e6e37867 100644
--- a/src/libprs500/web/fetch/simple.py
+++ b/src/libprs500/web/fetch/simple.py
@@ -15,7 +15,7 @@
 '''
 Fetch a webpage and its links recursively.
 '''
-import sys, socket, urllib2, os, urlparse, codecs, logging, re, time
+import sys, socket, urllib2, os, urlparse, codecs, logging, re, time, copy
 from urllib import url2pathname
 from httplib import responses
 from optparse import OptionParser
@@ -45,6 +45,11 @@ def save_soup(soup, target):
 class RecursiveFetcher(object):
     LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in 
                 ('.exe\s*$', '.mp3\s*$', '.ogg\s*$', '^\s*mailto:', '^\s*$'))
+    #ADBLOCK_FILTER = tuple(re.compile(i, re.IGNORECASE) for it in
+    #                       (
+    #                        
+    #                        )
+    #                       )
     CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE)
     
     def __init__(self, options):
@@ -64,6 +69,14 @@ class RecursiveFetcher(object):
         self.stylemap = {}
         self.current_dir = self.base_dir
         self.files = 0
+        self.preprocess_regexps = []
+        self.download_stylesheets = not options.no_stylesheets
+               
+
+    def get_soup(self, src):
+        nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
+        nmassage.extend(self.preprocess_regexps)
+        return BeautifulSoup(src, markupMassage=nmassage)
 
     def fetch_url(self, url):
         f = None
@@ -84,7 +97,7 @@ class RecursiveFetcher(object):
         
     def start_fetch(self, url):
         soup = BeautifulSoup('<a href="'+url+'" />')
-        print 'Working',
+        print 'Downloading',
         res = self.process_links(soup, url, 0, into_dir='')
         print '%s saved to %s'%(url, res)
         return res
@@ -99,9 +112,8 @@ class RecursiveFetcher(object):
         if self.filter_regexps:
             for f in self.filter_regexps:
                 if f.search(url):
-                    return False
-            return True
-        elif self.match_regexps:
+                    return False            
+        if self.match_regexps:
             for m in self.match_regexps:
                 if m.search(url):
                     return True
@@ -243,10 +255,11 @@ class RecursiveFetcher(object):
                 try:
                     self.current_dir = linkdiskpath
                     f = self.fetch_url(iurl)
-                    soup = BeautifulSoup(f.read())
+                    soup = self.get_soup(f.read())
                     logger.info('Processing images...')
                     self.process_images(soup, f.geturl())
-                    self.process_stylesheets(soup, f.geturl())
+                    if self.download_stylesheets:
+                        self.process_stylesheets(soup, f.geturl())
                     
                     res = os.path.join(linkdiskpath, basename(iurl))
                     self.filemap[nurl] = res
@@ -284,26 +297,36 @@ def option_parser(usage='%prog URL\n\nWhere URL is for example http://google.com
                       default=1, type='int', dest='max_recursions')
     parser.add_option('-n', '--max-files', default=sys.maxint, type='int', dest='max_files',
                       help='The maximum number of files to download. This only applies to files from <a href> tags. Default is %default')
+    parser.add_option('--delay', default=0, dest='delay', type='int',
+                      help='Minimum interval in seconds between consecutive fetches. Default is %default s')
     parser.add_option('--match-regexp', default=[], action='append', dest='match_regexps',
                       help='Only links that match this regular expression will be followed. This option can be specified multiple times, in which case as long as a link matches any one regexp, it will be followed. By default all links are followed.')
     parser.add_option('--filter-regexp', default=[], action='append', dest='filter_regexps',
-                      help='Any link that matches this regular expression will be ignored. This option can be specified multiple times, in which case as long as any regexp matches a link, it will be ignored.By default, no links are ignored. If both --filter-regexp and --match-regexp are specified, then --match-regexp is ignored.')
-    parser.add_option('--delay', default=0, dest='delay', type='int',
-                      help='Minimum interval in seconds between consecutive fetches. Default is %default s')
+                      help='Any link that matches this regular expression will be ignored. This option can be specified multiple times, in which case as long as any regexp matches a link, it will be ignored.By default, no links are ignored. If both --filter-regexp and --match-regexp are specified, then --filter-regexp is applied first.')
+    parser.add_option('--dont-download-stylesheets', action='store_true', default=False,
+                      help='Do not download CSS stylesheets.', dest='no_stylesheets')
+
     parser.add_option('--verbose', help='Show detailed output information. Useful for debugging',
                       default=False, action='store_true', dest='verbose')
     return parser
 
+
+def create_fetcher(options):
+    return RecursiveFetcher(options)
+
+def setup_logger(options):
+    level = logging.DEBUG if options.verbose else logging.WARNING
+    setup_cli_handlers(logger, level)
+
 def main(args=sys.argv):
     parser = option_parser()    
     options, args = parser.parse_args(args)
     if len(args) != 2:
         parser.print_help()
         return 1
-    level = logging.DEBUG if options.verbose else logging.WARNING
-    setup_cli_handlers(logger, level)
-        
-    fetcher = RecursiveFetcher(options)
+    
+    setup_logger(options)
+    fetcher = create_fetcher(options) 
     fetcher.start_fetch(args[1])