Initial implementation of web2lrf

This commit is contained in:
Kovid Goyal 2007-07-13 23:20:41 +00:00
parent a601c9c11e
commit 8f38a29165
6 changed files with 271 additions and 16 deletions

View File

@ -13,7 +13,7 @@
## with this program; if not, write to the Free Software Foundation, Inc., ## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
''' E-book management software''' ''' E-book management software'''
__version__ = "0.3.69" __version__ = "0.3.70"
__docformat__ = "epytext" __docformat__ = "epytext"
__author__ = "Kovid Goyal <kovid@kovidgoyal.net>" __author__ = "Kovid Goyal <kovid@kovidgoyal.net>"
__appname__ = 'libprs500' __appname__ = 'libprs500'

View File

@ -74,7 +74,7 @@ def option_parser(usage):
metadata = parser.add_option_group('METADATA OPTIONS') metadata = parser.add_option_group('METADATA OPTIONS')
metadata.add_option('--header', action='store_true', default=False, dest='header', metadata.add_option('--header', action='store_true', default=False, dest='header',
help='Add a header to all the pages with title and author.') help='Add a header to all the pages with title and author.')
metadata.add_option("-t", "--title", action="store", type="string", \ metadata.add_option("-t", "--title", action="store", type="string", default=None,\
dest="title", help="Set the title. Default: filename.") dest="title", help="Set the title. Default: filename.")
metadata.add_option("-a", "--author", action="store", type="string", \ metadata.add_option("-a", "--author", action="store", type="string", \
dest="author", help="Set the author. Default: %default", default='Unknown') dest="author", help="Set the author. Default: %default", default='Unknown')

View File

@ -0,0 +1,14 @@
## Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

View File

@ -0,0 +1,119 @@
## Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
'''Convert known websites into LRF files.'''
import sys, time, tempfile, shutil, os
from urlparse import urlsplit
from libprs500 import __appname__
from libprs500.ebooks.lrf.html.convert_from import option_parser as html_option_parser
from libprs500.ebooks.lrf.html.convert_from import process_file
from libprs500.ebooks.lrf.web.profiles import profiles
from libprs500.web.fetch.simple import setup_logger as web2disk_setup_logger
from libprs500.web.fetch.simple import create_fetcher
available_profiles = profiles.keys()
available_profiles.remove('default')
available_profiles = ' '.join(available_profiles)
def option_parser():
parser = html_option_parser(usage='''%prog [options] website_profile\n\n'''
'''%prog downloads a site from the web and converts it '''
'''into a LRF file for use with the SONY Reader. '''
'''website_profile is one of '''+available_profiles+\
''' If you specify a website_profile of default or do not specify '''
'''it, you must specify the --url option.'''
)
parser.remove_option('-t')
parser.add_option('-u', '--url', dest='url', default=None,
help='The URL to download. You only need to specify this if you are not specifying a website_profile.')
parser.add_option('-t', '--timeout', help='Timeout in seconds to wait for a response from the server. Default: %default s',
default=None, type='int', dest='timeout')
parser.add_option('-r', '--max-recursions', help='Maximum number of levels to recurse i.e. depth of links to follow. Default %default',
default=None, type='int', dest='max_recursions')
parser.add_option('-n', '--max-files', default=None, type='int', dest='max_files',
help='The maximum number of files to download. This only applies to files from <a href> tags. Default is %default')
parser.add_option('--delay', default=None, dest='delay', type='int',
help='Minimum interval in seconds between consecutive fetches. Default is %default s')
parser.add_option('--dont-download-stylesheets', action='store_true', default=None,
help='Do not download CSS stylesheets.', dest='no_stylesheets')
parser.add_option('--match-regexp', dest='match_regexps', default=[], action='append',
help='Only links that match this regular expression will be followed. This option can be specified multiple times, in which case as long as a link matches any one regexp, it will be followed. By default all links are followed.')
parser.add_option('--filter-regexp', default=[], action='append', dest='filter_regexps',
help='Any link that matches this regular expression will be ignored. This option can be specified multiple times, in which case as long as any regexp matches a link, it will be ignored.By default, no links are ignored. If both --filter-regexp and --match-regexp are specified, then --filter-regexp is applied first.')
return parser
def fetch_website(options):
tdir = tempfile.mkdtemp(prefix=__appname__+'_' )
options.dir = tdir
web2disk_setup_logger(options)
fetcher = create_fetcher(options)
fetcher.preprocess_regexps = options.preprocess_regexps
return fetcher.start_fetch(options.url), tdir
def create_lrf(htmlfile, options):
options.author = __appname__
options.header = True
if not options.output:
options.output = os.path.abspath(os.path.expanduser(options.title + ('.lrs' if options.lrs else '.lrf')))
process_file(htmlfile, options)
def main(args=sys.argv):
parser = option_parser()
options, args = parser.parse_args(args)
if len(args) > 2:
parser.print_help()
return 1
if len(args) == 2:
if not profiles.has_key(args[1]):
print >>sys.stderr, 'Unknown profile', args[1]
print >>sys.stderr, 'Valid profiles:', profiles.keys()
return 1
profile = profiles[args[1]] if len(args) == 2 else profiles['default']
for opt in ('url', 'timeout', 'max_recursions', 'max_files', 'delay', 'no_stylesheets'):
val = getattr(options, opt)
if val is None:
setattr(options, opt, profile[opt])
if not options.url:
parser.print_help()
print >>sys.stderr
print >>sys.stderr, 'You must specify the --url option or a profile from one of:',
print >>sys.stderr, available_profiles
return 1
if not options.title:
title = profile['title']
if not title:
title = urlsplit(options.url).netloc
options.title = title + time.strftime(' [%a %d %b %Y]', time.localtime())
options.match_regexps += profile['match_regexps']
options.preprocess_regexps = profile['preprocess_regexps']
options.filter_regexps += profile['filter_regexps']
htmlfile, tdir = fetch_website(options)
create_lrf(htmlfile, options)
shutil.rmtree(tdir)
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -0,0 +1,99 @@
## Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
'''Profiles for known websites.'''
import time, re
profiles = {
'default' : {
'url' : '', # The URL of the website
'title' : '', # The title to use for the LRF file
'max_recursions' : 1, # Number of levels of links to follow
'max_files' : 1000, # Maximum number of files to download
'delay' : 0, # Delay between consecutive downloads
'timeout' : 10, # Timeout for fetching files from server
'no_stylesheets' : False, # Download stylesheets
'match_regexps' : [], # List of regular expressions that determines which links to follow
'filter_regexps' : [], # List of regular expressions that determines which links to ignore
# Only one of match_regexps or filter_regexps should be defined
'html2lrf_options' : [], # List of options to pass to html2lrf
'preprocess_regexps': [], # List of regexp substitution rules to run on the downloaded HTML before running html2lrf
# See the profiles below for examples of these settings.
},
'nytimes' : {
'url' : 'http://nytimesriver.com',
'title' : 'The New York Times',
'match_regexps' : 'nytimes.com/'+time.strftime('%Y', time.localtime()),
'preprocess_regexps' :
[ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
# Remove help link and replace by title
(r'<a .*?alt=.Click here for information about this service.*?</a>',
lambda match: '<h1>The New York Times</h1>\n<p align="right"><b>%s</b></p>'%(time.strftime('%a %d %b %Y', time.localtime()),)),
# Blank line before categories
(r'<b>\s*NYT', lambda match: '<p></p><b>NYT'),
# Blank line between articles
(r'<p><a href', lambda match : '<br /><p><a href'),
# Remove header on individual articles
(r'<body class=.printerversion..*?<h1><nyt_headline',
lambda match : '<body class="printerversion">\n<h1><nyt_headline'),
# Remove footer from individiual articles
(r'<nyt_update_bottom.*', lambda match : '</body></html>'),
# Remove TimesSelect garbage
(r'<title>.*?TimesSelect', lambda match : 'Downloading of TimesSelect stories is not supported.<!--'),
]
],
},
'bbc' : {
'url' : 'http://bbcriver.com',
'title' : 'The BBC',
'no_stylesheets' : True,
'preprocess_regexps' :
[ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
# Remove help link and replace by title
(r'<a .*?alt=.Click here for information about this service.*?</a>',
lambda match: '<h1>The BBC</h1>\n<p align="right"><b>%s</b></p>'%(time.strftime('%a %d %b %Y', time.localtime()),)),
# Blank line before categories
(r'<b>\s*BBC', lambda match: '<p></p><b>BBC'),
# Remove footer from individual stories
(r'<div class=.footer.>.*?Published',
lambda match : '<p></p><div class="footer">Published'),
# Add some style info in place of disabled stylesheet
(r'<link.*?type=.text/css.*?>',
'<style type="text/css">.headline {font-size: x-large;}</style>'),
]
],
},
}
for key in profiles.keys():
if key == 'default':
continue
newd = profiles['default'].copy()
newd.update(profiles[key])
profiles[key] = newd
def profile_to_command_line_options(profile):
args = []
args.append('--max-recursions='+str(profile['max_recursions']))
args.append('--delay='+str(profile['delay']))
for i in profile['match_regexps']:
args.append('--match-regexp="'+i+'"')
for i in profile['filter_regexps']:
args.append('--filter-regexp="'+i+'"')
return args

View File

@ -15,7 +15,7 @@
''' '''
Fetch a webpage and its links recursively. Fetch a webpage and its links recursively.
''' '''
import sys, socket, urllib2, os, urlparse, codecs, logging, re, time import sys, socket, urllib2, os, urlparse, codecs, logging, re, time, copy
from urllib import url2pathname from urllib import url2pathname
from httplib import responses from httplib import responses
from optparse import OptionParser from optparse import OptionParser
@ -45,6 +45,11 @@ def save_soup(soup, target):
class RecursiveFetcher(object): class RecursiveFetcher(object):
LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in
('.exe\s*$', '.mp3\s*$', '.ogg\s*$', '^\s*mailto:', '^\s*$')) ('.exe\s*$', '.mp3\s*$', '.ogg\s*$', '^\s*mailto:', '^\s*$'))
#ADBLOCK_FILTER = tuple(re.compile(i, re.IGNORECASE) for it in
# (
#
# )
# )
CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE) CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE)
def __init__(self, options): def __init__(self, options):
@ -64,6 +69,14 @@ class RecursiveFetcher(object):
self.stylemap = {} self.stylemap = {}
self.current_dir = self.base_dir self.current_dir = self.base_dir
self.files = 0 self.files = 0
self.preprocess_regexps = []
self.download_stylesheets = not options.no_stylesheets
def get_soup(self, src):
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
nmassage.extend(self.preprocess_regexps)
return BeautifulSoup(src, markupMassage=nmassage)
def fetch_url(self, url): def fetch_url(self, url):
f = None f = None
@ -84,7 +97,7 @@ class RecursiveFetcher(object):
def start_fetch(self, url): def start_fetch(self, url):
soup = BeautifulSoup('<a href="'+url+'" />') soup = BeautifulSoup('<a href="'+url+'" />')
print 'Working', print 'Downloading',
res = self.process_links(soup, url, 0, into_dir='') res = self.process_links(soup, url, 0, into_dir='')
print '%s saved to %s'%(url, res) print '%s saved to %s'%(url, res)
return res return res
@ -100,8 +113,7 @@ class RecursiveFetcher(object):
for f in self.filter_regexps: for f in self.filter_regexps:
if f.search(url): if f.search(url):
return False return False
return True if self.match_regexps:
elif self.match_regexps:
for m in self.match_regexps: for m in self.match_regexps:
if m.search(url): if m.search(url):
return True return True
@ -243,10 +255,11 @@ class RecursiveFetcher(object):
try: try:
self.current_dir = linkdiskpath self.current_dir = linkdiskpath
f = self.fetch_url(iurl) f = self.fetch_url(iurl)
soup = BeautifulSoup(f.read()) soup = self.get_soup(f.read())
logger.info('Processing images...') logger.info('Processing images...')
self.process_images(soup, f.geturl()) self.process_images(soup, f.geturl())
self.process_stylesheets(soup, f.geturl()) if self.download_stylesheets:
self.process_stylesheets(soup, f.geturl())
res = os.path.join(linkdiskpath, basename(iurl)) res = os.path.join(linkdiskpath, basename(iurl))
self.filemap[nurl] = res self.filemap[nurl] = res
@ -284,26 +297,36 @@ def option_parser(usage='%prog URL\n\nWhere URL is for example http://google.com
default=1, type='int', dest='max_recursions') default=1, type='int', dest='max_recursions')
parser.add_option('-n', '--max-files', default=sys.maxint, type='int', dest='max_files', parser.add_option('-n', '--max-files', default=sys.maxint, type='int', dest='max_files',
help='The maximum number of files to download. This only applies to files from <a href> tags. Default is %default') help='The maximum number of files to download. This only applies to files from <a href> tags. Default is %default')
parser.add_option('--delay', default=0, dest='delay', type='int',
help='Minimum interval in seconds between consecutive fetches. Default is %default s')
parser.add_option('--match-regexp', default=[], action='append', dest='match_regexps', parser.add_option('--match-regexp', default=[], action='append', dest='match_regexps',
help='Only links that match this regular expression will be followed. This option can be specified multiple times, in which case as long as a link matches any one regexp, it will be followed. By default all links are followed.') help='Only links that match this regular expression will be followed. This option can be specified multiple times, in which case as long as a link matches any one regexp, it will be followed. By default all links are followed.')
parser.add_option('--filter-regexp', default=[], action='append', dest='filter_regexps', parser.add_option('--filter-regexp', default=[], action='append', dest='filter_regexps',
help='Any link that matches this regular expression will be ignored. This option can be specified multiple times, in which case as long as any regexp matches a link, it will be ignored.By default, no links are ignored. If both --filter-regexp and --match-regexp are specified, then --match-regexp is ignored.') help='Any link that matches this regular expression will be ignored. This option can be specified multiple times, in which case as long as any regexp matches a link, it will be ignored.By default, no links are ignored. If both --filter-regexp and --match-regexp are specified, then --filter-regexp is applied first.')
parser.add_option('--delay', default=0, dest='delay', type='int', parser.add_option('--dont-download-stylesheets', action='store_true', default=False,
help='Minimum interval in seconds between consecutive fetches. Default is %default s') help='Do not download CSS stylesheets.', dest='no_stylesheets')
parser.add_option('--verbose', help='Show detailed output information. Useful for debugging', parser.add_option('--verbose', help='Show detailed output information. Useful for debugging',
default=False, action='store_true', dest='verbose') default=False, action='store_true', dest='verbose')
return parser return parser
def create_fetcher(options):
return RecursiveFetcher(options)
def setup_logger(options):
level = logging.DEBUG if options.verbose else logging.WARNING
setup_cli_handlers(logger, level)
def main(args=sys.argv): def main(args=sys.argv):
parser = option_parser() parser = option_parser()
options, args = parser.parse_args(args) options, args = parser.parse_args(args)
if len(args) != 2: if len(args) != 2:
parser.print_help() parser.print_help()
return 1 return 1
level = logging.DEBUG if options.verbose else logging.WARNING
setup_cli_handlers(logger, level)
fetcher = RecursiveFetcher(options) setup_logger(options)
fetcher = create_fetcher(options)
fetcher.start_fetch(args[1]) fetcher.start_fetch(args[1])