Initial implementation of web2disk

2025-07-09 03:04:10 -04:00 · 2007-07-13 03:09:33 +00:00 · 2007-07-13 03:09:33 +00:00 · 00a50740fa
commit 00a50740fa
parent aa06fcfb1a
10 changed files with 406 additions and 11 deletions
--- a/setup.py
+++ b/setup.py
@ -28,6 +28,7 @@ entry_points = {
                             'markdown = libprs500.ebooks.markdown.markdown:main',\
                             'lit2lrf  = libprs500.ebooks.lrf.lit.convert_from:main',\
                             'rtf2lrf  = libprs500.ebooks.lrf.rtf.convert_from:main',\
+                             'web2disk = libprs500.web.fetch.simple:main',\
                           ], 
        'gui_scripts'    : [ APPNAME+' = libprs500.gui.main:main']
      }
--- a/src/libprs500/init.py
+++ b/src/libprs500/init.py
@ -13,12 +13,12 @@
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 ''' E-book management software'''
-__version__   = "0.3.68"
+__version__   = "0.3.69"
 __docformat__ = "epytext"
 __author__    = "Kovid Goyal <kovid@kovidgoyal.net>"
 __appname__   = 'libprs500'

-import sys, os
+import sys, os, logging
 iswindows = 'win32' in sys.platform.lower()
 isosx     = 'darwin' in sys.platform.lower()

@ -28,6 +28,23 @@ if iswindows:
    except:
        pass

+def setup_cli_handlers(logger, level):
+    logger.setLevel(level)
+    if level == logging.WARNING:
+        handler = logging.StreamHandler(sys.stdout)
+        handler.setFormatter(logging.Formatter('%(levelname)s: %(message)s'))
+        handler.setLevel(logging.WARNING)
+    elif level == logging.INFO:
+        handler = logging.StreamHandler(sys.stdout)
+        handler.setFormatter(logging.Formatter())
+        handler.setLevel(logging.INFO)
+    elif level == logging.DEBUG:
+        handler = logging.StreamHandler(sys.stderr)
+        handler.setLevel(logging.DEBUG)
+        handler.setFormatter(logging.Formatter('[%(levelname)s] %(filename)s:%(lineno)s: %(message)s'))
+    logger.addHandler(handler)
+
+
 def load_library(name, cdll):
    if iswindows:
        return cdll.LoadLibrary(name)
--- a/src/libprs500/ebooks/lrf/init.py
+++ b/src/libprs500/ebooks/lrf/init.py
@ -26,11 +26,11 @@ from libprs500.ebooks.lrf.pylrs.pylrs import TextBlock, Header, PutObj, \
                                             Paragraph, TextStyle, BlockStyle
 from libprs500.ebooks.lrf.fonts import FONT_FILE_MAP
 from libprs500.ebooks import ConversionError
-from libprs500 import __version__ as VERSION
+from libprs500 import __appname__, __version__, __author__
 from libprs500 import iswindows

 __docformat__ = "epytext"
-__author__    = "Kovid Goyal <kovid@kovidgoyal.net>"
+

 class PRS500_PROFILE(object):
    screen_width  = 600
@ -69,8 +69,8 @@ def font_family(option, opt_str, value, parser):
            
    
 def option_parser(usage):
-    parser = OptionParser(usage=usage, version='libprs500 '+VERSION,
-                          epilog='Created by Kovid Goyal')
+    parser = OptionParser(usage=usage, version=__appname__+' '+__version__,
+                          epilog='Created by '+__author__)
    metadata = parser.add_option_group('METADATA OPTIONS')
    metadata.add_option('--header', action='store_true', default=False, dest='header',
                      help='Add a header to all the pages with title and author.')
--- a/src/libprs500/ebooks/lrf/html/convert_from.py
+++ b/src/libprs500/ebooks/lrf/html/convert_from.py
@ -36,7 +36,8 @@ from libprs500.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, \
                Comment, Tag, NavigableString, Declaration, ProcessingInstruction
 from libprs500.ebooks.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream, \
                TextBlock, ImageBlock, JumpButton, CharButton, Bold, Space, \
-                Plot, Image, BlockSpace, RuledLine, BookSetting, Canvas, DropCaps
+                Plot, Image, BlockSpace, RuledLine, BookSetting, Canvas, DropCaps, \
+                LrsError
 from libprs500.ebooks.lrf.pylrs.pylrs import Span as _Span
 from libprs500.ebooks.lrf import option_parser, Book, PRS500_PROFILE
 from libprs500.ebooks import ConversionError
@ -584,7 +585,7 @@ class HTMLConverter(object):
            elif self.link_level < self.max_link_levels:
                try: # os.access raises Exceptions in path has null bytes
                    if not os.access(path.encode('utf8', 'replace'), os.R_OK):
-                        raise Exception()
+                        continue
                except Exception:
                    if self.verbose:
                        print "Skipping", link
@ -859,7 +860,10 @@ class HTMLConverter(object):
            return        
        
        if not self.images.has_key(path):
-            self.images[path] = ImageStream(path)
+            try:
+                self.images[path] = ImageStream(path)
+            except LrsError:
+                return
            
        im = Image(self.images[path], x0=0, y0=0, x1=width, y1=height,\
                               xsize=width, ysize=height)                    
--- a/src/libprs500/ebooks/metadata/init.py
+++ b/src/libprs500/ebooks/metadata/init.py
@ -47,6 +47,9 @@ class MetaInformation(object):
        self.category = None
        self.classification = None
        self.publisher = None
+        self.series    = None
+        self.series_index = None
+        self.rating       = None
        
    def __str__(self):
        ans = ''
--- a/src/libprs500/library/database.py
+++ b/src/libprs500/library/database.py
@ -17,7 +17,7 @@ Backend that implements storage of ebooks in an sqlite database.
 """
 import sqlite3 as sqlite
 import os, datetime, re
-from zlib import compress, decompress
+from zlib import compressobj, decompress
 from stat import ST_SIZE

 class Concatenate(object):
@ -690,6 +690,17 @@ class LibraryDatabase(object):
        self.conn.execute('INSERT INTO books_publishers_link(book, publisher) VALUES (?,?)', (id, aid))
        self.conn.commit()
        
+    def set_series(self, id, series):
+        self.conn.execute('DELETE FROM books_series_link WHERE book=?',(id,))
+        if series:
+            s = self.conn.execute('SELECT id from series WHERE name=?', (series,)).fetchone()
+            if s:
+                aid = s[0]
+            else:
+                aid = self.conn.execute('INSERT INTO series(name) VALUES (?)', (series,)).lastrowid
+            self.conn.execute('INSERT INTO books_series_link(book, series) VALUES (?,?)', (id, aid))
+            self.conn.commit()
+    
    def set_rating(self, id, rating):
        rating = int(rating)
        self.conn.execute('DELETE FROM books_ratings_link WHERE book=?',(id,))
@ -698,3 +709,28 @@ class LibraryDatabase(object):
        self.conn.execute('INSERT INTO books_ratings_link(book, rating) VALUES (?,?)', (id, rat))
        self.conn.commit()
        
+    def add_book(self, stream, format, mi, uri=None):
+        if not mi.author:
+            mi.author = 'Unknown'
+        obj = self.conn.execute('INSERT INTO books(title, uri, series_index) VALUES (?, ?, ?)', 
+                          (mi.title, uri, mi.series_index))
+        id = obj.lastrowid
+        self.conn.commit()
+        temp = mi.author.split(',')
+        authors = []
+        for a in temp:
+            authors += a.split('&')
+        self.set_authors(id, authors)
+        if mi.publisher:
+            self.set_publisher(id, mi.publisher)
+        if mi.rating:
+            self.set_rating(id, mi.rating)
+        if mi.series:
+            self.set_series(id, mi.series)
+        stream.seek(0, 2)
+        usize = stream.tell()
+        stream.seek(0)
+        self.conn.execute('INSERT INTO data(book, format, uncompressed_size, data) VALUES (?,?,?,?)',
+                          (id, format, usize, compressobj().compress(stream)))
+        self.conn.commit()
+            
--- a/src/libprs500/web/init.py
+++ b/src/libprs500/web/init.py
@ -0,0 +1,18 @@
+##    Copyright (C) 2006 Kovid Goyal kovid@kovidgoyal.net
+##    This program is free software; you can redistribute it and/or modify
+##    it under the terms of the GNU General Public License as published by
+##    the Free Software Foundation; either version 2 of the License, or
+##    (at your option) any later version.
+##
+##    This program is distributed in the hope that it will be useful,
+##    but WITHOUT ANY WARRANTY; without even the implied warranty of
+##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+##    GNU General Public License for more details.
+##
+##    You should have received a copy of the GNU General Public License along
+##    with this program; if not, write to the Free Software Foundation, Inc.,
+##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+
+
+ 
--- a/src/libprs500/web/fetch/init.py
+++ b/src/libprs500/web/fetch/init.py
@ -0,0 +1,14 @@
+##    Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
+##    This program is free software; you can redistribute it and/or modify
+##    it under the terms of the GNU General Public License as published by
+##    the Free Software Foundation; either version 2 of the License, or
+##    (at your option) any later version.
+##
+##    This program is distributed in the hope that it will be useful,
+##    but WITHOUT ANY WARRANTY; without even the implied warranty of
+##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+##    GNU General Public License for more details.
+##
+##    You should have received a copy of the GNU General Public License along
+##    with this program; if not, write to the Free Software Foundation, Inc.,
+##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
--- a/src/libprs500/web/fetch/simple.py
+++ b/src/libprs500/web/fetch/simple.py
@ -0,0 +1,301 @@
+##    Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
+##    This program is free software; you can redistribute it and/or modify
+##    it under the terms of the GNU General Public License as published by
+##    the Free Software Foundation; either version 2 of the License, or
+##    (at your option) any later version.
+##
+##    This program is distributed in the hope that it will be useful,
+##    but WITHOUT ANY WARRANTY; without even the implied warranty of
+##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+##    GNU General Public License for more details.
+##
+##    You should have received a copy of the GNU General Public License along
+##    with this program; if not, write to the Free Software Foundation, Inc.,
+##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+'''
+Fetch a webpage and its links recursively.
+'''
+import sys, socket, urllib2, os, urlparse, codecs, logging, re
+from urllib import url2pathname
+from httplib import responses
+from optparse import OptionParser
+
+from libprs500 import __version__, __appname__, __author__, setup_cli_handlers
+from libprs500.ebooks.BeautifulSoup import BeautifulSoup
+
+logger = logging.getLogger('libprs500.web.fetch.simple')
+
+class FetchError(Exception):
+    pass
+
+def fetch_url(url):
+    f = None
+    logger.info('Fetching %s', url)
+    try:
+        f = urllib2.urlopen(url)
+    except urllib2.URLError, err:
+        if hasattr(err, 'code') and responses.has_key(err.code):
+            raise FetchError, responses[err.code]
+        raise err
+    return f
+
+def basename(url):
+    parts = urlparse.urlsplit(url)
+    path = url2pathname(parts.path)
+    res = os.path.basename(path)
+    if not os.path.splitext(res)[1]:
+        return 'index.html'
+    return res
+
+def save_soup(soup, target):
+    f = codecs.open(target, 'w', 'utf8')
+    f.write(unicode(soup))
+    f.close()
+
+
+class RecursiveFetcher(object):
+    LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in 
+                ('.exe\s*$', '.mp3\s*$', '.ogg\s*$', '^\s*mailto:', '^\s*$'))
+    CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE)
+    
+    def __init__(self, options):
+        self.base_dir = os.path.abspath(os.path.expanduser(options.dir))
+        if not os.path.exists(self.base_dir):
+            os.makedirs(self.base_dir)
+        self.default_timeout = socket.getdefaulttimeout()
+        socket.setdefaulttimeout(options.timeout)
+        self.max_recursions = options.max_recursions
+        self.match_regexps  = [re.compile(i, re.IGNORECASE) for i in options.match_regexps]
+        self.filter_regexps = [re.compile(i, re.IGNORECASE) for i in options.filter_regexps]
+        self.max_files = options.max_files
+        self.filemap = {}
+        self.imagemap = {}
+        self.stylemap = {}
+        self.current_dir = self.base_dir
+        self.files = 0
+        
+    def start_fetch(self, url):
+        soup = BeautifulSoup('<a href="'+url+'" />')
+        print 'Working',
+        res = self.process_links(soup, url, 0, into_dir='')
+        print '%s saved to %s'%(url, res)
+        return res
+    
+    def is_link_ok(self, url):
+        for i in self.__class__.LINK_FILTER:
+            if i.search(url):
+                return False
+        return True
+        
+    def is_link_wanted(self, url):
+        if self.filter_regexps:
+            for f in self.filter_regexps:
+                if f.search(url):
+                    return False
+            return True
+        elif self.match_regexps:
+            for m in self.match_regexps:
+                if m.search(url):
+                    return True
+            return False
+        return True
+        
+    def process_stylesheets(self, soup, baseurl):
+        diskpath = os.path.join(self.current_dir, 'stylesheets')
+        if not os.path.exists(diskpath):
+            os.mkdir(diskpath)
+        c = 0
+        for tag in soup.findAll(lambda tag: tag.name.lower()in ['link', 'style'] and tag.has_key('type') and tag['type'].lower() == 'text/css'):
+            if tag.has_key('href'):
+                iurl = tag['href']
+                if not urlparse.urlsplit(iurl).scheme:
+                    iurl = urlparse.urljoin(baseurl, iurl, False)
+                if self.stylemap.has_key(iurl):
+                    tag['href'] = self.stylemap[iurl]
+                    continue
+                try:
+                    f = fetch_url(iurl)
+                except Exception, err:
+                    logger.warning('Could not fetch stylesheet %s', iurl)
+                    logger.debug('Error: %s', str(err), exc_info=True)
+                    continue
+                c += 1
+                stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
+                self.stylemap[iurl] = stylepath
+                open(stylepath, 'wb').write(f.read())
+                tag['href'] = stylepath
+            else:
+                for ns in tag.findAll(text=True):                    
+                    src = str(ns)
+                    m = self.__class__.CSS_IMPORT_PATTERN.search(src)
+                    if m:
+                        iurl = m.group(1)
+                        if not urlparse.urlsplit(iurl).scheme:
+                            iurl = urlparse.urljoin(baseurl, iurl, False)
+                        if self.stylemap.has_key(iurl):
+                            ns.replaceWith(src.replace(m.group(1), self.stylemap[iurl]))
+                            continue
+                        try:
+                            f = fetch_url(iurl)
+                        except Exception, err:
+                            logger.warning('Could not fetch stylesheet %s', iurl)
+                            logger.debug('Error: %s', str(err), exc_info=True)
+                            continue
+                        c += 1
+                        stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
+                        self.stylemap[iurl] = stylepath
+                        open(stylepath, 'wb').write(f.read())
+                        ns.replaceWith(src.replace(m.group(1), stylepath))
+                        
+                        
+    
+    def process_images(self, soup, baseurl):
+        diskpath = os.path.join(self.current_dir, 'images')
+        if not os.path.exists(diskpath):
+            os.mkdir(diskpath)
+        c = 0
+        for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
+            iurl, ext = tag['src'], os.path.splitext(tag['src'])[1]
+            if not ext:
+                logger.info('Skipping extensionless image %s', iurl)
+                continue
+            if not urlparse.urlsplit(iurl).scheme:
+                iurl = urlparse.urljoin(baseurl, iurl, False)
+            if self.imagemap.has_key(iurl):
+                tag['src'] = self.imagemap[iurl]
+                continue
+            try:
+                f = fetch_url(iurl)
+            except Exception, err:
+                logger.warning('Could not fetch image %s', iurl)
+                logger.debug('Error: %s', str(err), exc_info=True)
+                continue
+            c += 1
+            imgpath = os.path.join(diskpath, 'img'+str(c)+ext)
+            self.imagemap[iurl] = imgpath
+            open(imgpath, 'wb').write(f.read())
+            tag['src'] = imgpath
+
+    def absurl(self, baseurl, tag, key): 
+        iurl = tag[key]
+        parts = urlparse.urlsplit(iurl)
+        if not parts.netloc and not parts.path:
+            return None
+        if not parts.scheme:
+            iurl = urlparse.urljoin(baseurl, iurl, False)
+        if not self.is_link_ok(iurl):
+            logger.info('Skipping invalid link: %s', iurl)
+            return None
+        return iurl
+    
+    def normurl(self, url):
+        parts = list(urlparse.urlsplit(url))
+        parts[4] = ''
+        return urlparse.urlunsplit(parts)
+                
+    def localize_link(self, tag, key, path):
+        parts = urlparse.urlsplit(tag[key])
+        suffix = '#'+parts.fragment if parts.fragment else ''
+        tag[key] = path+suffix
+    
+    def process_return_links(self, soup, baseurl):
+        for tag in soup.findAll(lambda tag: tag.name.lower()=='a' and tag.has_key('href')):
+            iurl = self.absurl(baseurl, tag, 'href')            
+            if not iurl:
+                continue
+            nurl = self.normurl(iurl)
+            if self.filemap.has_key(nurl):
+                self.localize_link(tag, 'href', self.filemap[nurl])
+    
+    def process_links(self, soup, baseurl, recursion_level, into_dir='links'):
+        c, res = 0, ''
+        diskpath = os.path.join(self.current_dir, into_dir)
+        if not os.path.exists(diskpath):
+            os.mkdir(diskpath)
+        prev_dir = self.current_dir
+        try:
+            self.current_dir = diskpath
+            for tag in soup.findAll(lambda tag: tag.name.lower()=='a' and tag.has_key('href')):
+                print '.',
+                sys.stdout.flush()
+                iurl = self.absurl(baseurl, tag, 'href')
+                if not iurl:
+                    continue
+                nurl = self.normurl(iurl)
+                if self.filemap.has_key(nurl):
+                    self.localize_link(tag, 'href', self.filemap[nurl])
+                    continue
+                if self.files > self.max_files:
+                    return res
+                c += 1
+                linkdir = 'link'+str(c) if into_dir else ''
+                linkdiskpath = os.path.join(diskpath, linkdir)
+                if not os.path.exists(linkdiskpath):
+                    os.mkdir(linkdiskpath)
+                try:
+                    self.current_dir = linkdiskpath
+                    f = fetch_url(iurl)
+                    soup = BeautifulSoup(f.read())
+                    logger.info('Processing images...')
+                    self.process_images(soup, f.geturl())
+                    self.process_stylesheets(soup, f.geturl())
+                    
+                    res = os.path.join(linkdiskpath, basename(iurl))
+                    self.filemap[nurl] = res
+                    if recursion_level < self.max_recursions:
+                        logger.info('Processing links...')
+                        self.process_links(soup, iurl, recursion_level+1)
+                    else:
+                        self.process_return_links(soup, iurl) 
+                        logger.info('Recursion limit reached. Skipping %s', iurl)
+                    
+                    save_soup(soup, res)
+                    self.localize_link(tag, 'href', res)
+                except Exception, err:
+                    logger.warning('Could not fetch link %s', iurl)
+                    logger.debug('Error: %s', str(err), exc_info=True)
+                finally:
+                    self.current_dir = diskpath
+                    self.files += 1                
+        finally:
+            self.current_dir = prev_dir
+        print
+        return res
+    
+    def __del__(self):
+        socket.setdefaulttimeout(self.default_timeout)
+        
+def option_parser(usage='%prog URL\n\nWhere URL is for example http://google.com'):
+    parser = OptionParser(usage=usage, version=__appname__+' '+__version__,
+                          epilog='Created by ' + __author__)
+    parser.add_option('-d', '--base-dir', help='Base directory into which URL is saved. Default is %default',
+                      default='.', type='string', dest='dir')
+    parser.add_option('-t', '--timeout', help='Timeout in seconds to wait for a response from the server. Default: %default s',
+                      default=10, type='int', dest='timeout')
+    parser.add_option('-r', '--max-recursions', help='Maximum number of levels to recurse i.e. depth of links to follow. Default %default',
+                      default=1, type='int', dest='max_recursions')
+    parser.add_option('-n', '--max-files', default=sys.maxint, type='int', dest='max_files',
+                      help='The maximum number of files to download. This only applies to files from <a href> tags. Default is %default')
+    parser.add_option('--match-regexp', default=[], action='append', dest='match_regexps',
+                      help='Only links that match this regular expression will be followed. This option can be specified multiple times, in which case as long as a link matches any one regexp, it will be followed. By default all links are followed.')
+    parser.add_option('--filter-regexp', default=[], action='append', dest='filter_regexps',
+                      help='Any link that matches this regular expression will be ignored. This option can be specified multiple times, in which case as long as any regexp matches a link, it will be ignored.By default, no links are ignored. If both --filter-regexp and --match-regexp are specified, then --match-regexp is ignored.')
+    parser.add_option('--verbose', help='Show detailed output information. Useful for debugging',
+                      default=False, action='store_true', dest='verbose')
+    return parser
+
+def main(args=sys.argv):
+    parser = option_parser()    
+    options, args = parser.parse_args(args)
+    if len(args) != 2:
+        parser.print_help()
+        return 1
+    level = logging.DEBUG if options.verbose else logging.WARNING
+    setup_cli_handlers(logger, level)
+        
+    fetcher = RecursiveFetcher(options)
+    fetcher.start_fetch(args[1])
+    
+
+if __name__ == '__main__':    
+    sys.exit(main())
--- a/windows_installer.py
+++ b/windows_installer.py
@ -237,7 +237,8 @@ SectionEnd
 class WixInstaller(object):
    '''
    Make a .msi installer. Can't get the driver installation to play well with
-    an existing installation of the connect USB driver.
+    an existing installation of the connect USB driver. Pick this up again when
+    libusb1.dll is released based on winusb.
    '''
    TEMPLATE=\
 r'''<?xml version='1.0' encoding='windows-1252'?>