From 00a50740facc74d914793bca0a6e55b9420e911f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 13 Jul 2007 03:09:33 +0000 Subject: [PATCH] Initial implementation of web2disk --- setup.py | 1 + src/libprs500/__init__.py | 21 +- src/libprs500/ebooks/lrf/__init__.py | 8 +- src/libprs500/ebooks/lrf/html/convert_from.py | 10 +- src/libprs500/ebooks/metadata/__init__.py | 3 + src/libprs500/library/database.py | 38 ++- src/libprs500/web/__init__.py | 18 ++ src/libprs500/web/fetch/__init__.py | 14 + src/libprs500/web/fetch/simple.py | 301 ++++++++++++++++++ windows_installer.py | 3 +- 10 files changed, 406 insertions(+), 11 deletions(-) create mode 100644 src/libprs500/web/__init__.py create mode 100644 src/libprs500/web/fetch/__init__.py create mode 100644 src/libprs500/web/fetch/simple.py diff --git a/setup.py b/setup.py index 0e077a8c91..ade0028ee0 100644 --- a/setup.py +++ b/setup.py @@ -28,6 +28,7 @@ entry_points = { 'markdown = libprs500.ebooks.markdown.markdown:main',\ 'lit2lrf = libprs500.ebooks.lrf.lit.convert_from:main',\ 'rtf2lrf = libprs500.ebooks.lrf.rtf.convert_from:main',\ + 'web2disk = libprs500.web.fetch.simple:main',\ ], 'gui_scripts' : [ APPNAME+' = libprs500.gui.main:main'] } diff --git a/src/libprs500/__init__.py b/src/libprs500/__init__.py index 8c8336bc2c..745d1aec19 100644 --- a/src/libprs500/__init__.py +++ b/src/libprs500/__init__.py @@ -13,12 +13,12 @@ ## with this program; if not, write to the Free Software Foundation, Inc., ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ''' E-book management software''' -__version__ = "0.3.68" +__version__ = "0.3.69" __docformat__ = "epytext" __author__ = "Kovid Goyal " __appname__ = 'libprs500' -import sys, os +import sys, os, logging iswindows = 'win32' in sys.platform.lower() isosx = 'darwin' in sys.platform.lower() @@ -28,6 +28,23 @@ if iswindows: except: pass +def setup_cli_handlers(logger, level): + logger.setLevel(level) + if level == logging.WARNING: + handler = logging.StreamHandler(sys.stdout) + handler.setFormatter(logging.Formatter('%(levelname)s: %(message)s')) + handler.setLevel(logging.WARNING) + elif level == logging.INFO: + handler = logging.StreamHandler(sys.stdout) + handler.setFormatter(logging.Formatter()) + handler.setLevel(logging.INFO) + elif level == logging.DEBUG: + handler = logging.StreamHandler(sys.stderr) + handler.setLevel(logging.DEBUG) + handler.setFormatter(logging.Formatter('[%(levelname)s] %(filename)s:%(lineno)s: %(message)s')) + logger.addHandler(handler) + + def load_library(name, cdll): if iswindows: return cdll.LoadLibrary(name) diff --git a/src/libprs500/ebooks/lrf/__init__.py b/src/libprs500/ebooks/lrf/__init__.py index c5c33b8355..ccd7212f65 100644 --- a/src/libprs500/ebooks/lrf/__init__.py +++ b/src/libprs500/ebooks/lrf/__init__.py @@ -26,11 +26,11 @@ from libprs500.ebooks.lrf.pylrs.pylrs import TextBlock, Header, PutObj, \ Paragraph, TextStyle, BlockStyle from libprs500.ebooks.lrf.fonts import FONT_FILE_MAP from libprs500.ebooks import ConversionError -from libprs500 import __version__ as VERSION +from libprs500 import __appname__, __version__, __author__ from libprs500 import iswindows __docformat__ = "epytext" -__author__ = "Kovid Goyal " + class PRS500_PROFILE(object): screen_width = 600 @@ -69,8 +69,8 @@ def font_family(option, opt_str, value, parser): def option_parser(usage): - parser = OptionParser(usage=usage, version='libprs500 '+VERSION, - epilog='Created by Kovid Goyal') + parser = OptionParser(usage=usage, version=__appname__+' '+__version__, + epilog='Created by '+__author__) metadata = parser.add_option_group('METADATA OPTIONS') metadata.add_option('--header', action='store_true', default=False, dest='header', help='Add a header to all the pages with title and author.') diff --git a/src/libprs500/ebooks/lrf/html/convert_from.py b/src/libprs500/ebooks/lrf/html/convert_from.py index 5a8c4df66f..a6cbe07715 100644 --- a/src/libprs500/ebooks/lrf/html/convert_from.py +++ b/src/libprs500/ebooks/lrf/html/convert_from.py @@ -36,7 +36,8 @@ from libprs500.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, \ Comment, Tag, NavigableString, Declaration, ProcessingInstruction from libprs500.ebooks.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream, \ TextBlock, ImageBlock, JumpButton, CharButton, Bold, Space, \ - Plot, Image, BlockSpace, RuledLine, BookSetting, Canvas, DropCaps + Plot, Image, BlockSpace, RuledLine, BookSetting, Canvas, DropCaps, \ + LrsError from libprs500.ebooks.lrf.pylrs.pylrs import Span as _Span from libprs500.ebooks.lrf import option_parser, Book, PRS500_PROFILE from libprs500.ebooks import ConversionError @@ -584,7 +585,7 @@ class HTMLConverter(object): elif self.link_level < self.max_link_levels: try: # os.access raises Exceptions in path has null bytes if not os.access(path.encode('utf8', 'replace'), os.R_OK): - raise Exception() + continue except Exception: if self.verbose: print "Skipping", link @@ -859,7 +860,10 @@ class HTMLConverter(object): return if not self.images.has_key(path): - self.images[path] = ImageStream(path) + try: + self.images[path] = ImageStream(path) + except LrsError: + return im = Image(self.images[path], x0=0, y0=0, x1=width, y1=height,\ xsize=width, ysize=height) diff --git a/src/libprs500/ebooks/metadata/__init__.py b/src/libprs500/ebooks/metadata/__init__.py index 5f31a481e5..009b263455 100644 --- a/src/libprs500/ebooks/metadata/__init__.py +++ b/src/libprs500/ebooks/metadata/__init__.py @@ -47,6 +47,9 @@ class MetaInformation(object): self.category = None self.classification = None self.publisher = None + self.series = None + self.series_index = None + self.rating = None def __str__(self): ans = '' diff --git a/src/libprs500/library/database.py b/src/libprs500/library/database.py index 358ca6d895..b3d5317c8a 100644 --- a/src/libprs500/library/database.py +++ b/src/libprs500/library/database.py @@ -17,7 +17,7 @@ Backend that implements storage of ebooks in an sqlite database. """ import sqlite3 as sqlite import os, datetime, re -from zlib import compress, decompress +from zlib import compressobj, decompress from stat import ST_SIZE class Concatenate(object): @@ -689,6 +689,17 @@ class LibraryDatabase(object): aid = self.conn.execute('INSERT INTO publishers(name) VALUES (?)', (publisher,)).lastrowid self.conn.execute('INSERT INTO books_publishers_link(book, publisher) VALUES (?,?)', (id, aid)) self.conn.commit() + + def set_series(self, id, series): + self.conn.execute('DELETE FROM books_series_link WHERE book=?',(id,)) + if series: + s = self.conn.execute('SELECT id from series WHERE name=?', (series,)).fetchone() + if s: + aid = s[0] + else: + aid = self.conn.execute('INSERT INTO series(name) VALUES (?)', (series,)).lastrowid + self.conn.execute('INSERT INTO books_series_link(book, series) VALUES (?,?)', (id, aid)) + self.conn.commit() def set_rating(self, id, rating): rating = int(rating) @@ -697,4 +708,29 @@ class LibraryDatabase(object): rat = rat[0] if rat else self.conn.execute('INSERT INTO ratings(rating) VALUES (?)', (rating,)).lastrowid self.conn.execute('INSERT INTO books_ratings_link(book, rating) VALUES (?,?)', (id, rat)) self.conn.commit() + + def add_book(self, stream, format, mi, uri=None): + if not mi.author: + mi.author = 'Unknown' + obj = self.conn.execute('INSERT INTO books(title, uri, series_index) VALUES (?, ?, ?)', + (mi.title, uri, mi.series_index)) + id = obj.lastrowid + self.conn.commit() + temp = mi.author.split(',') + authors = [] + for a in temp: + authors += a.split('&') + self.set_authors(id, authors) + if mi.publisher: + self.set_publisher(id, mi.publisher) + if mi.rating: + self.set_rating(id, mi.rating) + if mi.series: + self.set_series(id, mi.series) + stream.seek(0, 2) + usize = stream.tell() + stream.seek(0) + self.conn.execute('INSERT INTO data(book, format, uncompressed_size, data) VALUES (?,?,?,?)', + (id, format, usize, compressobj().compress(stream))) + self.conn.commit() diff --git a/src/libprs500/web/__init__.py b/src/libprs500/web/__init__.py new file mode 100644 index 0000000000..2d46011db7 --- /dev/null +++ b/src/libprs500/web/__init__.py @@ -0,0 +1,18 @@ +## Copyright (C) 2006 Kovid Goyal kovid@kovidgoyal.net +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + + + \ No newline at end of file diff --git a/src/libprs500/web/fetch/__init__.py b/src/libprs500/web/fetch/__init__.py new file mode 100644 index 0000000000..aaf49de99e --- /dev/null +++ b/src/libprs500/web/fetch/__init__.py @@ -0,0 +1,14 @@ +## Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. diff --git a/src/libprs500/web/fetch/simple.py b/src/libprs500/web/fetch/simple.py new file mode 100644 index 0000000000..b1a0912f16 --- /dev/null +++ b/src/libprs500/web/fetch/simple.py @@ -0,0 +1,301 @@ +## Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +''' +Fetch a webpage and its links recursively. +''' +import sys, socket, urllib2, os, urlparse, codecs, logging, re +from urllib import url2pathname +from httplib import responses +from optparse import OptionParser + +from libprs500 import __version__, __appname__, __author__, setup_cli_handlers +from libprs500.ebooks.BeautifulSoup import BeautifulSoup + +logger = logging.getLogger('libprs500.web.fetch.simple') + +class FetchError(Exception): + pass + +def fetch_url(url): + f = None + logger.info('Fetching %s', url) + try: + f = urllib2.urlopen(url) + except urllib2.URLError, err: + if hasattr(err, 'code') and responses.has_key(err.code): + raise FetchError, responses[err.code] + raise err + return f + +def basename(url): + parts = urlparse.urlsplit(url) + path = url2pathname(parts.path) + res = os.path.basename(path) + if not os.path.splitext(res)[1]: + return 'index.html' + return res + +def save_soup(soup, target): + f = codecs.open(target, 'w', 'utf8') + f.write(unicode(soup)) + f.close() + + +class RecursiveFetcher(object): + LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in + ('.exe\s*$', '.mp3\s*$', '.ogg\s*$', '^\s*mailto:', '^\s*$')) + CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE) + + def __init__(self, options): + self.base_dir = os.path.abspath(os.path.expanduser(options.dir)) + if not os.path.exists(self.base_dir): + os.makedirs(self.base_dir) + self.default_timeout = socket.getdefaulttimeout() + socket.setdefaulttimeout(options.timeout) + self.max_recursions = options.max_recursions + self.match_regexps = [re.compile(i, re.IGNORECASE) for i in options.match_regexps] + self.filter_regexps = [re.compile(i, re.IGNORECASE) for i in options.filter_regexps] + self.max_files = options.max_files + self.filemap = {} + self.imagemap = {} + self.stylemap = {} + self.current_dir = self.base_dir + self.files = 0 + + def start_fetch(self, url): + soup = BeautifulSoup('') + print 'Working', + res = self.process_links(soup, url, 0, into_dir='') + print '%s saved to %s'%(url, res) + return res + + def is_link_ok(self, url): + for i in self.__class__.LINK_FILTER: + if i.search(url): + return False + return True + + def is_link_wanted(self, url): + if self.filter_regexps: + for f in self.filter_regexps: + if f.search(url): + return False + return True + elif self.match_regexps: + for m in self.match_regexps: + if m.search(url): + return True + return False + return True + + def process_stylesheets(self, soup, baseurl): + diskpath = os.path.join(self.current_dir, 'stylesheets') + if not os.path.exists(diskpath): + os.mkdir(diskpath) + c = 0 + for tag in soup.findAll(lambda tag: tag.name.lower()in ['link', 'style'] and tag.has_key('type') and tag['type'].lower() == 'text/css'): + if tag.has_key('href'): + iurl = tag['href'] + if not urlparse.urlsplit(iurl).scheme: + iurl = urlparse.urljoin(baseurl, iurl, False) + if self.stylemap.has_key(iurl): + tag['href'] = self.stylemap[iurl] + continue + try: + f = fetch_url(iurl) + except Exception, err: + logger.warning('Could not fetch stylesheet %s', iurl) + logger.debug('Error: %s', str(err), exc_info=True) + continue + c += 1 + stylepath = os.path.join(diskpath, 'style'+str(c)+'.css') + self.stylemap[iurl] = stylepath + open(stylepath, 'wb').write(f.read()) + tag['href'] = stylepath + else: + for ns in tag.findAll(text=True): + src = str(ns) + m = self.__class__.CSS_IMPORT_PATTERN.search(src) + if m: + iurl = m.group(1) + if not urlparse.urlsplit(iurl).scheme: + iurl = urlparse.urljoin(baseurl, iurl, False) + if self.stylemap.has_key(iurl): + ns.replaceWith(src.replace(m.group(1), self.stylemap[iurl])) + continue + try: + f = fetch_url(iurl) + except Exception, err: + logger.warning('Could not fetch stylesheet %s', iurl) + logger.debug('Error: %s', str(err), exc_info=True) + continue + c += 1 + stylepath = os.path.join(diskpath, 'style'+str(c)+'.css') + self.stylemap[iurl] = stylepath + open(stylepath, 'wb').write(f.read()) + ns.replaceWith(src.replace(m.group(1), stylepath)) + + + + def process_images(self, soup, baseurl): + diskpath = os.path.join(self.current_dir, 'images') + if not os.path.exists(diskpath): + os.mkdir(diskpath) + c = 0 + for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')): + iurl, ext = tag['src'], os.path.splitext(tag['src'])[1] + if not ext: + logger.info('Skipping extensionless image %s', iurl) + continue + if not urlparse.urlsplit(iurl).scheme: + iurl = urlparse.urljoin(baseurl, iurl, False) + if self.imagemap.has_key(iurl): + tag['src'] = self.imagemap[iurl] + continue + try: + f = fetch_url(iurl) + except Exception, err: + logger.warning('Could not fetch image %s', iurl) + logger.debug('Error: %s', str(err), exc_info=True) + continue + c += 1 + imgpath = os.path.join(diskpath, 'img'+str(c)+ext) + self.imagemap[iurl] = imgpath + open(imgpath, 'wb').write(f.read()) + tag['src'] = imgpath + + def absurl(self, baseurl, tag, key): + iurl = tag[key] + parts = urlparse.urlsplit(iurl) + if not parts.netloc and not parts.path: + return None + if not parts.scheme: + iurl = urlparse.urljoin(baseurl, iurl, False) + if not self.is_link_ok(iurl): + logger.info('Skipping invalid link: %s', iurl) + return None + return iurl + + def normurl(self, url): + parts = list(urlparse.urlsplit(url)) + parts[4] = '' + return urlparse.urlunsplit(parts) + + def localize_link(self, tag, key, path): + parts = urlparse.urlsplit(tag[key]) + suffix = '#'+parts.fragment if parts.fragment else '' + tag[key] = path+suffix + + def process_return_links(self, soup, baseurl): + for tag in soup.findAll(lambda tag: tag.name.lower()=='a' and tag.has_key('href')): + iurl = self.absurl(baseurl, tag, 'href') + if not iurl: + continue + nurl = self.normurl(iurl) + if self.filemap.has_key(nurl): + self.localize_link(tag, 'href', self.filemap[nurl]) + + def process_links(self, soup, baseurl, recursion_level, into_dir='links'): + c, res = 0, '' + diskpath = os.path.join(self.current_dir, into_dir) + if not os.path.exists(diskpath): + os.mkdir(diskpath) + prev_dir = self.current_dir + try: + self.current_dir = diskpath + for tag in soup.findAll(lambda tag: tag.name.lower()=='a' and tag.has_key('href')): + print '.', + sys.stdout.flush() + iurl = self.absurl(baseurl, tag, 'href') + if not iurl: + continue + nurl = self.normurl(iurl) + if self.filemap.has_key(nurl): + self.localize_link(tag, 'href', self.filemap[nurl]) + continue + if self.files > self.max_files: + return res + c += 1 + linkdir = 'link'+str(c) if into_dir else '' + linkdiskpath = os.path.join(diskpath, linkdir) + if not os.path.exists(linkdiskpath): + os.mkdir(linkdiskpath) + try: + self.current_dir = linkdiskpath + f = fetch_url(iurl) + soup = BeautifulSoup(f.read()) + logger.info('Processing images...') + self.process_images(soup, f.geturl()) + self.process_stylesheets(soup, f.geturl()) + + res = os.path.join(linkdiskpath, basename(iurl)) + self.filemap[nurl] = res + if recursion_level < self.max_recursions: + logger.info('Processing links...') + self.process_links(soup, iurl, recursion_level+1) + else: + self.process_return_links(soup, iurl) + logger.info('Recursion limit reached. Skipping %s', iurl) + + save_soup(soup, res) + self.localize_link(tag, 'href', res) + except Exception, err: + logger.warning('Could not fetch link %s', iurl) + logger.debug('Error: %s', str(err), exc_info=True) + finally: + self.current_dir = diskpath + self.files += 1 + finally: + self.current_dir = prev_dir + print + return res + + def __del__(self): + socket.setdefaulttimeout(self.default_timeout) + +def option_parser(usage='%prog URL\n\nWhere URL is for example http://google.com'): + parser = OptionParser(usage=usage, version=__appname__+' '+__version__, + epilog='Created by ' + __author__) + parser.add_option('-d', '--base-dir', help='Base directory into which URL is saved. Default is %default', + default='.', type='string', dest='dir') + parser.add_option('-t', '--timeout', help='Timeout in seconds to wait for a response from the server. Default: %default s', + default=10, type='int', dest='timeout') + parser.add_option('-r', '--max-recursions', help='Maximum number of levels to recurse i.e. depth of links to follow. Default %default', + default=1, type='int', dest='max_recursions') + parser.add_option('-n', '--max-files', default=sys.maxint, type='int', dest='max_files', + help='The maximum number of files to download. This only applies to files from tags. Default is %default') + parser.add_option('--match-regexp', default=[], action='append', dest='match_regexps', + help='Only links that match this regular expression will be followed. This option can be specified multiple times, in which case as long as a link matches any one regexp, it will be followed. By default all links are followed.') + parser.add_option('--filter-regexp', default=[], action='append', dest='filter_regexps', + help='Any link that matches this regular expression will be ignored. This option can be specified multiple times, in which case as long as any regexp matches a link, it will be ignored.By default, no links are ignored. If both --filter-regexp and --match-regexp are specified, then --match-regexp is ignored.') + parser.add_option('--verbose', help='Show detailed output information. Useful for debugging', + default=False, action='store_true', dest='verbose') + return parser + +def main(args=sys.argv): + parser = option_parser() + options, args = parser.parse_args(args) + if len(args) != 2: + parser.print_help() + return 1 + level = logging.DEBUG if options.verbose else logging.WARNING + setup_cli_handlers(logger, level) + + fetcher = RecursiveFetcher(options) + fetcher.start_fetch(args[1]) + + +if __name__ == '__main__': + sys.exit(main()) \ No newline at end of file diff --git a/windows_installer.py b/windows_installer.py index 529b74f8d0..eff29da240 100644 --- a/windows_installer.py +++ b/windows_installer.py @@ -237,7 +237,8 @@ SectionEnd class WixInstaller(object): ''' Make a .msi installer. Can't get the driver installation to play well with - an existing installation of the connect USB driver. + an existing installation of the connect USB driver. Pick this up again when + libusb1.dll is released based on winusb. ''' TEMPLATE=\ r'''