Initial implementation of web2disk

This commit is contained in:
Kovid Goyal 2007-07-13 03:09:33 +00:00
parent aa06fcfb1a
commit 00a50740fa
10 changed files with 406 additions and 11 deletions

View File

@ -28,6 +28,7 @@ entry_points = {
'markdown = libprs500.ebooks.markdown.markdown:main',\ 'markdown = libprs500.ebooks.markdown.markdown:main',\
'lit2lrf = libprs500.ebooks.lrf.lit.convert_from:main',\ 'lit2lrf = libprs500.ebooks.lrf.lit.convert_from:main',\
'rtf2lrf = libprs500.ebooks.lrf.rtf.convert_from:main',\ 'rtf2lrf = libprs500.ebooks.lrf.rtf.convert_from:main',\
'web2disk = libprs500.web.fetch.simple:main',\
], ],
'gui_scripts' : [ APPNAME+' = libprs500.gui.main:main'] 'gui_scripts' : [ APPNAME+' = libprs500.gui.main:main']
} }

View File

@ -13,12 +13,12 @@
## with this program; if not, write to the Free Software Foundation, Inc., ## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
''' E-book management software''' ''' E-book management software'''
__version__ = "0.3.68" __version__ = "0.3.69"
__docformat__ = "epytext" __docformat__ = "epytext"
__author__ = "Kovid Goyal <kovid@kovidgoyal.net>" __author__ = "Kovid Goyal <kovid@kovidgoyal.net>"
__appname__ = 'libprs500' __appname__ = 'libprs500'
import sys, os import sys, os, logging
iswindows = 'win32' in sys.platform.lower() iswindows = 'win32' in sys.platform.lower()
isosx = 'darwin' in sys.platform.lower() isosx = 'darwin' in sys.platform.lower()
@ -28,6 +28,23 @@ if iswindows:
except: except:
pass pass
def setup_cli_handlers(logger, level):
logger.setLevel(level)
if level == logging.WARNING:
handler = logging.StreamHandler(sys.stdout)
handler.setFormatter(logging.Formatter('%(levelname)s: %(message)s'))
handler.setLevel(logging.WARNING)
elif level == logging.INFO:
handler = logging.StreamHandler(sys.stdout)
handler.setFormatter(logging.Formatter())
handler.setLevel(logging.INFO)
elif level == logging.DEBUG:
handler = logging.StreamHandler(sys.stderr)
handler.setLevel(logging.DEBUG)
handler.setFormatter(logging.Formatter('[%(levelname)s] %(filename)s:%(lineno)s: %(message)s'))
logger.addHandler(handler)
def load_library(name, cdll): def load_library(name, cdll):
if iswindows: if iswindows:
return cdll.LoadLibrary(name) return cdll.LoadLibrary(name)

View File

@ -26,11 +26,11 @@ from libprs500.ebooks.lrf.pylrs.pylrs import TextBlock, Header, PutObj, \
Paragraph, TextStyle, BlockStyle Paragraph, TextStyle, BlockStyle
from libprs500.ebooks.lrf.fonts import FONT_FILE_MAP from libprs500.ebooks.lrf.fonts import FONT_FILE_MAP
from libprs500.ebooks import ConversionError from libprs500.ebooks import ConversionError
from libprs500 import __version__ as VERSION from libprs500 import __appname__, __version__, __author__
from libprs500 import iswindows from libprs500 import iswindows
__docformat__ = "epytext" __docformat__ = "epytext"
__author__ = "Kovid Goyal <kovid@kovidgoyal.net>"
class PRS500_PROFILE(object): class PRS500_PROFILE(object):
screen_width = 600 screen_width = 600
@ -69,8 +69,8 @@ def font_family(option, opt_str, value, parser):
def option_parser(usage): def option_parser(usage):
parser = OptionParser(usage=usage, version='libprs500 '+VERSION, parser = OptionParser(usage=usage, version=__appname__+' '+__version__,
epilog='Created by Kovid Goyal') epilog='Created by '+__author__)
metadata = parser.add_option_group('METADATA OPTIONS') metadata = parser.add_option_group('METADATA OPTIONS')
metadata.add_option('--header', action='store_true', default=False, dest='header', metadata.add_option('--header', action='store_true', default=False, dest='header',
help='Add a header to all the pages with title and author.') help='Add a header to all the pages with title and author.')

View File

@ -36,7 +36,8 @@ from libprs500.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, \
Comment, Tag, NavigableString, Declaration, ProcessingInstruction Comment, Tag, NavigableString, Declaration, ProcessingInstruction
from libprs500.ebooks.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream, \ from libprs500.ebooks.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream, \
TextBlock, ImageBlock, JumpButton, CharButton, Bold, Space, \ TextBlock, ImageBlock, JumpButton, CharButton, Bold, Space, \
Plot, Image, BlockSpace, RuledLine, BookSetting, Canvas, DropCaps Plot, Image, BlockSpace, RuledLine, BookSetting, Canvas, DropCaps, \
LrsError
from libprs500.ebooks.lrf.pylrs.pylrs import Span as _Span from libprs500.ebooks.lrf.pylrs.pylrs import Span as _Span
from libprs500.ebooks.lrf import option_parser, Book, PRS500_PROFILE from libprs500.ebooks.lrf import option_parser, Book, PRS500_PROFILE
from libprs500.ebooks import ConversionError from libprs500.ebooks import ConversionError
@ -584,7 +585,7 @@ class HTMLConverter(object):
elif self.link_level < self.max_link_levels: elif self.link_level < self.max_link_levels:
try: # os.access raises Exceptions in path has null bytes try: # os.access raises Exceptions in path has null bytes
if not os.access(path.encode('utf8', 'replace'), os.R_OK): if not os.access(path.encode('utf8', 'replace'), os.R_OK):
raise Exception() continue
except Exception: except Exception:
if self.verbose: if self.verbose:
print "Skipping", link print "Skipping", link
@ -859,7 +860,10 @@ class HTMLConverter(object):
return return
if not self.images.has_key(path): if not self.images.has_key(path):
self.images[path] = ImageStream(path) try:
self.images[path] = ImageStream(path)
except LrsError:
return
im = Image(self.images[path], x0=0, y0=0, x1=width, y1=height,\ im = Image(self.images[path], x0=0, y0=0, x1=width, y1=height,\
xsize=width, ysize=height) xsize=width, ysize=height)

View File

@ -47,6 +47,9 @@ class MetaInformation(object):
self.category = None self.category = None
self.classification = None self.classification = None
self.publisher = None self.publisher = None
self.series = None
self.series_index = None
self.rating = None
def __str__(self): def __str__(self):
ans = '' ans = ''

View File

@ -17,7 +17,7 @@ Backend that implements storage of ebooks in an sqlite database.
""" """
import sqlite3 as sqlite import sqlite3 as sqlite
import os, datetime, re import os, datetime, re
from zlib import compress, decompress from zlib import compressobj, decompress
from stat import ST_SIZE from stat import ST_SIZE
class Concatenate(object): class Concatenate(object):
@ -690,6 +690,17 @@ class LibraryDatabase(object):
self.conn.execute('INSERT INTO books_publishers_link(book, publisher) VALUES (?,?)', (id, aid)) self.conn.execute('INSERT INTO books_publishers_link(book, publisher) VALUES (?,?)', (id, aid))
self.conn.commit() self.conn.commit()
def set_series(self, id, series):
self.conn.execute('DELETE FROM books_series_link WHERE book=?',(id,))
if series:
s = self.conn.execute('SELECT id from series WHERE name=?', (series,)).fetchone()
if s:
aid = s[0]
else:
aid = self.conn.execute('INSERT INTO series(name) VALUES (?)', (series,)).lastrowid
self.conn.execute('INSERT INTO books_series_link(book, series) VALUES (?,?)', (id, aid))
self.conn.commit()
def set_rating(self, id, rating): def set_rating(self, id, rating):
rating = int(rating) rating = int(rating)
self.conn.execute('DELETE FROM books_ratings_link WHERE book=?',(id,)) self.conn.execute('DELETE FROM books_ratings_link WHERE book=?',(id,))
@ -698,3 +709,28 @@ class LibraryDatabase(object):
self.conn.execute('INSERT INTO books_ratings_link(book, rating) VALUES (?,?)', (id, rat)) self.conn.execute('INSERT INTO books_ratings_link(book, rating) VALUES (?,?)', (id, rat))
self.conn.commit() self.conn.commit()
def add_book(self, stream, format, mi, uri=None):
if not mi.author:
mi.author = 'Unknown'
obj = self.conn.execute('INSERT INTO books(title, uri, series_index) VALUES (?, ?, ?)',
(mi.title, uri, mi.series_index))
id = obj.lastrowid
self.conn.commit()
temp = mi.author.split(',')
authors = []
for a in temp:
authors += a.split('&')
self.set_authors(id, authors)
if mi.publisher:
self.set_publisher(id, mi.publisher)
if mi.rating:
self.set_rating(id, mi.rating)
if mi.series:
self.set_series(id, mi.series)
stream.seek(0, 2)
usize = stream.tell()
stream.seek(0)
self.conn.execute('INSERT INTO data(book, format, uncompressed_size, data) VALUES (?,?,?,?)',
(id, format, usize, compressobj().compress(stream)))
self.conn.commit()

View File

@ -0,0 +1,18 @@
## Copyright (C) 2006 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

View File

@ -0,0 +1,14 @@
## Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

View File

@ -0,0 +1,301 @@
## Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
'''
Fetch a webpage and its links recursively.
'''
import sys, socket, urllib2, os, urlparse, codecs, logging, re
from urllib import url2pathname
from httplib import responses
from optparse import OptionParser
from libprs500 import __version__, __appname__, __author__, setup_cli_handlers
from libprs500.ebooks.BeautifulSoup import BeautifulSoup
logger = logging.getLogger('libprs500.web.fetch.simple')
class FetchError(Exception):
pass
def fetch_url(url):
f = None
logger.info('Fetching %s', url)
try:
f = urllib2.urlopen(url)
except urllib2.URLError, err:
if hasattr(err, 'code') and responses.has_key(err.code):
raise FetchError, responses[err.code]
raise err
return f
def basename(url):
parts = urlparse.urlsplit(url)
path = url2pathname(parts.path)
res = os.path.basename(path)
if not os.path.splitext(res)[1]:
return 'index.html'
return res
def save_soup(soup, target):
f = codecs.open(target, 'w', 'utf8')
f.write(unicode(soup))
f.close()
class RecursiveFetcher(object):
LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in
('.exe\s*$', '.mp3\s*$', '.ogg\s*$', '^\s*mailto:', '^\s*$'))
CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE)
def __init__(self, options):
self.base_dir = os.path.abspath(os.path.expanduser(options.dir))
if not os.path.exists(self.base_dir):
os.makedirs(self.base_dir)
self.default_timeout = socket.getdefaulttimeout()
socket.setdefaulttimeout(options.timeout)
self.max_recursions = options.max_recursions
self.match_regexps = [re.compile(i, re.IGNORECASE) for i in options.match_regexps]
self.filter_regexps = [re.compile(i, re.IGNORECASE) for i in options.filter_regexps]
self.max_files = options.max_files
self.filemap = {}
self.imagemap = {}
self.stylemap = {}
self.current_dir = self.base_dir
self.files = 0
def start_fetch(self, url):
soup = BeautifulSoup('<a href="'+url+'" />')
print 'Working',
res = self.process_links(soup, url, 0, into_dir='')
print '%s saved to %s'%(url, res)
return res
def is_link_ok(self, url):
for i in self.__class__.LINK_FILTER:
if i.search(url):
return False
return True
def is_link_wanted(self, url):
if self.filter_regexps:
for f in self.filter_regexps:
if f.search(url):
return False
return True
elif self.match_regexps:
for m in self.match_regexps:
if m.search(url):
return True
return False
return True
def process_stylesheets(self, soup, baseurl):
diskpath = os.path.join(self.current_dir, 'stylesheets')
if not os.path.exists(diskpath):
os.mkdir(diskpath)
c = 0
for tag in soup.findAll(lambda tag: tag.name.lower()in ['link', 'style'] and tag.has_key('type') and tag['type'].lower() == 'text/css'):
if tag.has_key('href'):
iurl = tag['href']
if not urlparse.urlsplit(iurl).scheme:
iurl = urlparse.urljoin(baseurl, iurl, False)
if self.stylemap.has_key(iurl):
tag['href'] = self.stylemap[iurl]
continue
try:
f = fetch_url(iurl)
except Exception, err:
logger.warning('Could not fetch stylesheet %s', iurl)
logger.debug('Error: %s', str(err), exc_info=True)
continue
c += 1
stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
self.stylemap[iurl] = stylepath
open(stylepath, 'wb').write(f.read())
tag['href'] = stylepath
else:
for ns in tag.findAll(text=True):
src = str(ns)
m = self.__class__.CSS_IMPORT_PATTERN.search(src)
if m:
iurl = m.group(1)
if not urlparse.urlsplit(iurl).scheme:
iurl = urlparse.urljoin(baseurl, iurl, False)
if self.stylemap.has_key(iurl):
ns.replaceWith(src.replace(m.group(1), self.stylemap[iurl]))
continue
try:
f = fetch_url(iurl)
except Exception, err:
logger.warning('Could not fetch stylesheet %s', iurl)
logger.debug('Error: %s', str(err), exc_info=True)
continue
c += 1
stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
self.stylemap[iurl] = stylepath
open(stylepath, 'wb').write(f.read())
ns.replaceWith(src.replace(m.group(1), stylepath))
def process_images(self, soup, baseurl):
diskpath = os.path.join(self.current_dir, 'images')
if not os.path.exists(diskpath):
os.mkdir(diskpath)
c = 0
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
iurl, ext = tag['src'], os.path.splitext(tag['src'])[1]
if not ext:
logger.info('Skipping extensionless image %s', iurl)
continue
if not urlparse.urlsplit(iurl).scheme:
iurl = urlparse.urljoin(baseurl, iurl, False)
if self.imagemap.has_key(iurl):
tag['src'] = self.imagemap[iurl]
continue
try:
f = fetch_url(iurl)
except Exception, err:
logger.warning('Could not fetch image %s', iurl)
logger.debug('Error: %s', str(err), exc_info=True)
continue
c += 1
imgpath = os.path.join(diskpath, 'img'+str(c)+ext)
self.imagemap[iurl] = imgpath
open(imgpath, 'wb').write(f.read())
tag['src'] = imgpath
def absurl(self, baseurl, tag, key):
iurl = tag[key]
parts = urlparse.urlsplit(iurl)
if not parts.netloc and not parts.path:
return None
if not parts.scheme:
iurl = urlparse.urljoin(baseurl, iurl, False)
if not self.is_link_ok(iurl):
logger.info('Skipping invalid link: %s', iurl)
return None
return iurl
def normurl(self, url):
parts = list(urlparse.urlsplit(url))
parts[4] = ''
return urlparse.urlunsplit(parts)
def localize_link(self, tag, key, path):
parts = urlparse.urlsplit(tag[key])
suffix = '#'+parts.fragment if parts.fragment else ''
tag[key] = path+suffix
def process_return_links(self, soup, baseurl):
for tag in soup.findAll(lambda tag: tag.name.lower()=='a' and tag.has_key('href')):
iurl = self.absurl(baseurl, tag, 'href')
if not iurl:
continue
nurl = self.normurl(iurl)
if self.filemap.has_key(nurl):
self.localize_link(tag, 'href', self.filemap[nurl])
def process_links(self, soup, baseurl, recursion_level, into_dir='links'):
c, res = 0, ''
diskpath = os.path.join(self.current_dir, into_dir)
if not os.path.exists(diskpath):
os.mkdir(diskpath)
prev_dir = self.current_dir
try:
self.current_dir = diskpath
for tag in soup.findAll(lambda tag: tag.name.lower()=='a' and tag.has_key('href')):
print '.',
sys.stdout.flush()
iurl = self.absurl(baseurl, tag, 'href')
if not iurl:
continue
nurl = self.normurl(iurl)
if self.filemap.has_key(nurl):
self.localize_link(tag, 'href', self.filemap[nurl])
continue
if self.files > self.max_files:
return res
c += 1
linkdir = 'link'+str(c) if into_dir else ''
linkdiskpath = os.path.join(diskpath, linkdir)
if not os.path.exists(linkdiskpath):
os.mkdir(linkdiskpath)
try:
self.current_dir = linkdiskpath
f = fetch_url(iurl)
soup = BeautifulSoup(f.read())
logger.info('Processing images...')
self.process_images(soup, f.geturl())
self.process_stylesheets(soup, f.geturl())
res = os.path.join(linkdiskpath, basename(iurl))
self.filemap[nurl] = res
if recursion_level < self.max_recursions:
logger.info('Processing links...')
self.process_links(soup, iurl, recursion_level+1)
else:
self.process_return_links(soup, iurl)
logger.info('Recursion limit reached. Skipping %s', iurl)
save_soup(soup, res)
self.localize_link(tag, 'href', res)
except Exception, err:
logger.warning('Could not fetch link %s', iurl)
logger.debug('Error: %s', str(err), exc_info=True)
finally:
self.current_dir = diskpath
self.files += 1
finally:
self.current_dir = prev_dir
print
return res
def __del__(self):
socket.setdefaulttimeout(self.default_timeout)
def option_parser(usage='%prog URL\n\nWhere URL is for example http://google.com'):
parser = OptionParser(usage=usage, version=__appname__+' '+__version__,
epilog='Created by ' + __author__)
parser.add_option('-d', '--base-dir', help='Base directory into which URL is saved. Default is %default',
default='.', type='string', dest='dir')
parser.add_option('-t', '--timeout', help='Timeout in seconds to wait for a response from the server. Default: %default s',
default=10, type='int', dest='timeout')
parser.add_option('-r', '--max-recursions', help='Maximum number of levels to recurse i.e. depth of links to follow. Default %default',
default=1, type='int', dest='max_recursions')
parser.add_option('-n', '--max-files', default=sys.maxint, type='int', dest='max_files',
help='The maximum number of files to download. This only applies to files from <a href> tags. Default is %default')
parser.add_option('--match-regexp', default=[], action='append', dest='match_regexps',
help='Only links that match this regular expression will be followed. This option can be specified multiple times, in which case as long as a link matches any one regexp, it will be followed. By default all links are followed.')
parser.add_option('--filter-regexp', default=[], action='append', dest='filter_regexps',
help='Any link that matches this regular expression will be ignored. This option can be specified multiple times, in which case as long as any regexp matches a link, it will be ignored.By default, no links are ignored. If both --filter-regexp and --match-regexp are specified, then --match-regexp is ignored.')
parser.add_option('--verbose', help='Show detailed output information. Useful for debugging',
default=False, action='store_true', dest='verbose')
return parser
def main(args=sys.argv):
parser = option_parser()
options, args = parser.parse_args(args)
if len(args) != 2:
parser.print_help()
return 1
level = logging.DEBUG if options.verbose else logging.WARNING
setup_cli_handlers(logger, level)
fetcher = RecursiveFetcher(options)
fetcher.start_fetch(args[1])
if __name__ == '__main__':
sys.exit(main())

View File

@ -237,7 +237,8 @@ SectionEnd
class WixInstaller(object): class WixInstaller(object):
''' '''
Make a .msi installer. Can't get the driver installation to play well with Make a .msi installer. Can't get the driver installation to play well with
an existing installation of the connect USB driver. an existing installation of the connect USB driver. Pick this up again when
libusb1.dll is released based on winusb.
''' '''
TEMPLATE=\ TEMPLATE=\
r'''<?xml version='1.0' encoding='windows-1252'?> r'''<?xml version='1.0' encoding='windows-1252'?>