mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Migrate html2lrf, web2lrf to python logging framework
This commit is contained in:
parent
1f24807b87
commit
0b68623f86
@ -28,6 +28,9 @@ if iswindows:
|
||||
except:
|
||||
pass
|
||||
|
||||
class CommandLineError(Exception):
|
||||
pass
|
||||
|
||||
def setup_cli_handlers(logger, level):
|
||||
logger.setLevel(level)
|
||||
if level == logging.WARNING:
|
||||
|
@ -20,7 +20,7 @@ Code to convert HTML ebooks into LRF ebooks.
|
||||
I am indebted to esperanc for the initial CSS->Xylog Style conversion code
|
||||
and to Falstaff for pylrs.
|
||||
"""
|
||||
import os, re, sys, shutil, traceback, copy, glob
|
||||
import os, re, sys, shutil, copy, glob, logging
|
||||
from htmlentitydefs import name2codepoint
|
||||
from urllib import unquote
|
||||
from urlparse import urlparse
|
||||
@ -43,7 +43,7 @@ from libprs500.ebooks.lrf import Book
|
||||
from libprs500.ebooks.lrf import option_parser as lrf_option_parser
|
||||
from libprs500.ebooks import ConversionError
|
||||
from libprs500.ebooks.lrf.html.table import Table
|
||||
from libprs500 import extract, filename_to_utf8
|
||||
from libprs500 import extract, filename_to_utf8, setup_cli_handlers
|
||||
from libprs500.ptempfile import PersistentTemporaryFile
|
||||
|
||||
class Span(_Span):
|
||||
@ -84,7 +84,7 @@ class Span(_Span):
|
||||
return result
|
||||
|
||||
@staticmethod
|
||||
def translate_attrs(d, dpi, fonts, font_delta=0, memory=None):
|
||||
def translate_attrs(d, dpi, fonts, logger, font_delta=0, memory=None):
|
||||
"""
|
||||
Receives a dictionary of html attributes and styles and returns
|
||||
approximate Xylog equivalents in a new dictionary
|
||||
@ -211,20 +211,20 @@ class Span(_Span):
|
||||
else:
|
||||
memory.append(key)
|
||||
if report:
|
||||
print >>sys.stderr, 'Unhandled/malformed CSS key:', key, d[key]
|
||||
logger.info('Unhandled/malformed CSS key: %s: %s', key, d[key])
|
||||
t['fontfacename'] = (family, font_key(family, style, weight))
|
||||
if t.has_key('fontsize') and int(t['fontsize']) > 120:
|
||||
t['wordspace'] = 50
|
||||
return t
|
||||
|
||||
def __init__(self, ns, css, memory, dpi, fonts, font_delta=0, normal_font_size=100):
|
||||
def __init__(self, ns, css, memory, dpi, fonts, logger, font_delta=0, normal_font_size=100):
|
||||
src = ns.string if hasattr(ns, 'string') else ns
|
||||
src = re.sub(r'\s{2,}', ' ', src) # Remove multiple spaces
|
||||
for pat, repl in Span.rules:
|
||||
src = pat.sub(repl, src)
|
||||
if not src:
|
||||
raise ConversionError('No point in adding an empty string to a Span')
|
||||
attrs = Span.translate_attrs(css, dpi, fonts, font_delta=font_delta, memory=memory)
|
||||
attrs = Span.translate_attrs(css, dpi, fonts, logger, font_delta=font_delta, memory=memory)
|
||||
if 'fontsize' in attrs.keys():
|
||||
normal_font_size = int(attrs['fontsize'])
|
||||
variant = attrs.pop('fontvariant', None)
|
||||
@ -323,7 +323,7 @@ class HTMLConverter(object):
|
||||
else:
|
||||
object.__setattr__(self, attr, val)
|
||||
|
||||
def __init__(self, book, fonts, path, options, link_level=0, is_root=True):
|
||||
def __init__(self, book, fonts, path, options, logger, link_level=0, is_root=True):
|
||||
'''
|
||||
Convert HTML file at C{path} and add it to C{book}. After creating
|
||||
the object, you must call L{self.process_links} on it to create the links and
|
||||
@ -356,7 +356,8 @@ class HTMLConverter(object):
|
||||
th = {'font-size' : 'large', 'font-weight':'bold'},
|
||||
big = {'font-size' : 'large', 'font-weight':'bold'},
|
||||
)
|
||||
self.css['.libprs500_dropcaps'] = {'font-size': 'xx-large'}
|
||||
self.css['.libprs500_dropcaps'] = {'font-size': 'xx-large'}
|
||||
self.logger = logger
|
||||
self.fonts = fonts #: dict specifting font families to use
|
||||
self.scaled_images = {} #: Temporary files with scaled version of images
|
||||
self.rotated_images = {} #: Temporary files with rotated version of images
|
||||
@ -385,8 +386,7 @@ class HTMLConverter(object):
|
||||
path = os.path.abspath(path)
|
||||
os.chdir(os.path.dirname(path))
|
||||
self.file_name = os.path.basename(path)
|
||||
print "Processing", self.file_name
|
||||
print '\tParsing HTML...',
|
||||
self.logger.info('Processing %s\n\tParsing HTML...', self.file_name)
|
||||
sys.stdout.flush()
|
||||
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
|
||||
nmassage.extend(HTMLConverter.MARKUP_MASSAGE)
|
||||
@ -400,7 +400,7 @@ class HTMLConverter(object):
|
||||
self.soup = BeautifulSoup(raw,
|
||||
convertEntities=BeautifulSoup.HTML_ENTITIES,
|
||||
markupMassage=nmassage)
|
||||
print 'done\n\tConverting to BBeB...',
|
||||
logger.info('\tConverting to BBeB...')
|
||||
sys.stdout.flush()
|
||||
self.current_page = None
|
||||
self.current_para = None
|
||||
@ -411,7 +411,6 @@ class HTMLConverter(object):
|
||||
self.page_break_found = True
|
||||
self.parse_file()
|
||||
HTMLConverter.processed_files[path] = self
|
||||
print 'done'
|
||||
|
||||
def parse_css(self, style):
|
||||
"""
|
||||
@ -554,8 +553,8 @@ class HTMLConverter(object):
|
||||
if target.parent != None and \
|
||||
hasattr(target.parent, 'objId'):
|
||||
self.book.addTocEntry(ascii_text, tb)
|
||||
elif self.verbose:
|
||||
print "Cannot add link", ascii_text, "to TOC"
|
||||
else:
|
||||
self.logger.debug("Cannot add link %s to TOC", ascii_text)
|
||||
|
||||
|
||||
def get_target_block(fragment, targets):
|
||||
@ -624,21 +623,21 @@ class HTMLConverter(object):
|
||||
if not os.access(path.encode('utf8', 'replace'), os.R_OK):
|
||||
continue
|
||||
except Exception:
|
||||
if self.verbose:
|
||||
print "Skipping", link
|
||||
self.logger.exception('Skipping %s', link)
|
||||
continue
|
||||
path = os.path.abspath(path)
|
||||
if not path in HTMLConverter.processed_files.keys():
|
||||
try:
|
||||
self.files[path] = HTMLConverter(
|
||||
self.book, self.fonts, path, self.options,
|
||||
self.logger,
|
||||
link_level = self.link_level+1,
|
||||
is_root = False,)
|
||||
HTMLConverter.processed_files[path] = self.files[path]
|
||||
except Exception:
|
||||
print >>sys.stderr, 'Unable to process', path
|
||||
self.logger.warning('Unable to process %s', path)
|
||||
if self.verbose:
|
||||
traceback.print_exc()
|
||||
self.logger.exception('')
|
||||
continue
|
||||
finally:
|
||||
os.chdir(cwd)
|
||||
@ -759,12 +758,12 @@ class HTMLConverter(object):
|
||||
else:
|
||||
self.process_alignment(css)
|
||||
try:
|
||||
self.current_para.append(Span(src, self.sanctify_css(css), self.memory,\
|
||||
self.profile.dpi, self.fonts, font_delta=self.font_delta))
|
||||
self.current_para.append(Span(src, self.sanctify_css(css), self.memory,
|
||||
self.profile.dpi, self.fonts, self.logger,
|
||||
font_delta=self.font_delta))
|
||||
self.current_para.normalize_spaces()
|
||||
except ConversionError, err:
|
||||
if self.verbose:
|
||||
print >>sys.stderr, err
|
||||
except ConversionError:
|
||||
self.logger.exception('Bad text')
|
||||
|
||||
def sanctify_css(self, css):
|
||||
""" Return a copy of C{css} that is safe for use in a SPAM Xylog tag """
|
||||
@ -809,7 +808,7 @@ class HTMLConverter(object):
|
||||
try:
|
||||
im = PILImage.open(path)
|
||||
except IOError, err:
|
||||
print >>sys.stderr, 'Unable to process:', path, err
|
||||
self.logger.warning('Unable to process image: %s\n%s', path, err)
|
||||
return
|
||||
|
||||
|
||||
@ -826,7 +825,7 @@ class HTMLConverter(object):
|
||||
self.scaled_images[path] = pt
|
||||
return pt.name
|
||||
except IOError: # PIL chokes on interlaced PNG images
|
||||
print >>sys.stderr, 'Unable to process interlaced PNG', path
|
||||
self.logger.warning('Unable to process interlaced PNG %s', path)
|
||||
return None
|
||||
|
||||
pheight = int(self.current_page.pageStyle.attrs['textheight'])
|
||||
@ -863,10 +862,8 @@ class HTMLConverter(object):
|
||||
path = pt.name
|
||||
self.rotated_images[path] = pt
|
||||
width, height = im.size
|
||||
except IOError, err: # PIL chokes on interlaced PNG files and since auto-rotation is not critical we ignore the error
|
||||
if self.verbose:
|
||||
print >>sys.stderr, 'Unable to autorotate interlaced PNG', path
|
||||
print >>sys.stderr, err
|
||||
except IOError: # PIL chokes on interlaced PNG files and since auto-rotation is not critical we ignore the error
|
||||
self.logger.debug('Unable to process interlaced PNG %s', path)
|
||||
finally:
|
||||
pt.close()
|
||||
|
||||
@ -945,8 +942,7 @@ class HTMLConverter(object):
|
||||
if not self.page_break_found and self.page_break.match(tagname):
|
||||
if len(self.current_page.contents) > 3:
|
||||
self.end_page()
|
||||
if self.verbose:
|
||||
print 'Forcing page break at', tagname
|
||||
self.logger.debug('Forcing page break at %s', tagname)
|
||||
return end_page
|
||||
|
||||
def parse_tag(self, tag, parent_css):
|
||||
@ -1048,8 +1044,7 @@ class HTMLConverter(object):
|
||||
dropcaps = tag.has_key('class') and tag['class'] == 'libprs500_dropcaps'
|
||||
self.process_image(path, tag_css, width, height, dropcaps=dropcaps)
|
||||
else:
|
||||
if self.verbose:
|
||||
print >>sys.stderr, "Failed to process:", tag
|
||||
self.logger.debug("Failed to process: %s", str(tag))
|
||||
elif tagname in ['style', 'link']:
|
||||
def update_css(ncss):
|
||||
for key in ncss.keys():
|
||||
@ -1083,7 +1078,8 @@ class HTMLConverter(object):
|
||||
c.replaceWith(self.get_text(c))
|
||||
self.end_current_para()
|
||||
self.current_block.append_to(self.current_page)
|
||||
attrs = Span.translate_attrs(tag_css, self.profile.dpi, self.fonts, self.font_delta, self.memory)
|
||||
attrs = Span.translate_attrs(tag_css, self.profile.dpi, self.fonts,
|
||||
self.logger, self.font_delta, self.memory)
|
||||
attrs['fontfacename'] = self.fonts['mono']['normal'][1]
|
||||
ts = self.book.create_text_style(**self.unindented_style.attrs)
|
||||
ts.attrs.update(attrs)
|
||||
@ -1185,8 +1181,7 @@ class HTMLConverter(object):
|
||||
src = self.get_text(tag, limit=1000)
|
||||
if self.chapter_detection and tagname.startswith('h'):
|
||||
if self.chapter_regex.search(src):
|
||||
if self.verbose:
|
||||
print 'Detected chapter', src
|
||||
self.logger.debug('Detected chapter %s', src)
|
||||
self.end_page()
|
||||
self.page_break_found = True
|
||||
self.end_current_para()
|
||||
@ -1241,9 +1236,8 @@ class HTMLConverter(object):
|
||||
try:
|
||||
self.process_table(tag, tag_css)
|
||||
except Exception, err:
|
||||
print 'WARNING: An error occurred while processing a table:', err
|
||||
print 'Ignoring table markup for table:'
|
||||
print str(tag)[:300]
|
||||
self.logger.warning('An error occurred while processing a table: %s', str(err))
|
||||
self.logger.warning('Ignoring table markup for table:\n%s', str(tag)[:300])
|
||||
self.in_table = False
|
||||
self.process_children(tag, tag_css)
|
||||
else:
|
||||
@ -1275,16 +1269,20 @@ class HTMLConverter(object):
|
||||
for _file in self.scaled_images.values() + self.rotated_images.values():
|
||||
_file.__del__()
|
||||
|
||||
def process_file(path, options):
|
||||
def process_file(path, options, logger=None):
|
||||
if re.match('http://|https://', path):
|
||||
raise ConversionError, 'You have to save the website %s as an html file first and then run html2lrf on it.'%(path,)
|
||||
if logger is None:
|
||||
level = logging.DEBUG if options.verbose else logging.INFO
|
||||
logger = logging.getLogger('html2lrf')
|
||||
setup_cli_handlers(logger, level)
|
||||
cwd = os.getcwd()
|
||||
dirpath = None
|
||||
default_title = filename_to_utf8(os.path.splitext(os.path.basename(path))[0])
|
||||
try:
|
||||
dirpath, path = get_path(path)
|
||||
cpath, tpath = '', ''
|
||||
try_opf(path, options)
|
||||
try_opf(path, options, logger)
|
||||
if options.cover:
|
||||
options.cover = os.path.abspath(os.path.expanduser(options.cover))
|
||||
cpath = options.cover
|
||||
@ -1347,7 +1345,7 @@ def process_file(path, options):
|
||||
fpba = ['$', '', '$']
|
||||
options.force_page_break_attr = [re.compile(fpba[0], re.IGNORECASE), fpba[1],
|
||||
re.compile(fpba[2], re.IGNORECASE)]
|
||||
conv = HTMLConverter(book, fonts, path, options)
|
||||
conv = HTMLConverter(book, fonts, path, options, logger)
|
||||
conv.process_links()
|
||||
oname = options.output
|
||||
if not oname:
|
||||
@ -1356,7 +1354,7 @@ def process_file(path, options):
|
||||
oname = os.path.join(cwd,name)
|
||||
oname = os.path.abspath(os.path.expanduser(oname))
|
||||
conv.writeto(oname, lrs=options.lrs)
|
||||
print 'Output written to', oname
|
||||
logger.info('Output written to %s', oname)
|
||||
conv.cleanup()
|
||||
return oname
|
||||
finally:
|
||||
@ -1364,7 +1362,7 @@ def process_file(path, options):
|
||||
if dirpath:
|
||||
shutil.rmtree(dirpath, True)
|
||||
|
||||
def try_opf(path, options):
|
||||
def try_opf(path, options, logger):
|
||||
try:
|
||||
opf = glob.glob(os.path.join(os.path.dirname(path),'*.opf'))[0]
|
||||
except IndexError:
|
||||
@ -1419,12 +1417,9 @@ def try_opf(path, options):
|
||||
if not os.access(options.cover, os.R_OK):
|
||||
options.cover = None
|
||||
except:
|
||||
if options.verbose:
|
||||
traceback.print_exc()
|
||||
except Exception, err:
|
||||
if options.verbose:
|
||||
print >>sys.stderr, 'Failed to process opf file', err
|
||||
pass
|
||||
logger.exception('Could not load cover')
|
||||
except Exception:
|
||||
logger.exception('Failed to process opf file')
|
||||
|
||||
def option_parser():
|
||||
return lrf_option_parser('''Usage: %prog [options] mybook.[html|rar|zip]\n\n'''
|
||||
|
@ -14,14 +14,13 @@
|
||||
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
'''Convert known websites into LRF files.'''
|
||||
|
||||
import sys, time, tempfile, shutil, os
|
||||
import sys, time, tempfile, shutil, os, logging
|
||||
from urlparse import urlsplit
|
||||
|
||||
from libprs500 import __appname__
|
||||
from libprs500 import __appname__, setup_cli_handlers, CommandLineError
|
||||
from libprs500.ebooks.lrf import option_parser as lrf_option_parser
|
||||
from libprs500.ebooks.lrf.html.convert_from import process_file
|
||||
from libprs500.ebooks.lrf.web.profiles import profiles
|
||||
from libprs500.web.fetch.simple import setup_logger as web2disk_setup_logger
|
||||
from libprs500.web.fetch.simple import create_fetcher
|
||||
|
||||
available_profiles = profiles.keys()
|
||||
@ -57,14 +56,14 @@ def option_parser():
|
||||
help='Any link that matches this regular expression will be ignored. This option can be specified multiple times, in which case as long as any regexp matches a link, it will be ignored.By default, no links are ignored. If both --filter-regexp and --match-regexp are specified, then --filter-regexp is applied first.')
|
||||
return parser
|
||||
|
||||
def fetch_website(options):
|
||||
def fetch_website(options, logger):
|
||||
tdir = tempfile.mkdtemp(prefix=__appname__+'_' )
|
||||
options.dir = tdir
|
||||
fetcher = create_fetcher(options)
|
||||
fetcher = create_fetcher(options, logger)
|
||||
fetcher.preprocess_regexps = options.preprocess_regexps
|
||||
return fetcher.start_fetch(options.url), tdir
|
||||
|
||||
def create_lrf(htmlfile, options):
|
||||
def create_lrf(htmlfile, options, logger):
|
||||
if not options.author:
|
||||
options.author = __appname__
|
||||
options.header = True
|
||||
@ -73,20 +72,16 @@ def create_lrf(htmlfile, options):
|
||||
else:
|
||||
options.output = os.path.abspath(os.path.expanduser(options.title + ('.lrs' if options.lrs else '.lrf')))
|
||||
|
||||
process_file(htmlfile, options)
|
||||
process_file(htmlfile, options, logger)
|
||||
|
||||
def main(args=sys.argv):
|
||||
parser = option_parser()
|
||||
options, args = parser.parse_args(args)
|
||||
web2disk_setup_logger(options)
|
||||
if len(args) > 2:
|
||||
parser.print_help()
|
||||
return 1
|
||||
def process_profile(args, options, logger=None):
|
||||
if logger is None:
|
||||
level = logging.DEBUG if options.verbose else logging.INFO
|
||||
logger = logging.getLogger('web2lrf')
|
||||
setup_cli_handlers(logger, level)
|
||||
if len(args) == 2:
|
||||
if not profiles.has_key(args[1]):
|
||||
print >>sys.stderr, 'Unknown profile', args[1]
|
||||
print >>sys.stderr, 'Valid profiles:', profiles.keys()
|
||||
return 1
|
||||
raise CommandLineError('Unknown profile: %s\nValid profiles: %s'%(args[1], profiles.keys()))
|
||||
profile = profiles[args[1]] if len(args) == 2 else profiles['default']
|
||||
|
||||
if profile.has_key('initialize'):
|
||||
@ -98,11 +93,7 @@ def main(args=sys.argv):
|
||||
setattr(options, opt, profile[opt])
|
||||
|
||||
if not options.url:
|
||||
parser.print_help()
|
||||
print >>sys.stderr
|
||||
print >>sys.stderr, 'You must specify the --url option or a profile from one of:',
|
||||
print >>sys.stderr, available_profiles
|
||||
return 1
|
||||
raise CommandLineError('You must specify the --url option or a profile from one of: %s', available_profiles)
|
||||
|
||||
if not options.title:
|
||||
title = profile['title']
|
||||
@ -114,12 +105,24 @@ def main(args=sys.argv):
|
||||
options.preprocess_regexps = profile['preprocess_regexps']
|
||||
options.filter_regexps += profile['filter_regexps']
|
||||
|
||||
htmlfile, tdir = fetch_website(options)
|
||||
create_lrf(htmlfile, options)
|
||||
htmlfile, tdir = fetch_website(options, logger)
|
||||
create_lrf(htmlfile, options, logger)
|
||||
if profile.has_key('finalize'):
|
||||
profile['finalize'](profile)
|
||||
shutil.rmtree(tdir)
|
||||
|
||||
|
||||
|
||||
|
||||
def main(args=sys.argv):
|
||||
parser = option_parser()
|
||||
options, args = parser.parse_args(args)
|
||||
if len(args) > 2:
|
||||
parser.print_help()
|
||||
return 1
|
||||
try:
|
||||
process_profile(args, options)
|
||||
except CommandLineError, err:
|
||||
print >>sys.stderr, err
|
||||
return 0
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
@ -23,8 +23,6 @@ from optparse import OptionParser
|
||||
from libprs500 import __version__, __appname__, __author__, setup_cli_handlers
|
||||
from libprs500.ebooks.BeautifulSoup import BeautifulSoup
|
||||
|
||||
logger = logging.getLogger('libprs500.web.fetch.simple')
|
||||
|
||||
class FetchError(Exception):
|
||||
pass
|
||||
|
||||
@ -52,7 +50,8 @@ class RecursiveFetcher(object):
|
||||
# )
|
||||
CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE)
|
||||
|
||||
def __init__(self, options):
|
||||
def __init__(self, options, logger):
|
||||
self.logger = logger
|
||||
self.base_dir = os.path.abspath(os.path.expanduser(options.dir))
|
||||
if not os.path.exists(self.base_dir):
|
||||
os.makedirs(self.base_dir)
|
||||
@ -80,7 +79,7 @@ class RecursiveFetcher(object):
|
||||
|
||||
def fetch_url(self, url):
|
||||
f = None
|
||||
logger.info('Fetching %s', url)
|
||||
self.logger.debug('Fetching %s', url)
|
||||
delta = time.time() - self.last_fetch_at
|
||||
if delta < self.delay:
|
||||
time.sleep(delta)
|
||||
@ -138,8 +137,8 @@ class RecursiveFetcher(object):
|
||||
try:
|
||||
f = self.fetch_url(iurl)
|
||||
except Exception, err:
|
||||
logger.warning('Could not fetch stylesheet %s', iurl)
|
||||
logger.debug('Error: %s', str(err), exc_info=True)
|
||||
self.logger.warning('Could not fetch stylesheet %s', iurl)
|
||||
self.logger.debug('Error: %s', str(err), exc_info=True)
|
||||
continue
|
||||
c += 1
|
||||
stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
|
||||
@ -160,8 +159,8 @@ class RecursiveFetcher(object):
|
||||
try:
|
||||
f = self.fetch_url(iurl)
|
||||
except Exception, err:
|
||||
logger.warning('Could not fetch stylesheet %s', iurl)
|
||||
logger.debug('Error: %s', str(err), exc_info=True)
|
||||
self.logger.warning('Could not fetch stylesheet %s', iurl)
|
||||
self.logger.debug('Error: %s', str(err), exc_info=True)
|
||||
continue
|
||||
c += 1
|
||||
stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
|
||||
@ -179,7 +178,7 @@ class RecursiveFetcher(object):
|
||||
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
|
||||
iurl, ext = tag['src'], os.path.splitext(tag['src'])[1]
|
||||
if not ext:
|
||||
logger.info('Skipping extensionless image %s', iurl)
|
||||
self.logger.debug('Skipping extensionless image %s', iurl)
|
||||
continue
|
||||
if not urlparse.urlsplit(iurl).scheme:
|
||||
iurl = urlparse.urljoin(baseurl, iurl, False)
|
||||
@ -189,8 +188,8 @@ class RecursiveFetcher(object):
|
||||
try:
|
||||
f = self.fetch_url(iurl)
|
||||
except Exception, err:
|
||||
logger.warning('Could not fetch image %s', iurl)
|
||||
logger.debug('Error: %s', str(err), exc_info=True)
|
||||
self.logger.warning('Could not fetch image %s', iurl)
|
||||
self.logger.debug('Error: %s', str(err), exc_info=True)
|
||||
continue
|
||||
c += 1
|
||||
imgpath = os.path.join(diskpath, 'img'+str(c)+ext)
|
||||
@ -206,7 +205,7 @@ class RecursiveFetcher(object):
|
||||
if not parts.scheme:
|
||||
iurl = urlparse.urljoin(baseurl, iurl, False)
|
||||
if not self.is_link_ok(iurl):
|
||||
logger.info('Skipping invalid link: %s', iurl)
|
||||
self.logger.debug('Skipping invalid link: %s', iurl)
|
||||
return None
|
||||
return iurl
|
||||
|
||||
@ -258,7 +257,7 @@ class RecursiveFetcher(object):
|
||||
self.current_dir = linkdiskpath
|
||||
f = self.fetch_url(iurl)
|
||||
soup = self.get_soup(f.read())
|
||||
logger.info('Processing images...')
|
||||
self.logger.debug('Processing images...')
|
||||
self.process_images(soup, f.geturl())
|
||||
if self.download_stylesheets:
|
||||
self.process_stylesheets(soup, f.geturl())
|
||||
@ -266,17 +265,17 @@ class RecursiveFetcher(object):
|
||||
res = os.path.join(linkdiskpath, basename(iurl))
|
||||
self.filemap[nurl] = res
|
||||
if recursion_level < self.max_recursions:
|
||||
logger.info('Processing links...')
|
||||
self.logger.debug('Processing links...')
|
||||
self.process_links(soup, iurl, recursion_level+1)
|
||||
else:
|
||||
self.process_return_links(soup, iurl)
|
||||
logger.info('Recursion limit reached. Skipping %s', iurl)
|
||||
self.logger.debug('Recursion limit reached. Skipping %s', iurl)
|
||||
|
||||
save_soup(soup, res)
|
||||
self.localize_link(tag, 'href', res)
|
||||
except Exception, err:
|
||||
logger.warning('Could not fetch link %s', iurl)
|
||||
logger.debug('Error: %s', str(err), exc_info=True)
|
||||
self.logger.warning('Could not fetch link %s', iurl)
|
||||
self.logger.debug('Error: %s', str(err), exc_info=True)
|
||||
finally:
|
||||
self.current_dir = diskpath
|
||||
self.files += 1
|
||||
@ -313,12 +312,12 @@ def option_parser(usage='%prog URL\n\nWhere URL is for example http://google.com
|
||||
return parser
|
||||
|
||||
|
||||
def create_fetcher(options):
|
||||
return RecursiveFetcher(options)
|
||||
|
||||
def setup_logger(options):
|
||||
level = logging.DEBUG if options.verbose else logging.WARNING
|
||||
setup_cli_handlers(logger, level)
|
||||
def create_fetcher(options, logger=None):
|
||||
if logger is None:
|
||||
level = logging.DEBUG if options.verbose else logging.INFO
|
||||
logger = logging.getLogger('web2disk')
|
||||
setup_cli_handlers(logger, level)
|
||||
return RecursiveFetcher(options, logger)
|
||||
|
||||
def main(args=sys.argv):
|
||||
parser = option_parser()
|
||||
@ -327,7 +326,6 @@ def main(args=sys.argv):
|
||||
parser.print_help()
|
||||
return 1
|
||||
|
||||
setup_logger(options)
|
||||
fetcher = create_fetcher(options)
|
||||
fetcher.start_fetch(args[1])
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user