Migrate html2lrf, web2lrf to python logging framework

This commit is contained in:
Kovid Goyal 2007-08-12 21:59:36 +00:00
parent 1f24807b87
commit 0b68623f86
4 changed files with 98 additions and 99 deletions

View File

@ -28,6 +28,9 @@ if iswindows:
except:
pass
class CommandLineError(Exception):
pass
def setup_cli_handlers(logger, level):
logger.setLevel(level)
if level == logging.WARNING:

View File

@ -20,7 +20,7 @@ Code to convert HTML ebooks into LRF ebooks.
I am indebted to esperanc for the initial CSS->Xylog Style conversion code
and to Falstaff for pylrs.
"""
import os, re, sys, shutil, traceback, copy, glob
import os, re, sys, shutil, copy, glob, logging
from htmlentitydefs import name2codepoint
from urllib import unquote
from urlparse import urlparse
@ -43,7 +43,7 @@ from libprs500.ebooks.lrf import Book
from libprs500.ebooks.lrf import option_parser as lrf_option_parser
from libprs500.ebooks import ConversionError
from libprs500.ebooks.lrf.html.table import Table
from libprs500 import extract, filename_to_utf8
from libprs500 import extract, filename_to_utf8, setup_cli_handlers
from libprs500.ptempfile import PersistentTemporaryFile
class Span(_Span):
@ -84,7 +84,7 @@ class Span(_Span):
return result
@staticmethod
def translate_attrs(d, dpi, fonts, font_delta=0, memory=None):
def translate_attrs(d, dpi, fonts, logger, font_delta=0, memory=None):
"""
Receives a dictionary of html attributes and styles and returns
approximate Xylog equivalents in a new dictionary
@ -211,20 +211,20 @@ class Span(_Span):
else:
memory.append(key)
if report:
print >>sys.stderr, 'Unhandled/malformed CSS key:', key, d[key]
logger.info('Unhandled/malformed CSS key: %s: %s', key, d[key])
t['fontfacename'] = (family, font_key(family, style, weight))
if t.has_key('fontsize') and int(t['fontsize']) > 120:
t['wordspace'] = 50
return t
def __init__(self, ns, css, memory, dpi, fonts, font_delta=0, normal_font_size=100):
def __init__(self, ns, css, memory, dpi, fonts, logger, font_delta=0, normal_font_size=100):
src = ns.string if hasattr(ns, 'string') else ns
src = re.sub(r'\s{2,}', ' ', src) # Remove multiple spaces
for pat, repl in Span.rules:
src = pat.sub(repl, src)
if not src:
raise ConversionError('No point in adding an empty string to a Span')
attrs = Span.translate_attrs(css, dpi, fonts, font_delta=font_delta, memory=memory)
attrs = Span.translate_attrs(css, dpi, fonts, logger, font_delta=font_delta, memory=memory)
if 'fontsize' in attrs.keys():
normal_font_size = int(attrs['fontsize'])
variant = attrs.pop('fontvariant', None)
@ -323,7 +323,7 @@ class HTMLConverter(object):
else:
object.__setattr__(self, attr, val)
def __init__(self, book, fonts, path, options, link_level=0, is_root=True):
def __init__(self, book, fonts, path, options, logger, link_level=0, is_root=True):
'''
Convert HTML file at C{path} and add it to C{book}. After creating
the object, you must call L{self.process_links} on it to create the links and
@ -356,7 +356,8 @@ class HTMLConverter(object):
th = {'font-size' : 'large', 'font-weight':'bold'},
big = {'font-size' : 'large', 'font-weight':'bold'},
)
self.css['.libprs500_dropcaps'] = {'font-size': 'xx-large'}
self.css['.libprs500_dropcaps'] = {'font-size': 'xx-large'}
self.logger = logger
self.fonts = fonts #: dict specifting font families to use
self.scaled_images = {} #: Temporary files with scaled version of images
self.rotated_images = {} #: Temporary files with rotated version of images
@ -385,8 +386,7 @@ class HTMLConverter(object):
path = os.path.abspath(path)
os.chdir(os.path.dirname(path))
self.file_name = os.path.basename(path)
print "Processing", self.file_name
print '\tParsing HTML...',
self.logger.info('Processing %s\n\tParsing HTML...', self.file_name)
sys.stdout.flush()
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
nmassage.extend(HTMLConverter.MARKUP_MASSAGE)
@ -400,7 +400,7 @@ class HTMLConverter(object):
self.soup = BeautifulSoup(raw,
convertEntities=BeautifulSoup.HTML_ENTITIES,
markupMassage=nmassage)
print 'done\n\tConverting to BBeB...',
logger.info('\tConverting to BBeB...')
sys.stdout.flush()
self.current_page = None
self.current_para = None
@ -411,7 +411,6 @@ class HTMLConverter(object):
self.page_break_found = True
self.parse_file()
HTMLConverter.processed_files[path] = self
print 'done'
def parse_css(self, style):
"""
@ -554,8 +553,8 @@ class HTMLConverter(object):
if target.parent != None and \
hasattr(target.parent, 'objId'):
self.book.addTocEntry(ascii_text, tb)
elif self.verbose:
print "Cannot add link", ascii_text, "to TOC"
else:
self.logger.debug("Cannot add link %s to TOC", ascii_text)
def get_target_block(fragment, targets):
@ -624,21 +623,21 @@ class HTMLConverter(object):
if not os.access(path.encode('utf8', 'replace'), os.R_OK):
continue
except Exception:
if self.verbose:
print "Skipping", link
self.logger.exception('Skipping %s', link)
continue
path = os.path.abspath(path)
if not path in HTMLConverter.processed_files.keys():
try:
self.files[path] = HTMLConverter(
self.book, self.fonts, path, self.options,
self.logger,
link_level = self.link_level+1,
is_root = False,)
HTMLConverter.processed_files[path] = self.files[path]
except Exception:
print >>sys.stderr, 'Unable to process', path
self.logger.warning('Unable to process %s', path)
if self.verbose:
traceback.print_exc()
self.logger.exception('')
continue
finally:
os.chdir(cwd)
@ -759,12 +758,12 @@ class HTMLConverter(object):
else:
self.process_alignment(css)
try:
self.current_para.append(Span(src, self.sanctify_css(css), self.memory,\
self.profile.dpi, self.fonts, font_delta=self.font_delta))
self.current_para.append(Span(src, self.sanctify_css(css), self.memory,
self.profile.dpi, self.fonts, self.logger,
font_delta=self.font_delta))
self.current_para.normalize_spaces()
except ConversionError, err:
if self.verbose:
print >>sys.stderr, err
except ConversionError:
self.logger.exception('Bad text')
def sanctify_css(self, css):
""" Return a copy of C{css} that is safe for use in a SPAM Xylog tag """
@ -809,7 +808,7 @@ class HTMLConverter(object):
try:
im = PILImage.open(path)
except IOError, err:
print >>sys.stderr, 'Unable to process:', path, err
self.logger.warning('Unable to process image: %s\n%s', path, err)
return
@ -826,7 +825,7 @@ class HTMLConverter(object):
self.scaled_images[path] = pt
return pt.name
except IOError: # PIL chokes on interlaced PNG images
print >>sys.stderr, 'Unable to process interlaced PNG', path
self.logger.warning('Unable to process interlaced PNG %s', path)
return None
pheight = int(self.current_page.pageStyle.attrs['textheight'])
@ -863,10 +862,8 @@ class HTMLConverter(object):
path = pt.name
self.rotated_images[path] = pt
width, height = im.size
except IOError, err: # PIL chokes on interlaced PNG files and since auto-rotation is not critical we ignore the error
if self.verbose:
print >>sys.stderr, 'Unable to autorotate interlaced PNG', path
print >>sys.stderr, err
except IOError: # PIL chokes on interlaced PNG files and since auto-rotation is not critical we ignore the error
self.logger.debug('Unable to process interlaced PNG %s', path)
finally:
pt.close()
@ -945,8 +942,7 @@ class HTMLConverter(object):
if not self.page_break_found and self.page_break.match(tagname):
if len(self.current_page.contents) > 3:
self.end_page()
if self.verbose:
print 'Forcing page break at', tagname
self.logger.debug('Forcing page break at %s', tagname)
return end_page
def parse_tag(self, tag, parent_css):
@ -1048,8 +1044,7 @@ class HTMLConverter(object):
dropcaps = tag.has_key('class') and tag['class'] == 'libprs500_dropcaps'
self.process_image(path, tag_css, width, height, dropcaps=dropcaps)
else:
if self.verbose:
print >>sys.stderr, "Failed to process:", tag
self.logger.debug("Failed to process: %s", str(tag))
elif tagname in ['style', 'link']:
def update_css(ncss):
for key in ncss.keys():
@ -1083,7 +1078,8 @@ class HTMLConverter(object):
c.replaceWith(self.get_text(c))
self.end_current_para()
self.current_block.append_to(self.current_page)
attrs = Span.translate_attrs(tag_css, self.profile.dpi, self.fonts, self.font_delta, self.memory)
attrs = Span.translate_attrs(tag_css, self.profile.dpi, self.fonts,
self.logger, self.font_delta, self.memory)
attrs['fontfacename'] = self.fonts['mono']['normal'][1]
ts = self.book.create_text_style(**self.unindented_style.attrs)
ts.attrs.update(attrs)
@ -1185,8 +1181,7 @@ class HTMLConverter(object):
src = self.get_text(tag, limit=1000)
if self.chapter_detection and tagname.startswith('h'):
if self.chapter_regex.search(src):
if self.verbose:
print 'Detected chapter', src
self.logger.debug('Detected chapter %s', src)
self.end_page()
self.page_break_found = True
self.end_current_para()
@ -1241,9 +1236,8 @@ class HTMLConverter(object):
try:
self.process_table(tag, tag_css)
except Exception, err:
print 'WARNING: An error occurred while processing a table:', err
print 'Ignoring table markup for table:'
print str(tag)[:300]
self.logger.warning('An error occurred while processing a table: %s', str(err))
self.logger.warning('Ignoring table markup for table:\n%s', str(tag)[:300])
self.in_table = False
self.process_children(tag, tag_css)
else:
@ -1275,16 +1269,20 @@ class HTMLConverter(object):
for _file in self.scaled_images.values() + self.rotated_images.values():
_file.__del__()
def process_file(path, options):
def process_file(path, options, logger=None):
if re.match('http://|https://', path):
raise ConversionError, 'You have to save the website %s as an html file first and then run html2lrf on it.'%(path,)
if logger is None:
level = logging.DEBUG if options.verbose else logging.INFO
logger = logging.getLogger('html2lrf')
setup_cli_handlers(logger, level)
cwd = os.getcwd()
dirpath = None
default_title = filename_to_utf8(os.path.splitext(os.path.basename(path))[0])
try:
dirpath, path = get_path(path)
cpath, tpath = '', ''
try_opf(path, options)
try_opf(path, options, logger)
if options.cover:
options.cover = os.path.abspath(os.path.expanduser(options.cover))
cpath = options.cover
@ -1347,7 +1345,7 @@ def process_file(path, options):
fpba = ['$', '', '$']
options.force_page_break_attr = [re.compile(fpba[0], re.IGNORECASE), fpba[1],
re.compile(fpba[2], re.IGNORECASE)]
conv = HTMLConverter(book, fonts, path, options)
conv = HTMLConverter(book, fonts, path, options, logger)
conv.process_links()
oname = options.output
if not oname:
@ -1356,7 +1354,7 @@ def process_file(path, options):
oname = os.path.join(cwd,name)
oname = os.path.abspath(os.path.expanduser(oname))
conv.writeto(oname, lrs=options.lrs)
print 'Output written to', oname
logger.info('Output written to %s', oname)
conv.cleanup()
return oname
finally:
@ -1364,7 +1362,7 @@ def process_file(path, options):
if dirpath:
shutil.rmtree(dirpath, True)
def try_opf(path, options):
def try_opf(path, options, logger):
try:
opf = glob.glob(os.path.join(os.path.dirname(path),'*.opf'))[0]
except IndexError:
@ -1419,12 +1417,9 @@ def try_opf(path, options):
if not os.access(options.cover, os.R_OK):
options.cover = None
except:
if options.verbose:
traceback.print_exc()
except Exception, err:
if options.verbose:
print >>sys.stderr, 'Failed to process opf file', err
pass
logger.exception('Could not load cover')
except Exception:
logger.exception('Failed to process opf file')
def option_parser():
return lrf_option_parser('''Usage: %prog [options] mybook.[html|rar|zip]\n\n'''

View File

@ -14,14 +14,13 @@
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
'''Convert known websites into LRF files.'''
import sys, time, tempfile, shutil, os
import sys, time, tempfile, shutil, os, logging
from urlparse import urlsplit
from libprs500 import __appname__
from libprs500 import __appname__, setup_cli_handlers, CommandLineError
from libprs500.ebooks.lrf import option_parser as lrf_option_parser
from libprs500.ebooks.lrf.html.convert_from import process_file
from libprs500.ebooks.lrf.web.profiles import profiles
from libprs500.web.fetch.simple import setup_logger as web2disk_setup_logger
from libprs500.web.fetch.simple import create_fetcher
available_profiles = profiles.keys()
@ -57,14 +56,14 @@ def option_parser():
help='Any link that matches this regular expression will be ignored. This option can be specified multiple times, in which case as long as any regexp matches a link, it will be ignored.By default, no links are ignored. If both --filter-regexp and --match-regexp are specified, then --filter-regexp is applied first.')
return parser
def fetch_website(options):
def fetch_website(options, logger):
tdir = tempfile.mkdtemp(prefix=__appname__+'_' )
options.dir = tdir
fetcher = create_fetcher(options)
fetcher = create_fetcher(options, logger)
fetcher.preprocess_regexps = options.preprocess_regexps
return fetcher.start_fetch(options.url), tdir
def create_lrf(htmlfile, options):
def create_lrf(htmlfile, options, logger):
if not options.author:
options.author = __appname__
options.header = True
@ -73,20 +72,16 @@ def create_lrf(htmlfile, options):
else:
options.output = os.path.abspath(os.path.expanduser(options.title + ('.lrs' if options.lrs else '.lrf')))
process_file(htmlfile, options)
process_file(htmlfile, options, logger)
def main(args=sys.argv):
parser = option_parser()
options, args = parser.parse_args(args)
web2disk_setup_logger(options)
if len(args) > 2:
parser.print_help()
return 1
def process_profile(args, options, logger=None):
if logger is None:
level = logging.DEBUG if options.verbose else logging.INFO
logger = logging.getLogger('web2lrf')
setup_cli_handlers(logger, level)
if len(args) == 2:
if not profiles.has_key(args[1]):
print >>sys.stderr, 'Unknown profile', args[1]
print >>sys.stderr, 'Valid profiles:', profiles.keys()
return 1
raise CommandLineError('Unknown profile: %s\nValid profiles: %s'%(args[1], profiles.keys()))
profile = profiles[args[1]] if len(args) == 2 else profiles['default']
if profile.has_key('initialize'):
@ -98,11 +93,7 @@ def main(args=sys.argv):
setattr(options, opt, profile[opt])
if not options.url:
parser.print_help()
print >>sys.stderr
print >>sys.stderr, 'You must specify the --url option or a profile from one of:',
print >>sys.stderr, available_profiles
return 1
raise CommandLineError('You must specify the --url option or a profile from one of: %s', available_profiles)
if not options.title:
title = profile['title']
@ -114,12 +105,24 @@ def main(args=sys.argv):
options.preprocess_regexps = profile['preprocess_regexps']
options.filter_regexps += profile['filter_regexps']
htmlfile, tdir = fetch_website(options)
create_lrf(htmlfile, options)
htmlfile, tdir = fetch_website(options, logger)
create_lrf(htmlfile, options, logger)
if profile.has_key('finalize'):
profile['finalize'](profile)
shutil.rmtree(tdir)
def main(args=sys.argv):
parser = option_parser()
options, args = parser.parse_args(args)
if len(args) > 2:
parser.print_help()
return 1
try:
process_profile(args, options)
except CommandLineError, err:
print >>sys.stderr, err
return 0
if __name__ == '__main__':

View File

@ -23,8 +23,6 @@ from optparse import OptionParser
from libprs500 import __version__, __appname__, __author__, setup_cli_handlers
from libprs500.ebooks.BeautifulSoup import BeautifulSoup
logger = logging.getLogger('libprs500.web.fetch.simple')
class FetchError(Exception):
pass
@ -52,7 +50,8 @@ class RecursiveFetcher(object):
# )
CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE)
def __init__(self, options):
def __init__(self, options, logger):
self.logger = logger
self.base_dir = os.path.abspath(os.path.expanduser(options.dir))
if not os.path.exists(self.base_dir):
os.makedirs(self.base_dir)
@ -80,7 +79,7 @@ class RecursiveFetcher(object):
def fetch_url(self, url):
f = None
logger.info('Fetching %s', url)
self.logger.debug('Fetching %s', url)
delta = time.time() - self.last_fetch_at
if delta < self.delay:
time.sleep(delta)
@ -138,8 +137,8 @@ class RecursiveFetcher(object):
try:
f = self.fetch_url(iurl)
except Exception, err:
logger.warning('Could not fetch stylesheet %s', iurl)
logger.debug('Error: %s', str(err), exc_info=True)
self.logger.warning('Could not fetch stylesheet %s', iurl)
self.logger.debug('Error: %s', str(err), exc_info=True)
continue
c += 1
stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
@ -160,8 +159,8 @@ class RecursiveFetcher(object):
try:
f = self.fetch_url(iurl)
except Exception, err:
logger.warning('Could not fetch stylesheet %s', iurl)
logger.debug('Error: %s', str(err), exc_info=True)
self.logger.warning('Could not fetch stylesheet %s', iurl)
self.logger.debug('Error: %s', str(err), exc_info=True)
continue
c += 1
stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
@ -179,7 +178,7 @@ class RecursiveFetcher(object):
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
iurl, ext = tag['src'], os.path.splitext(tag['src'])[1]
if not ext:
logger.info('Skipping extensionless image %s', iurl)
self.logger.debug('Skipping extensionless image %s', iurl)
continue
if not urlparse.urlsplit(iurl).scheme:
iurl = urlparse.urljoin(baseurl, iurl, False)
@ -189,8 +188,8 @@ class RecursiveFetcher(object):
try:
f = self.fetch_url(iurl)
except Exception, err:
logger.warning('Could not fetch image %s', iurl)
logger.debug('Error: %s', str(err), exc_info=True)
self.logger.warning('Could not fetch image %s', iurl)
self.logger.debug('Error: %s', str(err), exc_info=True)
continue
c += 1
imgpath = os.path.join(diskpath, 'img'+str(c)+ext)
@ -206,7 +205,7 @@ class RecursiveFetcher(object):
if not parts.scheme:
iurl = urlparse.urljoin(baseurl, iurl, False)
if not self.is_link_ok(iurl):
logger.info('Skipping invalid link: %s', iurl)
self.logger.debug('Skipping invalid link: %s', iurl)
return None
return iurl
@ -258,7 +257,7 @@ class RecursiveFetcher(object):
self.current_dir = linkdiskpath
f = self.fetch_url(iurl)
soup = self.get_soup(f.read())
logger.info('Processing images...')
self.logger.debug('Processing images...')
self.process_images(soup, f.geturl())
if self.download_stylesheets:
self.process_stylesheets(soup, f.geturl())
@ -266,17 +265,17 @@ class RecursiveFetcher(object):
res = os.path.join(linkdiskpath, basename(iurl))
self.filemap[nurl] = res
if recursion_level < self.max_recursions:
logger.info('Processing links...')
self.logger.debug('Processing links...')
self.process_links(soup, iurl, recursion_level+1)
else:
self.process_return_links(soup, iurl)
logger.info('Recursion limit reached. Skipping %s', iurl)
self.logger.debug('Recursion limit reached. Skipping %s', iurl)
save_soup(soup, res)
self.localize_link(tag, 'href', res)
except Exception, err:
logger.warning('Could not fetch link %s', iurl)
logger.debug('Error: %s', str(err), exc_info=True)
self.logger.warning('Could not fetch link %s', iurl)
self.logger.debug('Error: %s', str(err), exc_info=True)
finally:
self.current_dir = diskpath
self.files += 1
@ -313,12 +312,12 @@ def option_parser(usage='%prog URL\n\nWhere URL is for example http://google.com
return parser
def create_fetcher(options):
return RecursiveFetcher(options)
def setup_logger(options):
level = logging.DEBUG if options.verbose else logging.WARNING
setup_cli_handlers(logger, level)
def create_fetcher(options, logger=None):
if logger is None:
level = logging.DEBUG if options.verbose else logging.INFO
logger = logging.getLogger('web2disk')
setup_cli_handlers(logger, level)
return RecursiveFetcher(options, logger)
def main(args=sys.argv):
parser = option_parser()
@ -327,7 +326,6 @@ def main(args=sys.argv):
parser.print_help()
return 1
setup_logger(options)
fetcher = create_fetcher(options)
fetcher.start_fetch(args[1])