diff --git a/src/libprs500/__init__.py b/src/libprs500/__init__.py index 4fb40760fe..e5413c8ddc 100644 --- a/src/libprs500/__init__.py +++ b/src/libprs500/__init__.py @@ -28,6 +28,9 @@ if iswindows: except: pass +class CommandLineError(Exception): + pass + def setup_cli_handlers(logger, level): logger.setLevel(level) if level == logging.WARNING: diff --git a/src/libprs500/ebooks/lrf/html/convert_from.py b/src/libprs500/ebooks/lrf/html/convert_from.py index 47651a284f..5e9ea60fda 100644 --- a/src/libprs500/ebooks/lrf/html/convert_from.py +++ b/src/libprs500/ebooks/lrf/html/convert_from.py @@ -20,7 +20,7 @@ Code to convert HTML ebooks into LRF ebooks. I am indebted to esperanc for the initial CSS->Xylog Style conversion code and to Falstaff for pylrs. """ -import os, re, sys, shutil, traceback, copy, glob +import os, re, sys, shutil, copy, glob, logging from htmlentitydefs import name2codepoint from urllib import unquote from urlparse import urlparse @@ -43,7 +43,7 @@ from libprs500.ebooks.lrf import Book from libprs500.ebooks.lrf import option_parser as lrf_option_parser from libprs500.ebooks import ConversionError from libprs500.ebooks.lrf.html.table import Table -from libprs500 import extract, filename_to_utf8 +from libprs500 import extract, filename_to_utf8, setup_cli_handlers from libprs500.ptempfile import PersistentTemporaryFile class Span(_Span): @@ -84,7 +84,7 @@ class Span(_Span): return result @staticmethod - def translate_attrs(d, dpi, fonts, font_delta=0, memory=None): + def translate_attrs(d, dpi, fonts, logger, font_delta=0, memory=None): """ Receives a dictionary of html attributes and styles and returns approximate Xylog equivalents in a new dictionary @@ -211,20 +211,20 @@ class Span(_Span): else: memory.append(key) if report: - print >>sys.stderr, 'Unhandled/malformed CSS key:', key, d[key] + logger.info('Unhandled/malformed CSS key: %s: %s', key, d[key]) t['fontfacename'] = (family, font_key(family, style, weight)) if t.has_key('fontsize') and int(t['fontsize']) > 120: t['wordspace'] = 50 return t - def __init__(self, ns, css, memory, dpi, fonts, font_delta=0, normal_font_size=100): + def __init__(self, ns, css, memory, dpi, fonts, logger, font_delta=0, normal_font_size=100): src = ns.string if hasattr(ns, 'string') else ns src = re.sub(r'\s{2,}', ' ', src) # Remove multiple spaces for pat, repl in Span.rules: src = pat.sub(repl, src) if not src: raise ConversionError('No point in adding an empty string to a Span') - attrs = Span.translate_attrs(css, dpi, fonts, font_delta=font_delta, memory=memory) + attrs = Span.translate_attrs(css, dpi, fonts, logger, font_delta=font_delta, memory=memory) if 'fontsize' in attrs.keys(): normal_font_size = int(attrs['fontsize']) variant = attrs.pop('fontvariant', None) @@ -323,7 +323,7 @@ class HTMLConverter(object): else: object.__setattr__(self, attr, val) - def __init__(self, book, fonts, path, options, link_level=0, is_root=True): + def __init__(self, book, fonts, path, options, logger, link_level=0, is_root=True): ''' Convert HTML file at C{path} and add it to C{book}. After creating the object, you must call L{self.process_links} on it to create the links and @@ -356,7 +356,8 @@ class HTMLConverter(object): th = {'font-size' : 'large', 'font-weight':'bold'}, big = {'font-size' : 'large', 'font-weight':'bold'}, ) - self.css['.libprs500_dropcaps'] = {'font-size': 'xx-large'} + self.css['.libprs500_dropcaps'] = {'font-size': 'xx-large'} + self.logger = logger self.fonts = fonts #: dict specifting font families to use self.scaled_images = {} #: Temporary files with scaled version of images self.rotated_images = {} #: Temporary files with rotated version of images @@ -385,8 +386,7 @@ class HTMLConverter(object): path = os.path.abspath(path) os.chdir(os.path.dirname(path)) self.file_name = os.path.basename(path) - print "Processing", self.file_name - print '\tParsing HTML...', + self.logger.info('Processing %s\n\tParsing HTML...', self.file_name) sys.stdout.flush() nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE) nmassage.extend(HTMLConverter.MARKUP_MASSAGE) @@ -400,7 +400,7 @@ class HTMLConverter(object): self.soup = BeautifulSoup(raw, convertEntities=BeautifulSoup.HTML_ENTITIES, markupMassage=nmassage) - print 'done\n\tConverting to BBeB...', + logger.info('\tConverting to BBeB...') sys.stdout.flush() self.current_page = None self.current_para = None @@ -411,7 +411,6 @@ class HTMLConverter(object): self.page_break_found = True self.parse_file() HTMLConverter.processed_files[path] = self - print 'done' def parse_css(self, style): """ @@ -554,8 +553,8 @@ class HTMLConverter(object): if target.parent != None and \ hasattr(target.parent, 'objId'): self.book.addTocEntry(ascii_text, tb) - elif self.verbose: - print "Cannot add link", ascii_text, "to TOC" + else: + self.logger.debug("Cannot add link %s to TOC", ascii_text) def get_target_block(fragment, targets): @@ -624,21 +623,21 @@ class HTMLConverter(object): if not os.access(path.encode('utf8', 'replace'), os.R_OK): continue except Exception: - if self.verbose: - print "Skipping", link + self.logger.exception('Skipping %s', link) continue path = os.path.abspath(path) if not path in HTMLConverter.processed_files.keys(): try: self.files[path] = HTMLConverter( self.book, self.fonts, path, self.options, + self.logger, link_level = self.link_level+1, is_root = False,) HTMLConverter.processed_files[path] = self.files[path] except Exception: - print >>sys.stderr, 'Unable to process', path + self.logger.warning('Unable to process %s', path) if self.verbose: - traceback.print_exc() + self.logger.exception('') continue finally: os.chdir(cwd) @@ -759,12 +758,12 @@ class HTMLConverter(object): else: self.process_alignment(css) try: - self.current_para.append(Span(src, self.sanctify_css(css), self.memory,\ - self.profile.dpi, self.fonts, font_delta=self.font_delta)) + self.current_para.append(Span(src, self.sanctify_css(css), self.memory, + self.profile.dpi, self.fonts, self.logger, + font_delta=self.font_delta)) self.current_para.normalize_spaces() - except ConversionError, err: - if self.verbose: - print >>sys.stderr, err + except ConversionError: + self.logger.exception('Bad text') def sanctify_css(self, css): """ Return a copy of C{css} that is safe for use in a SPAM Xylog tag """ @@ -809,7 +808,7 @@ class HTMLConverter(object): try: im = PILImage.open(path) except IOError, err: - print >>sys.stderr, 'Unable to process:', path, err + self.logger.warning('Unable to process image: %s\n%s', path, err) return @@ -826,7 +825,7 @@ class HTMLConverter(object): self.scaled_images[path] = pt return pt.name except IOError: # PIL chokes on interlaced PNG images - print >>sys.stderr, 'Unable to process interlaced PNG', path + self.logger.warning('Unable to process interlaced PNG %s', path) return None pheight = int(self.current_page.pageStyle.attrs['textheight']) @@ -863,10 +862,8 @@ class HTMLConverter(object): path = pt.name self.rotated_images[path] = pt width, height = im.size - except IOError, err: # PIL chokes on interlaced PNG files and since auto-rotation is not critical we ignore the error - if self.verbose: - print >>sys.stderr, 'Unable to autorotate interlaced PNG', path - print >>sys.stderr, err + except IOError: # PIL chokes on interlaced PNG files and since auto-rotation is not critical we ignore the error + self.logger.debug('Unable to process interlaced PNG %s', path) finally: pt.close() @@ -945,8 +942,7 @@ class HTMLConverter(object): if not self.page_break_found and self.page_break.match(tagname): if len(self.current_page.contents) > 3: self.end_page() - if self.verbose: - print 'Forcing page break at', tagname + self.logger.debug('Forcing page break at %s', tagname) return end_page def parse_tag(self, tag, parent_css): @@ -1048,8 +1044,7 @@ class HTMLConverter(object): dropcaps = tag.has_key('class') and tag['class'] == 'libprs500_dropcaps' self.process_image(path, tag_css, width, height, dropcaps=dropcaps) else: - if self.verbose: - print >>sys.stderr, "Failed to process:", tag + self.logger.debug("Failed to process: %s", str(tag)) elif tagname in ['style', 'link']: def update_css(ncss): for key in ncss.keys(): @@ -1083,7 +1078,8 @@ class HTMLConverter(object): c.replaceWith(self.get_text(c)) self.end_current_para() self.current_block.append_to(self.current_page) - attrs = Span.translate_attrs(tag_css, self.profile.dpi, self.fonts, self.font_delta, self.memory) + attrs = Span.translate_attrs(tag_css, self.profile.dpi, self.fonts, + self.logger, self.font_delta, self.memory) attrs['fontfacename'] = self.fonts['mono']['normal'][1] ts = self.book.create_text_style(**self.unindented_style.attrs) ts.attrs.update(attrs) @@ -1185,8 +1181,7 @@ class HTMLConverter(object): src = self.get_text(tag, limit=1000) if self.chapter_detection and tagname.startswith('h'): if self.chapter_regex.search(src): - if self.verbose: - print 'Detected chapter', src + self.logger.debug('Detected chapter %s', src) self.end_page() self.page_break_found = True self.end_current_para() @@ -1241,9 +1236,8 @@ class HTMLConverter(object): try: self.process_table(tag, tag_css) except Exception, err: - print 'WARNING: An error occurred while processing a table:', err - print 'Ignoring table markup for table:' - print str(tag)[:300] + self.logger.warning('An error occurred while processing a table: %s', str(err)) + self.logger.warning('Ignoring table markup for table:\n%s', str(tag)[:300]) self.in_table = False self.process_children(tag, tag_css) else: @@ -1275,16 +1269,20 @@ class HTMLConverter(object): for _file in self.scaled_images.values() + self.rotated_images.values(): _file.__del__() -def process_file(path, options): +def process_file(path, options, logger=None): if re.match('http://|https://', path): raise ConversionError, 'You have to save the website %s as an html file first and then run html2lrf on it.'%(path,) + if logger is None: + level = logging.DEBUG if options.verbose else logging.INFO + logger = logging.getLogger('html2lrf') + setup_cli_handlers(logger, level) cwd = os.getcwd() dirpath = None default_title = filename_to_utf8(os.path.splitext(os.path.basename(path))[0]) try: dirpath, path = get_path(path) cpath, tpath = '', '' - try_opf(path, options) + try_opf(path, options, logger) if options.cover: options.cover = os.path.abspath(os.path.expanduser(options.cover)) cpath = options.cover @@ -1347,7 +1345,7 @@ def process_file(path, options): fpba = ['$', '', '$'] options.force_page_break_attr = [re.compile(fpba[0], re.IGNORECASE), fpba[1], re.compile(fpba[2], re.IGNORECASE)] - conv = HTMLConverter(book, fonts, path, options) + conv = HTMLConverter(book, fonts, path, options, logger) conv.process_links() oname = options.output if not oname: @@ -1356,7 +1354,7 @@ def process_file(path, options): oname = os.path.join(cwd,name) oname = os.path.abspath(os.path.expanduser(oname)) conv.writeto(oname, lrs=options.lrs) - print 'Output written to', oname + logger.info('Output written to %s', oname) conv.cleanup() return oname finally: @@ -1364,7 +1362,7 @@ def process_file(path, options): if dirpath: shutil.rmtree(dirpath, True) -def try_opf(path, options): +def try_opf(path, options, logger): try: opf = glob.glob(os.path.join(os.path.dirname(path),'*.opf'))[0] except IndexError: @@ -1419,12 +1417,9 @@ def try_opf(path, options): if not os.access(options.cover, os.R_OK): options.cover = None except: - if options.verbose: - traceback.print_exc() - except Exception, err: - if options.verbose: - print >>sys.stderr, 'Failed to process opf file', err - pass + logger.exception('Could not load cover') + except Exception: + logger.exception('Failed to process opf file') def option_parser(): return lrf_option_parser('''Usage: %prog [options] mybook.[html|rar|zip]\n\n''' diff --git a/src/libprs500/ebooks/lrf/web/convert_from.py b/src/libprs500/ebooks/lrf/web/convert_from.py index c8e2a5e56a..11690b0e60 100644 --- a/src/libprs500/ebooks/lrf/web/convert_from.py +++ b/src/libprs500/ebooks/lrf/web/convert_from.py @@ -14,14 +14,13 @@ ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. '''Convert known websites into LRF files.''' -import sys, time, tempfile, shutil, os +import sys, time, tempfile, shutil, os, logging from urlparse import urlsplit -from libprs500 import __appname__ +from libprs500 import __appname__, setup_cli_handlers, CommandLineError from libprs500.ebooks.lrf import option_parser as lrf_option_parser from libprs500.ebooks.lrf.html.convert_from import process_file from libprs500.ebooks.lrf.web.profiles import profiles -from libprs500.web.fetch.simple import setup_logger as web2disk_setup_logger from libprs500.web.fetch.simple import create_fetcher available_profiles = profiles.keys() @@ -57,14 +56,14 @@ def option_parser(): help='Any link that matches this regular expression will be ignored. This option can be specified multiple times, in which case as long as any regexp matches a link, it will be ignored.By default, no links are ignored. If both --filter-regexp and --match-regexp are specified, then --filter-regexp is applied first.') return parser -def fetch_website(options): +def fetch_website(options, logger): tdir = tempfile.mkdtemp(prefix=__appname__+'_' ) options.dir = tdir - fetcher = create_fetcher(options) + fetcher = create_fetcher(options, logger) fetcher.preprocess_regexps = options.preprocess_regexps return fetcher.start_fetch(options.url), tdir -def create_lrf(htmlfile, options): +def create_lrf(htmlfile, options, logger): if not options.author: options.author = __appname__ options.header = True @@ -73,20 +72,16 @@ def create_lrf(htmlfile, options): else: options.output = os.path.abspath(os.path.expanduser(options.title + ('.lrs' if options.lrs else '.lrf'))) - process_file(htmlfile, options) + process_file(htmlfile, options, logger) -def main(args=sys.argv): - parser = option_parser() - options, args = parser.parse_args(args) - web2disk_setup_logger(options) - if len(args) > 2: - parser.print_help() - return 1 +def process_profile(args, options, logger=None): + if logger is None: + level = logging.DEBUG if options.verbose else logging.INFO + logger = logging.getLogger('web2lrf') + setup_cli_handlers(logger, level) if len(args) == 2: if not profiles.has_key(args[1]): - print >>sys.stderr, 'Unknown profile', args[1] - print >>sys.stderr, 'Valid profiles:', profiles.keys() - return 1 + raise CommandLineError('Unknown profile: %s\nValid profiles: %s'%(args[1], profiles.keys())) profile = profiles[args[1]] if len(args) == 2 else profiles['default'] if profile.has_key('initialize'): @@ -98,11 +93,7 @@ def main(args=sys.argv): setattr(options, opt, profile[opt]) if not options.url: - parser.print_help() - print >>sys.stderr - print >>sys.stderr, 'You must specify the --url option or a profile from one of:', - print >>sys.stderr, available_profiles - return 1 + raise CommandLineError('You must specify the --url option or a profile from one of: %s', available_profiles) if not options.title: title = profile['title'] @@ -114,12 +105,24 @@ def main(args=sys.argv): options.preprocess_regexps = profile['preprocess_regexps'] options.filter_regexps += profile['filter_regexps'] - htmlfile, tdir = fetch_website(options) - create_lrf(htmlfile, options) + htmlfile, tdir = fetch_website(options, logger) + create_lrf(htmlfile, options, logger) if profile.has_key('finalize'): profile['finalize'](profile) shutil.rmtree(tdir) - + + + +def main(args=sys.argv): + parser = option_parser() + options, args = parser.parse_args(args) + if len(args) > 2: + parser.print_help() + return 1 + try: + process_profile(args, options) + except CommandLineError, err: + print >>sys.stderr, err return 0 if __name__ == '__main__': diff --git a/src/libprs500/web/fetch/simple.py b/src/libprs500/web/fetch/simple.py index feb7bdd808..b9f6bbe388 100644 --- a/src/libprs500/web/fetch/simple.py +++ b/src/libprs500/web/fetch/simple.py @@ -23,8 +23,6 @@ from optparse import OptionParser from libprs500 import __version__, __appname__, __author__, setup_cli_handlers from libprs500.ebooks.BeautifulSoup import BeautifulSoup -logger = logging.getLogger('libprs500.web.fetch.simple') - class FetchError(Exception): pass @@ -52,7 +50,8 @@ class RecursiveFetcher(object): # ) CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE) - def __init__(self, options): + def __init__(self, options, logger): + self.logger = logger self.base_dir = os.path.abspath(os.path.expanduser(options.dir)) if not os.path.exists(self.base_dir): os.makedirs(self.base_dir) @@ -80,7 +79,7 @@ class RecursiveFetcher(object): def fetch_url(self, url): f = None - logger.info('Fetching %s', url) + self.logger.debug('Fetching %s', url) delta = time.time() - self.last_fetch_at if delta < self.delay: time.sleep(delta) @@ -138,8 +137,8 @@ class RecursiveFetcher(object): try: f = self.fetch_url(iurl) except Exception, err: - logger.warning('Could not fetch stylesheet %s', iurl) - logger.debug('Error: %s', str(err), exc_info=True) + self.logger.warning('Could not fetch stylesheet %s', iurl) + self.logger.debug('Error: %s', str(err), exc_info=True) continue c += 1 stylepath = os.path.join(diskpath, 'style'+str(c)+'.css') @@ -160,8 +159,8 @@ class RecursiveFetcher(object): try: f = self.fetch_url(iurl) except Exception, err: - logger.warning('Could not fetch stylesheet %s', iurl) - logger.debug('Error: %s', str(err), exc_info=True) + self.logger.warning('Could not fetch stylesheet %s', iurl) + self.logger.debug('Error: %s', str(err), exc_info=True) continue c += 1 stylepath = os.path.join(diskpath, 'style'+str(c)+'.css') @@ -179,7 +178,7 @@ class RecursiveFetcher(object): for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')): iurl, ext = tag['src'], os.path.splitext(tag['src'])[1] if not ext: - logger.info('Skipping extensionless image %s', iurl) + self.logger.debug('Skipping extensionless image %s', iurl) continue if not urlparse.urlsplit(iurl).scheme: iurl = urlparse.urljoin(baseurl, iurl, False) @@ -189,8 +188,8 @@ class RecursiveFetcher(object): try: f = self.fetch_url(iurl) except Exception, err: - logger.warning('Could not fetch image %s', iurl) - logger.debug('Error: %s', str(err), exc_info=True) + self.logger.warning('Could not fetch image %s', iurl) + self.logger.debug('Error: %s', str(err), exc_info=True) continue c += 1 imgpath = os.path.join(diskpath, 'img'+str(c)+ext) @@ -206,7 +205,7 @@ class RecursiveFetcher(object): if not parts.scheme: iurl = urlparse.urljoin(baseurl, iurl, False) if not self.is_link_ok(iurl): - logger.info('Skipping invalid link: %s', iurl) + self.logger.debug('Skipping invalid link: %s', iurl) return None return iurl @@ -258,7 +257,7 @@ class RecursiveFetcher(object): self.current_dir = linkdiskpath f = self.fetch_url(iurl) soup = self.get_soup(f.read()) - logger.info('Processing images...') + self.logger.debug('Processing images...') self.process_images(soup, f.geturl()) if self.download_stylesheets: self.process_stylesheets(soup, f.geturl()) @@ -266,17 +265,17 @@ class RecursiveFetcher(object): res = os.path.join(linkdiskpath, basename(iurl)) self.filemap[nurl] = res if recursion_level < self.max_recursions: - logger.info('Processing links...') + self.logger.debug('Processing links...') self.process_links(soup, iurl, recursion_level+1) else: self.process_return_links(soup, iurl) - logger.info('Recursion limit reached. Skipping %s', iurl) + self.logger.debug('Recursion limit reached. Skipping %s', iurl) save_soup(soup, res) self.localize_link(tag, 'href', res) except Exception, err: - logger.warning('Could not fetch link %s', iurl) - logger.debug('Error: %s', str(err), exc_info=True) + self.logger.warning('Could not fetch link %s', iurl) + self.logger.debug('Error: %s', str(err), exc_info=True) finally: self.current_dir = diskpath self.files += 1 @@ -313,12 +312,12 @@ def option_parser(usage='%prog URL\n\nWhere URL is for example http://google.com return parser -def create_fetcher(options): - return RecursiveFetcher(options) - -def setup_logger(options): - level = logging.DEBUG if options.verbose else logging.WARNING - setup_cli_handlers(logger, level) +def create_fetcher(options, logger=None): + if logger is None: + level = logging.DEBUG if options.verbose else logging.INFO + logger = logging.getLogger('web2disk') + setup_cli_handlers(logger, level) + return RecursiveFetcher(options, logger) def main(args=sys.argv): parser = option_parser() @@ -327,7 +326,6 @@ def main(args=sys.argv): parser.print_help() return 1 - setup_logger(options) fetcher = create_fetcher(options) fetcher.start_fetch(args[1])