diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 30f423fce3..484d46dc36 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -18,7 +18,7 @@ every time you add an HTML file to the library.\ file_types = set(['html', 'htm', 'xhtml', 'xhtm']) supported_platforms = ['windows', 'osx', 'linux'] on_import = True - + def run(self, htmlfile): of = self.temporary_file('_plugin_html2zip.zip') from calibre.ebooks.html import gui_main as html2oeb @@ -26,172 +26,173 @@ every time you add an HTML file to the library.\ return of.name class OPFMetadataReader(MetadataReaderPlugin): - + name = 'Read OPF metadata' file_types = set(['opf']) description = _('Read metadata from %s files')%'OPF' - + def get_metadata(self, stream, ftype): from calibre.ebooks.metadata.opf2 import OPF from calibre.ebooks.metadata import MetaInformation return MetaInformation(OPF(stream, os.getcwd())) class RTFMetadataReader(MetadataReaderPlugin): - - name = 'Read RTF metadata' + + name = 'Read RTF metadata' file_types = set(['rtf']) description = _('Read metadata from %s files')%'RTF' - + def get_metadata(self, stream, ftype): from calibre.ebooks.metadata.rtf import get_metadata return get_metadata(stream) class FB2MetadataReader(MetadataReaderPlugin): - + name = 'Read FB2 metadata' file_types = set(['fb2']) description = _('Read metadata from %s files')%'FB2' - + def get_metadata(self, stream, ftype): from calibre.ebooks.metadata.fb2 import get_metadata return get_metadata(stream) class LRFMetadataReader(MetadataReaderPlugin): - + name = 'Read LRF metadata' file_types = set(['lrf']) description = _('Read metadata from %s files')%'LRF' - + def get_metadata(self, stream, ftype): from calibre.ebooks.lrf.meta import get_metadata return get_metadata(stream) class PDFMetadataReader(MetadataReaderPlugin): - + name = 'Read PDF metadata' file_types = set(['pdf']) description = _('Read metadata from %s files')%'PDF' - + def get_metadata(self, stream, ftype): from calibre.ebooks.metadata.pdf import get_metadata return get_metadata(stream) class LITMetadataReader(MetadataReaderPlugin): - + name = 'Read LIT metadata' file_types = set(['lit']) description = _('Read metadata from %s files')%'LIT' - + def get_metadata(self, stream, ftype): from calibre.ebooks.metadata.lit import get_metadata return get_metadata(stream) class IMPMetadataReader(MetadataReaderPlugin): - + name = 'Read IMP metadata' file_types = set(['imp']) description = _('Read metadata from %s files')%'IMP' author = 'Ashish Kulkarni' - + def get_metadata(self, stream, ftype): from calibre.ebooks.metadata.imp import get_metadata return get_metadata(stream) class RBMetadataReader(MetadataReaderPlugin): - + name = 'Read RB metadata' file_types = set(['rb']) description = _('Read metadata from %s files')%'RB' author = 'Ashish Kulkarni' - + def get_metadata(self, stream, ftype): from calibre.ebooks.metadata.rb import get_metadata return get_metadata(stream) class EPUBMetadataReader(MetadataReaderPlugin): - + name = 'Read EPUB metadata' file_types = set(['epub']) description = _('Read metadata from %s files')%'EPUB' - + def get_metadata(self, stream, ftype): from calibre.ebooks.metadata.epub import get_metadata return get_metadata(stream) class HTMLMetadataReader(MetadataReaderPlugin): - + name = 'Read HTML metadata' file_types = set(['html']) description = _('Read metadata from %s files')%'HTML' - + def get_metadata(self, stream, ftype): from calibre.ebooks.metadata.html import get_metadata return get_metadata(stream) class MOBIMetadataReader(MetadataReaderPlugin): - + name = 'Read MOBI metadata' file_types = set(['mobi', 'prc', 'azw']) description = _('Read metadata from %s files')%'MOBI' - + def get_metadata(self, stream, ftype): from calibre.ebooks.mobi.reader import get_metadata return get_metadata(stream) class TOPAZMetadataReader(MetadataReaderPlugin): - + name = 'Read Topaz metadata' file_types = set(['tpz', 'azw1']) description = _('Read metadata from %s files')%'MOBI' - + def get_metadata(self, stream, ftype): from calibre.ebooks.metadata.topaz import get_metadata return get_metadata(stream) class ODTMetadataReader(MetadataReaderPlugin): - + name = 'Read ODT metadata' file_types = set(['odt']) description = _('Read metadata from %s files')%'ODT' - + def get_metadata(self, stream, ftype): from calibre.ebooks.metadata.odt import get_metadata return get_metadata(stream) - + class TXTMetadataReader(MetadataReaderPlugin): - + name = 'Read TXT metadata' file_types = set(['txt']) description = _('Read metadata from %s files') % 'TXT' - + def get_metadata(self, stream, ftype): from calibre.ebooks.metadata.txt import get_metadata return get_metadata(stream) class LRXMetadataReader(MetadataReaderPlugin): - + name = 'Read LRX metadata' file_types = set(['lrx']) description = _('Read metadata from %s files')%'LRX' - + def get_metadata(self, stream, ftype): from calibre.ebooks.metadata.lrx import get_metadata return get_metadata(stream) class ComicMetadataReader(MetadataReaderPlugin): - + name = 'Read comic metadata' file_types = set(['cbr', 'cbz']) description = _('Extract cover from comic files') - + def get_metadata(self, stream, ftype): if ftype == 'cbr': from calibre.libunrar import extract_member as extract_first + extract_first else: from calibre.libunzip import extract_member as extract_first - from calibre.ebooks.metadata import MetaInformation + from calibre.ebooks.metadata import MetaInformation ret = extract_first(stream) mi = MetaInformation(None, None) if ret is not None: @@ -199,65 +200,65 @@ class ComicMetadataReader(MetadataReaderPlugin): ext = os.path.splitext(path)[1][1:] mi.cover_data = (ext.lower(), data) return mi - + class ZipMetadataReader(MetadataReaderPlugin): - + name = 'Read ZIP metadata' file_types = set(['zip', 'oebzip']) description = _('Read metadata from ebooks in ZIP archives') - + def get_metadata(self, stream, ftype): from calibre.ebooks.metadata.zip import get_metadata return get_metadata(stream) class RARMetadataReader(MetadataReaderPlugin): - + name = 'Read RAR metadata' file_types = set(['rar']) description = _('Read metadata from ebooks in RAR archives') - + def get_metadata(self, stream, ftype): from calibre.ebooks.metadata.rar import get_metadata return get_metadata(stream) class EPUBMetadataWriter(MetadataWriterPlugin): - + name = 'Set EPUB metadata' file_types = set(['epub']) description = _('Set metadata in %s files')%'EPUB' - + def set_metadata(self, stream, mi, type): from calibre.ebooks.metadata.epub import set_metadata set_metadata(stream, mi) - + class LRFMetadataWriter(MetadataWriterPlugin): - + name = 'Set LRF metadata' file_types = set(['lrf']) description = _('Set metadata in %s files')%'LRF' - + def set_metadata(self, stream, mi, type): from calibre.ebooks.lrf.meta import set_metadata set_metadata(stream, mi) class RTFMetadataWriter(MetadataWriterPlugin): - + name = 'Set RTF metadata' file_types = set(['rtf']) description = _('Set metadata in %s files')%'RTF' - + def set_metadata(self, stream, mi, type): from calibre.ebooks.metadata.rtf import set_metadata set_metadata(stream, mi) class MOBIMetadataWriter(MetadataWriterPlugin): - + name = 'Set MOBI metadata' file_types = set(['mobi', 'prc', 'azw']) description = _('Set metadata in %s files')%'MOBI' author = 'Marshall T. Vandegrift' - + def set_metadata(self, stream, mi, type): from calibre.ebooks.metadata.mobi import set_metadata set_metadata(stream, mi) @@ -267,14 +268,16 @@ from calibre.ebooks.epub.input import EPUBInput from calibre.ebooks.mobi.input import MOBIInput from calibre.ebooks.pdf.input import PDFInput from calibre.ebooks.txt.input import TXTInput +from calibre.ebooks.html.input import HTMLInput from calibre.ebooks.oeb.output import OEBOutput from calibre.ebooks.txt.output import TXTOutput from calibre.ebooks.pdf.output import PDFOutput from calibre.customize.profiles import input_profiles, output_profiles -plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDFInput, TXTInput, OEBOutput, TXTOutput, PDFOutput] +plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDFInput, HTMLInput, + TXTInput, OEBOutput, TXTOutput, PDFOutput] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ x.__name__.endswith('MetadataReader')] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ x.__name__.endswith('MetadataWriter')] -plugins += input_profiles + output_profiles \ No newline at end of file +plugins += input_profiles + output_profiles diff --git a/src/calibre/customize/conversion.py b/src/calibre/customize/conversion.py index c531a15e34..77cdb0b7da 100644 --- a/src/calibre/customize/conversion.py +++ b/src/calibre/customize/conversion.py @@ -163,9 +163,9 @@ class InputFormatPlugin(Plugin): for x in os.listdir('.'): shutil.rmtree(x) if os.path.isdir(x) else os.remove(x) - ret = self.convert(stream, options, file_ext, log, accelerators) + if options.debug_input is not None: options.debug_input = os.path.abspath(options.debug_input) if not os.path.exists(options.debug_input): diff --git a/src/calibre/ebooks/html/__init__.py b/src/calibre/ebooks/html/__init__.py index 9a8f8e2d20..d026256ee8 100644 --- a/src/calibre/ebooks/html/__init__.py +++ b/src/calibre/ebooks/html/__init__.py @@ -17,7 +17,7 @@ def tostring(root, strip_comments=False, pretty_print=False): root.set('xmlns', 'http://www.w3.org/1999/xhtml') root.set('{http://www.w3.org/1999/xhtml}xlink', 'http://www.w3.org/1999/xlink') for x in root.iter(): - if x.tag.rpartition('}')[-1].lower() == 'svg': + if hasattr(x.tag, 'rpartition') and x.tag.rpartition('}')[-1].lower() == 'svg': x.set('xmlns', 'http://www.w3.org/2000/svg') ans = _tostring(root, encoding='utf-8', pretty_print=pretty_print) diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py index dd9aa0285c..951b0824a5 100644 --- a/src/calibre/ebooks/html/input.py +++ b/src/calibre/ebooks/html/input.py @@ -11,14 +11,12 @@ __docformat__ = 'restructuredtext en' Input plugin for HTML or OPF ebooks. ''' -import os, re, sys, cStringIO +import os, re, sys from urlparse import urlparse, urlunparse from urllib import unquote from calibre.customize.conversion import InputFormatPlugin -from calibre.ebooks.metadata.meta import get_metadata -from calibre.ebooks.metadata.opf2 import OPF, OPFCreator -from calibre.ebooks.metadata import MetaInformation +from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.chardet import xml_to_unicode from calibre.customize.conversion import OptionRecommendation from calibre import unicode_path @@ -213,72 +211,21 @@ def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None) sys.setrecursionlimit(orec) -def opf_traverse(opf_reader, verbose=0, encoding=None): - ''' - Return a list of :class:`HTMLFile` objects in the order specified by the - `` element of the OPF. - - :param opf_reader: An :class:`calibre.ebooks.metadata.opf2.OPF` instance. - :param encoding: Specify character encoding of HTML files. If `None` it is - auto-detected. - ''' - if not opf_reader.spine: - raise ValueError('OPF does not have a spine') - flat = [] - for path in opf_reader.spine.items(): - path = os.path.abspath(path) - if path not in flat: - flat.append(os.path.abspath(path)) - for item in opf_reader.manifest: - if 'html' in item.mime_type: - path = os.path.abspath(item.path) - if path not in flat: - flat.append(path) - for i, path in enumerate(flat): - if not os.path.exists(path): - path = path.replace('&', '%26') - if os.path.exists(path): - flat[i] = path - for item in opf_reader.itermanifest(): - item.set('href', item.get('href').replace('&', '%26')) - ans = [] - for path in flat: - if os.path.exists(path): - ans.append(HTMLFile(path, 0, encoding, verbose)) - else: - print 'WARNING: OPF spine item %s does not exist'%path - ans = [f for f in ans if not f.is_binary] - return ans - -def search_for_opf(dir): - for f in os.listdir(dir): - if f.lower().endswith('.opf'): - return OPF(open(os.path.join(dir, f), 'rb'), dir) - def get_filelist(htmlfile, dir, opts, log): ''' Build list of files referenced by html file or try to detect and use an OPF file instead. ''' - print 'Building file list...' - opf = search_for_opf(dir) - filelist = None - if opf is not None: - try: - filelist = opf_traverse(opf, verbose=opts.verbose, - encoding=opts.input_encoding) - except: - pass - if not filelist: - filelist = traverse(htmlfile, max_levels=int(opts.max_levels), - verbose=opts.verbose, - encoding=opts.input_encoding)\ - [0 if opts.breadth_first else 1] + log.info('Building file list...') + filelist = traverse(htmlfile, max_levels=int(opts.max_levels), + verbose=opts.verbose, + encoding=opts.input_encoding)\ + [0 if opts.breadth_first else 1] if opts.verbose: log.debug('\tFound files...') for f in filelist: log.debug('\t\t', f) - return opf, filelist + return filelist class HTMLInput(InputFormatPlugin): @@ -309,34 +256,32 @@ class HTMLInput(InputFormatPlugin): def convert(self, stream, opts, file_ext, log, accelerators): + from calibre.ebooks.metadata.meta import get_metadata + basedir = os.getcwd() + if hasattr(stream, 'name'): basedir = os.path.dirname(stream.name) if file_ext == 'opf': - opf = OPF(stream, basedir) - filelist = opf_traverse(opf, verbose=opts.verbose, - encoding=opts.input_encoding) - mi = MetaInformation(opf) + opfpath = stream.name else: - opf, filelist = get_filelist(stream.name, basedir, opts, log) - mi = MetaInformation(opf) - mi.smart_update(get_metadata(stream, 'html')) + filelist = get_filelist(stream.name, basedir, opts, log) + mi = get_metadata(stream, 'html') + mi = OPFCreator(os.getcwdu(), mi) + mi.guide = None + entries = [(f.path, 'application/xhtml+xml') for f in filelist] + mi.create_manifest(entries) + mi.create_spine([f.path for f in filelist]) - mi = OPFCreator(os.getcwdu(), mi) - mi.guide = None - entries = [(f.path, 'application/xhtml+xml') for f in filelist] - mi.create_manifest(entries) - mi.create_spine([f.path for f in filelist]) - - tocbuf = cStringIO.StringIO() - mi.render(open('metadata.opf', 'wb'), tocbuf, 'toc.ncx') - toc = tocbuf.getvalue() - if toc: - open('toc.ncx', 'wb').write(toc) + mi.render(open('metadata.opf', 'wb')) + opfpath = os.path.abspath('metadata.opf') from calibre.ebooks.conversion.plumber import create_oebbook - return create_oebbook(log, os.path.abspath('metadata.opf')) - - + oeb = create_oebbook(log, opfpath) + + from calibre.ebooks.oeb.transforms.package import Package + Package(os.getcwdu())(oeb, opts) + + return oeb diff --git a/src/calibre/ebooks/oeb/reader.py b/src/calibre/ebooks/oeb/reader.py index 0c5a4ad97c..faeff4b825 100644 --- a/src/calibre/ebooks/oeb/reader.py +++ b/src/calibre/ebooks/oeb/reader.py @@ -573,7 +573,7 @@ class OEBReader(object): item = self._find_ncx(opf) self._toc_from_opf(opf, item) self._pages_from_opf(opf, item) - self._ensure_cover_image() + #self._ensure_cover_image() def main(argv=sys.argv): diff --git a/src/calibre/ebooks/oeb/transforms/package.py b/src/calibre/ebooks/oeb/transforms/package.py index de775f8865..faf5486475 100644 --- a/src/calibre/ebooks/oeb/transforms/package.py +++ b/src/calibre/ebooks/oeb/transforms/package.py @@ -6,13 +6,14 @@ __license__ = 'GPL v3' __copyright__ = '2009, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import os +import os, re from urllib import unquote as urlunquote from functools import partial from lxml import etree import cssutils +from calibre import sanitize_file_name from calibre.constants import islinux from calibre.ebooks.oeb.base import OEB_DOCS, urlnormalize, urldefrag, \ rewrite_links @@ -36,15 +37,21 @@ class Package(object): self.new_base_path = os.path.abspath(base) def rewrite_links_in(self, item): - base = os.path.join(self.new_base_path, *item.href.split('/')) + old_href = item.old_href.split('#')[0] + new_href = item.href.split('#')[0] + base = os.path.join(self.old_base_path, *old_href.split('/')) base = os.path.dirname(base) + self.log.debug('\tRewriting links in', base+'/'+ + item.href.rpartition('/')[-1]) + new_base = os.path.join(self.new_base_path, *new_href.split('/')) + new_base = os.path.dirname(new_base) if etree.iselement(item.data): - self.rewrite_links_in_xml(item.data, base) + self.rewrite_links_in_xml(item.data, base, new_base) elif hasattr(item.data, 'cssText'): - self.rewrite_links_in_css(item.data, base) + self.rewrite_links_in_css(item.data, base, new_base) - def link_replacer(self, link_, base=''): + def link_replacer(self, link_, base='', new_base=''): link = urlnormalize(link_) link, frag = urldefrag(link) link = urlunquote(link).replace('/', os.sep) @@ -55,20 +62,33 @@ class Package(object): link = link.lower() if link not in self.map: return link_ - nlink = os.path.relpath(self.map[link], base) + nlink = os.path.relpath(self.map[link], new_base) if frag: - nlink = '#'.join(nlink, frag) + nlink = '#'.join((nlink, frag)) return nlink.replace(os.sep, '/') - def rewrite_links_in_css(self, sheet, base): - repl = partial(self.link_replacer, base=base) + def rewrite_links_in_css(self, sheet, base, new_base): + repl = partial(self.link_replacer, base=base, new_base=new_base) cssutils.replaceUrls(sheet, repl) - def rewrite_links_in_xml(self, root, base): - repl = partial(self.link_replacer, base=base) + def rewrite_links_in_xml(self, root, base, new_base): + repl = partial(self.link_replacer, base=base, new_base=new_base) rewrite_links(root, repl) - def move_manifest_item(self, item): + def uniqify_name(self, new_href, hrefs): + c = 0 + while new_href in hrefs: + c += 1 + parts = new_href.split('/') + name, ext = os.path.splitext(parts[-1]) + name = re.sub(r'_\d+$', '', name) + name += '_%d'%c + parts[-1] = name + ext + new_href = '/'.join(parts) + return new_href + + + def move_manifest_item(self, item, hrefs): item.data # Make sure the data has been loaded and cached old_abspath = os.path.join(self.old_base_path, *(urldefrag(item.href)[0].split('/'))) @@ -79,11 +99,17 @@ class Package(object): new_href = 'content/' elif item.href.lower().endswith('.ncx'): new_href = '' - new_href += bname + new_href += sanitize_file_name(bname) + + if new_href in hrefs: + new_href = self.uniqify_name(new_href, hrefs) + hrefs.add(new_href) new_abspath = os.path.join(self.new_base_path, *new_href.split('/')) new_abspath = os.path.abspath(new_abspath) + item.old_href = self.oeb.manifest.hrefs.pop(item.href).href item.href = new_href + self.oeb.manifest.hrefs[item.href] = item if not islinux: old_abspath, new_abspath = old_abspath.lower(), new_abspath.lower() if old_abspath != new_abspath: @@ -91,25 +117,33 @@ class Package(object): def rewrite_links_in_toc(self, toc): if toc.href: - toc.href = self.link_replacer(toc.href, base=self.new_base_path) + toc.href = self.link_replacer(toc.href, base=self.old_base_path, + new_base=self.new_base_path) for x in toc: self.rewrite_links_in_toc(x) def __call__(self, oeb, context): self.map = {} - self.log = self.oeb.log + self.log = oeb.log + self.oeb = oeb self.old_base_path = os.path.abspath(oeb.container.rootdir) + hrefs = set([]) for item in self.oeb.manifest: - self.move_manifest_item(item) + self.move_manifest_item(item, hrefs) + self.log.debug('Rewriting links in OEB documents...') for item in self.oeb.manifest: self.rewrite_links_in(item) if getattr(oeb.toc, 'nodes', False): + self.log.debug('Rewriting links in TOC...') self.rewrite_links_in_toc(oeb.toc) if hasattr(oeb, 'guide'): + self.log.debug('Rewriting links in guide...') for ref in oeb.guide.values(): - ref.href = self.link_replacer(ref.href, base=self.new_base_path) + ref.href = self.link_replacer(ref.href, + base=self.old_base_path, + new_base=self.new_base_path) diff --git a/src/calibre/ebooks/oeb/writer.py b/src/calibre/ebooks/oeb/writer.py index 1e5e5aea11..ef72414f5a 100644 --- a/src/calibre/ebooks/oeb/writer.py +++ b/src/calibre/ebooks/oeb/writer.py @@ -48,7 +48,8 @@ class OEBWriter(object): pretty_print=pretty_print) def __call__(self, oeb, path): - """Read the book in the :class:`OEBBook` object :param:`oeb` to a file + """ + Read the book in the :class:`OEBBook` object :param:`oeb` to a file at :param:`path`. """ version = int(self.version[0]) diff --git a/src/calibre/web/feeds/main.py b/src/calibre/web/feeds/main.py index faa132bef4..61bfa97e11 100644 --- a/src/calibre/web/feeds/main.py +++ b/src/calibre/web/feeds/main.py @@ -1,11 +1,11 @@ -#!/usr/bin/env python +#!/usr/bin/env python __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' ''' CLI for downloading feeds. ''' -import sys, os, logging +import sys, os from calibre.web.feeds.recipes import get_builtin_recipe, compile_recipe, titles from calibre.web.fetch.simple import option_parser as _option_parser from calibre.web.feeds.news import BasicNewsRecipe @@ -14,13 +14,13 @@ from calibre.utils.config import Config, StringConfig def config(defaults=None): desc = _('Options to control the fetching of periodical content from the web.') c = Config('feeds2disk', desc) if defaults is None else StringConfig(defaults, desc) - + web2disk = c.add_group('web2disk', _('Customize the download engine')) - web2disk('timeout', ['-t', '--timeout'], default=10.0, + web2disk('timeout', ['-t', '--timeout'], default=10.0, help=_('Timeout in seconds to wait for a response from the server. Default: %default s'),) - web2disk('delay', ['--delay'], default=0, + web2disk('delay', ['--delay'], default=0, help=_('Minimum interval in seconds between consecutive fetches. Default is %default s')) - web2disk('encoding', ['--encoding'], default=None, + web2disk('encoding', ['--encoding'], default=None, help=_('The character encoding for the websites you are trying to download. The default is to try and guess the encoding.')) web2disk('match_regexps', ['--match-regexp'], default=[], action='append', help=_('Only links that match this regular expression will be followed. This option can be specified multiple times, in which case as long as a link matches any one regexp, it will be followed. By default all links are followed.')) @@ -28,42 +28,42 @@ def config(defaults=None): help=_('Any link that matches this regular expression will be ignored. This option can be specified multiple times, in which case as long as any regexp matches a link, it will be ignored.By default, no links are ignored. If both --filter-regexp and --match-regexp are specified, then --filter-regexp is applied first.')) web2disk('no_stylesheets', ['--dont-download-stylesheets'], action='store_true', default=False, help=_('Do not download CSS stylesheets.')) - + c.add_opt('feeds', ['--feeds'], default=None, - help=_('''Specify a list of feeds to download. For example: + help=_('''Specify a list of feeds to download. For example: "['http://feeds.newsweek.com/newsweek/TopNews', 'http://feeds.newsweek.com/headlines/politics']" If you specify this option, any argument to %prog is ignored and a default recipe is used to download the feeds.''')) c.add_opt('verbose', ['-v', '--verbose'], default=0, action='count', help=_('''Be more verbose while processing.''')) c.add_opt('title', ['--title'], default=None, help=_('The title for this recipe. Used as the title for any ebooks created from the downloaded feeds.')) - c.add_opt('username', ['-u', '--username'], default=None, + c.add_opt('username', ['-u', '--username'], default=None, help=_('Username for sites that require a login to access content.')) - c.add_opt('password', ['-p', '--password'], default=None, + c.add_opt('password', ['-p', '--password'], default=None, help=_('Password for sites that require a login to access content.')) - c.add_opt('lrf', ['--lrf'], default=False, action='store_true', + c.add_opt('lrf', ['--lrf'], default=False, action='store_true', help='Optimize fetching for subsequent conversion to LRF.') - c.add_opt('epub', ['--epub'], default=False, action='store_true', + c.add_opt('epub', ['--epub'], default=False, action='store_true', help='Optimize fetching for subsequent conversion to EPUB.') - c.add_opt('mobi', ['--mobi'], default=False, action='store_true', + c.add_opt('mobi', ['--mobi'], default=False, action='store_true', help='Optimize fetching for subsequent conversion to MOBI.') c.add_opt('recursions', ['--recursions'], default=0, help=_('Number of levels of links to follow on webpages that are linked to from feeds. Defaul %default')) - c.add_opt('output_dir', ['--output-dir'], default='.', + c.add_opt('output_dir', ['--output-dir'], default='.', help=_('The directory in which to store the downloaded feeds. Defaults to the current directory.')) c.add_opt('no_progress_bar', ['--no-progress-bar'], default=False, action='store_true', help=_("Don't show the progress bar")) c.add_opt('debug', ['--debug'], action='store_true', default=False, help=_('Very verbose output, useful for debugging.')) - c.add_opt('test', ['--test'], action='store_true', default=False, + c.add_opt('test', ['--test'], action='store_true', default=False, help=_('Useful for recipe development. Forces max_articles_per_feed to 2 and downloads at most 2 feeds.')) - + return c - + USAGE=_('''\ %%prog [options] ARG -%%prog parses an online source of articles, like an RSS or ATOM feed and +%%prog parses an online source of articles, like an RSS or ATOM feed and fetches the article contents organized in a nice hierarchy. ARG can be one of: @@ -85,9 +85,9 @@ def option_parser(usage=USAGE): p.remove_option('--verbose') p.remove_option('--max-files') p.subsume('WEB2DISK OPTIONS', _('Options to control web2disk (used to fetch websites linked from feeds)')) - + p.add_option('--feeds', default=None, - help=_('''Specify a list of feeds to download. For example: + help=_('''Specify a list of feeds to download. For example: "['http://feeds.newsweek.com/newsweek/TopNews', 'http://feeds.newsweek.com/headlines/politics']" If you specify this option, any argument to %prog is ignored and a default recipe is used to download the feeds.''')) p.add_option('--verbose', default=False, action='store_true', @@ -99,70 +99,62 @@ If you specify this option, any argument to %prog is ignored and a default recip p.add_option('--lrf', default=False, action='store_true', help='Optimize fetching for subsequent conversion to LRF.') p.add_option('--recursions', default=0, type='int', help=_('Number of levels of links to follow on webpages that are linked to from feeds. Defaul %default')) - p.add_option('--output-dir', default=os.getcwd(), + p.add_option('--output-dir', default=os.getcwd(), help=_('The directory in which to store the downloaded feeds. Defaults to the current directory.')) p.add_option('--no-progress-bar', dest='no_progress_bar', default=False, action='store_true', help=_('Dont show the progress bar')) p.add_option('--debug', action='store_true', default=False, help=_('Very verbose output, useful for debugging.')) - p.add_option('--test', action='store_true', default=False, + p.add_option('--test', action='store_true', default=False, help=_('Useful for recipe development. Forces max_articles_per_feed to 2 and downloads at most 2 feeds.')) - + return p - + class RecipeError(Exception): pass -def run_recipe(opts, recipe_arg, parser, notification=None, handler=None): +def run_recipe(opts, recipe_arg, parser, notification=None): if notification is None: from calibre.utils.terminfo import TerminalController, ProgressBar term = TerminalController(sys.stdout) pb = ProgressBar(term, _('Fetching feeds...'), no_progress_bar=opts.no_progress_bar) notification = pb.update - + recipe = None if opts.feeds is not None: recipe = BasicNewsRecipe else: try: if os.access(recipe_arg, os.R_OK): - recipe = compile_recipe(open(recipe_arg).read()) + recipe = compile_recipe(open(recipe_arg).read()) else: raise Exception('not file') except: recipe = get_builtin_recipe(recipe_arg) if recipe is None: recipe = compile_recipe(recipe_arg) - + if recipe is None: raise RecipeError(recipe_arg+ ' is an invalid recipe') - - - if handler is None: - from calibre import ColoredFormatter - handler = logging.StreamHandler(sys.stdout) - handler.setLevel(logging.DEBUG if opts.debug else logging.INFO if opts.verbose else logging.WARN) - handler.setFormatter(ColoredFormatter('%(levelname)s: %(message)s\n')) # The trailing newline is need because of the progress bar - logging.getLogger('feeds2disk').addHandler(handler) - + recipe = recipe(opts, parser, notification) - + if not os.path.exists(recipe.output_dir): os.makedirs(recipe.output_dir) recipe.download(for_lrf=True) - + return recipe -def main(args=sys.argv, notification=None, handler=None): +def main(args=sys.argv, notification=None): p = option_parser() opts, args = p.parse_args(args=args[1:]) - + if len(args) != 1 and opts.feeds is None: p.print_help() return 1 recipe_arg = args[0] if len(args) > 0 else None - run_recipe(opts, recipe_arg, p, notification=notification, handler=handler) - + run_recipe(opts, recipe_arg, p, notification=notification) + return 0 if __name__ == '__main__': diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py index efcfdff94b..4ee6753180 100644 --- a/src/calibre/web/feeds/news.py +++ b/src/calibre/web/feeds/news.py @@ -7,7 +7,7 @@ Defines various abstract base classes that can be subclassed to create powerful __docformat__ = "restructuredtext en" -import logging, os, cStringIO, time, traceback, re, urlparse, sys +import os, time, traceback, re, urlparse, sys from collections import defaultdict from functools import partial from contextlib import nested, closing @@ -27,6 +27,7 @@ from calibre.web.feeds import feed_from_xml, templates, feeds_from_index, Feed from calibre.web.fetch.simple import option_parser as web2disk_option_parser from calibre.web.fetch.simple import RecursiveFetcher from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending +from calibre.utils.logging import Log from calibre.ptempfile import PersistentTemporaryFile, \ PersistentTemporaryDirectory @@ -423,7 +424,7 @@ class BasicNewsRecipe(object): ''' raise NotImplementedError - def get_obfuscated_article(self, url, logger): + def get_obfuscated_article(self, url): ''' If you set :member:`articles_are_obfuscated` this method is called with every article URL. It should return the path to a file on the filesystem @@ -443,6 +444,7 @@ class BasicNewsRecipe(object): :param parser: Command line option parser. Used to intelligently merge options. :param progress_reporter: A Callable that takes two arguments: progress (a number between 0 and 1) and a string message. The message should be optional. ''' + self.log = Log() if not isinstance(self.title, unicode): self.title = unicode(self.title, 'utf-8', 'replace') @@ -455,7 +457,6 @@ class BasicNewsRecipe(object): if self.debug: - logging.getLogger('feeds2disk').setLevel(logging.DEBUG) self.verbose = True self.report_progress = progress_reporter @@ -560,20 +561,20 @@ class BasicNewsRecipe(object): res = self.build_index() self.report_progress(1, _('Download finished')) if self.failed_downloads: - self.log_warning(_('Failed to download the following articles:')) + self.log.warning(_('Failed to download the following articles:')) for feed, article, debug in self.failed_downloads: - self.log_warning(article.title+_(' from ')+feed.title) - self.log_debug(article.url) - self.log_debug(debug) + self.log.warning(article.title+_(' from ')+feed.title) + self.log.debug(article.url) + self.log.debug(debug) if self.partial_failures: - self.log_warning(_('Failed to download parts of the following articles:')) + self.log.warning(_('Failed to download parts of the following articles:')) for feed, atitle, aurl, debug in self.partial_failures: - self.log_warning(atitle + _(' from ') + feed) - self.log_debug(aurl) - self.log_warning(_('\tFailed links:')) + self.log.warning(atitle + _(' from ') + feed) + self.log.debug(aurl) + self.log.warning(_('\tFailed links:')) for l, tb in debug: - self.log_warning(l) - self.log_debug(tb) + self.log.warning(l) + self.log.debug(tb) return res finally: self.cleanup() @@ -636,20 +637,11 @@ class BasicNewsRecipe(object): extra_css=self.extra_css).render(doctype='xhtml') - def create_logger(self, feed_number, article_number): - logger = logging.getLogger('feeds2disk.article_%d_%d'%(feed_number, article_number)) - out = cStringIO.StringIO() - handler = logging.StreamHandler(out) - handler.setFormatter(logging.Formatter('%(levelname)s: %(message)s')) - handler.setLevel(logging.INFO if self.verbose else logging.WARNING) - if self.debug: - handler.setLevel(logging.DEBUG) - logger.addHandler(handler) - return logger, out - - def _fetch_article(self, url, dir, logger, f, a, num_of_feeds): + def _fetch_article(self, url, dir, f, a, num_of_feeds): self.web2disk_options.browser = self.get_browser() if self.multithreaded_fetch else self.browser - fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map, self.css_map, (url, f, a, num_of_feeds)) + fetcher = RecursiveFetcher(self.web2disk_options, self.log, + self.image_map, self.css_map, + (url, f, a, num_of_feeds)) fetcher.base_dir = dir fetcher.current_dir = dir fetcher.show_progress = False @@ -661,21 +653,21 @@ class BasicNewsRecipe(object): raise Exception(_('Could not fetch article. Run with --debug to see the reason')) return res, path, failures - def fetch_article(self, url, dir, logger, f, a, num_of_feeds): - return self._fetch_article(url, dir, logger, f, a, num_of_feeds) + def fetch_article(self, url, dir, f, a, num_of_feeds): + return self._fetch_article(url, dir, f, a, num_of_feeds) - def fetch_obfuscated_article(self, url, dir, logger, f, a, num_of_feeds): - path = os.path.abspath(self.get_obfuscated_article(url, logger)) + def fetch_obfuscated_article(self, url, dir, f, a, num_of_feeds): + path = os.path.abspath(self.get_obfuscated_article(url)) url = ('file:'+path) if iswindows else ('file://'+path) - return self._fetch_article(url, dir, logger, f, a, num_of_feeds) + return self._fetch_article(url, dir, f, a, num_of_feeds) - def fetch_embedded_article(self, article, dir, logger, f, a, num_of_feeds): + def fetch_embedded_article(self, article, dir, f, a, num_of_feeds): templ = templates.EmbeddedContent() raw = templ.generate(article).render('html') with PersistentTemporaryFile('_feeds2disk.html') as pt: pt.write(raw) url = ('file:'+pt.name) if iswindows else ('file://'+pt.name) - return self._fetch_article(url, dir, logger, f, a, num_of_feeds) + return self._fetch_article(url, dir, f, a, num_of_feeds) def build_index(self): @@ -716,7 +708,6 @@ class BasicNewsRecipe(object): art_dir = os.path.join(feed_dir, 'article_%d'%a) if not os.path.isdir(art_dir): os.makedirs(art_dir) - logger, stream = self.create_logger(f, a) try: url = self.print_version(article.url) except NotImplementedError: @@ -726,10 +717,9 @@ class BasicNewsRecipe(object): func, arg = (self.fetch_embedded_article, article) if self.use_embedded_content else \ ((self.fetch_obfuscated_article if self.articles_are_obfuscated \ else self.fetch_article), url) - req = WorkRequest(func, (arg, art_dir, logger, f, a, len(feed)), + req = WorkRequest(func, (arg, art_dir, f, a, len(feed)), {}, (f, a), self.article_downloaded, self.error_in_article_download) - req.stream = stream req.feed = feed req.article = article req.feed_dir = feed_dir @@ -768,8 +758,8 @@ class BasicNewsRecipe(object): cu = self.get_cover_url() except Exception, err: cu = None - self.log_error(_('Could not download cover: %s')%str(err)) - self.log_debug(traceback.format_exc()) + self.log.error(_('Could not download cover: %s')%str(err)) + self.log.debug(traceback.format_exc()) if cu is not None: ext = cu.rpartition('.')[-1] if '?' in ext: @@ -841,8 +831,8 @@ class BasicNewsRecipe(object): f.write(html.encode('utf-8')) renderer = render_html(hf) if renderer.tb is not None: - self.logger.warning('Failed to render default cover') - self.logger.debug(renderer.tb) + self.log.warning('Failed to render default cover') + self.log.debug(renderer.tb) else: cover_file.write(renderer.data) cover_file.flush() @@ -863,7 +853,7 @@ class BasicNewsRecipe(object): manifest.append(os.path.join(dir, 'index.ncx')) cpath = getattr(self, 'cover_path', None) if cpath is None: - pf = PersistentTemporaryFile('_recipe_cover.jpg') + pf = open(os.path.join(dir, 'cover.jpg'), 'wb') self.default_cover(pf) cpath = pf.name if cpath is not None and os.access(cpath, os.R_OK): @@ -944,7 +934,7 @@ class BasicNewsRecipe(object): a = request.requestID[1] article = request.article - self.log_debug(_('\nDownloaded article %s from %s\n%s')%(article.title, article.url, request.stream.getvalue().decode('utf-8', 'ignore'))) + self.log.debug(_('\nDownloaded article %s from %s')%(article.title, article.url)) article.orig_url = article.url article.url = 'article_%d/index.html'%a article.downloaded = True @@ -956,11 +946,11 @@ class BasicNewsRecipe(object): def error_in_article_download(self, request, traceback): self.jobs_done += 1 - self.log_error(_('Failed to download article: %s from %s\n')%(request.article.title, request.article.url)) + self.log.error(_('Failed to download article: %s from %s\n')%(request.article.title, request.article.url)) debug = request.stream.getvalue().decode('utf-8', 'ignore') - self.log_debug(debug) - self.log_debug(traceback) - self.log_debug('\n') + self.log.debug(debug) + self.log.debug(traceback) + self.log.debug('\n') self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article download failed: %s')%request.article.title) self.failed_downloads.append((request.feed, request.article, debug)) @@ -990,7 +980,7 @@ class BasicNewsRecipe(object): feed.populate_from_preparsed_feed(msg, []) feed.description = unicode(err) parsed_feeds.append(feed) - self.log_exception(msg) + self.log.exception(msg) return parsed_feeds @@ -1057,7 +1047,7 @@ class CustomIndexRecipe(BasicNewsRecipe): index = os.path.abspath(self.custom_index()) url = 'file:'+index if iswindows else 'file://'+index self.web2disk_options.browser = self.browser - fetcher = RecursiveFetcher(self.web2disk_options, self.logger) + fetcher = RecursiveFetcher(self.web2disk_options, self.log) fetcher.base_dir = self.output_dir fetcher.current_dir = self.output_dir fetcher.show_progress = False @@ -1069,7 +1059,7 @@ class AutomaticNewsRecipe(BasicNewsRecipe): keep_only_tags = [dict(name=['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])] - def fetch_embedded_article(self, article, dir, logger, f, a, num_of_feeds): + def fetch_embedded_article(self, article, dir, f, a, num_of_feeds): if self.use_embedded_content: self.web2disk_options.keep_only_tags = [] - return BasicNewsRecipe.fetch_embedded_article(self, article, dir, logger, f, a, num_of_feeds) + return BasicNewsRecipe.fetch_embedded_article(self, article, dir, f, a, num_of_feeds) diff --git a/src/calibre/web/fetch/simple.py b/src/calibre/web/fetch/simple.py index 51a4554a50..2ae705e01a 100644 --- a/src/calibre/web/fetch/simple.py +++ b/src/calibre/web/fetch/simple.py @@ -7,18 +7,19 @@ __copyright__ = '2008, Kovid Goyal ' Fetch a webpage and its links recursively. The webpages are saved to disk in UTF-8 encoding with any charset declarations removed. ''' -import sys, socket, os, urlparse, logging, re, time, copy, urllib2, threading, traceback +import sys, socket, os, urlparse, re, time, copy, urllib2, threading, traceback from urllib import url2pathname, quote from threading import RLock from httplib import responses from PIL import Image from cStringIO import StringIO -from calibre import setup_cli_handlers, browser, sanitize_file_name, \ +from calibre import browser, sanitize_file_name, \ relpath, unicode_path from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.config import OptionParser +from calibre.utils.logging import Log class FetchError(Exception): pass @@ -28,10 +29,10 @@ class closing(object): def __init__(self, thing): self.thing = thing - + def __enter__(self): return self.thing - + def __exit__(self, *exc_info): try: self.thing.close() @@ -55,47 +56,48 @@ def save_soup(soup, target): for meta in metas: if 'charset' in meta.get('content', '').lower(): meta.replaceWith(nm) - + selfdir = os.path.dirname(target) - + for tag in soup.findAll(['img', 'link', 'a']): for key in ('src', 'href'): path = tag.get(key, None) if path and os.path.isfile(path) and os.path.exists(path) and os.path.isabs(path): tag[key] = unicode_path(relpath(path, selfdir).replace(os.sep, '/')) - + html = unicode(soup) with open(target, 'wb') as f: f.write(html.encode('utf-8')) - + class response(str): - + def __new__(cls, *args): obj = super(response, cls).__new__(cls, *args) obj.newurl = None return obj - + class DummyLock(object): - + def __enter__(self, *args): return self def __exit__(self, *args): pass class RecursiveFetcher(object): - LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in + LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in ('.exe\s*$', '.mp3\s*$', '.ogg\s*$', '^\s*mailto:', '^\s*$')) #ADBLOCK_FILTER = tuple(re.compile(i, re.IGNORECASE) for it in # ( - # + # # ) # ) CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE) default_timeout = socket.getdefaulttimeout() # Needed here as it is used in __del__ DUMMY_LOCK = DummyLock() - - def __init__(self, options, logger, image_map={}, css_map={}, job_info=None): + + def __init__(self, options, log, image_map={}, css_map={}, job_info=None): self.base_dir = os.path.abspath(os.path.expanduser(options.dir)) if not os.path.exists(self.base_dir): os.makedirs(self.base_dir) + self.log = log self.default_timeout = socket.getdefaulttimeout() socket.setdefaulttimeout(options.timeout) self.verbose = options.verbose @@ -122,19 +124,19 @@ class RecursiveFetcher(object): self.remove_tags_after = getattr(options, 'remove_tags_after', None) self.remove_tags_before = getattr(options, 'remove_tags_before', None) self.keep_only_tags = getattr(options, 'keep_only_tags', []) - self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup) + self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup) self.postprocess_html_ext= getattr(options, 'postprocess_html', None) self.download_stylesheets = not options.no_stylesheets self.show_progress = True self.failed_links = [] self.job_info = job_info - + def get_soup(self, src): nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE) nmassage.extend(self.preprocess_regexps) nmassage += [(re.compile(r'', re.DOTALL), lambda m: '')] # Some websites have buggy doctype declarations that mess up beautifulsoup soup = BeautifulSoup(xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage) - + if self.keep_only_tags: body = Tag(soup, 'body') try: @@ -146,7 +148,7 @@ class RecursiveFetcher(object): soup.find('body').replaceWith(body) except AttributeError: # soup has no body element pass - + def remove_beyond(tag, next): while tag is not None and tag.name != 'body': after = getattr(tag, next) @@ -155,27 +157,27 @@ class RecursiveFetcher(object): after.extract() after = ns tag = tag.parent - + if self.remove_tags_after is not None: rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after for spec in rt: tag = soup.find(**spec) remove_beyond(tag, 'nextSibling') - + if self.remove_tags_before is not None: tag = soup.find(**self.remove_tags_before) remove_beyond(tag, 'previousSibling') - + for kwds in self.remove_tags: for tag in soup.findAll(**kwds): tag.extract() return self.preprocess_html_ext(soup) - - + + def fetch_url(self, url): data = None - self.log_debug('Fetching %s', url) - delta = time.time() - self.last_fetch_at + self.log.debug('Fetching', url) + delta = time.time() - self.last_fetch_at if delta < self.delay: time.sleep(delta) if re.search(r'\s+', url) is not None: @@ -190,43 +192,43 @@ class RecursiveFetcher(object): raise FetchError, responses[err.code] if getattr(err, 'reason', [0])[0] == 104 or \ getattr(getattr(err, 'args', [None])[0], 'errno', None) == -2: # Connection reset by peer or Name or service not know - self.log_debug('Temporary error, retrying in 1 second') + self.log.debug('Temporary error, retrying in 1 second') time.sleep(1) with closing(self.browser.open(url)) as f: data = response(f.read()+f.read()) data.newurl = f.geturl() - else: + else: raise err finally: self.last_fetch_at = time.time() return data - + def start_fetch(self, url): soup = BeautifulSoup(u'') - self.log_info('Downloading') + self.log.debug('Downloading') res = self.process_links(soup, url, 0, into_dir='') - self.log_info('%s saved to %s', url, res) + self.log.debug('%s saved to %s'%( url, res)) return res - + def is_link_ok(self, url): for i in self.__class__.LINK_FILTER: if i.search(url): return False return True - + def is_link_wanted(self, url): if self.filter_regexps: for f in self.filter_regexps: if f.search(url): - return False + return False if self.match_regexps: for m in self.match_regexps: if m.search(url): return True return False return True - + def process_stylesheets(self, soup, baseurl): diskpath = unicode_path(os.path.join(self.current_dir, 'stylesheets')) if not os.path.exists(diskpath): @@ -243,8 +245,7 @@ class RecursiveFetcher(object): try: data = self.fetch_url(iurl) except Exception, err: - self.log_debug('Could not fetch stylesheet %s', iurl) - self.log_debug('Error: %s', str(err), exc_info=True) + self.log.exception('Could not fetch stylesheet %s'% iurl) continue stylepath = os.path.join(diskpath, 'style'+str(c)+'.css') with self.stylemap_lock: @@ -253,7 +254,7 @@ class RecursiveFetcher(object): x.write(data) tag['href'] = stylepath else: - for ns in tag.findAll(text=True): + for ns in tag.findAll(text=True): src = str(ns) m = self.__class__.CSS_IMPORT_PATTERN.search(src) if m: @@ -267,8 +268,7 @@ class RecursiveFetcher(object): try: data = self.fetch_url(iurl) except Exception, err: - self.log_warning('Could not fetch stylesheet %s', iurl) - self.log_debug('Error: %s', str(err), exc_info=True) + self.log.exception('Could not fetch stylesheet %s'% iurl) continue c += 1 stylepath = os.path.join(diskpath, 'style'+str(c)+'.css') @@ -277,9 +277,9 @@ class RecursiveFetcher(object): with open(stylepath, 'wb') as x: x.write(data) ns.replaceWith(src.replace(m.group(1), stylepath)) - - - + + + def process_images(self, soup, baseurl): diskpath = unicode_path(os.path.join(self.current_dir, 'images')) if not os.path.exists(diskpath): @@ -291,9 +291,6 @@ class RecursiveFetcher(object): iurl = self.image_url_processor(baseurl, iurl) ext = os.path.splitext(iurl)[1] ext = ext[:5] - #if not ext: - # self.log_debug('Skipping extensionless image %s', iurl) - # continue if not urlparse.urlsplit(iurl).scheme: iurl = urlparse.urljoin(baseurl, iurl, False) with self.imagemap_lock: @@ -303,8 +300,7 @@ class RecursiveFetcher(object): try: data = self.fetch_url(iurl) except Exception, err: - self.log_warning('Could not fetch image %s', iurl) - self.log_debug('Error: %s', str(err), exc_info=True) + self.log.exception('Could not fetch image %s'% iurl) continue c += 1 fname = sanitize_file_name('img'+str(c)+ext) @@ -322,7 +318,7 @@ class RecursiveFetcher(object): traceback.print_exc() continue - def absurl(self, baseurl, tag, key, filter=True): + def absurl(self, baseurl, tag, key, filter=True): iurl = tag[key] parts = urlparse.urlsplit(iurl) if not parts.netloc and not parts.path: @@ -330,32 +326,32 @@ class RecursiveFetcher(object): if not parts.scheme: iurl = urlparse.urljoin(baseurl, iurl, False) if not self.is_link_ok(iurl): - self.log_debug('Skipping invalid link: %s', iurl) + self.log.debug('Skipping invalid link:', iurl) return None if filter and not self.is_link_wanted(iurl): - self.log_debug('Filtered link: '+iurl) + self.log.debug('Filtered link: '+iurl) return None return iurl - + def normurl(self, url): parts = list(urlparse.urlsplit(url)) parts[4] = '' return urlparse.urlunsplit(parts) - + def localize_link(self, tag, key, path): parts = urlparse.urlsplit(tag[key]) suffix = '#'+parts.fragment if parts.fragment else '' tag[key] = path+suffix - + def process_return_links(self, soup, baseurl): for tag in soup.findAll(lambda tag: tag.name.lower()=='a' and tag.has_key('href')): - iurl = self.absurl(baseurl, tag, 'href') + iurl = self.absurl(baseurl, tag, 'href') if not iurl: continue nurl = self.normurl(iurl) if self.filemap.has_key(nurl): self.localize_link(tag, 'href', self.filemap[nurl]) - + def process_links(self, soup, baseurl, recursion_level, into_dir='links'): res = '' diskpath = os.path.join(self.current_dir, into_dir) @@ -365,7 +361,7 @@ class RecursiveFetcher(object): try: self.current_dir = diskpath tags = list(soup.findAll('a', href=True)) - + for c, tag in enumerate(tags): if self.show_progress: print '.', @@ -395,17 +391,17 @@ class RecursiveFetcher(object): dsrc = dsrc.decode(self.encoding, 'ignore') else: dsrc = xml_to_unicode(dsrc, self.verbose)[0] - + soup = self.get_soup(dsrc) - + base = soup.find('base', href=True) if base is not None: newbaseurl = base['href'] - self.log_debug('Processing images...') + self.log.debug('Processing images...') self.process_images(soup, newbaseurl) if self.download_stylesheets: self.process_stylesheets(soup, newbaseurl) - + _fname = basename(iurl) if not isinstance(_fname, unicode): _fname.decode('latin1', 'replace') @@ -416,56 +412,55 @@ class RecursiveFetcher(object): self.downloaded_paths.append(res) self.filemap[nurl] = res if recursion_level < self.max_recursions: - self.log_debug('Processing links...') + self.log.debug('Processing links...') self.process_links(soup, newbaseurl, recursion_level+1) else: - self.process_return_links(soup, newbaseurl) - self.log_debug('Recursion limit reached. Skipping links in %s', iurl) - + self.process_return_links(soup, newbaseurl) + self.log.debug('Recursion limit reached. Skipping links in', iurl) + if callable(self.postprocess_html_ext): - soup = self.postprocess_html_ext(soup, + soup = self.postprocess_html_ext(soup, c==0 and recursion_level==0 and not getattr(self, 'called_first', False), self.job_info) - + if c==0 and recursion_level == 0: self.called_first = True - + save_soup(soup, res) self.localize_link(tag, 'href', res) except Exception, err: self.failed_links.append((iurl, traceback.format_exc())) - self.log_warning('Could not fetch link %s', iurl) - self.log_debug('Error: %s', str(err), exc_info=True) + self.log.exception('Could not fetch link', iurl) finally: self.current_dir = diskpath - self.files += 1 + self.files += 1 finally: self.current_dir = prev_dir if self.show_progress: print return res - + def __del__(self): dt = getattr(self, 'default_timeout', None) if dt is not None: socket.setdefaulttimeout(dt) - + def option_parser(usage=_('%prog URL\n\nWhere URL is for example http://google.com')): parser = OptionParser(usage=usage) - parser.add_option('-d', '--base-dir', + parser.add_option('-d', '--base-dir', help=_('Base directory into which URL is saved. Default is %default'), default='.', type='string', dest='dir') - parser.add_option('-t', '--timeout', + parser.add_option('-t', '--timeout', help=_('Timeout in seconds to wait for a response from the server. Default: %default s'), default=10.0, type='float', dest='timeout') - parser.add_option('-r', '--max-recursions', default=1, + parser.add_option('-r', '--max-recursions', default=1, help=_('Maximum number of levels to recurse i.e. depth of links to follow. Default %default'), type='int', dest='max_recursions') parser.add_option('-n', '--max-files', default=sys.maxint, type='int', dest='max_files', help=_('The maximum number of files to download. This only applies to files from tags. Default is %default')) parser.add_option('--delay', default=0, dest='delay', type='int', help=_('Minimum interval in seconds between consecutive fetches. Default is %default s')) - parser.add_option('--encoding', default=None, + parser.add_option('--encoding', default=None, help=_('The character encoding for the websites you are trying to download. The default is to try and guess the encoding.')) parser.add_option('--match-regexp', default=[], action='append', dest='match_regexps', help=_('Only links that match this regular expression will be followed. This option can be specified multiple times, in which case as long as a link matches any one regexp, it will be followed. By default all links are followed.')) @@ -478,23 +473,21 @@ def option_parser(usage=_('%prog URL\n\nWhere URL is for example http://google.c return parser -def create_fetcher(options, logger=None, image_map={}): - if logger is None: - level = logging.DEBUG if options.verbose else logging.INFO - logger = logging.getLogger('web2disk') - setup_cli_handlers(logger, level) - return RecursiveFetcher(options, logger, image_map={}) +def create_fetcher(options, image_map={}, log=None): + if log is None: + log = Log() + return RecursiveFetcher(options, log, image_map={}) def main(args=sys.argv): - parser = option_parser() + parser = option_parser() options, args = parser.parse_args(args) if len(args) != 2: parser.print_help() return 1 - - fetcher = create_fetcher(options) - fetcher.start_fetch(args[1]) - -if __name__ == '__main__': + fetcher = create_fetcher(options) + fetcher.start_fetch(args[1]) + + +if __name__ == '__main__': sys.exit(main())