mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 10:44:09 -04:00
Working HTML/OPF input plugin. Also fixed feeds download and removed cover processing from OEBBook
This commit is contained in:
parent
296853cd43
commit
95d1b58ae3
@ -189,6 +189,7 @@ class ComicMetadataReader(MetadataReaderPlugin):
|
|||||||
def get_metadata(self, stream, ftype):
|
def get_metadata(self, stream, ftype):
|
||||||
if ftype == 'cbr':
|
if ftype == 'cbr':
|
||||||
from calibre.libunrar import extract_member as extract_first
|
from calibre.libunrar import extract_member as extract_first
|
||||||
|
extract_first
|
||||||
else:
|
else:
|
||||||
from calibre.libunzip import extract_member as extract_first
|
from calibre.libunzip import extract_member as extract_first
|
||||||
from calibre.ebooks.metadata import MetaInformation
|
from calibre.ebooks.metadata import MetaInformation
|
||||||
@ -267,12 +268,14 @@ from calibre.ebooks.epub.input import EPUBInput
|
|||||||
from calibre.ebooks.mobi.input import MOBIInput
|
from calibre.ebooks.mobi.input import MOBIInput
|
||||||
from calibre.ebooks.pdf.input import PDFInput
|
from calibre.ebooks.pdf.input import PDFInput
|
||||||
from calibre.ebooks.txt.input import TXTInput
|
from calibre.ebooks.txt.input import TXTInput
|
||||||
|
from calibre.ebooks.html.input import HTMLInput
|
||||||
from calibre.ebooks.oeb.output import OEBOutput
|
from calibre.ebooks.oeb.output import OEBOutput
|
||||||
from calibre.ebooks.txt.output import TXTOutput
|
from calibre.ebooks.txt.output import TXTOutput
|
||||||
from calibre.ebooks.pdf.output import PDFOutput
|
from calibre.ebooks.pdf.output import PDFOutput
|
||||||
from calibre.customize.profiles import input_profiles, output_profiles
|
from calibre.customize.profiles import input_profiles, output_profiles
|
||||||
|
|
||||||
plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDFInput, TXTInput, OEBOutput, TXTOutput, PDFOutput]
|
plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDFInput, HTMLInput,
|
||||||
|
TXTInput, OEBOutput, TXTOutput, PDFOutput]
|
||||||
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
|
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
|
||||||
x.__name__.endswith('MetadataReader')]
|
x.__name__.endswith('MetadataReader')]
|
||||||
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
|
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
|
||||||
|
@ -163,9 +163,9 @@ class InputFormatPlugin(Plugin):
|
|||||||
for x in os.listdir('.'):
|
for x in os.listdir('.'):
|
||||||
shutil.rmtree(x) if os.path.isdir(x) else os.remove(x)
|
shutil.rmtree(x) if os.path.isdir(x) else os.remove(x)
|
||||||
|
|
||||||
|
|
||||||
ret = self.convert(stream, options, file_ext,
|
ret = self.convert(stream, options, file_ext,
|
||||||
log, accelerators)
|
log, accelerators)
|
||||||
|
|
||||||
if options.debug_input is not None:
|
if options.debug_input is not None:
|
||||||
options.debug_input = os.path.abspath(options.debug_input)
|
options.debug_input = os.path.abspath(options.debug_input)
|
||||||
if not os.path.exists(options.debug_input):
|
if not os.path.exists(options.debug_input):
|
||||||
|
@ -17,7 +17,7 @@ def tostring(root, strip_comments=False, pretty_print=False):
|
|||||||
root.set('xmlns', 'http://www.w3.org/1999/xhtml')
|
root.set('xmlns', 'http://www.w3.org/1999/xhtml')
|
||||||
root.set('{http://www.w3.org/1999/xhtml}xlink', 'http://www.w3.org/1999/xlink')
|
root.set('{http://www.w3.org/1999/xhtml}xlink', 'http://www.w3.org/1999/xlink')
|
||||||
for x in root.iter():
|
for x in root.iter():
|
||||||
if x.tag.rpartition('}')[-1].lower() == 'svg':
|
if hasattr(x.tag, 'rpartition') and x.tag.rpartition('}')[-1].lower() == 'svg':
|
||||||
x.set('xmlns', 'http://www.w3.org/2000/svg')
|
x.set('xmlns', 'http://www.w3.org/2000/svg')
|
||||||
|
|
||||||
ans = _tostring(root, encoding='utf-8', pretty_print=pretty_print)
|
ans = _tostring(root, encoding='utf-8', pretty_print=pretty_print)
|
||||||
|
@ -11,14 +11,12 @@ __docformat__ = 'restructuredtext en'
|
|||||||
Input plugin for HTML or OPF ebooks.
|
Input plugin for HTML or OPF ebooks.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
import os, re, sys, cStringIO
|
import os, re, sys
|
||||||
from urlparse import urlparse, urlunparse
|
from urlparse import urlparse, urlunparse
|
||||||
from urllib import unquote
|
from urllib import unquote
|
||||||
|
|
||||||
from calibre.customize.conversion import InputFormatPlugin
|
from calibre.customize.conversion import InputFormatPlugin
|
||||||
from calibre.ebooks.metadata.meta import get_metadata
|
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||||
from calibre.ebooks.metadata.opf2 import OPF, OPFCreator
|
|
||||||
from calibre.ebooks.metadata import MetaInformation
|
|
||||||
from calibre.ebooks.chardet import xml_to_unicode
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
from calibre.customize.conversion import OptionRecommendation
|
from calibre.customize.conversion import OptionRecommendation
|
||||||
from calibre import unicode_path
|
from calibre import unicode_path
|
||||||
@ -213,72 +211,21 @@ def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None)
|
|||||||
sys.setrecursionlimit(orec)
|
sys.setrecursionlimit(orec)
|
||||||
|
|
||||||
|
|
||||||
def opf_traverse(opf_reader, verbose=0, encoding=None):
|
|
||||||
'''
|
|
||||||
Return a list of :class:`HTMLFile` objects in the order specified by the
|
|
||||||
`<spine>` element of the OPF.
|
|
||||||
|
|
||||||
:param opf_reader: An :class:`calibre.ebooks.metadata.opf2.OPF` instance.
|
|
||||||
:param encoding: Specify character encoding of HTML files. If `None` it is
|
|
||||||
auto-detected.
|
|
||||||
'''
|
|
||||||
if not opf_reader.spine:
|
|
||||||
raise ValueError('OPF does not have a spine')
|
|
||||||
flat = []
|
|
||||||
for path in opf_reader.spine.items():
|
|
||||||
path = os.path.abspath(path)
|
|
||||||
if path not in flat:
|
|
||||||
flat.append(os.path.abspath(path))
|
|
||||||
for item in opf_reader.manifest:
|
|
||||||
if 'html' in item.mime_type:
|
|
||||||
path = os.path.abspath(item.path)
|
|
||||||
if path not in flat:
|
|
||||||
flat.append(path)
|
|
||||||
for i, path in enumerate(flat):
|
|
||||||
if not os.path.exists(path):
|
|
||||||
path = path.replace('&', '%26')
|
|
||||||
if os.path.exists(path):
|
|
||||||
flat[i] = path
|
|
||||||
for item in opf_reader.itermanifest():
|
|
||||||
item.set('href', item.get('href').replace('&', '%26'))
|
|
||||||
ans = []
|
|
||||||
for path in flat:
|
|
||||||
if os.path.exists(path):
|
|
||||||
ans.append(HTMLFile(path, 0, encoding, verbose))
|
|
||||||
else:
|
|
||||||
print 'WARNING: OPF spine item %s does not exist'%path
|
|
||||||
ans = [f for f in ans if not f.is_binary]
|
|
||||||
return ans
|
|
||||||
|
|
||||||
def search_for_opf(dir):
|
|
||||||
for f in os.listdir(dir):
|
|
||||||
if f.lower().endswith('.opf'):
|
|
||||||
return OPF(open(os.path.join(dir, f), 'rb'), dir)
|
|
||||||
|
|
||||||
def get_filelist(htmlfile, dir, opts, log):
|
def get_filelist(htmlfile, dir, opts, log):
|
||||||
'''
|
'''
|
||||||
Build list of files referenced by html file or try to detect and use an
|
Build list of files referenced by html file or try to detect and use an
|
||||||
OPF file instead.
|
OPF file instead.
|
||||||
'''
|
'''
|
||||||
print 'Building file list...'
|
log.info('Building file list...')
|
||||||
opf = search_for_opf(dir)
|
filelist = traverse(htmlfile, max_levels=int(opts.max_levels),
|
||||||
filelist = None
|
verbose=opts.verbose,
|
||||||
if opf is not None:
|
encoding=opts.input_encoding)\
|
||||||
try:
|
[0 if opts.breadth_first else 1]
|
||||||
filelist = opf_traverse(opf, verbose=opts.verbose,
|
|
||||||
encoding=opts.input_encoding)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
if not filelist:
|
|
||||||
filelist = traverse(htmlfile, max_levels=int(opts.max_levels),
|
|
||||||
verbose=opts.verbose,
|
|
||||||
encoding=opts.input_encoding)\
|
|
||||||
[0 if opts.breadth_first else 1]
|
|
||||||
if opts.verbose:
|
if opts.verbose:
|
||||||
log.debug('\tFound files...')
|
log.debug('\tFound files...')
|
||||||
for f in filelist:
|
for f in filelist:
|
||||||
log.debug('\t\t', f)
|
log.debug('\t\t', f)
|
||||||
return opf, filelist
|
return filelist
|
||||||
|
|
||||||
|
|
||||||
class HTMLInput(InputFormatPlugin):
|
class HTMLInput(InputFormatPlugin):
|
||||||
@ -309,34 +256,32 @@ class HTMLInput(InputFormatPlugin):
|
|||||||
|
|
||||||
def convert(self, stream, opts, file_ext, log,
|
def convert(self, stream, opts, file_ext, log,
|
||||||
accelerators):
|
accelerators):
|
||||||
|
from calibre.ebooks.metadata.meta import get_metadata
|
||||||
|
|
||||||
basedir = os.getcwd()
|
basedir = os.getcwd()
|
||||||
|
|
||||||
if hasattr(stream, 'name'):
|
if hasattr(stream, 'name'):
|
||||||
basedir = os.path.dirname(stream.name)
|
basedir = os.path.dirname(stream.name)
|
||||||
if file_ext == 'opf':
|
if file_ext == 'opf':
|
||||||
opf = OPF(stream, basedir)
|
opfpath = stream.name
|
||||||
filelist = opf_traverse(opf, verbose=opts.verbose,
|
|
||||||
encoding=opts.input_encoding)
|
|
||||||
mi = MetaInformation(opf)
|
|
||||||
else:
|
else:
|
||||||
opf, filelist = get_filelist(stream.name, basedir, opts, log)
|
filelist = get_filelist(stream.name, basedir, opts, log)
|
||||||
mi = MetaInformation(opf)
|
mi = get_metadata(stream, 'html')
|
||||||
mi.smart_update(get_metadata(stream, 'html'))
|
mi = OPFCreator(os.getcwdu(), mi)
|
||||||
|
mi.guide = None
|
||||||
|
entries = [(f.path, 'application/xhtml+xml') for f in filelist]
|
||||||
|
mi.create_manifest(entries)
|
||||||
|
mi.create_spine([f.path for f in filelist])
|
||||||
|
|
||||||
mi = OPFCreator(os.getcwdu(), mi)
|
mi.render(open('metadata.opf', 'wb'))
|
||||||
mi.guide = None
|
opfpath = os.path.abspath('metadata.opf')
|
||||||
entries = [(f.path, 'application/xhtml+xml') for f in filelist]
|
|
||||||
mi.create_manifest(entries)
|
|
||||||
mi.create_spine([f.path for f in filelist])
|
|
||||||
|
|
||||||
tocbuf = cStringIO.StringIO()
|
|
||||||
mi.render(open('metadata.opf', 'wb'), tocbuf, 'toc.ncx')
|
|
||||||
toc = tocbuf.getvalue()
|
|
||||||
if toc:
|
|
||||||
open('toc.ncx', 'wb').write(toc)
|
|
||||||
|
|
||||||
from calibre.ebooks.conversion.plumber import create_oebbook
|
from calibre.ebooks.conversion.plumber import create_oebbook
|
||||||
return create_oebbook(log, os.path.abspath('metadata.opf'))
|
oeb = create_oebbook(log, opfpath)
|
||||||
|
|
||||||
|
from calibre.ebooks.oeb.transforms.package import Package
|
||||||
|
Package(os.getcwdu())(oeb, opts)
|
||||||
|
|
||||||
|
return oeb
|
||||||
|
|
||||||
|
|
||||||
|
@ -573,7 +573,7 @@ class OEBReader(object):
|
|||||||
item = self._find_ncx(opf)
|
item = self._find_ncx(opf)
|
||||||
self._toc_from_opf(opf, item)
|
self._toc_from_opf(opf, item)
|
||||||
self._pages_from_opf(opf, item)
|
self._pages_from_opf(opf, item)
|
||||||
self._ensure_cover_image()
|
#self._ensure_cover_image()
|
||||||
|
|
||||||
|
|
||||||
def main(argv=sys.argv):
|
def main(argv=sys.argv):
|
||||||
|
@ -6,13 +6,14 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import os
|
import os, re
|
||||||
from urllib import unquote as urlunquote
|
from urllib import unquote as urlunquote
|
||||||
from functools import partial
|
from functools import partial
|
||||||
|
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
import cssutils
|
import cssutils
|
||||||
|
|
||||||
|
from calibre import sanitize_file_name
|
||||||
from calibre.constants import islinux
|
from calibre.constants import islinux
|
||||||
from calibre.ebooks.oeb.base import OEB_DOCS, urlnormalize, urldefrag, \
|
from calibre.ebooks.oeb.base import OEB_DOCS, urlnormalize, urldefrag, \
|
||||||
rewrite_links
|
rewrite_links
|
||||||
@ -36,15 +37,21 @@ class Package(object):
|
|||||||
self.new_base_path = os.path.abspath(base)
|
self.new_base_path = os.path.abspath(base)
|
||||||
|
|
||||||
def rewrite_links_in(self, item):
|
def rewrite_links_in(self, item):
|
||||||
base = os.path.join(self.new_base_path, *item.href.split('/'))
|
old_href = item.old_href.split('#')[0]
|
||||||
|
new_href = item.href.split('#')[0]
|
||||||
|
base = os.path.join(self.old_base_path, *old_href.split('/'))
|
||||||
base = os.path.dirname(base)
|
base = os.path.dirname(base)
|
||||||
|
self.log.debug('\tRewriting links in', base+'/'+
|
||||||
|
item.href.rpartition('/')[-1])
|
||||||
|
new_base = os.path.join(self.new_base_path, *new_href.split('/'))
|
||||||
|
new_base = os.path.dirname(new_base)
|
||||||
|
|
||||||
if etree.iselement(item.data):
|
if etree.iselement(item.data):
|
||||||
self.rewrite_links_in_xml(item.data, base)
|
self.rewrite_links_in_xml(item.data, base, new_base)
|
||||||
elif hasattr(item.data, 'cssText'):
|
elif hasattr(item.data, 'cssText'):
|
||||||
self.rewrite_links_in_css(item.data, base)
|
self.rewrite_links_in_css(item.data, base, new_base)
|
||||||
|
|
||||||
def link_replacer(self, link_, base=''):
|
def link_replacer(self, link_, base='', new_base=''):
|
||||||
link = urlnormalize(link_)
|
link = urlnormalize(link_)
|
||||||
link, frag = urldefrag(link)
|
link, frag = urldefrag(link)
|
||||||
link = urlunquote(link).replace('/', os.sep)
|
link = urlunquote(link).replace('/', os.sep)
|
||||||
@ -55,20 +62,33 @@ class Package(object):
|
|||||||
link = link.lower()
|
link = link.lower()
|
||||||
if link not in self.map:
|
if link not in self.map:
|
||||||
return link_
|
return link_
|
||||||
nlink = os.path.relpath(self.map[link], base)
|
nlink = os.path.relpath(self.map[link], new_base)
|
||||||
if frag:
|
if frag:
|
||||||
nlink = '#'.join(nlink, frag)
|
nlink = '#'.join((nlink, frag))
|
||||||
return nlink.replace(os.sep, '/')
|
return nlink.replace(os.sep, '/')
|
||||||
|
|
||||||
def rewrite_links_in_css(self, sheet, base):
|
def rewrite_links_in_css(self, sheet, base, new_base):
|
||||||
repl = partial(self.link_replacer, base=base)
|
repl = partial(self.link_replacer, base=base, new_base=new_base)
|
||||||
cssutils.replaceUrls(sheet, repl)
|
cssutils.replaceUrls(sheet, repl)
|
||||||
|
|
||||||
def rewrite_links_in_xml(self, root, base):
|
def rewrite_links_in_xml(self, root, base, new_base):
|
||||||
repl = partial(self.link_replacer, base=base)
|
repl = partial(self.link_replacer, base=base, new_base=new_base)
|
||||||
rewrite_links(root, repl)
|
rewrite_links(root, repl)
|
||||||
|
|
||||||
def move_manifest_item(self, item):
|
def uniqify_name(self, new_href, hrefs):
|
||||||
|
c = 0
|
||||||
|
while new_href in hrefs:
|
||||||
|
c += 1
|
||||||
|
parts = new_href.split('/')
|
||||||
|
name, ext = os.path.splitext(parts[-1])
|
||||||
|
name = re.sub(r'_\d+$', '', name)
|
||||||
|
name += '_%d'%c
|
||||||
|
parts[-1] = name + ext
|
||||||
|
new_href = '/'.join(parts)
|
||||||
|
return new_href
|
||||||
|
|
||||||
|
|
||||||
|
def move_manifest_item(self, item, hrefs):
|
||||||
item.data # Make sure the data has been loaded and cached
|
item.data # Make sure the data has been loaded and cached
|
||||||
old_abspath = os.path.join(self.old_base_path,
|
old_abspath = os.path.join(self.old_base_path,
|
||||||
*(urldefrag(item.href)[0].split('/')))
|
*(urldefrag(item.href)[0].split('/')))
|
||||||
@ -79,11 +99,17 @@ class Package(object):
|
|||||||
new_href = 'content/'
|
new_href = 'content/'
|
||||||
elif item.href.lower().endswith('.ncx'):
|
elif item.href.lower().endswith('.ncx'):
|
||||||
new_href = ''
|
new_href = ''
|
||||||
new_href += bname
|
new_href += sanitize_file_name(bname)
|
||||||
|
|
||||||
|
if new_href in hrefs:
|
||||||
|
new_href = self.uniqify_name(new_href, hrefs)
|
||||||
|
hrefs.add(new_href)
|
||||||
|
|
||||||
new_abspath = os.path.join(self.new_base_path, *new_href.split('/'))
|
new_abspath = os.path.join(self.new_base_path, *new_href.split('/'))
|
||||||
new_abspath = os.path.abspath(new_abspath)
|
new_abspath = os.path.abspath(new_abspath)
|
||||||
|
item.old_href = self.oeb.manifest.hrefs.pop(item.href).href
|
||||||
item.href = new_href
|
item.href = new_href
|
||||||
|
self.oeb.manifest.hrefs[item.href] = item
|
||||||
if not islinux:
|
if not islinux:
|
||||||
old_abspath, new_abspath = old_abspath.lower(), new_abspath.lower()
|
old_abspath, new_abspath = old_abspath.lower(), new_abspath.lower()
|
||||||
if old_abspath != new_abspath:
|
if old_abspath != new_abspath:
|
||||||
@ -91,25 +117,33 @@ class Package(object):
|
|||||||
|
|
||||||
def rewrite_links_in_toc(self, toc):
|
def rewrite_links_in_toc(self, toc):
|
||||||
if toc.href:
|
if toc.href:
|
||||||
toc.href = self.link_replacer(toc.href, base=self.new_base_path)
|
toc.href = self.link_replacer(toc.href, base=self.old_base_path,
|
||||||
|
new_base=self.new_base_path)
|
||||||
|
|
||||||
for x in toc:
|
for x in toc:
|
||||||
self.rewrite_links_in_toc(x)
|
self.rewrite_links_in_toc(x)
|
||||||
|
|
||||||
def __call__(self, oeb, context):
|
def __call__(self, oeb, context):
|
||||||
self.map = {}
|
self.map = {}
|
||||||
self.log = self.oeb.log
|
self.log = oeb.log
|
||||||
|
self.oeb = oeb
|
||||||
self.old_base_path = os.path.abspath(oeb.container.rootdir)
|
self.old_base_path = os.path.abspath(oeb.container.rootdir)
|
||||||
|
|
||||||
|
hrefs = set([])
|
||||||
for item in self.oeb.manifest:
|
for item in self.oeb.manifest:
|
||||||
self.move_manifest_item(item)
|
self.move_manifest_item(item, hrefs)
|
||||||
|
|
||||||
|
self.log.debug('Rewriting links in OEB documents...')
|
||||||
for item in self.oeb.manifest:
|
for item in self.oeb.manifest:
|
||||||
self.rewrite_links_in(item)
|
self.rewrite_links_in(item)
|
||||||
|
|
||||||
if getattr(oeb.toc, 'nodes', False):
|
if getattr(oeb.toc, 'nodes', False):
|
||||||
|
self.log.debug('Rewriting links in TOC...')
|
||||||
self.rewrite_links_in_toc(oeb.toc)
|
self.rewrite_links_in_toc(oeb.toc)
|
||||||
|
|
||||||
if hasattr(oeb, 'guide'):
|
if hasattr(oeb, 'guide'):
|
||||||
|
self.log.debug('Rewriting links in guide...')
|
||||||
for ref in oeb.guide.values():
|
for ref in oeb.guide.values():
|
||||||
ref.href = self.link_replacer(ref.href, base=self.new_base_path)
|
ref.href = self.link_replacer(ref.href,
|
||||||
|
base=self.old_base_path,
|
||||||
|
new_base=self.new_base_path)
|
||||||
|
@ -48,7 +48,8 @@ class OEBWriter(object):
|
|||||||
pretty_print=pretty_print)
|
pretty_print=pretty_print)
|
||||||
|
|
||||||
def __call__(self, oeb, path):
|
def __call__(self, oeb, path):
|
||||||
"""Read the book in the :class:`OEBBook` object :param:`oeb` to a file
|
"""
|
||||||
|
Read the book in the :class:`OEBBook` object :param:`oeb` to a file
|
||||||
at :param:`path`.
|
at :param:`path`.
|
||||||
"""
|
"""
|
||||||
version = int(self.version[0])
|
version = int(self.version[0])
|
||||||
|
@ -5,7 +5,7 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
|||||||
CLI for downloading feeds.
|
CLI for downloading feeds.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
import sys, os, logging
|
import sys, os
|
||||||
from calibre.web.feeds.recipes import get_builtin_recipe, compile_recipe, titles
|
from calibre.web.feeds.recipes import get_builtin_recipe, compile_recipe, titles
|
||||||
from calibre.web.fetch.simple import option_parser as _option_parser
|
from calibre.web.fetch.simple import option_parser as _option_parser
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
@ -113,7 +113,7 @@ If you specify this option, any argument to %prog is ignored and a default recip
|
|||||||
class RecipeError(Exception):
|
class RecipeError(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def run_recipe(opts, recipe_arg, parser, notification=None, handler=None):
|
def run_recipe(opts, recipe_arg, parser, notification=None):
|
||||||
if notification is None:
|
if notification is None:
|
||||||
from calibre.utils.terminfo import TerminalController, ProgressBar
|
from calibre.utils.terminfo import TerminalController, ProgressBar
|
||||||
term = TerminalController(sys.stdout)
|
term = TerminalController(sys.stdout)
|
||||||
@ -137,14 +137,6 @@ def run_recipe(opts, recipe_arg, parser, notification=None, handler=None):
|
|||||||
if recipe is None:
|
if recipe is None:
|
||||||
raise RecipeError(recipe_arg+ ' is an invalid recipe')
|
raise RecipeError(recipe_arg+ ' is an invalid recipe')
|
||||||
|
|
||||||
|
|
||||||
if handler is None:
|
|
||||||
from calibre import ColoredFormatter
|
|
||||||
handler = logging.StreamHandler(sys.stdout)
|
|
||||||
handler.setLevel(logging.DEBUG if opts.debug else logging.INFO if opts.verbose else logging.WARN)
|
|
||||||
handler.setFormatter(ColoredFormatter('%(levelname)s: %(message)s\n')) # The trailing newline is need because of the progress bar
|
|
||||||
logging.getLogger('feeds2disk').addHandler(handler)
|
|
||||||
|
|
||||||
recipe = recipe(opts, parser, notification)
|
recipe = recipe(opts, parser, notification)
|
||||||
|
|
||||||
if not os.path.exists(recipe.output_dir):
|
if not os.path.exists(recipe.output_dir):
|
||||||
@ -153,7 +145,7 @@ def run_recipe(opts, recipe_arg, parser, notification=None, handler=None):
|
|||||||
|
|
||||||
return recipe
|
return recipe
|
||||||
|
|
||||||
def main(args=sys.argv, notification=None, handler=None):
|
def main(args=sys.argv, notification=None):
|
||||||
p = option_parser()
|
p = option_parser()
|
||||||
opts, args = p.parse_args(args=args[1:])
|
opts, args = p.parse_args(args=args[1:])
|
||||||
|
|
||||||
@ -161,7 +153,7 @@ def main(args=sys.argv, notification=None, handler=None):
|
|||||||
p.print_help()
|
p.print_help()
|
||||||
return 1
|
return 1
|
||||||
recipe_arg = args[0] if len(args) > 0 else None
|
recipe_arg = args[0] if len(args) > 0 else None
|
||||||
run_recipe(opts, recipe_arg, p, notification=notification, handler=handler)
|
run_recipe(opts, recipe_arg, p, notification=notification)
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
@ -7,7 +7,7 @@ Defines various abstract base classes that can be subclassed to create powerful
|
|||||||
__docformat__ = "restructuredtext en"
|
__docformat__ = "restructuredtext en"
|
||||||
|
|
||||||
|
|
||||||
import logging, os, cStringIO, time, traceback, re, urlparse, sys
|
import os, time, traceback, re, urlparse, sys
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from contextlib import nested, closing
|
from contextlib import nested, closing
|
||||||
@ -27,6 +27,7 @@ from calibre.web.feeds import feed_from_xml, templates, feeds_from_index, Feed
|
|||||||
from calibre.web.fetch.simple import option_parser as web2disk_option_parser
|
from calibre.web.fetch.simple import option_parser as web2disk_option_parser
|
||||||
from calibre.web.fetch.simple import RecursiveFetcher
|
from calibre.web.fetch.simple import RecursiveFetcher
|
||||||
from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending
|
from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending
|
||||||
|
from calibre.utils.logging import Log
|
||||||
from calibre.ptempfile import PersistentTemporaryFile, \
|
from calibre.ptempfile import PersistentTemporaryFile, \
|
||||||
PersistentTemporaryDirectory
|
PersistentTemporaryDirectory
|
||||||
|
|
||||||
@ -423,7 +424,7 @@ class BasicNewsRecipe(object):
|
|||||||
'''
|
'''
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def get_obfuscated_article(self, url, logger):
|
def get_obfuscated_article(self, url):
|
||||||
'''
|
'''
|
||||||
If you set :member:`articles_are_obfuscated` this method is called with
|
If you set :member:`articles_are_obfuscated` this method is called with
|
||||||
every article URL. It should return the path to a file on the filesystem
|
every article URL. It should return the path to a file on the filesystem
|
||||||
@ -443,6 +444,7 @@ class BasicNewsRecipe(object):
|
|||||||
:param parser: Command line option parser. Used to intelligently merge options.
|
:param parser: Command line option parser. Used to intelligently merge options.
|
||||||
:param progress_reporter: A Callable that takes two arguments: progress (a number between 0 and 1) and a string message. The message should be optional.
|
:param progress_reporter: A Callable that takes two arguments: progress (a number between 0 and 1) and a string message. The message should be optional.
|
||||||
'''
|
'''
|
||||||
|
self.log = Log()
|
||||||
if not isinstance(self.title, unicode):
|
if not isinstance(self.title, unicode):
|
||||||
self.title = unicode(self.title, 'utf-8', 'replace')
|
self.title = unicode(self.title, 'utf-8', 'replace')
|
||||||
|
|
||||||
@ -455,7 +457,6 @@ class BasicNewsRecipe(object):
|
|||||||
|
|
||||||
|
|
||||||
if self.debug:
|
if self.debug:
|
||||||
logging.getLogger('feeds2disk').setLevel(logging.DEBUG)
|
|
||||||
self.verbose = True
|
self.verbose = True
|
||||||
self.report_progress = progress_reporter
|
self.report_progress = progress_reporter
|
||||||
|
|
||||||
@ -560,20 +561,20 @@ class BasicNewsRecipe(object):
|
|||||||
res = self.build_index()
|
res = self.build_index()
|
||||||
self.report_progress(1, _('Download finished'))
|
self.report_progress(1, _('Download finished'))
|
||||||
if self.failed_downloads:
|
if self.failed_downloads:
|
||||||
self.log_warning(_('Failed to download the following articles:'))
|
self.log.warning(_('Failed to download the following articles:'))
|
||||||
for feed, article, debug in self.failed_downloads:
|
for feed, article, debug in self.failed_downloads:
|
||||||
self.log_warning(article.title+_(' from ')+feed.title)
|
self.log.warning(article.title+_(' from ')+feed.title)
|
||||||
self.log_debug(article.url)
|
self.log.debug(article.url)
|
||||||
self.log_debug(debug)
|
self.log.debug(debug)
|
||||||
if self.partial_failures:
|
if self.partial_failures:
|
||||||
self.log_warning(_('Failed to download parts of the following articles:'))
|
self.log.warning(_('Failed to download parts of the following articles:'))
|
||||||
for feed, atitle, aurl, debug in self.partial_failures:
|
for feed, atitle, aurl, debug in self.partial_failures:
|
||||||
self.log_warning(atitle + _(' from ') + feed)
|
self.log.warning(atitle + _(' from ') + feed)
|
||||||
self.log_debug(aurl)
|
self.log.debug(aurl)
|
||||||
self.log_warning(_('\tFailed links:'))
|
self.log.warning(_('\tFailed links:'))
|
||||||
for l, tb in debug:
|
for l, tb in debug:
|
||||||
self.log_warning(l)
|
self.log.warning(l)
|
||||||
self.log_debug(tb)
|
self.log.debug(tb)
|
||||||
return res
|
return res
|
||||||
finally:
|
finally:
|
||||||
self.cleanup()
|
self.cleanup()
|
||||||
@ -636,20 +637,11 @@ class BasicNewsRecipe(object):
|
|||||||
extra_css=self.extra_css).render(doctype='xhtml')
|
extra_css=self.extra_css).render(doctype='xhtml')
|
||||||
|
|
||||||
|
|
||||||
def create_logger(self, feed_number, article_number):
|
def _fetch_article(self, url, dir, f, a, num_of_feeds):
|
||||||
logger = logging.getLogger('feeds2disk.article_%d_%d'%(feed_number, article_number))
|
|
||||||
out = cStringIO.StringIO()
|
|
||||||
handler = logging.StreamHandler(out)
|
|
||||||
handler.setFormatter(logging.Formatter('%(levelname)s: %(message)s'))
|
|
||||||
handler.setLevel(logging.INFO if self.verbose else logging.WARNING)
|
|
||||||
if self.debug:
|
|
||||||
handler.setLevel(logging.DEBUG)
|
|
||||||
logger.addHandler(handler)
|
|
||||||
return logger, out
|
|
||||||
|
|
||||||
def _fetch_article(self, url, dir, logger, f, a, num_of_feeds):
|
|
||||||
self.web2disk_options.browser = self.get_browser() if self.multithreaded_fetch else self.browser
|
self.web2disk_options.browser = self.get_browser() if self.multithreaded_fetch else self.browser
|
||||||
fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map, self.css_map, (url, f, a, num_of_feeds))
|
fetcher = RecursiveFetcher(self.web2disk_options, self.log,
|
||||||
|
self.image_map, self.css_map,
|
||||||
|
(url, f, a, num_of_feeds))
|
||||||
fetcher.base_dir = dir
|
fetcher.base_dir = dir
|
||||||
fetcher.current_dir = dir
|
fetcher.current_dir = dir
|
||||||
fetcher.show_progress = False
|
fetcher.show_progress = False
|
||||||
@ -661,21 +653,21 @@ class BasicNewsRecipe(object):
|
|||||||
raise Exception(_('Could not fetch article. Run with --debug to see the reason'))
|
raise Exception(_('Could not fetch article. Run with --debug to see the reason'))
|
||||||
return res, path, failures
|
return res, path, failures
|
||||||
|
|
||||||
def fetch_article(self, url, dir, logger, f, a, num_of_feeds):
|
def fetch_article(self, url, dir, f, a, num_of_feeds):
|
||||||
return self._fetch_article(url, dir, logger, f, a, num_of_feeds)
|
return self._fetch_article(url, dir, f, a, num_of_feeds)
|
||||||
|
|
||||||
def fetch_obfuscated_article(self, url, dir, logger, f, a, num_of_feeds):
|
def fetch_obfuscated_article(self, url, dir, f, a, num_of_feeds):
|
||||||
path = os.path.abspath(self.get_obfuscated_article(url, logger))
|
path = os.path.abspath(self.get_obfuscated_article(url))
|
||||||
url = ('file:'+path) if iswindows else ('file://'+path)
|
url = ('file:'+path) if iswindows else ('file://'+path)
|
||||||
return self._fetch_article(url, dir, logger, f, a, num_of_feeds)
|
return self._fetch_article(url, dir, f, a, num_of_feeds)
|
||||||
|
|
||||||
def fetch_embedded_article(self, article, dir, logger, f, a, num_of_feeds):
|
def fetch_embedded_article(self, article, dir, f, a, num_of_feeds):
|
||||||
templ = templates.EmbeddedContent()
|
templ = templates.EmbeddedContent()
|
||||||
raw = templ.generate(article).render('html')
|
raw = templ.generate(article).render('html')
|
||||||
with PersistentTemporaryFile('_feeds2disk.html') as pt:
|
with PersistentTemporaryFile('_feeds2disk.html') as pt:
|
||||||
pt.write(raw)
|
pt.write(raw)
|
||||||
url = ('file:'+pt.name) if iswindows else ('file://'+pt.name)
|
url = ('file:'+pt.name) if iswindows else ('file://'+pt.name)
|
||||||
return self._fetch_article(url, dir, logger, f, a, num_of_feeds)
|
return self._fetch_article(url, dir, f, a, num_of_feeds)
|
||||||
|
|
||||||
|
|
||||||
def build_index(self):
|
def build_index(self):
|
||||||
@ -716,7 +708,6 @@ class BasicNewsRecipe(object):
|
|||||||
art_dir = os.path.join(feed_dir, 'article_%d'%a)
|
art_dir = os.path.join(feed_dir, 'article_%d'%a)
|
||||||
if not os.path.isdir(art_dir):
|
if not os.path.isdir(art_dir):
|
||||||
os.makedirs(art_dir)
|
os.makedirs(art_dir)
|
||||||
logger, stream = self.create_logger(f, a)
|
|
||||||
try:
|
try:
|
||||||
url = self.print_version(article.url)
|
url = self.print_version(article.url)
|
||||||
except NotImplementedError:
|
except NotImplementedError:
|
||||||
@ -726,10 +717,9 @@ class BasicNewsRecipe(object):
|
|||||||
func, arg = (self.fetch_embedded_article, article) if self.use_embedded_content else \
|
func, arg = (self.fetch_embedded_article, article) if self.use_embedded_content else \
|
||||||
((self.fetch_obfuscated_article if self.articles_are_obfuscated \
|
((self.fetch_obfuscated_article if self.articles_are_obfuscated \
|
||||||
else self.fetch_article), url)
|
else self.fetch_article), url)
|
||||||
req = WorkRequest(func, (arg, art_dir, logger, f, a, len(feed)),
|
req = WorkRequest(func, (arg, art_dir, f, a, len(feed)),
|
||||||
{}, (f, a), self.article_downloaded,
|
{}, (f, a), self.article_downloaded,
|
||||||
self.error_in_article_download)
|
self.error_in_article_download)
|
||||||
req.stream = stream
|
|
||||||
req.feed = feed
|
req.feed = feed
|
||||||
req.article = article
|
req.article = article
|
||||||
req.feed_dir = feed_dir
|
req.feed_dir = feed_dir
|
||||||
@ -768,8 +758,8 @@ class BasicNewsRecipe(object):
|
|||||||
cu = self.get_cover_url()
|
cu = self.get_cover_url()
|
||||||
except Exception, err:
|
except Exception, err:
|
||||||
cu = None
|
cu = None
|
||||||
self.log_error(_('Could not download cover: %s')%str(err))
|
self.log.error(_('Could not download cover: %s')%str(err))
|
||||||
self.log_debug(traceback.format_exc())
|
self.log.debug(traceback.format_exc())
|
||||||
if cu is not None:
|
if cu is not None:
|
||||||
ext = cu.rpartition('.')[-1]
|
ext = cu.rpartition('.')[-1]
|
||||||
if '?' in ext:
|
if '?' in ext:
|
||||||
@ -841,8 +831,8 @@ class BasicNewsRecipe(object):
|
|||||||
f.write(html.encode('utf-8'))
|
f.write(html.encode('utf-8'))
|
||||||
renderer = render_html(hf)
|
renderer = render_html(hf)
|
||||||
if renderer.tb is not None:
|
if renderer.tb is not None:
|
||||||
self.logger.warning('Failed to render default cover')
|
self.log.warning('Failed to render default cover')
|
||||||
self.logger.debug(renderer.tb)
|
self.log.debug(renderer.tb)
|
||||||
else:
|
else:
|
||||||
cover_file.write(renderer.data)
|
cover_file.write(renderer.data)
|
||||||
cover_file.flush()
|
cover_file.flush()
|
||||||
@ -863,7 +853,7 @@ class BasicNewsRecipe(object):
|
|||||||
manifest.append(os.path.join(dir, 'index.ncx'))
|
manifest.append(os.path.join(dir, 'index.ncx'))
|
||||||
cpath = getattr(self, 'cover_path', None)
|
cpath = getattr(self, 'cover_path', None)
|
||||||
if cpath is None:
|
if cpath is None:
|
||||||
pf = PersistentTemporaryFile('_recipe_cover.jpg')
|
pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
|
||||||
self.default_cover(pf)
|
self.default_cover(pf)
|
||||||
cpath = pf.name
|
cpath = pf.name
|
||||||
if cpath is not None and os.access(cpath, os.R_OK):
|
if cpath is not None and os.access(cpath, os.R_OK):
|
||||||
@ -944,7 +934,7 @@ class BasicNewsRecipe(object):
|
|||||||
a = request.requestID[1]
|
a = request.requestID[1]
|
||||||
|
|
||||||
article = request.article
|
article = request.article
|
||||||
self.log_debug(_('\nDownloaded article %s from %s\n%s')%(article.title, article.url, request.stream.getvalue().decode('utf-8', 'ignore')))
|
self.log.debug(_('\nDownloaded article %s from %s')%(article.title, article.url))
|
||||||
article.orig_url = article.url
|
article.orig_url = article.url
|
||||||
article.url = 'article_%d/index.html'%a
|
article.url = 'article_%d/index.html'%a
|
||||||
article.downloaded = True
|
article.downloaded = True
|
||||||
@ -956,11 +946,11 @@ class BasicNewsRecipe(object):
|
|||||||
|
|
||||||
def error_in_article_download(self, request, traceback):
|
def error_in_article_download(self, request, traceback):
|
||||||
self.jobs_done += 1
|
self.jobs_done += 1
|
||||||
self.log_error(_('Failed to download article: %s from %s\n')%(request.article.title, request.article.url))
|
self.log.error(_('Failed to download article: %s from %s\n')%(request.article.title, request.article.url))
|
||||||
debug = request.stream.getvalue().decode('utf-8', 'ignore')
|
debug = request.stream.getvalue().decode('utf-8', 'ignore')
|
||||||
self.log_debug(debug)
|
self.log.debug(debug)
|
||||||
self.log_debug(traceback)
|
self.log.debug(traceback)
|
||||||
self.log_debug('\n')
|
self.log.debug('\n')
|
||||||
self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article download failed: %s')%request.article.title)
|
self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article download failed: %s')%request.article.title)
|
||||||
self.failed_downloads.append((request.feed, request.article, debug))
|
self.failed_downloads.append((request.feed, request.article, debug))
|
||||||
|
|
||||||
@ -990,7 +980,7 @@ class BasicNewsRecipe(object):
|
|||||||
feed.populate_from_preparsed_feed(msg, [])
|
feed.populate_from_preparsed_feed(msg, [])
|
||||||
feed.description = unicode(err)
|
feed.description = unicode(err)
|
||||||
parsed_feeds.append(feed)
|
parsed_feeds.append(feed)
|
||||||
self.log_exception(msg)
|
self.log.exception(msg)
|
||||||
|
|
||||||
|
|
||||||
return parsed_feeds
|
return parsed_feeds
|
||||||
@ -1057,7 +1047,7 @@ class CustomIndexRecipe(BasicNewsRecipe):
|
|||||||
index = os.path.abspath(self.custom_index())
|
index = os.path.abspath(self.custom_index())
|
||||||
url = 'file:'+index if iswindows else 'file://'+index
|
url = 'file:'+index if iswindows else 'file://'+index
|
||||||
self.web2disk_options.browser = self.browser
|
self.web2disk_options.browser = self.browser
|
||||||
fetcher = RecursiveFetcher(self.web2disk_options, self.logger)
|
fetcher = RecursiveFetcher(self.web2disk_options, self.log)
|
||||||
fetcher.base_dir = self.output_dir
|
fetcher.base_dir = self.output_dir
|
||||||
fetcher.current_dir = self.output_dir
|
fetcher.current_dir = self.output_dir
|
||||||
fetcher.show_progress = False
|
fetcher.show_progress = False
|
||||||
@ -1069,7 +1059,7 @@ class AutomaticNewsRecipe(BasicNewsRecipe):
|
|||||||
|
|
||||||
keep_only_tags = [dict(name=['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])]
|
keep_only_tags = [dict(name=['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])]
|
||||||
|
|
||||||
def fetch_embedded_article(self, article, dir, logger, f, a, num_of_feeds):
|
def fetch_embedded_article(self, article, dir, f, a, num_of_feeds):
|
||||||
if self.use_embedded_content:
|
if self.use_embedded_content:
|
||||||
self.web2disk_options.keep_only_tags = []
|
self.web2disk_options.keep_only_tags = []
|
||||||
return BasicNewsRecipe.fetch_embedded_article(self, article, dir, logger, f, a, num_of_feeds)
|
return BasicNewsRecipe.fetch_embedded_article(self, article, dir, f, a, num_of_feeds)
|
||||||
|
@ -7,18 +7,19 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
|||||||
Fetch a webpage and its links recursively. The webpages are saved to disk in
|
Fetch a webpage and its links recursively. The webpages are saved to disk in
|
||||||
UTF-8 encoding with any charset declarations removed.
|
UTF-8 encoding with any charset declarations removed.
|
||||||
'''
|
'''
|
||||||
import sys, socket, os, urlparse, logging, re, time, copy, urllib2, threading, traceback
|
import sys, socket, os, urlparse, re, time, copy, urllib2, threading, traceback
|
||||||
from urllib import url2pathname, quote
|
from urllib import url2pathname, quote
|
||||||
from threading import RLock
|
from threading import RLock
|
||||||
from httplib import responses
|
from httplib import responses
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from cStringIO import StringIO
|
from cStringIO import StringIO
|
||||||
|
|
||||||
from calibre import setup_cli_handlers, browser, sanitize_file_name, \
|
from calibre import browser, sanitize_file_name, \
|
||||||
relpath, unicode_path
|
relpath, unicode_path
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
|
||||||
from calibre.ebooks.chardet import xml_to_unicode
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
from calibre.utils.config import OptionParser
|
from calibre.utils.config import OptionParser
|
||||||
|
from calibre.utils.logging import Log
|
||||||
|
|
||||||
class FetchError(Exception):
|
class FetchError(Exception):
|
||||||
pass
|
pass
|
||||||
@ -92,10 +93,11 @@ class RecursiveFetcher(object):
|
|||||||
default_timeout = socket.getdefaulttimeout() # Needed here as it is used in __del__
|
default_timeout = socket.getdefaulttimeout() # Needed here as it is used in __del__
|
||||||
DUMMY_LOCK = DummyLock()
|
DUMMY_LOCK = DummyLock()
|
||||||
|
|
||||||
def __init__(self, options, logger, image_map={}, css_map={}, job_info=None):
|
def __init__(self, options, log, image_map={}, css_map={}, job_info=None):
|
||||||
self.base_dir = os.path.abspath(os.path.expanduser(options.dir))
|
self.base_dir = os.path.abspath(os.path.expanduser(options.dir))
|
||||||
if not os.path.exists(self.base_dir):
|
if not os.path.exists(self.base_dir):
|
||||||
os.makedirs(self.base_dir)
|
os.makedirs(self.base_dir)
|
||||||
|
self.log = log
|
||||||
self.default_timeout = socket.getdefaulttimeout()
|
self.default_timeout = socket.getdefaulttimeout()
|
||||||
socket.setdefaulttimeout(options.timeout)
|
socket.setdefaulttimeout(options.timeout)
|
||||||
self.verbose = options.verbose
|
self.verbose = options.verbose
|
||||||
@ -174,7 +176,7 @@ class RecursiveFetcher(object):
|
|||||||
|
|
||||||
def fetch_url(self, url):
|
def fetch_url(self, url):
|
||||||
data = None
|
data = None
|
||||||
self.log_debug('Fetching %s', url)
|
self.log.debug('Fetching', url)
|
||||||
delta = time.time() - self.last_fetch_at
|
delta = time.time() - self.last_fetch_at
|
||||||
if delta < self.delay:
|
if delta < self.delay:
|
||||||
time.sleep(delta)
|
time.sleep(delta)
|
||||||
@ -190,7 +192,7 @@ class RecursiveFetcher(object):
|
|||||||
raise FetchError, responses[err.code]
|
raise FetchError, responses[err.code]
|
||||||
if getattr(err, 'reason', [0])[0] == 104 or \
|
if getattr(err, 'reason', [0])[0] == 104 or \
|
||||||
getattr(getattr(err, 'args', [None])[0], 'errno', None) == -2: # Connection reset by peer or Name or service not know
|
getattr(getattr(err, 'args', [None])[0], 'errno', None) == -2: # Connection reset by peer or Name or service not know
|
||||||
self.log_debug('Temporary error, retrying in 1 second')
|
self.log.debug('Temporary error, retrying in 1 second')
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
with closing(self.browser.open(url)) as f:
|
with closing(self.browser.open(url)) as f:
|
||||||
data = response(f.read()+f.read())
|
data = response(f.read()+f.read())
|
||||||
@ -204,9 +206,9 @@ class RecursiveFetcher(object):
|
|||||||
|
|
||||||
def start_fetch(self, url):
|
def start_fetch(self, url):
|
||||||
soup = BeautifulSoup(u'<a href="'+url+'" />')
|
soup = BeautifulSoup(u'<a href="'+url+'" />')
|
||||||
self.log_info('Downloading')
|
self.log.debug('Downloading')
|
||||||
res = self.process_links(soup, url, 0, into_dir='')
|
res = self.process_links(soup, url, 0, into_dir='')
|
||||||
self.log_info('%s saved to %s', url, res)
|
self.log.debug('%s saved to %s'%( url, res))
|
||||||
return res
|
return res
|
||||||
|
|
||||||
def is_link_ok(self, url):
|
def is_link_ok(self, url):
|
||||||
@ -243,8 +245,7 @@ class RecursiveFetcher(object):
|
|||||||
try:
|
try:
|
||||||
data = self.fetch_url(iurl)
|
data = self.fetch_url(iurl)
|
||||||
except Exception, err:
|
except Exception, err:
|
||||||
self.log_debug('Could not fetch stylesheet %s', iurl)
|
self.log.exception('Could not fetch stylesheet %s'% iurl)
|
||||||
self.log_debug('Error: %s', str(err), exc_info=True)
|
|
||||||
continue
|
continue
|
||||||
stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
|
stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
|
||||||
with self.stylemap_lock:
|
with self.stylemap_lock:
|
||||||
@ -267,8 +268,7 @@ class RecursiveFetcher(object):
|
|||||||
try:
|
try:
|
||||||
data = self.fetch_url(iurl)
|
data = self.fetch_url(iurl)
|
||||||
except Exception, err:
|
except Exception, err:
|
||||||
self.log_warning('Could not fetch stylesheet %s', iurl)
|
self.log.exception('Could not fetch stylesheet %s'% iurl)
|
||||||
self.log_debug('Error: %s', str(err), exc_info=True)
|
|
||||||
continue
|
continue
|
||||||
c += 1
|
c += 1
|
||||||
stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
|
stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
|
||||||
@ -291,9 +291,6 @@ class RecursiveFetcher(object):
|
|||||||
iurl = self.image_url_processor(baseurl, iurl)
|
iurl = self.image_url_processor(baseurl, iurl)
|
||||||
ext = os.path.splitext(iurl)[1]
|
ext = os.path.splitext(iurl)[1]
|
||||||
ext = ext[:5]
|
ext = ext[:5]
|
||||||
#if not ext:
|
|
||||||
# self.log_debug('Skipping extensionless image %s', iurl)
|
|
||||||
# continue
|
|
||||||
if not urlparse.urlsplit(iurl).scheme:
|
if not urlparse.urlsplit(iurl).scheme:
|
||||||
iurl = urlparse.urljoin(baseurl, iurl, False)
|
iurl = urlparse.urljoin(baseurl, iurl, False)
|
||||||
with self.imagemap_lock:
|
with self.imagemap_lock:
|
||||||
@ -303,8 +300,7 @@ class RecursiveFetcher(object):
|
|||||||
try:
|
try:
|
||||||
data = self.fetch_url(iurl)
|
data = self.fetch_url(iurl)
|
||||||
except Exception, err:
|
except Exception, err:
|
||||||
self.log_warning('Could not fetch image %s', iurl)
|
self.log.exception('Could not fetch image %s'% iurl)
|
||||||
self.log_debug('Error: %s', str(err), exc_info=True)
|
|
||||||
continue
|
continue
|
||||||
c += 1
|
c += 1
|
||||||
fname = sanitize_file_name('img'+str(c)+ext)
|
fname = sanitize_file_name('img'+str(c)+ext)
|
||||||
@ -330,10 +326,10 @@ class RecursiveFetcher(object):
|
|||||||
if not parts.scheme:
|
if not parts.scheme:
|
||||||
iurl = urlparse.urljoin(baseurl, iurl, False)
|
iurl = urlparse.urljoin(baseurl, iurl, False)
|
||||||
if not self.is_link_ok(iurl):
|
if not self.is_link_ok(iurl):
|
||||||
self.log_debug('Skipping invalid link: %s', iurl)
|
self.log.debug('Skipping invalid link:', iurl)
|
||||||
return None
|
return None
|
||||||
if filter and not self.is_link_wanted(iurl):
|
if filter and not self.is_link_wanted(iurl):
|
||||||
self.log_debug('Filtered link: '+iurl)
|
self.log.debug('Filtered link: '+iurl)
|
||||||
return None
|
return None
|
||||||
return iurl
|
return iurl
|
||||||
|
|
||||||
@ -401,7 +397,7 @@ class RecursiveFetcher(object):
|
|||||||
base = soup.find('base', href=True)
|
base = soup.find('base', href=True)
|
||||||
if base is not None:
|
if base is not None:
|
||||||
newbaseurl = base['href']
|
newbaseurl = base['href']
|
||||||
self.log_debug('Processing images...')
|
self.log.debug('Processing images...')
|
||||||
self.process_images(soup, newbaseurl)
|
self.process_images(soup, newbaseurl)
|
||||||
if self.download_stylesheets:
|
if self.download_stylesheets:
|
||||||
self.process_stylesheets(soup, newbaseurl)
|
self.process_stylesheets(soup, newbaseurl)
|
||||||
@ -416,11 +412,11 @@ class RecursiveFetcher(object):
|
|||||||
self.downloaded_paths.append(res)
|
self.downloaded_paths.append(res)
|
||||||
self.filemap[nurl] = res
|
self.filemap[nurl] = res
|
||||||
if recursion_level < self.max_recursions:
|
if recursion_level < self.max_recursions:
|
||||||
self.log_debug('Processing links...')
|
self.log.debug('Processing links...')
|
||||||
self.process_links(soup, newbaseurl, recursion_level+1)
|
self.process_links(soup, newbaseurl, recursion_level+1)
|
||||||
else:
|
else:
|
||||||
self.process_return_links(soup, newbaseurl)
|
self.process_return_links(soup, newbaseurl)
|
||||||
self.log_debug('Recursion limit reached. Skipping links in %s', iurl)
|
self.log.debug('Recursion limit reached. Skipping links in', iurl)
|
||||||
|
|
||||||
if callable(self.postprocess_html_ext):
|
if callable(self.postprocess_html_ext):
|
||||||
soup = self.postprocess_html_ext(soup,
|
soup = self.postprocess_html_ext(soup,
|
||||||
@ -434,8 +430,7 @@ class RecursiveFetcher(object):
|
|||||||
self.localize_link(tag, 'href', res)
|
self.localize_link(tag, 'href', res)
|
||||||
except Exception, err:
|
except Exception, err:
|
||||||
self.failed_links.append((iurl, traceback.format_exc()))
|
self.failed_links.append((iurl, traceback.format_exc()))
|
||||||
self.log_warning('Could not fetch link %s', iurl)
|
self.log.exception('Could not fetch link', iurl)
|
||||||
self.log_debug('Error: %s', str(err), exc_info=True)
|
|
||||||
finally:
|
finally:
|
||||||
self.current_dir = diskpath
|
self.current_dir = diskpath
|
||||||
self.files += 1
|
self.files += 1
|
||||||
@ -478,12 +473,10 @@ def option_parser(usage=_('%prog URL\n\nWhere URL is for example http://google.c
|
|||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
|
||||||
def create_fetcher(options, logger=None, image_map={}):
|
def create_fetcher(options, image_map={}, log=None):
|
||||||
if logger is None:
|
if log is None:
|
||||||
level = logging.DEBUG if options.verbose else logging.INFO
|
log = Log()
|
||||||
logger = logging.getLogger('web2disk')
|
return RecursiveFetcher(options, log, image_map={})
|
||||||
setup_cli_handlers(logger, level)
|
|
||||||
return RecursiveFetcher(options, logger, image_map={})
|
|
||||||
|
|
||||||
def main(args=sys.argv):
|
def main(args=sys.argv):
|
||||||
parser = option_parser()
|
parser = option_parser()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user