mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 18:54:09 -04:00
News download: Full WebKit based framework
News download: Add a framework for scraping javascript heavy sites using a full javascript enabled WebKit based browser.
This commit is contained in:
parent
29b4c093f6
commit
c4c63b3a78
341
src/calibre/web/feeds/jsnews.py
Normal file
341
src/calibre/web/feeds/jsnews.py
Normal file
@ -0,0 +1,341 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
import os, re
|
||||
from io import BytesIO
|
||||
from functools import partial
|
||||
|
||||
from calibre import force_unicode, walk
|
||||
from calibre.constants import __appname__
|
||||
from calibre.web.feeds import feeds_from_index
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.web.fetch.javascript import fetch_page, AbortFetch, links_from_selectors
|
||||
from calibre.ebooks.chardet import xml_to_unicode, strip_encoding_declarations
|
||||
|
||||
def image_data_to_url(data, base='cover'):
|
||||
from calibre.utils.imghdr import what
|
||||
ans = BytesIO(data)
|
||||
ext = what(None, data)
|
||||
if not ext:
|
||||
if data.startswith(b'%PDF-'):
|
||||
ext = 'pdf'
|
||||
else:
|
||||
ext = 'jpg'
|
||||
ans.name = 'cover.' + ext
|
||||
return ans
|
||||
|
||||
class JavascriptRecipe(BasicNewsRecipe):
|
||||
|
||||
#: Minimum calibre version needed to use this recipe
|
||||
requires_version = (0, 9, 34)
|
||||
|
||||
#: List of tags to be removed. Specified tags are removed from downloaded HTML.
|
||||
#: A tag is specified using CSS selectors.
|
||||
#: A common example::
|
||||
#:
|
||||
#: remove_tags = ['div.advert', 'div.tools']
|
||||
#:
|
||||
#: This will remove all `<div class="advert">` and `<div class="tools">` tags and all
|
||||
#: their children from the downloaded :term:`HTML`.
|
||||
remove_tags = ()
|
||||
|
||||
#: Remove all tags that occur after the specified tag.
|
||||
#: A tag is specified using CSS selectors.
|
||||
#: For example::
|
||||
#:
|
||||
# : remove_tags_after = '#content'
|
||||
#:
|
||||
#: will remove all tags after the first element with `id="content"`.
|
||||
remove_tags_after = None
|
||||
|
||||
#: Remove all tags that occur before the specified tag.
|
||||
#: A tag is specified using CSS selectors.
|
||||
#: For example::
|
||||
#:
|
||||
# : remove_tags_before = '#content'
|
||||
#:
|
||||
#: will remove all tags before the first element with `id="content"`.
|
||||
remove_tags_before = None
|
||||
|
||||
#: Keep only the specified tags and their children.
|
||||
#: Uses the CSS selector syntax.
|
||||
#: If this list is not empty, then the `<body>` tag will be emptied and re-filled with
|
||||
#: the tags that match the entries in this list. For example::
|
||||
#:
|
||||
# : keep_only_tags = ['#content', '#heading']
|
||||
#:
|
||||
#: will keep only tags that have an `id` attribute of `"content"` or `"heading"`.
|
||||
keep_only_tags = ()
|
||||
|
||||
#: A list of selectors that match <a href> elements that you want followed.
|
||||
#: For this to work you must also set recursions to at least 1.
|
||||
#: You can get more control by re-implemnting :met:`select_links` in your sub-class.
|
||||
links_from_selectors = ()
|
||||
|
||||
def select_links(self, browser, url, recursion_level):
|
||||
'''
|
||||
Override this method in your sub-class to implement arbitrary link following logic. It must return a
|
||||
list of URLs, each of which will be downloaded in turn.
|
||||
'''
|
||||
return links_from_selectors(self.links_from_selectors, self.recursions, browser, url, recursion_level)
|
||||
|
||||
def get_jsbrowser(self, *args, **kwargs):
|
||||
from calibre.web.jsbrowser.browser import Browser
|
||||
return Browser(default_timeout=kwargs.get('default_timeout', 120))
|
||||
|
||||
def do_login(self, browser, username, password):
|
||||
'''
|
||||
This method is used to login to a website that uses a paywall. Implement it in
|
||||
your recipe if the site uses a paywall. An example implementation::
|
||||
|
||||
def do_login(self, browser, username, password):
|
||||
browser.visit('http://some-page-that-has-a-login')
|
||||
form = browser.select_form(nr=0) # Select the first form on the page
|
||||
form['username'] = username
|
||||
form['password'] = password
|
||||
browser.submit(timeout=120) # Submit the form and wait at most two minutes for loading to complete
|
||||
|
||||
Note that you can also select forms with CSS2 selectors, like this::
|
||||
|
||||
browser.select_form('form#login_form')
|
||||
browser.select_from('form[name="someform"]')
|
||||
'''
|
||||
|
||||
pass
|
||||
|
||||
def get_publication_data(self, browser):
|
||||
'''
|
||||
Download the cover, the masthead image and the list of sections/articles.
|
||||
Should return a dictionary with keys 'index', 'cover' and 'masthead'.
|
||||
'cover' and 'masthead' are optional, if not present, they will be auto-generated.
|
||||
The index must be in the same format as described in :meth:`parse_index`.
|
||||
'''
|
||||
raise NotImplementedError('You must implement this method in your recipe')
|
||||
|
||||
def load_complete(self, browser, url, recursion_level):
|
||||
'''
|
||||
This method is called after every page on the website is loaded. To be
|
||||
precise, it is called when the DOM is ready. If further checks need to
|
||||
be made, they should be made here. For example, if you want to check
|
||||
that some element in the DOM is present, you would use::
|
||||
|
||||
def load_complete(self, browser, url, rl):
|
||||
browser.wait_for_element('#article-footer')
|
||||
return True
|
||||
|
||||
where article-footer is the id of the element you want to wait for.
|
||||
'''
|
||||
return True
|
||||
|
||||
def abort_article(self, msg=None):
|
||||
raise AbortFetch(msg or 'Article fetch aborted')
|
||||
|
||||
def preprocess_stage1(self, article, browser, url, recursion_level):
|
||||
pass
|
||||
|
||||
def preprocess_stage2(self, article, browser, url, recursion_level):
|
||||
pass
|
||||
|
||||
def postprocess_html(self, article, root, url, recursion_level):
|
||||
return root
|
||||
|
||||
def index_to_soup(self, url_or_raw, raw=False):
|
||||
'''
|
||||
Convenience method that takes an URL to the index page and returns
|
||||
a parsed lxml tree representation of it.
|
||||
|
||||
`url_or_raw`: Either a URL or the downloaded index page as a string
|
||||
'''
|
||||
if re.match(r'\w+://', url_or_raw):
|
||||
self.jsbrowser.start_load(url_or_raw)
|
||||
html = self.jsbrowser.html
|
||||
else:
|
||||
html = url_or_raw
|
||||
if isinstance(html, bytes):
|
||||
html = xml_to_unicode(html)[0]
|
||||
html = strip_encoding_declarations(html)
|
||||
if raw:
|
||||
return html
|
||||
import html5lib
|
||||
root = html5lib.parse(html, treebuilder='lxml', namespaceHTMLElements=False).getroot()
|
||||
return root
|
||||
|
||||
# ***************************** Internal API *****************************
|
||||
|
||||
def _preprocess_browser(self, article, browser, url, stage, recursion_level):
|
||||
func = getattr(self, 'preprocess_stage%d' % stage)
|
||||
return func(article, browser, url, recursion_level)
|
||||
|
||||
def _postprocess_html(self, article, feed_num, art_num, feed_len, root, url, recursion_level):
|
||||
from lxml.html.builder import STYLE
|
||||
if self.no_stylesheets:
|
||||
for link in root.xpath('//link[@href]'):
|
||||
if (link.get('type', '') or 'text/css'):
|
||||
link.getparent().remove(link)
|
||||
for style in root.xpath('//style'):
|
||||
style.getparent().remove(style)
|
||||
head = root.xpath('//head|//body')
|
||||
head = head[0] if head else next(root.iterdescendants())
|
||||
head.append(STYLE(self.template_css + '\n\n' + (self.extra_css or '')))
|
||||
|
||||
if recursion_level == 0:
|
||||
body = root.xpath('//body')
|
||||
if body:
|
||||
templ = self.navbar.generate(
|
||||
False, feed_num, art_num, feed_len, not self.has_single_feed, url,
|
||||
__appname__, center=self.center_navbar,
|
||||
extra_css=self.extra_css)
|
||||
body.insert(0, templ.root.xpath('//div')[0])
|
||||
|
||||
remove_attrs = set(self.remove_attributes)
|
||||
if self.remove_javascript:
|
||||
remove_attrs.add('onload')
|
||||
for script in root.xpath('//*[name()="script" or name()="noscript"]'):
|
||||
script.getparent().remove(script)
|
||||
|
||||
for attr in remove_attrs:
|
||||
for tag in root.xpath('//*[@%s]' % attr):
|
||||
tag.attrib.pop(attr, None)
|
||||
|
||||
nuke = ['base', 'iframe', 'canvas', 'embed', 'command', 'datalist', 'video', 'audio', 'form']
|
||||
for tag in root.xpath('|'.join('//%s' % tag for tag in nuke)):
|
||||
tag.getparent().remove(tag)
|
||||
|
||||
root = self.postprocess_html(article, root, url, recursion_level)
|
||||
if root is not None:
|
||||
# Nuke HTML5 tags
|
||||
tags = ['article', 'aside', 'header', 'footer', 'nav', 'figcaption', 'figure', 'section']
|
||||
for tag in root.xpath('|'.join('//%s' % tag for tag in tags)):
|
||||
tag.tag = 'div'
|
||||
|
||||
self.populate_article_metadata(article, root, recursion_level == 0)
|
||||
|
||||
return root
|
||||
|
||||
def download(self):
|
||||
browser = self.jsbrowser = self.get_jsbrowser()
|
||||
with browser:
|
||||
try:
|
||||
if self.needs_subscription and self.username and self.password:
|
||||
self.do_login(browser, self.username, self.password)
|
||||
data = self.get_publication_data(browser)
|
||||
|
||||
# Process cover, if any
|
||||
cdata = data.get('cover', None)
|
||||
if cdata:
|
||||
self.cover_url = image_data_to_url(cdata)
|
||||
self.download_cover()
|
||||
|
||||
# Process masthead, if any
|
||||
mdata = data.get('masthead', None)
|
||||
if mdata:
|
||||
self.masthead_url = image_data_to_url(mdata)
|
||||
self.resolve_masthead()
|
||||
|
||||
# Process the list of sections/articles
|
||||
return self.build_index(data, browser)
|
||||
finally:
|
||||
self.cleanup()
|
||||
|
||||
def build_index(self, data, browser):
|
||||
sections = data.get('index', None)
|
||||
if not sections:
|
||||
raise ValueError('No articles found, aborting')
|
||||
|
||||
feeds = feeds_from_index(sections, oldest_article=self.oldest_article,
|
||||
max_articles_per_feed=self.max_articles_per_feed,
|
||||
log=self.log)
|
||||
if not feeds:
|
||||
raise ValueError('No articles found, aborting')
|
||||
if self.ignore_duplicate_articles is not None:
|
||||
feeds = self.remove_duplicate_articles(feeds)
|
||||
if self.test:
|
||||
feeds = feeds[:2]
|
||||
self.has_single_feed = len(feeds) == 1
|
||||
index = os.path.join(self.output_dir, 'index.html')
|
||||
|
||||
html = self.feeds2index(feeds)
|
||||
with open(index, 'wb') as fi:
|
||||
fi.write(html)
|
||||
|
||||
if self.reverse_article_order:
|
||||
for feed in feeds:
|
||||
if hasattr(feed, 'reverse'):
|
||||
feed.reverse()
|
||||
|
||||
self.report_progress(0, _('Got feeds from index page'))
|
||||
resource_cache = {}
|
||||
|
||||
total = 0
|
||||
for feed in feeds:
|
||||
total += min(self.max_articles_per_feed, len(feed))
|
||||
num = 0
|
||||
|
||||
for f, feed in enumerate(feeds):
|
||||
feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
|
||||
if not os.path.isdir(feed_dir):
|
||||
os.makedirs(feed_dir)
|
||||
|
||||
for a, article in enumerate(feed):
|
||||
if a >= self.max_articles_per_feed:
|
||||
break
|
||||
num += 1
|
||||
art_dir = os.path.join(feed_dir, 'article_%d'%a)
|
||||
if not os.path.isdir(art_dir):
|
||||
os.makedirs(art_dir)
|
||||
try:
|
||||
url = self.print_version(article.url)
|
||||
except NotImplementedError:
|
||||
url = article.url
|
||||
except:
|
||||
self.log.exception('Failed to find print version for: '+article.url)
|
||||
url = None
|
||||
if not url:
|
||||
continue
|
||||
|
||||
self.log('Fetching article:', article.title, 'from', url)
|
||||
try:
|
||||
pages = fetch_page(
|
||||
url,
|
||||
load_complete=self.load_complete,
|
||||
links=self.select_links,
|
||||
remove=self.remove_tags,
|
||||
keep_only=self.keep_only_tags,
|
||||
preprocess_browser=partial(self._preprocess_browser, article),
|
||||
postprocess_html=partial(self._postprocess_html, article, f, a, len(feed)),
|
||||
remove_before=self.remove_tags_before,
|
||||
remove_after=self.remove_tags_after,
|
||||
remove_javascript=self.remove_javascript,
|
||||
resource_cache=resource_cache, output_dir=art_dir, browser=browser)
|
||||
except AbortFetch:
|
||||
self.log.exception('Fetching of article: %r aborted' % article.title)
|
||||
continue
|
||||
except Exception:
|
||||
self.log.exception('Fetching of article: %r failed' % article.title)
|
||||
continue
|
||||
self.log.debug('Downloaded article:', article.title, 'from', article.url)
|
||||
article.orig_url = article.url
|
||||
article.url = 'article_%d/index.html'%a
|
||||
article.downloaded = True
|
||||
article.sub_pages = pages[1:]
|
||||
self.report_progress(float(num)/total,
|
||||
_(u'Article downloaded: %s')%force_unicode(article.title))
|
||||
|
||||
for f, feed in enumerate(feeds):
|
||||
html = self.feed2index(f, feeds)
|
||||
feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
|
||||
with open(os.path.join(feed_dir, 'index.html'), 'wb') as fi:
|
||||
fi.write(html)
|
||||
if self.no_stylesheets:
|
||||
for f in walk(self.output_dir):
|
||||
if f.endswith('.css'):
|
||||
os.remove(f)
|
||||
self.create_opf(feeds)
|
||||
self.report_progress(1, _('Download finished'))
|
||||
return index
|
||||
|
@ -7,11 +7,12 @@ Builtin recipes.
|
||||
import re, time, io
|
||||
from calibre.web.feeds.news import (BasicNewsRecipe, CustomIndexRecipe,
|
||||
AutomaticNewsRecipe, CalibrePeriodical)
|
||||
from calibre.web.feeds.jsnews import JavascriptRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
from calibre.utils.config import JSONConfig
|
||||
|
||||
basic_recipes = (BasicNewsRecipe, AutomaticNewsRecipe, CustomIndexRecipe,
|
||||
CalibrePeriodical)
|
||||
CalibrePeriodical, JavascriptRecipe)
|
||||
|
||||
custom_recipes = JSONConfig('custom_recipes/index.json')
|
||||
|
||||
|
258
src/calibre/web/fetch/javascript.py
Normal file
258
src/calibre/web/fetch/javascript.py
Normal file
@ -0,0 +1,258 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
import time, os, hashlib
|
||||
from operator import attrgetter
|
||||
from collections import defaultdict
|
||||
from functools import partial
|
||||
|
||||
from calibre import jsbrowser
|
||||
from calibre.ebooks.chardet import strip_encoding_declarations
|
||||
from calibre.utils.imghdr import what
|
||||
from calibre.web.jsbrowser.browser import Timeout
|
||||
|
||||
# remove_comments() {{{
|
||||
remove_comments = '''
|
||||
function remove_comments(node) {
|
||||
var nodes = node.childNodes, i=0, t;
|
||||
while((t = nodes.item(i++))) {
|
||||
switch(t.nodeType){
|
||||
case Node.ELEMENT_NODE:
|
||||
remove_comments(t);
|
||||
break;
|
||||
case Node.COMMENT_NODE:
|
||||
node.removeChild(t);
|
||||
i--;
|
||||
}
|
||||
}
|
||||
}
|
||||
remove_comments(document)
|
||||
''' # }}}
|
||||
|
||||
class AbortFetch(ValueError):
|
||||
pass
|
||||
|
||||
def children(elem):
|
||||
elem = elem.firstChild()
|
||||
while not elem.isNull():
|
||||
yield elem
|
||||
elem = elem.nextSibling()
|
||||
|
||||
def apply_keep_only(browser, keep_only):
|
||||
mf = browser.page.mainFrame()
|
||||
body = mf.findFirstElement('body')
|
||||
if body.isNull():
|
||||
browser.log.error('Document has no body, cannot apply keep_only')
|
||||
return
|
||||
keep = []
|
||||
for selector in keep_only:
|
||||
keep.extend(x for x in mf.findAllElements(selector))
|
||||
if not keep:
|
||||
browser.log.error('Failed to find any elements matching the keep_only selectors: %r' % keep_only)
|
||||
return
|
||||
for elem in keep:
|
||||
body.appendInside(elem)
|
||||
for elem in tuple(children(body)):
|
||||
preserve = False
|
||||
for x in keep:
|
||||
if x == elem:
|
||||
preserve = True
|
||||
break
|
||||
if preserve:
|
||||
break
|
||||
elem.removeFromDocument()
|
||||
|
||||
def apply_remove(browser, remove):
|
||||
mf = browser.page.mainFrame()
|
||||
for selector in remove:
|
||||
for elem in mf.findAllElements(selector):
|
||||
if not elem.isNull():
|
||||
elem.removeFromDocument()
|
||||
|
||||
def remove_beyond(browser, selector, before=True):
|
||||
mf = browser.page.mainFrame()
|
||||
elem = mf.findFirstElement(selector)
|
||||
if elem.isNull():
|
||||
browser.log('Failed to find any element matching the selector: %s' % selector)
|
||||
return
|
||||
next_sibling = attrgetter('previousSibling' if before else 'nextSibling')
|
||||
|
||||
while not elem.isNull() and unicode(elem.tagName()) != 'body':
|
||||
remove = []
|
||||
after = next_sibling(elem)()
|
||||
while not after.isNull():
|
||||
remove.append(after)
|
||||
after = next_sibling(after)()
|
||||
for x in remove:
|
||||
x.removeFromDocument()
|
||||
elem = elem.parent()
|
||||
|
||||
def is_tag(elem, name):
|
||||
return unicode(elem.tagName()).lower() == name.lower()
|
||||
|
||||
def download_resources(browser, resource_cache, output_dir):
|
||||
img_counter = style_counter = 0
|
||||
resources = defaultdict(list)
|
||||
for img in browser.css_select('img[src]', all=True):
|
||||
# Using javascript ensures that absolute URLs are returned, direct
|
||||
# attribute access does not do that
|
||||
src = unicode(img.evaluateJavaScript('this.src').toString()).strip()
|
||||
if src:
|
||||
resources[src].append(img)
|
||||
for link in browser.css_select('link[href]', all=True):
|
||||
lt = unicode(link.attribute('type')).strip() or 'text/css'
|
||||
rel = unicode(link.attribute('rel')).strip() or 'stylesheet'
|
||||
if lt == 'text/css' and rel == 'stylesheet':
|
||||
href = unicode(link.evaluateJavaScript('this.href').toString()).strip()
|
||||
if href:
|
||||
resources[href].append(link)
|
||||
else:
|
||||
link.removeFromDocument()
|
||||
else:
|
||||
link.removeFromDocument()
|
||||
loaded_resources = browser.wait_for_resources(resources)
|
||||
for url, raw in loaded_resources.iteritems():
|
||||
h = hashlib.sha1(raw).digest()
|
||||
if h in resource_cache:
|
||||
href = os.path.relpath(resource_cache[h], output_dir).replace(os.sep, '/')
|
||||
else:
|
||||
elem = resources[url][0]
|
||||
if is_tag(elem, 'link'):
|
||||
style_counter += 1
|
||||
href = 'style_%d.css' % style_counter
|
||||
else:
|
||||
img_counter += 1
|
||||
ext = what(None, raw) or 'jpg'
|
||||
href = 'img_%d.%s' % (img_counter, ext)
|
||||
dest = os.path.join(output_dir, href)
|
||||
resource_cache[h] = dest
|
||||
with open(dest, 'wb') as f:
|
||||
f.write(raw)
|
||||
for elem in resources[url]:
|
||||
elem.setAttribute('href' if is_tag(elem, 'link') else 'src', href)
|
||||
|
||||
failed = set(resources) - set(loaded_resources)
|
||||
for url in failed:
|
||||
for elem in resources[url]:
|
||||
attr = 'href' if is_tag(elem, 'link') else 'src'
|
||||
elem.setAttribute(attr, '')
|
||||
|
||||
def save_html(browser, output_dir, postprocess_html, url, recursion_level):
|
||||
html = strip_encoding_declarations(browser.html)
|
||||
import html5lib
|
||||
root = html5lib.parse(html, treebuilder='lxml', namespaceHTMLElements=False).getroot()
|
||||
root = postprocess_html(root, url, recursion_level)
|
||||
if root is None:
|
||||
# user wants this page to be aborted
|
||||
raise AbortFetch('%s was aborted during postprocess' % url)
|
||||
with open(os.path.join(output_dir, 'index.html'), 'wb') as f:
|
||||
from lxml.html import tostring
|
||||
f.write(tostring(root, include_meta_content_type=True, encoding='utf-8', pretty_print=True))
|
||||
return f.name
|
||||
|
||||
def links_from_selectors(selectors, recursions, browser, url, recursion_level):
|
||||
ans = []
|
||||
if recursions > recursion_level:
|
||||
for selector in selectors:
|
||||
for a in browser.css_select(selector, all=True):
|
||||
href = unicode(a.evaluateJavaScript('this.href').toString()).strip()
|
||||
if href:
|
||||
ans.append(href)
|
||||
return ans
|
||||
|
||||
|
||||
def clean_dom(
|
||||
browser, url, recursion_level, preprocess_browser, remove_javascript,
|
||||
keep_only, remove_after, remove_before, remove):
|
||||
|
||||
# Remove comments as otherwise we can end up with nested comments, which
|
||||
# cause problems later
|
||||
browser.page.mainFrame().evaluateJavaScript(remove_comments)
|
||||
|
||||
preprocess_browser(browser, url, 1, recursion_level)
|
||||
if remove_javascript:
|
||||
for elem in browser.css_select('script', all=True):
|
||||
elem.removeFromDocument()
|
||||
if keep_only:
|
||||
apply_keep_only(browser, keep_only)
|
||||
if remove_after:
|
||||
remove_beyond(browser, remove_after, before=False)
|
||||
if remove_before:
|
||||
remove_beyond(browser, remove_before, before=True)
|
||||
if remove:
|
||||
apply_remove(browser, remove)
|
||||
preprocess_browser(browser, url, 2, recursion_level)
|
||||
|
||||
def fetch_page(
|
||||
url=None,
|
||||
load_complete=lambda browser, url, recursion_level: True,
|
||||
links=lambda browser, url, recursion_level: (),
|
||||
keep_only=(),
|
||||
remove_after=None,
|
||||
remove_before=None,
|
||||
remove=(),
|
||||
remove_javascript=True,
|
||||
preprocess_browser=lambda browser, url, stage, recursion_level:None,
|
||||
postprocess_html=lambda root, url, recursion_level: root,
|
||||
resource_cache={},
|
||||
output_dir=None,
|
||||
browser=None,
|
||||
recursion_level=0
|
||||
):
|
||||
|
||||
output_dir = output_dir or os.getcwdu()
|
||||
if browser is None:
|
||||
browser = jsbrowser()
|
||||
|
||||
# Load the DOM
|
||||
if url is not None:
|
||||
start_time = time.time()
|
||||
browser.start_load(url)
|
||||
while not load_complete(browser, url, recursion_level):
|
||||
browser.run_for_a_time(0.1)
|
||||
if time.time() - start_time > browser.default_timeout:
|
||||
raise Timeout('Timed out while waiting for %s to load' % url)
|
||||
|
||||
children = links(browser, url, recursion_level)
|
||||
|
||||
# Cleanup the DOM
|
||||
clean_dom(
|
||||
browser, url, recursion_level, preprocess_browser,
|
||||
remove_javascript, keep_only, remove_after, remove_before, remove)
|
||||
|
||||
# Download resources
|
||||
download_resources(browser, resource_cache, output_dir)
|
||||
|
||||
# Get HTML from the DOM
|
||||
pages = [save_html(browser, output_dir, postprocess_html, url, recursion_level)]
|
||||
|
||||
# Fetch the linked pages
|
||||
for i, curl in enumerate(children):
|
||||
odir = os.path.join(output_dir, 'link%d' % (i + 1))
|
||||
if not os.path.exists(odir):
|
||||
os.mkdir(odir)
|
||||
try:
|
||||
pages.extend(fetch_page(
|
||||
curl, load_complete=load_complete, links=links, keep_only=keep_only,
|
||||
remove_after=remove_after, remove_before=remove_before, remove=remove,
|
||||
preprocess_browser=preprocess_browser, postprocess_html=postprocess_html,
|
||||
resource_cache=resource_cache, output_dir=odir, browser=browser,
|
||||
recursion_level=recursion_level+1))
|
||||
except AbortFetch:
|
||||
continue
|
||||
return tuple(pages)
|
||||
|
||||
if __name__ == '__main__':
|
||||
browser = jsbrowser()
|
||||
fetch_page('http://www.time.com/time/magazine/article/0,9171,2145057,00.html', browser=browser,
|
||||
links=partial(links_from_selectors, ('.wp-paginate a.page[href]',), 1),
|
||||
keep_only=('article.post',), remove=('.entry-sharing', '.entry-footer', '.wp-paginate', '.post-rail'))
|
||||
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user