News download: Full WebKit based framework

News download: Add a framework for scraping javascript heavy sites using
a full javascript enabled WebKit based browser.
This commit is contained in:
Kovid Goyal 2013-06-11 11:22:27 +05:30
parent 29b4c093f6
commit c4c63b3a78
3 changed files with 601 additions and 1 deletions

View File

@ -0,0 +1,341 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import os, re
from io import BytesIO
from functools import partial
from calibre import force_unicode, walk
from calibre.constants import __appname__
from calibre.web.feeds import feeds_from_index
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.web.fetch.javascript import fetch_page, AbortFetch, links_from_selectors
from calibre.ebooks.chardet import xml_to_unicode, strip_encoding_declarations
def image_data_to_url(data, base='cover'):
from calibre.utils.imghdr import what
ans = BytesIO(data)
ext = what(None, data)
if not ext:
if data.startswith(b'%PDF-'):
ext = 'pdf'
else:
ext = 'jpg'
ans.name = 'cover.' + ext
return ans
class JavascriptRecipe(BasicNewsRecipe):
#: Minimum calibre version needed to use this recipe
requires_version = (0, 9, 34)
#: List of tags to be removed. Specified tags are removed from downloaded HTML.
#: A tag is specified using CSS selectors.
#: A common example::
#:
#: remove_tags = ['div.advert', 'div.tools']
#:
#: This will remove all `<div class="advert">` and `<div class="tools">` tags and all
#: their children from the downloaded :term:`HTML`.
remove_tags = ()
#: Remove all tags that occur after the specified tag.
#: A tag is specified using CSS selectors.
#: For example::
#:
# : remove_tags_after = '#content'
#:
#: will remove all tags after the first element with `id="content"`.
remove_tags_after = None
#: Remove all tags that occur before the specified tag.
#: A tag is specified using CSS selectors.
#: For example::
#:
# : remove_tags_before = '#content'
#:
#: will remove all tags before the first element with `id="content"`.
remove_tags_before = None
#: Keep only the specified tags and their children.
#: Uses the CSS selector syntax.
#: If this list is not empty, then the `<body>` tag will be emptied and re-filled with
#: the tags that match the entries in this list. For example::
#:
# : keep_only_tags = ['#content', '#heading']
#:
#: will keep only tags that have an `id` attribute of `"content"` or `"heading"`.
keep_only_tags = ()
#: A list of selectors that match <a href> elements that you want followed.
#: For this to work you must also set recursions to at least 1.
#: You can get more control by re-implemnting :met:`select_links` in your sub-class.
links_from_selectors = ()
def select_links(self, browser, url, recursion_level):
'''
Override this method in your sub-class to implement arbitrary link following logic. It must return a
list of URLs, each of which will be downloaded in turn.
'''
return links_from_selectors(self.links_from_selectors, self.recursions, browser, url, recursion_level)
def get_jsbrowser(self, *args, **kwargs):
from calibre.web.jsbrowser.browser import Browser
return Browser(default_timeout=kwargs.get('default_timeout', 120))
def do_login(self, browser, username, password):
'''
This method is used to login to a website that uses a paywall. Implement it in
your recipe if the site uses a paywall. An example implementation::
def do_login(self, browser, username, password):
browser.visit('http://some-page-that-has-a-login')
form = browser.select_form(nr=0) # Select the first form on the page
form['username'] = username
form['password'] = password
browser.submit(timeout=120) # Submit the form and wait at most two minutes for loading to complete
Note that you can also select forms with CSS2 selectors, like this::
browser.select_form('form#login_form')
browser.select_from('form[name="someform"]')
'''
pass
def get_publication_data(self, browser):
'''
Download the cover, the masthead image and the list of sections/articles.
Should return a dictionary with keys 'index', 'cover' and 'masthead'.
'cover' and 'masthead' are optional, if not present, they will be auto-generated.
The index must be in the same format as described in :meth:`parse_index`.
'''
raise NotImplementedError('You must implement this method in your recipe')
def load_complete(self, browser, url, recursion_level):
'''
This method is called after every page on the website is loaded. To be
precise, it is called when the DOM is ready. If further checks need to
be made, they should be made here. For example, if you want to check
that some element in the DOM is present, you would use::
def load_complete(self, browser, url, rl):
browser.wait_for_element('#article-footer')
return True
where article-footer is the id of the element you want to wait for.
'''
return True
def abort_article(self, msg=None):
raise AbortFetch(msg or 'Article fetch aborted')
def preprocess_stage1(self, article, browser, url, recursion_level):
pass
def preprocess_stage2(self, article, browser, url, recursion_level):
pass
def postprocess_html(self, article, root, url, recursion_level):
return root
def index_to_soup(self, url_or_raw, raw=False):
'''
Convenience method that takes an URL to the index page and returns
a parsed lxml tree representation of it.
`url_or_raw`: Either a URL or the downloaded index page as a string
'''
if re.match(r'\w+://', url_or_raw):
self.jsbrowser.start_load(url_or_raw)
html = self.jsbrowser.html
else:
html = url_or_raw
if isinstance(html, bytes):
html = xml_to_unicode(html)[0]
html = strip_encoding_declarations(html)
if raw:
return html
import html5lib
root = html5lib.parse(html, treebuilder='lxml', namespaceHTMLElements=False).getroot()
return root
# ***************************** Internal API *****************************
def _preprocess_browser(self, article, browser, url, stage, recursion_level):
func = getattr(self, 'preprocess_stage%d' % stage)
return func(article, browser, url, recursion_level)
def _postprocess_html(self, article, feed_num, art_num, feed_len, root, url, recursion_level):
from lxml.html.builder import STYLE
if self.no_stylesheets:
for link in root.xpath('//link[@href]'):
if (link.get('type', '') or 'text/css'):
link.getparent().remove(link)
for style in root.xpath('//style'):
style.getparent().remove(style)
head = root.xpath('//head|//body')
head = head[0] if head else next(root.iterdescendants())
head.append(STYLE(self.template_css + '\n\n' + (self.extra_css or '')))
if recursion_level == 0:
body = root.xpath('//body')
if body:
templ = self.navbar.generate(
False, feed_num, art_num, feed_len, not self.has_single_feed, url,
__appname__, center=self.center_navbar,
extra_css=self.extra_css)
body.insert(0, templ.root.xpath('//div')[0])
remove_attrs = set(self.remove_attributes)
if self.remove_javascript:
remove_attrs.add('onload')
for script in root.xpath('//*[name()="script" or name()="noscript"]'):
script.getparent().remove(script)
for attr in remove_attrs:
for tag in root.xpath('//*[@%s]' % attr):
tag.attrib.pop(attr, None)
nuke = ['base', 'iframe', 'canvas', 'embed', 'command', 'datalist', 'video', 'audio', 'form']
for tag in root.xpath('|'.join('//%s' % tag for tag in nuke)):
tag.getparent().remove(tag)
root = self.postprocess_html(article, root, url, recursion_level)
if root is not None:
# Nuke HTML5 tags
tags = ['article', 'aside', 'header', 'footer', 'nav', 'figcaption', 'figure', 'section']
for tag in root.xpath('|'.join('//%s' % tag for tag in tags)):
tag.tag = 'div'
self.populate_article_metadata(article, root, recursion_level == 0)
return root
def download(self):
browser = self.jsbrowser = self.get_jsbrowser()
with browser:
try:
if self.needs_subscription and self.username and self.password:
self.do_login(browser, self.username, self.password)
data = self.get_publication_data(browser)
# Process cover, if any
cdata = data.get('cover', None)
if cdata:
self.cover_url = image_data_to_url(cdata)
self.download_cover()
# Process masthead, if any
mdata = data.get('masthead', None)
if mdata:
self.masthead_url = image_data_to_url(mdata)
self.resolve_masthead()
# Process the list of sections/articles
return self.build_index(data, browser)
finally:
self.cleanup()
def build_index(self, data, browser):
sections = data.get('index', None)
if not sections:
raise ValueError('No articles found, aborting')
feeds = feeds_from_index(sections, oldest_article=self.oldest_article,
max_articles_per_feed=self.max_articles_per_feed,
log=self.log)
if not feeds:
raise ValueError('No articles found, aborting')
if self.ignore_duplicate_articles is not None:
feeds = self.remove_duplicate_articles(feeds)
if self.test:
feeds = feeds[:2]
self.has_single_feed = len(feeds) == 1
index = os.path.join(self.output_dir, 'index.html')
html = self.feeds2index(feeds)
with open(index, 'wb') as fi:
fi.write(html)
if self.reverse_article_order:
for feed in feeds:
if hasattr(feed, 'reverse'):
feed.reverse()
self.report_progress(0, _('Got feeds from index page'))
resource_cache = {}
total = 0
for feed in feeds:
total += min(self.max_articles_per_feed, len(feed))
num = 0
for f, feed in enumerate(feeds):
feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
if not os.path.isdir(feed_dir):
os.makedirs(feed_dir)
for a, article in enumerate(feed):
if a >= self.max_articles_per_feed:
break
num += 1
art_dir = os.path.join(feed_dir, 'article_%d'%a)
if not os.path.isdir(art_dir):
os.makedirs(art_dir)
try:
url = self.print_version(article.url)
except NotImplementedError:
url = article.url
except:
self.log.exception('Failed to find print version for: '+article.url)
url = None
if not url:
continue
self.log('Fetching article:', article.title, 'from', url)
try:
pages = fetch_page(
url,
load_complete=self.load_complete,
links=self.select_links,
remove=self.remove_tags,
keep_only=self.keep_only_tags,
preprocess_browser=partial(self._preprocess_browser, article),
postprocess_html=partial(self._postprocess_html, article, f, a, len(feed)),
remove_before=self.remove_tags_before,
remove_after=self.remove_tags_after,
remove_javascript=self.remove_javascript,
resource_cache=resource_cache, output_dir=art_dir, browser=browser)
except AbortFetch:
self.log.exception('Fetching of article: %r aborted' % article.title)
continue
except Exception:
self.log.exception('Fetching of article: %r failed' % article.title)
continue
self.log.debug('Downloaded article:', article.title, 'from', article.url)
article.orig_url = article.url
article.url = 'article_%d/index.html'%a
article.downloaded = True
article.sub_pages = pages[1:]
self.report_progress(float(num)/total,
_(u'Article downloaded: %s')%force_unicode(article.title))
for f, feed in enumerate(feeds):
html = self.feed2index(f, feeds)
feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
with open(os.path.join(feed_dir, 'index.html'), 'wb') as fi:
fi.write(html)
if self.no_stylesheets:
for f in walk(self.output_dir):
if f.endswith('.css'):
os.remove(f)
self.create_opf(feeds)
self.report_progress(1, _('Download finished'))
return index

View File

@ -7,11 +7,12 @@ Builtin recipes.
import re, time, io
from calibre.web.feeds.news import (BasicNewsRecipe, CustomIndexRecipe,
AutomaticNewsRecipe, CalibrePeriodical)
from calibre.web.feeds.jsnews import JavascriptRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.utils.config import JSONConfig
basic_recipes = (BasicNewsRecipe, AutomaticNewsRecipe, CustomIndexRecipe,
CalibrePeriodical)
CalibrePeriodical, JavascriptRecipe)
custom_recipes = JSONConfig('custom_recipes/index.json')

View File

@ -0,0 +1,258 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import time, os, hashlib
from operator import attrgetter
from collections import defaultdict
from functools import partial
from calibre import jsbrowser
from calibre.ebooks.chardet import strip_encoding_declarations
from calibre.utils.imghdr import what
from calibre.web.jsbrowser.browser import Timeout
# remove_comments() {{{
remove_comments = '''
function remove_comments(node) {
var nodes = node.childNodes, i=0, t;
while((t = nodes.item(i++))) {
switch(t.nodeType){
case Node.ELEMENT_NODE:
remove_comments(t);
break;
case Node.COMMENT_NODE:
node.removeChild(t);
i--;
}
}
}
remove_comments(document)
''' # }}}
class AbortFetch(ValueError):
pass
def children(elem):
elem = elem.firstChild()
while not elem.isNull():
yield elem
elem = elem.nextSibling()
def apply_keep_only(browser, keep_only):
mf = browser.page.mainFrame()
body = mf.findFirstElement('body')
if body.isNull():
browser.log.error('Document has no body, cannot apply keep_only')
return
keep = []
for selector in keep_only:
keep.extend(x for x in mf.findAllElements(selector))
if not keep:
browser.log.error('Failed to find any elements matching the keep_only selectors: %r' % keep_only)
return
for elem in keep:
body.appendInside(elem)
for elem in tuple(children(body)):
preserve = False
for x in keep:
if x == elem:
preserve = True
break
if preserve:
break
elem.removeFromDocument()
def apply_remove(browser, remove):
mf = browser.page.mainFrame()
for selector in remove:
for elem in mf.findAllElements(selector):
if not elem.isNull():
elem.removeFromDocument()
def remove_beyond(browser, selector, before=True):
mf = browser.page.mainFrame()
elem = mf.findFirstElement(selector)
if elem.isNull():
browser.log('Failed to find any element matching the selector: %s' % selector)
return
next_sibling = attrgetter('previousSibling' if before else 'nextSibling')
while not elem.isNull() and unicode(elem.tagName()) != 'body':
remove = []
after = next_sibling(elem)()
while not after.isNull():
remove.append(after)
after = next_sibling(after)()
for x in remove:
x.removeFromDocument()
elem = elem.parent()
def is_tag(elem, name):
return unicode(elem.tagName()).lower() == name.lower()
def download_resources(browser, resource_cache, output_dir):
img_counter = style_counter = 0
resources = defaultdict(list)
for img in browser.css_select('img[src]', all=True):
# Using javascript ensures that absolute URLs are returned, direct
# attribute access does not do that
src = unicode(img.evaluateJavaScript('this.src').toString()).strip()
if src:
resources[src].append(img)
for link in browser.css_select('link[href]', all=True):
lt = unicode(link.attribute('type')).strip() or 'text/css'
rel = unicode(link.attribute('rel')).strip() or 'stylesheet'
if lt == 'text/css' and rel == 'stylesheet':
href = unicode(link.evaluateJavaScript('this.href').toString()).strip()
if href:
resources[href].append(link)
else:
link.removeFromDocument()
else:
link.removeFromDocument()
loaded_resources = browser.wait_for_resources(resources)
for url, raw in loaded_resources.iteritems():
h = hashlib.sha1(raw).digest()
if h in resource_cache:
href = os.path.relpath(resource_cache[h], output_dir).replace(os.sep, '/')
else:
elem = resources[url][0]
if is_tag(elem, 'link'):
style_counter += 1
href = 'style_%d.css' % style_counter
else:
img_counter += 1
ext = what(None, raw) or 'jpg'
href = 'img_%d.%s' % (img_counter, ext)
dest = os.path.join(output_dir, href)
resource_cache[h] = dest
with open(dest, 'wb') as f:
f.write(raw)
for elem in resources[url]:
elem.setAttribute('href' if is_tag(elem, 'link') else 'src', href)
failed = set(resources) - set(loaded_resources)
for url in failed:
for elem in resources[url]:
attr = 'href' if is_tag(elem, 'link') else 'src'
elem.setAttribute(attr, '')
def save_html(browser, output_dir, postprocess_html, url, recursion_level):
html = strip_encoding_declarations(browser.html)
import html5lib
root = html5lib.parse(html, treebuilder='lxml', namespaceHTMLElements=False).getroot()
root = postprocess_html(root, url, recursion_level)
if root is None:
# user wants this page to be aborted
raise AbortFetch('%s was aborted during postprocess' % url)
with open(os.path.join(output_dir, 'index.html'), 'wb') as f:
from lxml.html import tostring
f.write(tostring(root, include_meta_content_type=True, encoding='utf-8', pretty_print=True))
return f.name
def links_from_selectors(selectors, recursions, browser, url, recursion_level):
ans = []
if recursions > recursion_level:
for selector in selectors:
for a in browser.css_select(selector, all=True):
href = unicode(a.evaluateJavaScript('this.href').toString()).strip()
if href:
ans.append(href)
return ans
def clean_dom(
browser, url, recursion_level, preprocess_browser, remove_javascript,
keep_only, remove_after, remove_before, remove):
# Remove comments as otherwise we can end up with nested comments, which
# cause problems later
browser.page.mainFrame().evaluateJavaScript(remove_comments)
preprocess_browser(browser, url, 1, recursion_level)
if remove_javascript:
for elem in browser.css_select('script', all=True):
elem.removeFromDocument()
if keep_only:
apply_keep_only(browser, keep_only)
if remove_after:
remove_beyond(browser, remove_after, before=False)
if remove_before:
remove_beyond(browser, remove_before, before=True)
if remove:
apply_remove(browser, remove)
preprocess_browser(browser, url, 2, recursion_level)
def fetch_page(
url=None,
load_complete=lambda browser, url, recursion_level: True,
links=lambda browser, url, recursion_level: (),
keep_only=(),
remove_after=None,
remove_before=None,
remove=(),
remove_javascript=True,
preprocess_browser=lambda browser, url, stage, recursion_level:None,
postprocess_html=lambda root, url, recursion_level: root,
resource_cache={},
output_dir=None,
browser=None,
recursion_level=0
):
output_dir = output_dir or os.getcwdu()
if browser is None:
browser = jsbrowser()
# Load the DOM
if url is not None:
start_time = time.time()
browser.start_load(url)
while not load_complete(browser, url, recursion_level):
browser.run_for_a_time(0.1)
if time.time() - start_time > browser.default_timeout:
raise Timeout('Timed out while waiting for %s to load' % url)
children = links(browser, url, recursion_level)
# Cleanup the DOM
clean_dom(
browser, url, recursion_level, preprocess_browser,
remove_javascript, keep_only, remove_after, remove_before, remove)
# Download resources
download_resources(browser, resource_cache, output_dir)
# Get HTML from the DOM
pages = [save_html(browser, output_dir, postprocess_html, url, recursion_level)]
# Fetch the linked pages
for i, curl in enumerate(children):
odir = os.path.join(output_dir, 'link%d' % (i + 1))
if not os.path.exists(odir):
os.mkdir(odir)
try:
pages.extend(fetch_page(
curl, load_complete=load_complete, links=links, keep_only=keep_only,
remove_after=remove_after, remove_before=remove_before, remove=remove,
preprocess_browser=preprocess_browser, postprocess_html=postprocess_html,
resource_cache=resource_cache, output_dir=odir, browser=browser,
recursion_level=recursion_level+1))
except AbortFetch:
continue
return tuple(pages)
if __name__ == '__main__':
browser = jsbrowser()
fetch_page('http://www.time.com/time/magazine/article/0,9171,2145057,00.html', browser=browser,
links=partial(links_from_selectors, ('.wp-paginate a.page[href]',), 1),
keep_only=('article.post',), remove=('.entry-sharing', '.entry-footer', '.wp-paginate', '.post-rail'))