From c4c63b3a78e5b79f03d830d92cae7fb8730d2901 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 11 Jun 2013 11:22:27 +0530 Subject: [PATCH] News download: Full WebKit based framework News download: Add a framework for scraping javascript heavy sites using a full javascript enabled WebKit based browser. --- src/calibre/web/feeds/jsnews.py | 341 ++++++++++++++++++++++ src/calibre/web/feeds/recipes/__init__.py | 3 +- src/calibre/web/fetch/javascript.py | 258 ++++++++++++++++ 3 files changed, 601 insertions(+), 1 deletion(-) create mode 100644 src/calibre/web/feeds/jsnews.py create mode 100644 src/calibre/web/fetch/javascript.py diff --git a/src/calibre/web/feeds/jsnews.py b/src/calibre/web/feeds/jsnews.py new file mode 100644 index 0000000000..b23f97b3e0 --- /dev/null +++ b/src/calibre/web/feeds/jsnews.py @@ -0,0 +1,341 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2013, Kovid Goyal ' + +import os, re +from io import BytesIO +from functools import partial + +from calibre import force_unicode, walk +from calibre.constants import __appname__ +from calibre.web.feeds import feeds_from_index +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.web.fetch.javascript import fetch_page, AbortFetch, links_from_selectors +from calibre.ebooks.chardet import xml_to_unicode, strip_encoding_declarations + +def image_data_to_url(data, base='cover'): + from calibre.utils.imghdr import what + ans = BytesIO(data) + ext = what(None, data) + if not ext: + if data.startswith(b'%PDF-'): + ext = 'pdf' + else: + ext = 'jpg' + ans.name = 'cover.' + ext + return ans + +class JavascriptRecipe(BasicNewsRecipe): + + #: Minimum calibre version needed to use this recipe + requires_version = (0, 9, 34) + + #: List of tags to be removed. Specified tags are removed from downloaded HTML. + #: A tag is specified using CSS selectors. + #: A common example:: + #: + #: remove_tags = ['div.advert', 'div.tools'] + #: + #: This will remove all `