News download: Full WebKit based framework

News download: Add a framework for scraping javascript heavy sites using a full javascript enabled WebKit based browser.
2025-07-08 18:54:09 -04:00 · 2013-06-11 11:22:27 +05:30 · 2013-06-11 11:22:27 +05:30 · c4c63b3a78
commit c4c63b3a78
parent 29b4c093f6
3 changed files with 601 additions and 1 deletions
--- a/src/calibre/web/feeds/jsnews.py
+++ b/src/calibre/web/feeds/jsnews.py
@ -0,0 +1,341 @@
+#!/usr/bin/env python
+# vim:fileencoding=utf-8
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import os, re
+from io import BytesIO
+from functools import partial
+
+from calibre import force_unicode, walk
+from calibre.constants import __appname__
+from calibre.web.feeds import feeds_from_index
+from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.web.fetch.javascript import fetch_page,  AbortFetch, links_from_selectors
+from calibre.ebooks.chardet import xml_to_unicode, strip_encoding_declarations
+
+def image_data_to_url(data, base='cover'):
+    from calibre.utils.imghdr import what
+    ans = BytesIO(data)
+    ext = what(None, data)
+    if not ext:
+        if data.startswith(b'%PDF-'):
+            ext = 'pdf'
+        else:
+            ext = 'jpg'
+    ans.name = 'cover.' + ext
+    return ans
+
+class JavascriptRecipe(BasicNewsRecipe):
+
+    #: Minimum calibre version needed to use this recipe
+    requires_version = (0, 9, 34)
+
+    #: List of tags to be removed. Specified tags are removed from downloaded HTML.
+    #: A tag is specified using CSS selectors.
+    #: A common example::
+    #:
+    #:   remove_tags = ['div.advert', 'div.tools']
+    #:
+    #: This will remove all `<div class="advert">` and `<div class="tools">` tags and all
+    #: their children from the downloaded :term:`HTML`.
+    remove_tags           = ()
+
+    #: Remove all tags that occur after the specified tag.
+    #: A tag is specified using CSS selectors.
+    #: For example::
+    #:
+    # :     remove_tags_after = '#content'
+    #:
+    #: will remove all tags after the first element with `id="content"`.
+    remove_tags_after     = None
+
+    #: Remove all tags that occur before the specified tag.
+    #: A tag is specified using CSS selectors.
+    #: For example::
+    #:
+    # :     remove_tags_before = '#content'
+    #:
+    #: will remove all tags before the first element with `id="content"`.
+    remove_tags_before    = None
+
+    #: Keep only the specified tags and their children.
+    #: Uses the CSS selector syntax.
+    #: If this list is not empty, then the `<body>` tag will be emptied and re-filled with
+    #: the tags that match the entries in this list. For example::
+    #:
+    # :     keep_only_tags = ['#content', '#heading']
+    #:
+    #: will keep only tags that have an `id` attribute of `"content"` or `"heading"`.
+    keep_only_tags        = ()
+
+    #: A list of selectors that match <a href> elements that you want followed.
+    #: For this to work you must also set recursions to at least 1.
+    #: You can get more control by re-implemnting :met:`select_links` in your sub-class.
+    links_from_selectors = ()
+
+    def select_links(self, browser, url, recursion_level):
+        '''
+        Override this method in your sub-class to implement arbitrary link following logic. It must return a
+        list of URLs, each of which will be downloaded in turn.
+        '''
+        return links_from_selectors(self.links_from_selectors, self.recursions, browser, url, recursion_level)
+
+    def get_jsbrowser(self, *args, **kwargs):
+        from calibre.web.jsbrowser.browser import Browser
+        return Browser(default_timeout=kwargs.get('default_timeout', 120))
+
+    def do_login(self, browser, username, password):
+        '''
+        This method is used to login to a website that uses a paywall. Implement it in
+        your recipe if the site uses a paywall. An example implementation::
+
+            def do_login(self, browser, username, password):
+                browser.visit('http://some-page-that-has-a-login')
+                form = browser.select_form(nr=0) # Select the first form on the page
+                form['username'] = username
+                form['password'] = password
+                browser.submit(timeout=120) # Submit the form and wait at most two minutes for loading to complete
+
+        Note that you can also select forms with CSS2 selectors, like this::
+
+            browser.select_form('form#login_form')
+            browser.select_from('form[name="someform"]')
+        '''
+
+        pass
+
+    def get_publication_data(self, browser):
+        '''
+        Download the cover, the masthead image and the list of sections/articles.
+        Should return a dictionary with keys 'index', 'cover' and 'masthead'.
+        'cover' and 'masthead' are optional, if not present, they will be auto-generated.
+        The index must be in the same format as described in :meth:`parse_index`.
+        '''
+        raise NotImplementedError('You must implement this method in your recipe')
+
+    def load_complete(self, browser, url, recursion_level):
+        '''
+        This method is called after every page on the website is loaded. To be
+        precise, it is called when the DOM is ready. If further checks need to
+        be made, they should be made here. For example, if you want to check
+        that some element in the DOM is present, you would use::
+
+            def load_complete(self, browser, url, rl):
+                browser.wait_for_element('#article-footer')
+                return True
+
+        where article-footer is the id of the element you want to wait for.
+        '''
+        return True
+
+    def abort_article(self, msg=None):
+        raise AbortFetch(msg or 'Article fetch aborted')
+
+    def preprocess_stage1(self, article, browser, url, recursion_level):
+        pass
+
+    def preprocess_stage2(self, article, browser, url, recursion_level):
+        pass
+
+    def postprocess_html(self, article, root, url, recursion_level):
+        return root
+
+    def index_to_soup(self, url_or_raw, raw=False):
+        '''
+        Convenience method that takes an URL to the index page and returns
+        a parsed lxml tree representation of it.
+
+        `url_or_raw`: Either a URL or the downloaded index page as a string
+        '''
+        if re.match(r'\w+://', url_or_raw):
+            self.jsbrowser.start_load(url_or_raw)
+            html = self.jsbrowser.html
+        else:
+            html = url_or_raw
+        if isinstance(html, bytes):
+            html = xml_to_unicode(html)[0]
+        html = strip_encoding_declarations(html)
+        if raw:
+            return html
+        import html5lib
+        root = html5lib.parse(html, treebuilder='lxml', namespaceHTMLElements=False).getroot()
+        return root
+
+# ***************************** Internal API *****************************
+
+    def _preprocess_browser(self, article, browser, url, stage, recursion_level):
+        func = getattr(self, 'preprocess_stage%d' % stage)
+        return func(article, browser, url, recursion_level)
+
+    def _postprocess_html(self, article, feed_num, art_num, feed_len, root, url, recursion_level):
+        from lxml.html.builder import STYLE
+        if self.no_stylesheets:
+            for link in root.xpath('//link[@href]'):
+                if (link.get('type', '') or 'text/css'):
+                    link.getparent().remove(link)
+            for style in root.xpath('//style'):
+                style.getparent().remove(style)
+        head = root.xpath('//head|//body')
+        head = head[0] if head else next(root.iterdescendants())
+        head.append(STYLE(self.template_css + '\n\n' + (self.extra_css or '')))
+
+        if recursion_level == 0:
+            body = root.xpath('//body')
+            if body:
+                templ = self.navbar.generate(
+                    False, feed_num, art_num, feed_len, not self.has_single_feed, url,
+                    __appname__, center=self.center_navbar,
+                    extra_css=self.extra_css)
+                body.insert(0, templ.root.xpath('//div')[0])
+
+        remove_attrs = set(self.remove_attributes)
+        if self.remove_javascript:
+            remove_attrs.add('onload')
+            for script in root.xpath('//*[name()="script" or name()="noscript"]'):
+                script.getparent().remove(script)
+
+        for attr in remove_attrs:
+            for tag in root.xpath('//*[@%s]' % attr):
+                tag.attrib.pop(attr, None)
+
+        nuke = ['base', 'iframe', 'canvas', 'embed', 'command', 'datalist', 'video', 'audio', 'form']
+        for tag in root.xpath('|'.join('//%s' % tag for tag in nuke)):
+            tag.getparent().remove(tag)
+
+        root = self.postprocess_html(article, root, url, recursion_level)
+        if root is not None:
+            # Nuke HTML5 tags
+            tags = ['article', 'aside', 'header', 'footer', 'nav', 'figcaption', 'figure', 'section']
+            for tag in root.xpath('|'.join('//%s' % tag for tag in tags)):
+                tag.tag = 'div'
+
+            self.populate_article_metadata(article, root, recursion_level == 0)
+
+        return root
+
+    def download(self):
+        browser = self.jsbrowser = self.get_jsbrowser()
+        with browser:
+            try:
+                if self.needs_subscription and self.username and self.password:
+                    self.do_login(browser, self.username, self.password)
+                data = self.get_publication_data(browser)
+
+                # Process cover, if any
+                cdata = data.get('cover', None)
+                if cdata:
+                    self.cover_url = image_data_to_url(cdata)
+                self.download_cover()
+
+                # Process masthead, if any
+                mdata = data.get('masthead', None)
+                if mdata:
+                    self.masthead_url = image_data_to_url(mdata)
+                self.resolve_masthead()
+
+                # Process the list of sections/articles
+                return self.build_index(data, browser)
+            finally:
+                self.cleanup()
+
+    def build_index(self, data, browser):
+        sections = data.get('index', None)
+        if not sections:
+            raise ValueError('No articles found, aborting')
+
+        feeds = feeds_from_index(sections, oldest_article=self.oldest_article,
+                                    max_articles_per_feed=self.max_articles_per_feed,
+                                    log=self.log)
+        if not feeds:
+            raise ValueError('No articles found, aborting')
+        if self.ignore_duplicate_articles is not None:
+            feeds = self.remove_duplicate_articles(feeds)
+        if self.test:
+            feeds = feeds[:2]
+        self.has_single_feed = len(feeds) == 1
+        index = os.path.join(self.output_dir, 'index.html')
+
+        html = self.feeds2index(feeds)
+        with open(index, 'wb') as fi:
+            fi.write(html)
+
+        if self.reverse_article_order:
+            for feed in feeds:
+                if hasattr(feed, 'reverse'):
+                    feed.reverse()
+
+        self.report_progress(0, _('Got feeds from index page'))
+        resource_cache = {}
+
+        total = 0
+        for feed in feeds:
+            total += min(self.max_articles_per_feed, len(feed))
+        num = 0
+
+        for f, feed in enumerate(feeds):
+            feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
+            if not os.path.isdir(feed_dir):
+                os.makedirs(feed_dir)
+
+            for a, article in enumerate(feed):
+                if a >= self.max_articles_per_feed:
+                    break
+                num += 1
+                art_dir = os.path.join(feed_dir, 'article_%d'%a)
+                if not os.path.isdir(art_dir):
+                    os.makedirs(art_dir)
+                try:
+                    url = self.print_version(article.url)
+                except NotImplementedError:
+                    url = article.url
+                except:
+                    self.log.exception('Failed to find print version for: '+article.url)
+                    url = None
+                if not url:
+                    continue
+
+                self.log('Fetching article:', article.title, 'from', url)
+                try:
+                    pages = fetch_page(
+                        url,
+                        load_complete=self.load_complete,
+                        links=self.select_links,
+                        remove=self.remove_tags,
+                        keep_only=self.keep_only_tags,
+                        preprocess_browser=partial(self._preprocess_browser, article),
+                        postprocess_html=partial(self._postprocess_html, article, f, a, len(feed)),
+                        remove_before=self.remove_tags_before,
+                        remove_after=self.remove_tags_after,
+                        remove_javascript=self.remove_javascript,
+                        resource_cache=resource_cache, output_dir=art_dir, browser=browser)
+                except AbortFetch:
+                    self.log.exception('Fetching of article: %r aborted' % article.title)
+                    continue
+                except Exception:
+                    self.log.exception('Fetching of article: %r failed' % article.title)
+                    continue
+                self.log.debug('Downloaded article:', article.title, 'from', article.url)
+                article.orig_url = article.url
+                article.url = 'article_%d/index.html'%a
+                article.downloaded = True
+                article.sub_pages  = pages[1:]
+                self.report_progress(float(num)/total,
+                    _(u'Article downloaded: %s')%force_unicode(article.title))
+
+        for f, feed in enumerate(feeds):
+            html = self.feed2index(f, feeds)
+            feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
+            with open(os.path.join(feed_dir, 'index.html'), 'wb') as fi:
+                fi.write(html)
+        if self.no_stylesheets:
+            for f in walk(self.output_dir):
+                if f.endswith('.css'):
+                    os.remove(f)
+        self.create_opf(feeds)
+        self.report_progress(1, _('Download finished'))
+        return index
+
--- a/src/calibre/web/feeds/recipes/init.py
+++ b/src/calibre/web/feeds/recipes/init.py
@ -7,11 +7,12 @@ Builtin recipes.
 import re, time, io
 from calibre.web.feeds.news import (BasicNewsRecipe, CustomIndexRecipe,
    AutomaticNewsRecipe, CalibrePeriodical)
+from calibre.web.feeds.jsnews import JavascriptRecipe
 from calibre.ebooks.BeautifulSoup import BeautifulSoup
 from calibre.utils.config import JSONConfig

 basic_recipes = (BasicNewsRecipe, AutomaticNewsRecipe, CustomIndexRecipe,
-        CalibrePeriodical)
+        CalibrePeriodical, JavascriptRecipe)

 custom_recipes = JSONConfig('custom_recipes/index.json')

--- a/src/calibre/web/fetch/javascript.py
+++ b/src/calibre/web/fetch/javascript.py
@ -0,0 +1,258 @@
+#!/usr/bin/env python
+# vim:fileencoding=utf-8
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import time, os, hashlib
+from operator import attrgetter
+from collections import defaultdict
+from functools import partial
+
+from calibre import jsbrowser
+from calibre.ebooks.chardet import strip_encoding_declarations
+from calibre.utils.imghdr import what
+from calibre.web.jsbrowser.browser import Timeout
+
+# remove_comments() {{{
+remove_comments = '''
+function remove_comments(node) {
+    var nodes = node.childNodes, i=0, t;
+    while((t = nodes.item(i++))) {
+        switch(t.nodeType){
+            case Node.ELEMENT_NODE:
+                remove_comments(t);
+                break;
+            case Node.COMMENT_NODE:
+                node.removeChild(t);
+                i--;
+        }
+    }
+}
+remove_comments(document)
+'''  # }}}
+
+class AbortFetch(ValueError):
+    pass
+
+def children(elem):
+    elem = elem.firstChild()
+    while not elem.isNull():
+        yield elem
+        elem = elem.nextSibling()
+
+def apply_keep_only(browser, keep_only):
+    mf = browser.page.mainFrame()
+    body = mf.findFirstElement('body')
+    if body.isNull():
+        browser.log.error('Document has no body, cannot apply keep_only')
+        return
+    keep = []
+    for selector in keep_only:
+        keep.extend(x for x in mf.findAllElements(selector))
+    if not keep:
+        browser.log.error('Failed to find any elements matching the keep_only selectors: %r' % keep_only)
+        return
+    for elem in keep:
+        body.appendInside(elem)
+    for elem in tuple(children(body)):
+        preserve = False
+        for x in keep:
+            if x == elem:
+                preserve = True
+                break
+        if preserve:
+            break
+        elem.removeFromDocument()
+
+def apply_remove(browser, remove):
+    mf = browser.page.mainFrame()
+    for selector in remove:
+        for elem in mf.findAllElements(selector):
+            if not elem.isNull():
+                elem.removeFromDocument()
+
+def remove_beyond(browser, selector, before=True):
+    mf = browser.page.mainFrame()
+    elem = mf.findFirstElement(selector)
+    if elem.isNull():
+        browser.log('Failed to find any element matching the selector: %s' % selector)
+        return
+    next_sibling = attrgetter('previousSibling' if before else 'nextSibling')
+
+    while not elem.isNull() and unicode(elem.tagName()) != 'body':
+        remove = []
+        after = next_sibling(elem)()
+        while not after.isNull():
+            remove.append(after)
+            after = next_sibling(after)()
+        for x in remove:
+            x.removeFromDocument()
+        elem = elem.parent()
+
+def is_tag(elem, name):
+    return unicode(elem.tagName()).lower() == name.lower()
+
+def download_resources(browser, resource_cache, output_dir):
+    img_counter = style_counter = 0
+    resources = defaultdict(list)
+    for img in browser.css_select('img[src]', all=True):
+        # Using javascript ensures that absolute URLs are returned, direct
+        # attribute access does not do that
+        src = unicode(img.evaluateJavaScript('this.src').toString()).strip()
+        if src:
+            resources[src].append(img)
+    for link in browser.css_select('link[href]', all=True):
+        lt = unicode(link.attribute('type')).strip() or 'text/css'
+        rel = unicode(link.attribute('rel')).strip() or 'stylesheet'
+        if lt == 'text/css' and rel == 'stylesheet':
+            href = unicode(link.evaluateJavaScript('this.href').toString()).strip()
+            if href:
+                resources[href].append(link)
+            else:
+                link.removeFromDocument()
+        else:
+            link.removeFromDocument()
+    loaded_resources = browser.wait_for_resources(resources)
+    for url, raw in loaded_resources.iteritems():
+        h = hashlib.sha1(raw).digest()
+        if h in resource_cache:
+            href = os.path.relpath(resource_cache[h], output_dir).replace(os.sep, '/')
+        else:
+            elem = resources[url][0]
+            if is_tag(elem, 'link'):
+                style_counter += 1
+                href = 'style_%d.css' % style_counter
+            else:
+                img_counter += 1
+                ext = what(None, raw) or 'jpg'
+                href = 'img_%d.%s' % (img_counter, ext)
+            dest = os.path.join(output_dir, href)
+            resource_cache[h] = dest
+            with open(dest, 'wb') as f:
+                f.write(raw)
+        for elem in resources[url]:
+            elem.setAttribute('href' if is_tag(elem, 'link') else 'src', href)
+
+    failed = set(resources) - set(loaded_resources)
+    for url in failed:
+        for elem in resources[url]:
+            attr = 'href' if is_tag(elem, 'link') else 'src'
+            elem.setAttribute(attr, '')
+
+def save_html(browser, output_dir, postprocess_html, url, recursion_level):
+    html = strip_encoding_declarations(browser.html)
+    import html5lib
+    root = html5lib.parse(html, treebuilder='lxml', namespaceHTMLElements=False).getroot()
+    root = postprocess_html(root, url, recursion_level)
+    if root is None:
+        # user wants this page to be aborted
+        raise AbortFetch('%s was aborted during postprocess' % url)
+    with open(os.path.join(output_dir, 'index.html'), 'wb') as f:
+        from lxml.html import tostring
+        f.write(tostring(root, include_meta_content_type=True, encoding='utf-8', pretty_print=True))
+        return f.name
+
+def links_from_selectors(selectors, recursions, browser, url, recursion_level):
+    ans = []
+    if recursions > recursion_level:
+        for selector in selectors:
+            for a in browser.css_select(selector, all=True):
+                href = unicode(a.evaluateJavaScript('this.href').toString()).strip()
+                if href:
+                    ans.append(href)
+    return ans
+
+
+def clean_dom(
+    browser, url, recursion_level, preprocess_browser, remove_javascript,
+    keep_only, remove_after, remove_before, remove):
+
+    # Remove comments as otherwise we can end up with nested comments, which
+    # cause problems later
+    browser.page.mainFrame().evaluateJavaScript(remove_comments)
+
+    preprocess_browser(browser, url, 1, recursion_level)
+    if remove_javascript:
+        for elem in browser.css_select('script', all=True):
+            elem.removeFromDocument()
+    if keep_only:
+        apply_keep_only(browser, keep_only)
+    if remove_after:
+        remove_beyond(browser, remove_after, before=False)
+    if remove_before:
+        remove_beyond(browser, remove_before, before=True)
+    if remove:
+        apply_remove(browser, remove)
+    preprocess_browser(browser, url, 2, recursion_level)
+
+def fetch_page(
+    url=None,
+    load_complete=lambda browser, url, recursion_level: True,
+    links=lambda browser, url, recursion_level: (),
+    keep_only=(),
+    remove_after=None,
+    remove_before=None,
+    remove=(),
+    remove_javascript=True,
+    preprocess_browser=lambda browser, url, stage, recursion_level:None,
+    postprocess_html=lambda root, url, recursion_level: root,
+    resource_cache={},
+    output_dir=None,
+    browser=None,
+    recursion_level=0
+    ):
+
+    output_dir = output_dir or os.getcwdu()
+    if browser is None:
+        browser = jsbrowser()
+
+    # Load the DOM
+    if url is not None:
+        start_time = time.time()
+        browser.start_load(url)
+        while not load_complete(browser, url, recursion_level):
+            browser.run_for_a_time(0.1)
+            if time.time() - start_time > browser.default_timeout:
+                raise Timeout('Timed out while waiting for %s to load' % url)
+
+    children = links(browser, url, recursion_level)
+
+    # Cleanup the DOM
+    clean_dom(
+        browser, url, recursion_level, preprocess_browser,
+        remove_javascript, keep_only, remove_after, remove_before, remove)
+
+    # Download resources
+    download_resources(browser, resource_cache, output_dir)
+
+    # Get HTML from the DOM
+    pages = [save_html(browser, output_dir, postprocess_html, url, recursion_level)]
+
+    # Fetch the linked pages
+    for i, curl in enumerate(children):
+        odir = os.path.join(output_dir, 'link%d' % (i + 1))
+        if not os.path.exists(odir):
+            os.mkdir(odir)
+        try:
+            pages.extend(fetch_page(
+                curl, load_complete=load_complete, links=links, keep_only=keep_only,
+                remove_after=remove_after, remove_before=remove_before, remove=remove,
+                preprocess_browser=preprocess_browser, postprocess_html=postprocess_html,
+                resource_cache=resource_cache, output_dir=odir, browser=browser,
+                recursion_level=recursion_level+1))
+        except AbortFetch:
+            continue
+    return tuple(pages)
+
+if __name__ == '__main__':
+    browser = jsbrowser()
+    fetch_page('http://www.time.com/time/magazine/article/0,9171,2145057,00.html', browser=browser,
+               links=partial(links_from_selectors, ('.wp-paginate a.page[href]',), 1),
+               keep_only=('article.post',), remove=('.entry-sharing', '.entry-footer', '.wp-paginate', '.post-rail'))
+
+
+
+