From 2af36a378b57867434a08f2b755b50ea48bcd3b1 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 12 Sep 2015 18:46:09 +0530 Subject: [PATCH] ... --- recipes/natgeo.recipe | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/recipes/natgeo.recipe b/recipes/natgeo.recipe index 44531c5ebb..d68948ed44 100644 --- a/recipes/natgeo.recipe +++ b/recipes/natgeo.recipe @@ -2,6 +2,7 @@ # vim:fileencoding=utf-8 from __future__ import (unicode_literals, division, absolute_import, print_function) +import re from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import Tag import html5lib @@ -40,6 +41,10 @@ class NatGeo(BasicNewsRecipe): def preprocess_raw_html(self, raw_html, url): # BeautifulSoup does not parse the natgeo html correctly, so we use a # custom cleanup routine + + # NatGeo embeds huge inline svg images as interactive graphics, this + # breaks conversion because split points cannot be found + raw_html = re.sub(r'', '', raw_html, flags=re.DOTALL) root = html5lib.parse(raw_html, namespaceHTMLElements=False, treebuilder='lxml') select = Select(root) keep = tuple(select('.mainArt')) + tuple(select('.byline')) + tuple(select('#article__body'))