mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
...
This commit is contained in:
parent
47dce862cc
commit
2af36a378b
@ -2,6 +2,7 @@
|
|||||||
# vim:fileencoding=utf-8
|
# vim:fileencoding=utf-8
|
||||||
from __future__ import (unicode_literals, division, absolute_import,
|
from __future__ import (unicode_literals, division, absolute_import,
|
||||||
print_function)
|
print_function)
|
||||||
|
import re
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import Tag
|
from calibre.ebooks.BeautifulSoup import Tag
|
||||||
import html5lib
|
import html5lib
|
||||||
@ -40,6 +41,10 @@ class NatGeo(BasicNewsRecipe):
|
|||||||
def preprocess_raw_html(self, raw_html, url):
|
def preprocess_raw_html(self, raw_html, url):
|
||||||
# BeautifulSoup does not parse the natgeo html correctly, so we use a
|
# BeautifulSoup does not parse the natgeo html correctly, so we use a
|
||||||
# custom cleanup routine
|
# custom cleanup routine
|
||||||
|
|
||||||
|
# NatGeo embeds huge inline svg images as interactive graphics, this
|
||||||
|
# breaks conversion because split points cannot be found
|
||||||
|
raw_html = re.sub(r'<svg.+?</svg>', '', raw_html, flags=re.DOTALL)
|
||||||
root = html5lib.parse(raw_html, namespaceHTMLElements=False, treebuilder='lxml')
|
root = html5lib.parse(raw_html, namespaceHTMLElements=False, treebuilder='lxml')
|
||||||
select = Select(root)
|
select = Select(root)
|
||||||
keep = tuple(select('.mainArt')) + tuple(select('.byline')) + tuple(select('#article__body'))
|
keep = tuple(select('.mainArt')) + tuple(select('.byline')) + tuple(select('#article__body'))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user