This commit is contained in:
Kovid Goyal 2017-03-17 13:15:51 +05:30
parent f10ef8fe90
commit 07a716ef38
2 changed files with 26 additions and 18 deletions

View File

@ -122,15 +122,19 @@ class Economist(BasicNewsRecipe):
return br return br
def preprocess_raw_html(self, raw, url): def preprocess_raw_html(self, raw, url):
soup = self.index_to_soup(raw) import html5lib
for div in soup.findAll(**classes('lazy-image')): root = html5lib.parse(raw, namespaceHTMLElements=False, treebuilder='lxml')
noscript = div.find('noscript') from lxml import etree
if noscript is not None: for div in root.xpath('//div[@class="lazy-image"]'):
img = noscript.find('img') noscript = list(div.iter('noscript'))
if img is not None: if noscript and noscript[0].text:
img.extract() img = list(html5lib.parse(noscript[0].text, namespaceHTMLElements=False, treebuilder='lxml').iter('img'))
noscript.replaceWith(img) if img:
return type(u'')(soup) p = noscript[0].getparent()
idx = p.index(noscript[0])
p.insert(idx, p.makeelement('img', src=img[0].get('src')))
p.remove(noscript[0])
return etree.tostring(root, encoding=unicode)
def parse_index(self): def parse_index(self):
# return [('Articles', [{'title':'test', # return [('Articles', [{'title':'test',

View File

@ -122,15 +122,19 @@ class Economist(BasicNewsRecipe):
return br return br
def preprocess_raw_html(self, raw, url): def preprocess_raw_html(self, raw, url):
soup = self.index_to_soup(raw) import html5lib
for div in soup.findAll(**classes('lazy-image')): root = html5lib.parse(raw, namespaceHTMLElements=False, treebuilder='lxml')
noscript = div.find('noscript') from lxml import etree
if noscript is not None: for div in root.xpath('//div[@class="lazy-image"]'):
img = noscript.find('img') noscript = list(div.iter('noscript'))
if img is not None: if noscript and noscript[0].text:
img.extract() img = list(html5lib.parse(noscript[0].text, namespaceHTMLElements=False, treebuilder='lxml').iter('img'))
noscript.replaceWith(img) if img:
return type(u'')(soup) p = noscript[0].getparent()
idx = p.index(noscript[0])
p.insert(idx, p.makeelement('img', src=img[0].get('src')))
p.remove(noscript[0])
return etree.tostring(root, encoding=unicode)
def parse_index(self): def parse_index(self):
# return [('Articles', [{'title':'test', # return [('Articles', [{'title':'test',