...

2025-07-09 03:04:10 -04:00 · 2017-03-17 13:15:51 +05:30 · 2017-03-17 13:15:51 +05:30 · 07a716ef38
commit 07a716ef38
parent f10ef8fe90
2 changed files with 26 additions and 18 deletions
--- a/recipes/economist.recipe
+++ b/recipes/economist.recipe
@ -122,15 +122,19 @@ class Economist(BasicNewsRecipe):
        return br

    def preprocess_raw_html(self, raw, url):
-        soup = self.index_to_soup(raw)
-        for div in soup.findAll(**classes('lazy-image')):
-            noscript = div.find('noscript')
-            if noscript is not None:
-                img = noscript.find('img')
-                if img is not None:
-                    img.extract()
-                    noscript.replaceWith(img)
-        return type(u'')(soup)
+        import html5lib
+        root = html5lib.parse(raw, namespaceHTMLElements=False, treebuilder='lxml')
+        from lxml import etree
+        for div in root.xpath('//div[@class="lazy-image"]'):
+            noscript = list(div.iter('noscript'))
+            if noscript and noscript[0].text:
+                img = list(html5lib.parse(noscript[0].text, namespaceHTMLElements=False, treebuilder='lxml').iter('img'))
+                if img:
+                    p = noscript[0].getparent()
+                    idx = p.index(noscript[0])
+                    p.insert(idx, p.makeelement('img', src=img[0].get('src')))
+                    p.remove(noscript[0])
+        return etree.tostring(root, encoding=unicode)

    def parse_index(self):
        # return [('Articles', [{'title':'test',
--- a/recipes/economist_free.recipe
+++ b/recipes/economist_free.recipe
@ -122,15 +122,19 @@ class Economist(BasicNewsRecipe):
        return br

    def preprocess_raw_html(self, raw, url):
-        soup = self.index_to_soup(raw)
-        for div in soup.findAll(**classes('lazy-image')):
-            noscript = div.find('noscript')
-            if noscript is not None:
-                img = noscript.find('img')
-                if img is not None:
-                    img.extract()
-                    noscript.replaceWith(img)
-        return type(u'')(soup)
+        import html5lib
+        root = html5lib.parse(raw, namespaceHTMLElements=False, treebuilder='lxml')
+        from lxml import etree
+        for div in root.xpath('//div[@class="lazy-image"]'):
+            noscript = list(div.iter('noscript'))
+            if noscript and noscript[0].text:
+                img = list(html5lib.parse(noscript[0].text, namespaceHTMLElements=False, treebuilder='lxml').iter('img'))
+                if img:
+                    p = noscript[0].getparent()
+                    idx = p.index(noscript[0])
+                    p.insert(idx, p.makeelement('img', src=img[0].get('src')))
+                    p.remove(noscript[0])
+        return etree.tostring(root, encoding=unicode)

    def parse_index(self):
        # return [('Articles', [{'title':'test',