Fix some recipes importing non-calibre BeautifulSoup

Fixes #1489658 [dependency on BeautifulSoup3?](https://bugs.launchpad.net/calibre/+bug/1489658)
2025-06-23 15:30:45 -04:00 · 2015-08-28 07:37:07 +05:30 · 2015-08-28 07:37:07 +05:30 · e64f766890
commit e64f766890
parent eba6551580
2 changed files with 88 additions and 89 deletions
--- a/recipes/metro_news_nl.recipe
+++ b/recipes/metro_news_nl.recipe
@ -2,7 +2,7 @@
 from calibre.web.feeds.news import BasicNewsRecipe
 import re
 from calibre.utils.magick import Image
-from BeautifulSoup import BeautifulSoup
+from calibre.ebooks.BeautifulSoup import BeautifulSoup

 ''' Version 1.2, updated cover image to match the changed website.
 added info date on title
@ -61,13 +61,13 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
    cover_url = 'http://www.readmetro.com/en/holland/metro-holland/image/large/last/'
    publication_type = 'newspaper'
    encoding = 'utf-8'
-    remove_attributes = ['style', 'font', 'width', 'height', 'itemtype', 'itemprop', 'itemscope']#, 'href']
+    remove_attributes = ['style', 'font', 'width', 'height', 'itemtype', 'itemprop', 'itemscope']  # , 'href']
    use_embedded_content = False
    extra_css = 'body{font-size:1em;padding:5px 0}body,a,h2{background-color:#fff;text-decoration:none;color:#000}#date,div.byline,p.article-image-caption .credits,.calibrenavbar,.calibre5{font-size:.5em}.article-box-fact.module-title,#date,div.byline{clear:both}.article-box-fact{font-size:0.7em}.article-box-fact.module-title{margin:8px 0; font-size:0.8em}h2{font-size:1em}h1.title{font-size:1.4em}h1.title,.article-body p,div.article-image-caption-2column,div.article-image-caption-3column,#date,div.byline{margin-bottom:.6em}div.article-box-fact div.subtitle,.article-box-fact.module-title,h1.title,p.article-image-caption{font-weight:700}div.column-1-3{margin-left:19px}div.column-1-2{display:inline}div.column-1-2,div.column-1-3{margin-right:7px}p.article-image-caption{font-size:.6em;margin-top:5px}p.article-image-caption,#date,div.byline{color:#616262}p.article-image-caption .credits{font-style:italic}div.article-image-caption{width:246px}div.article-image-caption-2column{width:373px}div.column-3{background-color:#eee;float:right;width:50%}div.column-3 module-title{border:1px solid #aaa}div.article-box-fact div.subtitle,.article-box-fact.module-title{color:#24763b}div.byline{border-top:2px solid #24763b}div.column-3,img,div.column-3,p.small,div.article-image-caption{margin:.5em}img,p.small,.column1,h2,.calibre5,.calibrenavbar{border:0;padding:0}.column1,h1,h2,.calibrenavbar{margin:0}'

-
    preprocess_regexps = [
-        (re.compile(r'(&nbsp;|\s|<img[^>]+metronieuws\.nl/([^>]+/templates/[^>]+\.jpe?g|internal\-roxen\-unit\.gif)[^>]+>)', re.DOTALL|re.IGNORECASE),lambda match: ' '),
+        (re.compile(r'(&nbsp;|\s|<img[^>]+metronieuws\.nl/([^>]+/templates/[^>]+\.jpe?g|internal\-roxen\-unit\.gif)[^>]+>)',
+         re.DOTALL|re.IGNORECASE),lambda match: ' '),
        #(re.compile(r'(&nbsp;|\s)+', re.DOTALL|re.IGNORECASE),lambda match:' '),
        #(re.compile(r'<(a |/a)[^>]*>', re.DOTALL|re.IGNORECASE),lambda match:'')
        #(re.compile('(</?)h2', re.DOTALL|re.IGNORECASE),lambda match:'\1em')
@ -75,15 +75,17 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):

    remove_tags_before= dict(id='subwrapper')
    remove_tags_after = dict(name='div', attrs={'class':['body-area','article-main-area']})
-#name='div', attrs={'class':['subwrapper']})]
-#'column-1-3','gallery-text']})]#id='share-and-byline')]
+# name='div', attrs={'class':['subwrapper']})]
+# 'column-1-3','gallery-text']})]#id='share-and-byline')]

    filter_regexps = [r'mailto:.*']

    remove_tags = [
        dict(name=['iframe','script','noscript','style']),
-        dict(name='div', attrs={'class':['fact-related-box','aside clearfix','aside clearfix middle-col-line','comments','share-tools','article-right-column','column-4-5','column-1-5','ad-msg','col-179 ','col-373 ','clear','ad','navigation',re.compile('share-tools(-top)?'),'tools','metroCommentFormWrap','article-tools-below-title','related-links','padding-top-15',re.compile('^promo.*?$'),'teaser-component',re.compile('fb(-comments|_iframe_widget)'),'promos','header-links','promo-2']}),
-        dict(id=['super-carousel','article-2','googleads','column-1-5-bottom','column-4-5',re.compile('^ad(\d+|adcomp.*?)?$'),'adadcomp-4','margin-5','sidebar',re.compile('^article-\d'),'comments','gallery-1','sharez_container','ts-container','topshares','ts-title']),
+        dict(name='div', attrs={'class':['fact-related-box','aside clearfix','aside clearfix middle-col-line','comments','share-tools','article-right-column','column-4-5','column-1-5','ad-msg','col-179 ','col-373 ','clear','ad','navigation',re.compile(
+            'share-tools(-top)?'),'tools','metroCommentFormWrap','article-tools-below-title','related-links','padding-top-15',re.compile('^promo.*?$'),'teaser-component',re.compile('fb(-comments|_iframe_widget)'),'promos','header-links','promo-2']}),
+        dict(id=['super-carousel','article-2','googleads','column-1-5-bottom','column-4-5',re.compile('^ad(\d+|adcomp.*?)?$'),'adadcomp-4',
+             'margin-5','sidebar',re.compile('^article-\d'),'comments','gallery-1','sharez_container','ts-container','topshares','ts-title']),
        dict(name='a', attrs={'name':'comments'}),
        #dict(name='div', attrs={'data-href'}),
        dict(name='img', attrs={'class':'top-line','title':'volledig scherm'}),
@ -128,6 +130,7 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
        ]

 class MerryPreProcess():
+
    def optimizePicture(self,soup):
        for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
            try:
@ -142,6 +145,7 @@ class MerryPreProcess():
        return soup

 class MerryExtract():
+
    def safeRemovePart(self, killingSoup, soupIsArray):
        if killingSoup and not killingSoup == None:
            try:
@ -174,15 +178,15 @@ class MerryProcess(BeautifulSoup):
                    self.myKiller.safeRemovePart(part, True)
        articlefacts = soup.find('div', {'class':'article-box-fact column'})
        if (articlefacts and not articlefacts==None):
-          try:
-            contenttag = soup.find('div', {'class':'article-body'})
-            foundrighttag = False
-            if contenttag and not contenttag == None:
-                foundrighttag = True
-            if foundrighttag == True:
-                contenttag.insert(0, allfactsparent)
-          except:
-              pass
+            try:
+                contenttag = soup.find('div', {'class':'article-body'})
+                foundrighttag = False
+                if contenttag and not contenttag == None:
+                    foundrighttag = True
+                if foundrighttag == True:
+                    contenttag.insert(0, allfactsparent)
+            except:
+                pass
        return soup

    def moveTitleAndAuthor(self, soup):
@ -207,7 +211,7 @@ class MerryProcess(BeautifulSoup):
    def removeUnwantedTags(self,soup):
        self.insertFacts(soup)
        self.removeEmptyTags(soup)
-        self.removeArrayOfTags(soup.findAll(attrs={'class': 'share-tools-bottom'})) # at end to keep author
+        self.removeArrayOfTags(soup.findAll(attrs={'class': 'share-tools-bottom'}))  # at end to keep author
        return soup

    def removeArrayOfTags(self,souparray):
@ -215,9 +219,10 @@ class MerryProcess(BeautifulSoup):

    def removeEmptyTags(self,soup,run=0):
        emptymatches = re.compile('^[&nbsp;\s\n\r\t ]*$')
-        emptytags = soup.findAll(lambda tag: tag.find(True) is None and (tag.string is None or tag.string.strip()=="" or tag.string.strip()==emptymatches) and not tag.isSelfClosing)
+        emptytags = soup.findAll(lambda tag: tag.find(True) is None and (
+            tag.string is None or tag.string.strip()=="" or tag.string.strip()==emptymatches) and not tag.isSelfClosing)
        if emptytags and not (emptytags == None or emptytags == []):
            self.removeArrayOfTags(emptytags)
-            #recursive in case removing empty tag creates new empty tag
+            # recursive in case removing empty tag creates new empty tag
            self.removeEmptyTags(soup, run=run)
        return soup
--- a/recipes/revista_muy.recipe
+++ b/recipes/revista_muy.recipe
@ -1,6 +1,6 @@
 from calibre.web.feeds.news import re
 from calibre.web.feeds.recipes import BasicNewsRecipe
-from BeautifulSoup import Tag
+from calibre.ebooks.BeautifulSoup import Tag

 class RevistaMuyInteresante(BasicNewsRecipe):

@ -17,27 +17,25 @@ class RevistaMuyInteresante(BasicNewsRecipe):

    extra_css              = ' .txt_articulo{ font-family: sans-serif; font-size: medium; text-align: justify } .contentheading{font-family: serif; font-size: large; font-weight: bold; color: #000000; text-align: center}'

-
    def preprocess_html(self, soup):
-            for item in soup.findAll(style=True):
-               del item['style']
-
-            for img_tag in soup.findAll('img'):
-                imagen = img_tag
-                new_tag = Tag(soup,'p')
-                img_tag.replaceWith(new_tag)
-                div = soup.find(attrs={'class':'article_category'})
-                div.insert(0,imagen)
-                break
-            return soup
+        for item in soup.findAll(style=True):
+            del item['style']

+        for img_tag in soup.findAll('img'):
+            imagen = img_tag
+            new_tag = Tag(soup,'p')
+            img_tag.replaceWith(new_tag)
+            div = soup.find(attrs={'class':'article_category'})
+            div.insert(0,imagen)
+            break
+        return soup

    preprocess_regexps = [
-        (re.compile(r'<td class="contentheading" width="100%">.*?</td>', re.DOTALL|re.IGNORECASE), lambda match: '<td class="contentheading">' + match.group().replace('<td class="contentheading" width="100%">','').strip().replace('</td>','').strip() + '</td>'),
+        (re.compile(r'<td class="contentheading" width="100%">.*?</td>', re.DOTALL|re.IGNORECASE), lambda match: '<td class="contentheading">' +
+         match.group().replace('<td class="contentheading" width="100%">','').strip().replace('</td>','').strip() + '</td>'),

    ]

-
    keep_only_tags = [dict(name='div', attrs={'class':['article']}),dict(name='td', attrs={'class':['txt_articulo']})]

    remove_tags        = [
@ -51,65 +49,63 @@ class RevistaMuyInteresante(BasicNewsRecipe):

    remove_tags_after = dict(name='div', attrs={'class':'tags_articles'})

-
-    #TO GET ARTICLES IN SECTION
+    # TO GET ARTICLES IN SECTION
    def nz_parse_section(self, url):
-            soup = self.index_to_soup(url)
-            div = soup.find(attrs={'class':'contenido'})
-            current_articles = []
-            for x in div.findAllNext(attrs={'class':['headline']}):
-                    a = x.find('a', href=True)
-                    if a is None:
-                        continue
-                    title = self.tag_to_string(a)
-                    url = a.get('href', False)
-                    if not url or not title:
-                        continue
-                    if url.startswith('/'):
-                         url = 'http://www.muyinteresante.es'+url
+        soup = self.index_to_soup(url)
+        div = soup.find(attrs={'class':'contenido'})
+        current_articles = []
+        for x in div.findAllNext(attrs={'class':['headline']}):
+            a = x.find('a', href=True)
+            if a is None:
+                continue
+            title = self.tag_to_string(a)
+            url = a.get('href', False)
+            if not url or not title:
+                continue
+            if url.startswith('/'):
+                url = 'http://www.muyinteresante.es'+url
 #                    self.log('\t\tFound article:', title)
 #                    self.log('\t\t\t', url)
-                    current_articles.append({'title': title, 'url':url,
-                        'description':'', 'date':''})
-
-            return current_articles
+            current_articles.append({'title': title, 'url':url,
+                'description':'', 'date':''})

+        return current_articles

    # To GET SECTIONS
    def parse_index(self):
-            feeds = []
-            for title, url in [
-                ('Historia',
-                 'http://www.muyinteresante.es/historia-articulos'),
-                ('Ciencia',
-                 'http://www.muyinteresante.es/ciencia-articulos'),
-                ('Naturaleza',
-                 'http://www.muyinteresante.es/naturaleza-articulos'),
-                ('Tecnología',
-                 'http://www.muyinteresante.es/tecnologia-articulos'),
-                ('Salud',
-                 'http://www.muyinteresante.es/salud-articulos'),
-                ('Más Muy',
-                 'http://www.muyinteresante.es/muy'),
-                ('Innova - Automoción',
-                 'http://www.muyinteresante.es/articulos-innovacion-autos'),
-                ('Innova - Salud',
-                 'http://www.muyinteresante.es/articulos-innovacion-salud'),
-                ('Innova - Medio Ambiente',
-                 'http://www.muyinteresante.es/articulos-innovacion-medio-ambiente'),
-                ('Innova - Alimentación',
-                 'http://www.muyinteresante.es/articulos-innovacion-alimentacion'),
-                ('Innova - Sociedad',
-                 'http://www.muyinteresante.es/articulos-innovacion-sociedad'),
-                ('Innova - Tecnología',
-                 'http://www.muyinteresante.es/articulos-innovacion-tecnologia'),
-                ('Innova - Ocio',
-                 'http://www.muyinteresante.es/articulos-innovacion-ocio'),
-             ]:
-               articles = self.nz_parse_section(url)
-               if articles:
-                   feeds.append((title, articles))
-            return feeds
+        feeds = []
+        for title, url in [
+            ('Historia',
+             'http://www.muyinteresante.es/historia-articulos'),
+            ('Ciencia',
+             'http://www.muyinteresante.es/ciencia-articulos'),
+            ('Naturaleza',
+             'http://www.muyinteresante.es/naturaleza-articulos'),
+            ('Tecnología',
+             'http://www.muyinteresante.es/tecnologia-articulos'),
+            ('Salud',
+             'http://www.muyinteresante.es/salud-articulos'),
+            ('Más Muy',
+             'http://www.muyinteresante.es/muy'),
+            ('Innova - Automoción',
+             'http://www.muyinteresante.es/articulos-innovacion-autos'),
+            ('Innova - Salud',
+             'http://www.muyinteresante.es/articulos-innovacion-salud'),
+            ('Innova - Medio Ambiente',
+             'http://www.muyinteresante.es/articulos-innovacion-medio-ambiente'),
+            ('Innova - Alimentación',
+             'http://www.muyinteresante.es/articulos-innovacion-alimentacion'),
+            ('Innova - Sociedad',
+             'http://www.muyinteresante.es/articulos-innovacion-sociedad'),
+            ('Innova - Tecnología',
+             'http://www.muyinteresante.es/articulos-innovacion-tecnologia'),
+            ('Innova - Ocio',
+             'http://www.muyinteresante.es/articulos-innovacion-ocio'),
+         ]:
+            articles = self.nz_parse_section(url)
+            if articles:
+                feeds.append((title, articles))
+        return feeds

    def get_cover_url(self):
        index = 'http://www.muyinteresante.es/revista'
@ -118,5 +114,3 @@ class RevistaMuyInteresante(BasicNewsRecipe):
        if link_item:
            cover_url = "http://www.muyinteresante.es"+link_item['src']
        return cover_url
-
-