Fix some recipes importing non-calibre BeautifulSoup

Fixes #1489658 [dependency on BeautifulSoup3?](https://bugs.launchpad.net/calibre/+bug/1489658)
This commit is contained in:
Kovid Goyal 2015-08-28 07:37:07 +05:30
parent eba6551580
commit e64f766890
2 changed files with 88 additions and 89 deletions

View File

@ -2,7 +2,7 @@
from calibre.web.feeds.news import BasicNewsRecipe
import re
from calibre.utils.magick import Image
from BeautifulSoup import BeautifulSoup
from calibre.ebooks.BeautifulSoup import BeautifulSoup
''' Version 1.2, updated cover image to match the changed website.
added info date on title
@ -65,9 +65,9 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
use_embedded_content = False
extra_css = 'body{font-size:1em;padding:5px 0}body,a,h2{background-color:#fff;text-decoration:none;color:#000}#date,div.byline,p.article-image-caption .credits,.calibrenavbar,.calibre5{font-size:.5em}.article-box-fact.module-title,#date,div.byline{clear:both}.article-box-fact{font-size:0.7em}.article-box-fact.module-title{margin:8px 0; font-size:0.8em}h2{font-size:1em}h1.title{font-size:1.4em}h1.title,.article-body p,div.article-image-caption-2column,div.article-image-caption-3column,#date,div.byline{margin-bottom:.6em}div.article-box-fact div.subtitle,.article-box-fact.module-title,h1.title,p.article-image-caption{font-weight:700}div.column-1-3{margin-left:19px}div.column-1-2{display:inline}div.column-1-2,div.column-1-3{margin-right:7px}p.article-image-caption{font-size:.6em;margin-top:5px}p.article-image-caption,#date,div.byline{color:#616262}p.article-image-caption .credits{font-style:italic}div.article-image-caption{width:246px}div.article-image-caption-2column{width:373px}div.column-3{background-color:#eee;float:right;width:50%}div.column-3 module-title{border:1px solid #aaa}div.article-box-fact div.subtitle,.article-box-fact.module-title{color:#24763b}div.byline{border-top:2px solid #24763b}div.column-3,img,div.column-3,p.small,div.article-image-caption{margin:.5em}img,p.small,.column1,h2,.calibre5,.calibrenavbar{border:0;padding:0}.column1,h1,h2,.calibrenavbar{margin:0}'
preprocess_regexps = [
(re.compile(r'(&nbsp;|\s|<img[^>]+metronieuws\.nl/([^>]+/templates/[^>]+\.jpe?g|internal\-roxen\-unit\.gif)[^>]+>)', re.DOTALL|re.IGNORECASE),lambda match: ' '),
(re.compile(r'(&nbsp;|\s|<img[^>]+metronieuws\.nl/([^>]+/templates/[^>]+\.jpe?g|internal\-roxen\-unit\.gif)[^>]+>)',
re.DOTALL|re.IGNORECASE),lambda match: ' '),
#(re.compile(r'(&nbsp;|\s)+', re.DOTALL|re.IGNORECASE),lambda match:' '),
#(re.compile(r'<(a |/a)[^>]*>', re.DOTALL|re.IGNORECASE),lambda match:'')
#(re.compile('(</?)h2', re.DOTALL|re.IGNORECASE),lambda match:'\1em')
@ -82,8 +82,10 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
remove_tags = [
dict(name=['iframe','script','noscript','style']),
dict(name='div', attrs={'class':['fact-related-box','aside clearfix','aside clearfix middle-col-line','comments','share-tools','article-right-column','column-4-5','column-1-5','ad-msg','col-179 ','col-373 ','clear','ad','navigation',re.compile('share-tools(-top)?'),'tools','metroCommentFormWrap','article-tools-below-title','related-links','padding-top-15',re.compile('^promo.*?$'),'teaser-component',re.compile('fb(-comments|_iframe_widget)'),'promos','header-links','promo-2']}),
dict(id=['super-carousel','article-2','googleads','column-1-5-bottom','column-4-5',re.compile('^ad(\d+|adcomp.*?)?$'),'adadcomp-4','margin-5','sidebar',re.compile('^article-\d'),'comments','gallery-1','sharez_container','ts-container','topshares','ts-title']),
dict(name='div', attrs={'class':['fact-related-box','aside clearfix','aside clearfix middle-col-line','comments','share-tools','article-right-column','column-4-5','column-1-5','ad-msg','col-179 ','col-373 ','clear','ad','navigation',re.compile(
'share-tools(-top)?'),'tools','metroCommentFormWrap','article-tools-below-title','related-links','padding-top-15',re.compile('^promo.*?$'),'teaser-component',re.compile('fb(-comments|_iframe_widget)'),'promos','header-links','promo-2']}),
dict(id=['super-carousel','article-2','googleads','column-1-5-bottom','column-4-5',re.compile('^ad(\d+|adcomp.*?)?$'),'adadcomp-4',
'margin-5','sidebar',re.compile('^article-\d'),'comments','gallery-1','sharez_container','ts-container','topshares','ts-title']),
dict(name='a', attrs={'name':'comments'}),
#dict(name='div', attrs={'data-href'}),
dict(name='img', attrs={'class':'top-line','title':'volledig scherm'}),
@ -128,6 +130,7 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
]
class MerryPreProcess():
def optimizePicture(self,soup):
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
try:
@ -142,6 +145,7 @@ class MerryPreProcess():
return soup
class MerryExtract():
def safeRemovePart(self, killingSoup, soupIsArray):
if killingSoup and not killingSoup == None:
try:
@ -215,7 +219,8 @@ class MerryProcess(BeautifulSoup):
def removeEmptyTags(self,soup,run=0):
emptymatches = re.compile('^[&nbsp;\s\n\r\t ]*$')
emptytags = soup.findAll(lambda tag: tag.find(True) is None and (tag.string is None or tag.string.strip()=="" or tag.string.strip()==emptymatches) and not tag.isSelfClosing)
emptytags = soup.findAll(lambda tag: tag.find(True) is None and (
tag.string is None or tag.string.strip()=="" or tag.string.strip()==emptymatches) and not tag.isSelfClosing)
if emptytags and not (emptytags == None or emptytags == []):
self.removeArrayOfTags(emptytags)
# recursive in case removing empty tag creates new empty tag

View File

@ -1,6 +1,6 @@
from calibre.web.feeds.news import re
from calibre.web.feeds.recipes import BasicNewsRecipe
from BeautifulSoup import Tag
from calibre.ebooks.BeautifulSoup import Tag
class RevistaMuyInteresante(BasicNewsRecipe):
@ -17,7 +17,6 @@ class RevistaMuyInteresante(BasicNewsRecipe):
extra_css = ' .txt_articulo{ font-family: sans-serif; font-size: medium; text-align: justify } .contentheading{font-family: serif; font-size: large; font-weight: bold; color: #000000; text-align: center}'
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
@ -31,13 +30,12 @@ class RevistaMuyInteresante(BasicNewsRecipe):
break
return soup
preprocess_regexps = [
(re.compile(r'<td class="contentheading" width="100%">.*?</td>', re.DOTALL|re.IGNORECASE), lambda match: '<td class="contentheading">' + match.group().replace('<td class="contentheading" width="100%">','').strip().replace('</td>','').strip() + '</td>'),
(re.compile(r'<td class="contentheading" width="100%">.*?</td>', re.DOTALL|re.IGNORECASE), lambda match: '<td class="contentheading">' +
match.group().replace('<td class="contentheading" width="100%">','').strip().replace('</td>','').strip() + '</td>'),
]
keep_only_tags = [dict(name='div', attrs={'class':['article']}),dict(name='td', attrs={'class':['txt_articulo']})]
remove_tags = [
@ -51,7 +49,6 @@ class RevistaMuyInteresante(BasicNewsRecipe):
remove_tags_after = dict(name='div', attrs={'class':'tags_articles'})
# TO GET ARTICLES IN SECTION
def nz_parse_section(self, url):
soup = self.index_to_soup(url)
@ -74,7 +71,6 @@ class RevistaMuyInteresante(BasicNewsRecipe):
return current_articles
# To GET SECTIONS
def parse_index(self):
feeds = []
@ -118,5 +114,3 @@ class RevistaMuyInteresante(BasicNewsRecipe):
if link_item:
cover_url = "http://www.muyinteresante.es"+link_item['src']
return cover_url