Fix some recipes importing non-calibre BeautifulSoup

Fixes #1489658 [dependency on BeautifulSoup3?](https://bugs.launchpad.net/calibre/+bug/1489658)
This commit is contained in:
Kovid Goyal 2015-08-28 07:37:07 +05:30
parent eba6551580
commit e64f766890
2 changed files with 88 additions and 89 deletions

View File

@ -2,7 +2,7 @@
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
import re import re
from calibre.utils.magick import Image from calibre.utils.magick import Image
from BeautifulSoup import BeautifulSoup from calibre.ebooks.BeautifulSoup import BeautifulSoup
''' Version 1.2, updated cover image to match the changed website. ''' Version 1.2, updated cover image to match the changed website.
added info date on title added info date on title
@ -61,13 +61,13 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
cover_url = 'http://www.readmetro.com/en/holland/metro-holland/image/large/last/' cover_url = 'http://www.readmetro.com/en/holland/metro-holland/image/large/last/'
publication_type = 'newspaper' publication_type = 'newspaper'
encoding = 'utf-8' encoding = 'utf-8'
remove_attributes = ['style', 'font', 'width', 'height', 'itemtype', 'itemprop', 'itemscope']#, 'href'] remove_attributes = ['style', 'font', 'width', 'height', 'itemtype', 'itemprop', 'itemscope'] # , 'href']
use_embedded_content = False use_embedded_content = False
extra_css = 'body{font-size:1em;padding:5px 0}body,a,h2{background-color:#fff;text-decoration:none;color:#000}#date,div.byline,p.article-image-caption .credits,.calibrenavbar,.calibre5{font-size:.5em}.article-box-fact.module-title,#date,div.byline{clear:both}.article-box-fact{font-size:0.7em}.article-box-fact.module-title{margin:8px 0; font-size:0.8em}h2{font-size:1em}h1.title{font-size:1.4em}h1.title,.article-body p,div.article-image-caption-2column,div.article-image-caption-3column,#date,div.byline{margin-bottom:.6em}div.article-box-fact div.subtitle,.article-box-fact.module-title,h1.title,p.article-image-caption{font-weight:700}div.column-1-3{margin-left:19px}div.column-1-2{display:inline}div.column-1-2,div.column-1-3{margin-right:7px}p.article-image-caption{font-size:.6em;margin-top:5px}p.article-image-caption,#date,div.byline{color:#616262}p.article-image-caption .credits{font-style:italic}div.article-image-caption{width:246px}div.article-image-caption-2column{width:373px}div.column-3{background-color:#eee;float:right;width:50%}div.column-3 module-title{border:1px solid #aaa}div.article-box-fact div.subtitle,.article-box-fact.module-title{color:#24763b}div.byline{border-top:2px solid #24763b}div.column-3,img,div.column-3,p.small,div.article-image-caption{margin:.5em}img,p.small,.column1,h2,.calibre5,.calibrenavbar{border:0;padding:0}.column1,h1,h2,.calibrenavbar{margin:0}' extra_css = 'body{font-size:1em;padding:5px 0}body,a,h2{background-color:#fff;text-decoration:none;color:#000}#date,div.byline,p.article-image-caption .credits,.calibrenavbar,.calibre5{font-size:.5em}.article-box-fact.module-title,#date,div.byline{clear:both}.article-box-fact{font-size:0.7em}.article-box-fact.module-title{margin:8px 0; font-size:0.8em}h2{font-size:1em}h1.title{font-size:1.4em}h1.title,.article-body p,div.article-image-caption-2column,div.article-image-caption-3column,#date,div.byline{margin-bottom:.6em}div.article-box-fact div.subtitle,.article-box-fact.module-title,h1.title,p.article-image-caption{font-weight:700}div.column-1-3{margin-left:19px}div.column-1-2{display:inline}div.column-1-2,div.column-1-3{margin-right:7px}p.article-image-caption{font-size:.6em;margin-top:5px}p.article-image-caption,#date,div.byline{color:#616262}p.article-image-caption .credits{font-style:italic}div.article-image-caption{width:246px}div.article-image-caption-2column{width:373px}div.column-3{background-color:#eee;float:right;width:50%}div.column-3 module-title{border:1px solid #aaa}div.article-box-fact div.subtitle,.article-box-fact.module-title{color:#24763b}div.byline{border-top:2px solid #24763b}div.column-3,img,div.column-3,p.small,div.article-image-caption{margin:.5em}img,p.small,.column1,h2,.calibre5,.calibrenavbar{border:0;padding:0}.column1,h1,h2,.calibrenavbar{margin:0}'
preprocess_regexps = [ preprocess_regexps = [
(re.compile(r'(&nbsp;|\s|<img[^>]+metronieuws\.nl/([^>]+/templates/[^>]+\.jpe?g|internal\-roxen\-unit\.gif)[^>]+>)', re.DOTALL|re.IGNORECASE),lambda match: ' '), (re.compile(r'(&nbsp;|\s|<img[^>]+metronieuws\.nl/([^>]+/templates/[^>]+\.jpe?g|internal\-roxen\-unit\.gif)[^>]+>)',
re.DOTALL|re.IGNORECASE),lambda match: ' '),
#(re.compile(r'(&nbsp;|\s)+', re.DOTALL|re.IGNORECASE),lambda match:' '), #(re.compile(r'(&nbsp;|\s)+', re.DOTALL|re.IGNORECASE),lambda match:' '),
#(re.compile(r'<(a |/a)[^>]*>', re.DOTALL|re.IGNORECASE),lambda match:'') #(re.compile(r'<(a |/a)[^>]*>', re.DOTALL|re.IGNORECASE),lambda match:'')
#(re.compile('(</?)h2', re.DOTALL|re.IGNORECASE),lambda match:'\1em') #(re.compile('(</?)h2', re.DOTALL|re.IGNORECASE),lambda match:'\1em')
@ -75,15 +75,17 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
remove_tags_before= dict(id='subwrapper') remove_tags_before= dict(id='subwrapper')
remove_tags_after = dict(name='div', attrs={'class':['body-area','article-main-area']}) remove_tags_after = dict(name='div', attrs={'class':['body-area','article-main-area']})
#name='div', attrs={'class':['subwrapper']})] # name='div', attrs={'class':['subwrapper']})]
#'column-1-3','gallery-text']})]#id='share-and-byline')] # 'column-1-3','gallery-text']})]#id='share-and-byline')]
filter_regexps = [r'mailto:.*'] filter_regexps = [r'mailto:.*']
remove_tags = [ remove_tags = [
dict(name=['iframe','script','noscript','style']), dict(name=['iframe','script','noscript','style']),
dict(name='div', attrs={'class':['fact-related-box','aside clearfix','aside clearfix middle-col-line','comments','share-tools','article-right-column','column-4-5','column-1-5','ad-msg','col-179 ','col-373 ','clear','ad','navigation',re.compile('share-tools(-top)?'),'tools','metroCommentFormWrap','article-tools-below-title','related-links','padding-top-15',re.compile('^promo.*?$'),'teaser-component',re.compile('fb(-comments|_iframe_widget)'),'promos','header-links','promo-2']}), dict(name='div', attrs={'class':['fact-related-box','aside clearfix','aside clearfix middle-col-line','comments','share-tools','article-right-column','column-4-5','column-1-5','ad-msg','col-179 ','col-373 ','clear','ad','navigation',re.compile(
dict(id=['super-carousel','article-2','googleads','column-1-5-bottom','column-4-5',re.compile('^ad(\d+|adcomp.*?)?$'),'adadcomp-4','margin-5','sidebar',re.compile('^article-\d'),'comments','gallery-1','sharez_container','ts-container','topshares','ts-title']), 'share-tools(-top)?'),'tools','metroCommentFormWrap','article-tools-below-title','related-links','padding-top-15',re.compile('^promo.*?$'),'teaser-component',re.compile('fb(-comments|_iframe_widget)'),'promos','header-links','promo-2']}),
dict(id=['super-carousel','article-2','googleads','column-1-5-bottom','column-4-5',re.compile('^ad(\d+|adcomp.*?)?$'),'adadcomp-4',
'margin-5','sidebar',re.compile('^article-\d'),'comments','gallery-1','sharez_container','ts-container','topshares','ts-title']),
dict(name='a', attrs={'name':'comments'}), dict(name='a', attrs={'name':'comments'}),
#dict(name='div', attrs={'data-href'}), #dict(name='div', attrs={'data-href'}),
dict(name='img', attrs={'class':'top-line','title':'volledig scherm'}), dict(name='img', attrs={'class':'top-line','title':'volledig scherm'}),
@ -128,6 +130,7 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
] ]
class MerryPreProcess(): class MerryPreProcess():
def optimizePicture(self,soup): def optimizePicture(self,soup):
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')): for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
try: try:
@ -142,6 +145,7 @@ class MerryPreProcess():
return soup return soup
class MerryExtract(): class MerryExtract():
def safeRemovePart(self, killingSoup, soupIsArray): def safeRemovePart(self, killingSoup, soupIsArray):
if killingSoup and not killingSoup == None: if killingSoup and not killingSoup == None:
try: try:
@ -174,15 +178,15 @@ class MerryProcess(BeautifulSoup):
self.myKiller.safeRemovePart(part, True) self.myKiller.safeRemovePart(part, True)
articlefacts = soup.find('div', {'class':'article-box-fact column'}) articlefacts = soup.find('div', {'class':'article-box-fact column'})
if (articlefacts and not articlefacts==None): if (articlefacts and not articlefacts==None):
try: try:
contenttag = soup.find('div', {'class':'article-body'}) contenttag = soup.find('div', {'class':'article-body'})
foundrighttag = False foundrighttag = False
if contenttag and not contenttag == None: if contenttag and not contenttag == None:
foundrighttag = True foundrighttag = True
if foundrighttag == True: if foundrighttag == True:
contenttag.insert(0, allfactsparent) contenttag.insert(0, allfactsparent)
except: except:
pass pass
return soup return soup
def moveTitleAndAuthor(self, soup): def moveTitleAndAuthor(self, soup):
@ -207,7 +211,7 @@ class MerryProcess(BeautifulSoup):
def removeUnwantedTags(self,soup): def removeUnwantedTags(self,soup):
self.insertFacts(soup) self.insertFacts(soup)
self.removeEmptyTags(soup) self.removeEmptyTags(soup)
self.removeArrayOfTags(soup.findAll(attrs={'class': 'share-tools-bottom'})) # at end to keep author self.removeArrayOfTags(soup.findAll(attrs={'class': 'share-tools-bottom'})) # at end to keep author
return soup return soup
def removeArrayOfTags(self,souparray): def removeArrayOfTags(self,souparray):
@ -215,9 +219,10 @@ class MerryProcess(BeautifulSoup):
def removeEmptyTags(self,soup,run=0): def removeEmptyTags(self,soup,run=0):
emptymatches = re.compile('^[&nbsp;\s\n\r\t ]*$') emptymatches = re.compile('^[&nbsp;\s\n\r\t ]*$')
emptytags = soup.findAll(lambda tag: tag.find(True) is None and (tag.string is None or tag.string.strip()=="" or tag.string.strip()==emptymatches) and not tag.isSelfClosing) emptytags = soup.findAll(lambda tag: tag.find(True) is None and (
tag.string is None or tag.string.strip()=="" or tag.string.strip()==emptymatches) and not tag.isSelfClosing)
if emptytags and not (emptytags == None or emptytags == []): if emptytags and not (emptytags == None or emptytags == []):
self.removeArrayOfTags(emptytags) self.removeArrayOfTags(emptytags)
#recursive in case removing empty tag creates new empty tag # recursive in case removing empty tag creates new empty tag
self.removeEmptyTags(soup, run=run) self.removeEmptyTags(soup, run=run)
return soup return soup

View File

@ -1,6 +1,6 @@
from calibre.web.feeds.news import re from calibre.web.feeds.news import re
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from BeautifulSoup import Tag from calibre.ebooks.BeautifulSoup import Tag
class RevistaMuyInteresante(BasicNewsRecipe): class RevistaMuyInteresante(BasicNewsRecipe):
@ -17,27 +17,25 @@ class RevistaMuyInteresante(BasicNewsRecipe):
extra_css = ' .txt_articulo{ font-family: sans-serif; font-size: medium; text-align: justify } .contentheading{font-family: serif; font-size: large; font-weight: bold; color: #000000; text-align: center}' extra_css = ' .txt_articulo{ font-family: sans-serif; font-size: medium; text-align: justify } .contentheading{font-family: serif; font-size: large; font-weight: bold; color: #000000; text-align: center}'
def preprocess_html(self, soup): def preprocess_html(self, soup):
for item in soup.findAll(style=True): for item in soup.findAll(style=True):
del item['style'] del item['style']
for img_tag in soup.findAll('img'):
imagen = img_tag
new_tag = Tag(soup,'p')
img_tag.replaceWith(new_tag)
div = soup.find(attrs={'class':'article_category'})
div.insert(0,imagen)
break
return soup
for img_tag in soup.findAll('img'):
imagen = img_tag
new_tag = Tag(soup,'p')
img_tag.replaceWith(new_tag)
div = soup.find(attrs={'class':'article_category'})
div.insert(0,imagen)
break
return soup
preprocess_regexps = [ preprocess_regexps = [
(re.compile(r'<td class="contentheading" width="100%">.*?</td>', re.DOTALL|re.IGNORECASE), lambda match: '<td class="contentheading">' + match.group().replace('<td class="contentheading" width="100%">','').strip().replace('</td>','').strip() + '</td>'), (re.compile(r'<td class="contentheading" width="100%">.*?</td>', re.DOTALL|re.IGNORECASE), lambda match: '<td class="contentheading">' +
match.group().replace('<td class="contentheading" width="100%">','').strip().replace('</td>','').strip() + '</td>'),
] ]
keep_only_tags = [dict(name='div', attrs={'class':['article']}),dict(name='td', attrs={'class':['txt_articulo']})] keep_only_tags = [dict(name='div', attrs={'class':['article']}),dict(name='td', attrs={'class':['txt_articulo']})]
remove_tags = [ remove_tags = [
@ -51,65 +49,63 @@ class RevistaMuyInteresante(BasicNewsRecipe):
remove_tags_after = dict(name='div', attrs={'class':'tags_articles'}) remove_tags_after = dict(name='div', attrs={'class':'tags_articles'})
# TO GET ARTICLES IN SECTION
#TO GET ARTICLES IN SECTION
def nz_parse_section(self, url): def nz_parse_section(self, url):
soup = self.index_to_soup(url) soup = self.index_to_soup(url)
div = soup.find(attrs={'class':'contenido'}) div = soup.find(attrs={'class':'contenido'})
current_articles = [] current_articles = []
for x in div.findAllNext(attrs={'class':['headline']}): for x in div.findAllNext(attrs={'class':['headline']}):
a = x.find('a', href=True) a = x.find('a', href=True)
if a is None: if a is None:
continue continue
title = self.tag_to_string(a) title = self.tag_to_string(a)
url = a.get('href', False) url = a.get('href', False)
if not url or not title: if not url or not title:
continue continue
if url.startswith('/'): if url.startswith('/'):
url = 'http://www.muyinteresante.es'+url url = 'http://www.muyinteresante.es'+url
# self.log('\t\tFound article:', title) # self.log('\t\tFound article:', title)
# self.log('\t\t\t', url) # self.log('\t\t\t', url)
current_articles.append({'title': title, 'url':url, current_articles.append({'title': title, 'url':url,
'description':'', 'date':''}) 'description':'', 'date':''})
return current_articles
return current_articles
# To GET SECTIONS # To GET SECTIONS
def parse_index(self): def parse_index(self):
feeds = [] feeds = []
for title, url in [ for title, url in [
('Historia', ('Historia',
'http://www.muyinteresante.es/historia-articulos'), 'http://www.muyinteresante.es/historia-articulos'),
('Ciencia', ('Ciencia',
'http://www.muyinteresante.es/ciencia-articulos'), 'http://www.muyinteresante.es/ciencia-articulos'),
('Naturaleza', ('Naturaleza',
'http://www.muyinteresante.es/naturaleza-articulos'), 'http://www.muyinteresante.es/naturaleza-articulos'),
('Tecnología', ('Tecnología',
'http://www.muyinteresante.es/tecnologia-articulos'), 'http://www.muyinteresante.es/tecnologia-articulos'),
('Salud', ('Salud',
'http://www.muyinteresante.es/salud-articulos'), 'http://www.muyinteresante.es/salud-articulos'),
('Más Muy', ('Más Muy',
'http://www.muyinteresante.es/muy'), 'http://www.muyinteresante.es/muy'),
('Innova - Automoción', ('Innova - Automoción',
'http://www.muyinteresante.es/articulos-innovacion-autos'), 'http://www.muyinteresante.es/articulos-innovacion-autos'),
('Innova - Salud', ('Innova - Salud',
'http://www.muyinteresante.es/articulos-innovacion-salud'), 'http://www.muyinteresante.es/articulos-innovacion-salud'),
('Innova - Medio Ambiente', ('Innova - Medio Ambiente',
'http://www.muyinteresante.es/articulos-innovacion-medio-ambiente'), 'http://www.muyinteresante.es/articulos-innovacion-medio-ambiente'),
('Innova - Alimentación', ('Innova - Alimentación',
'http://www.muyinteresante.es/articulos-innovacion-alimentacion'), 'http://www.muyinteresante.es/articulos-innovacion-alimentacion'),
('Innova - Sociedad', ('Innova - Sociedad',
'http://www.muyinteresante.es/articulos-innovacion-sociedad'), 'http://www.muyinteresante.es/articulos-innovacion-sociedad'),
('Innova - Tecnología', ('Innova - Tecnología',
'http://www.muyinteresante.es/articulos-innovacion-tecnologia'), 'http://www.muyinteresante.es/articulos-innovacion-tecnologia'),
('Innova - Ocio', ('Innova - Ocio',
'http://www.muyinteresante.es/articulos-innovacion-ocio'), 'http://www.muyinteresante.es/articulos-innovacion-ocio'),
]: ]:
articles = self.nz_parse_section(url) articles = self.nz_parse_section(url)
if articles: if articles:
feeds.append((title, articles)) feeds.append((title, articles))
return feeds return feeds
def get_cover_url(self): def get_cover_url(self):
index = 'http://www.muyinteresante.es/revista' index = 'http://www.muyinteresante.es/revista'
@ -118,5 +114,3 @@ class RevistaMuyInteresante(BasicNewsRecipe):
if link_item: if link_item:
cover_url = "http://www.muyinteresante.es"+link_item['src'] cover_url = "http://www.muyinteresante.es"+link_item['src']
return cover_url return cover_url