mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 10:14:46 -04:00
Fix some recipes importing non-calibre BeautifulSoup
Fixes #1489658 [dependency on BeautifulSoup3?](https://bugs.launchpad.net/calibre/+bug/1489658)
This commit is contained in:
parent
eba6551580
commit
e64f766890
@ -2,7 +2,7 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
import re
|
import re
|
||||||
from calibre.utils.magick import Image
|
from calibre.utils.magick import Image
|
||||||
from BeautifulSoup import BeautifulSoup
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
|
|
||||||
''' Version 1.2, updated cover image to match the changed website.
|
''' Version 1.2, updated cover image to match the changed website.
|
||||||
added info date on title
|
added info date on title
|
||||||
@ -61,13 +61,13 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
|
|||||||
cover_url = 'http://www.readmetro.com/en/holland/metro-holland/image/large/last/'
|
cover_url = 'http://www.readmetro.com/en/holland/metro-holland/image/large/last/'
|
||||||
publication_type = 'newspaper'
|
publication_type = 'newspaper'
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
remove_attributes = ['style', 'font', 'width', 'height', 'itemtype', 'itemprop', 'itemscope']#, 'href']
|
remove_attributes = ['style', 'font', 'width', 'height', 'itemtype', 'itemprop', 'itemscope'] # , 'href']
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
extra_css = 'body{font-size:1em;padding:5px 0}body,a,h2{background-color:#fff;text-decoration:none;color:#000}#date,div.byline,p.article-image-caption .credits,.calibrenavbar,.calibre5{font-size:.5em}.article-box-fact.module-title,#date,div.byline{clear:both}.article-box-fact{font-size:0.7em}.article-box-fact.module-title{margin:8px 0; font-size:0.8em}h2{font-size:1em}h1.title{font-size:1.4em}h1.title,.article-body p,div.article-image-caption-2column,div.article-image-caption-3column,#date,div.byline{margin-bottom:.6em}div.article-box-fact div.subtitle,.article-box-fact.module-title,h1.title,p.article-image-caption{font-weight:700}div.column-1-3{margin-left:19px}div.column-1-2{display:inline}div.column-1-2,div.column-1-3{margin-right:7px}p.article-image-caption{font-size:.6em;margin-top:5px}p.article-image-caption,#date,div.byline{color:#616262}p.article-image-caption .credits{font-style:italic}div.article-image-caption{width:246px}div.article-image-caption-2column{width:373px}div.column-3{background-color:#eee;float:right;width:50%}div.column-3 module-title{border:1px solid #aaa}div.article-box-fact div.subtitle,.article-box-fact.module-title{color:#24763b}div.byline{border-top:2px solid #24763b}div.column-3,img,div.column-3,p.small,div.article-image-caption{margin:.5em}img,p.small,.column1,h2,.calibre5,.calibrenavbar{border:0;padding:0}.column1,h1,h2,.calibrenavbar{margin:0}'
|
extra_css = 'body{font-size:1em;padding:5px 0}body,a,h2{background-color:#fff;text-decoration:none;color:#000}#date,div.byline,p.article-image-caption .credits,.calibrenavbar,.calibre5{font-size:.5em}.article-box-fact.module-title,#date,div.byline{clear:both}.article-box-fact{font-size:0.7em}.article-box-fact.module-title{margin:8px 0; font-size:0.8em}h2{font-size:1em}h1.title{font-size:1.4em}h1.title,.article-body p,div.article-image-caption-2column,div.article-image-caption-3column,#date,div.byline{margin-bottom:.6em}div.article-box-fact div.subtitle,.article-box-fact.module-title,h1.title,p.article-image-caption{font-weight:700}div.column-1-3{margin-left:19px}div.column-1-2{display:inline}div.column-1-2,div.column-1-3{margin-right:7px}p.article-image-caption{font-size:.6em;margin-top:5px}p.article-image-caption,#date,div.byline{color:#616262}p.article-image-caption .credits{font-style:italic}div.article-image-caption{width:246px}div.article-image-caption-2column{width:373px}div.column-3{background-color:#eee;float:right;width:50%}div.column-3 module-title{border:1px solid #aaa}div.article-box-fact div.subtitle,.article-box-fact.module-title{color:#24763b}div.byline{border-top:2px solid #24763b}div.column-3,img,div.column-3,p.small,div.article-image-caption{margin:.5em}img,p.small,.column1,h2,.calibre5,.calibrenavbar{border:0;padding:0}.column1,h1,h2,.calibrenavbar{margin:0}'
|
||||||
|
|
||||||
|
|
||||||
preprocess_regexps = [
|
preprocess_regexps = [
|
||||||
(re.compile(r'( |\s|<img[^>]+metronieuws\.nl/([^>]+/templates/[^>]+\.jpe?g|internal\-roxen\-unit\.gif)[^>]+>)', re.DOTALL|re.IGNORECASE),lambda match: ' '),
|
(re.compile(r'( |\s|<img[^>]+metronieuws\.nl/([^>]+/templates/[^>]+\.jpe?g|internal\-roxen\-unit\.gif)[^>]+>)',
|
||||||
|
re.DOTALL|re.IGNORECASE),lambda match: ' '),
|
||||||
#(re.compile(r'( |\s)+', re.DOTALL|re.IGNORECASE),lambda match:' '),
|
#(re.compile(r'( |\s)+', re.DOTALL|re.IGNORECASE),lambda match:' '),
|
||||||
#(re.compile(r'<(a |/a)[^>]*>', re.DOTALL|re.IGNORECASE),lambda match:'')
|
#(re.compile(r'<(a |/a)[^>]*>', re.DOTALL|re.IGNORECASE),lambda match:'')
|
||||||
#(re.compile('(</?)h2', re.DOTALL|re.IGNORECASE),lambda match:'\1em')
|
#(re.compile('(</?)h2', re.DOTALL|re.IGNORECASE),lambda match:'\1em')
|
||||||
@ -75,15 +75,17 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
|
|||||||
|
|
||||||
remove_tags_before= dict(id='subwrapper')
|
remove_tags_before= dict(id='subwrapper')
|
||||||
remove_tags_after = dict(name='div', attrs={'class':['body-area','article-main-area']})
|
remove_tags_after = dict(name='div', attrs={'class':['body-area','article-main-area']})
|
||||||
#name='div', attrs={'class':['subwrapper']})]
|
# name='div', attrs={'class':['subwrapper']})]
|
||||||
#'column-1-3','gallery-text']})]#id='share-and-byline')]
|
# 'column-1-3','gallery-text']})]#id='share-and-byline')]
|
||||||
|
|
||||||
filter_regexps = [r'mailto:.*']
|
filter_regexps = [r'mailto:.*']
|
||||||
|
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(name=['iframe','script','noscript','style']),
|
dict(name=['iframe','script','noscript','style']),
|
||||||
dict(name='div', attrs={'class':['fact-related-box','aside clearfix','aside clearfix middle-col-line','comments','share-tools','article-right-column','column-4-5','column-1-5','ad-msg','col-179 ','col-373 ','clear','ad','navigation',re.compile('share-tools(-top)?'),'tools','metroCommentFormWrap','article-tools-below-title','related-links','padding-top-15',re.compile('^promo.*?$'),'teaser-component',re.compile('fb(-comments|_iframe_widget)'),'promos','header-links','promo-2']}),
|
dict(name='div', attrs={'class':['fact-related-box','aside clearfix','aside clearfix middle-col-line','comments','share-tools','article-right-column','column-4-5','column-1-5','ad-msg','col-179 ','col-373 ','clear','ad','navigation',re.compile(
|
||||||
dict(id=['super-carousel','article-2','googleads','column-1-5-bottom','column-4-5',re.compile('^ad(\d+|adcomp.*?)?$'),'adadcomp-4','margin-5','sidebar',re.compile('^article-\d'),'comments','gallery-1','sharez_container','ts-container','topshares','ts-title']),
|
'share-tools(-top)?'),'tools','metroCommentFormWrap','article-tools-below-title','related-links','padding-top-15',re.compile('^promo.*?$'),'teaser-component',re.compile('fb(-comments|_iframe_widget)'),'promos','header-links','promo-2']}),
|
||||||
|
dict(id=['super-carousel','article-2','googleads','column-1-5-bottom','column-4-5',re.compile('^ad(\d+|adcomp.*?)?$'),'adadcomp-4',
|
||||||
|
'margin-5','sidebar',re.compile('^article-\d'),'comments','gallery-1','sharez_container','ts-container','topshares','ts-title']),
|
||||||
dict(name='a', attrs={'name':'comments'}),
|
dict(name='a', attrs={'name':'comments'}),
|
||||||
#dict(name='div', attrs={'data-href'}),
|
#dict(name='div', attrs={'data-href'}),
|
||||||
dict(name='img', attrs={'class':'top-line','title':'volledig scherm'}),
|
dict(name='img', attrs={'class':'top-line','title':'volledig scherm'}),
|
||||||
@ -128,6 +130,7 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
|
|||||||
]
|
]
|
||||||
|
|
||||||
class MerryPreProcess():
|
class MerryPreProcess():
|
||||||
|
|
||||||
def optimizePicture(self,soup):
|
def optimizePicture(self,soup):
|
||||||
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
|
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
|
||||||
try:
|
try:
|
||||||
@ -142,6 +145,7 @@ class MerryPreProcess():
|
|||||||
return soup
|
return soup
|
||||||
|
|
||||||
class MerryExtract():
|
class MerryExtract():
|
||||||
|
|
||||||
def safeRemovePart(self, killingSoup, soupIsArray):
|
def safeRemovePart(self, killingSoup, soupIsArray):
|
||||||
if killingSoup and not killingSoup == None:
|
if killingSoup and not killingSoup == None:
|
||||||
try:
|
try:
|
||||||
@ -215,9 +219,10 @@ class MerryProcess(BeautifulSoup):
|
|||||||
|
|
||||||
def removeEmptyTags(self,soup,run=0):
|
def removeEmptyTags(self,soup,run=0):
|
||||||
emptymatches = re.compile('^[ \s\n\r\t ]*$')
|
emptymatches = re.compile('^[ \s\n\r\t ]*$')
|
||||||
emptytags = soup.findAll(lambda tag: tag.find(True) is None and (tag.string is None or tag.string.strip()=="" or tag.string.strip()==emptymatches) and not tag.isSelfClosing)
|
emptytags = soup.findAll(lambda tag: tag.find(True) is None and (
|
||||||
|
tag.string is None or tag.string.strip()=="" or tag.string.strip()==emptymatches) and not tag.isSelfClosing)
|
||||||
if emptytags and not (emptytags == None or emptytags == []):
|
if emptytags and not (emptytags == None or emptytags == []):
|
||||||
self.removeArrayOfTags(emptytags)
|
self.removeArrayOfTags(emptytags)
|
||||||
#recursive in case removing empty tag creates new empty tag
|
# recursive in case removing empty tag creates new empty tag
|
||||||
self.removeEmptyTags(soup, run=run)
|
self.removeEmptyTags(soup, run=run)
|
||||||
return soup
|
return soup
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
from calibre.web.feeds.news import re
|
from calibre.web.feeds.news import re
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
from BeautifulSoup import Tag
|
from calibre.ebooks.BeautifulSoup import Tag
|
||||||
|
|
||||||
class RevistaMuyInteresante(BasicNewsRecipe):
|
class RevistaMuyInteresante(BasicNewsRecipe):
|
||||||
|
|
||||||
@ -17,7 +17,6 @@ class RevistaMuyInteresante(BasicNewsRecipe):
|
|||||||
|
|
||||||
extra_css = ' .txt_articulo{ font-family: sans-serif; font-size: medium; text-align: justify } .contentheading{font-family: serif; font-size: large; font-weight: bold; color: #000000; text-align: center}'
|
extra_css = ' .txt_articulo{ font-family: sans-serif; font-size: medium; text-align: justify } .contentheading{font-family: serif; font-size: large; font-weight: bold; color: #000000; text-align: center}'
|
||||||
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for item in soup.findAll(style=True):
|
for item in soup.findAll(style=True):
|
||||||
del item['style']
|
del item['style']
|
||||||
@ -31,13 +30,12 @@ class RevistaMuyInteresante(BasicNewsRecipe):
|
|||||||
break
|
break
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
|
||||||
preprocess_regexps = [
|
preprocess_regexps = [
|
||||||
(re.compile(r'<td class="contentheading" width="100%">.*?</td>', re.DOTALL|re.IGNORECASE), lambda match: '<td class="contentheading">' + match.group().replace('<td class="contentheading" width="100%">','').strip().replace('</td>','').strip() + '</td>'),
|
(re.compile(r'<td class="contentheading" width="100%">.*?</td>', re.DOTALL|re.IGNORECASE), lambda match: '<td class="contentheading">' +
|
||||||
|
match.group().replace('<td class="contentheading" width="100%">','').strip().replace('</td>','').strip() + '</td>'),
|
||||||
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
keep_only_tags = [dict(name='div', attrs={'class':['article']}),dict(name='td', attrs={'class':['txt_articulo']})]
|
keep_only_tags = [dict(name='div', attrs={'class':['article']}),dict(name='td', attrs={'class':['txt_articulo']})]
|
||||||
|
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
@ -51,8 +49,7 @@ class RevistaMuyInteresante(BasicNewsRecipe):
|
|||||||
|
|
||||||
remove_tags_after = dict(name='div', attrs={'class':'tags_articles'})
|
remove_tags_after = dict(name='div', attrs={'class':'tags_articles'})
|
||||||
|
|
||||||
|
# TO GET ARTICLES IN SECTION
|
||||||
#TO GET ARTICLES IN SECTION
|
|
||||||
def nz_parse_section(self, url):
|
def nz_parse_section(self, url):
|
||||||
soup = self.index_to_soup(url)
|
soup = self.index_to_soup(url)
|
||||||
div = soup.find(attrs={'class':'contenido'})
|
div = soup.find(attrs={'class':'contenido'})
|
||||||
@ -74,7 +71,6 @@ class RevistaMuyInteresante(BasicNewsRecipe):
|
|||||||
|
|
||||||
return current_articles
|
return current_articles
|
||||||
|
|
||||||
|
|
||||||
# To GET SECTIONS
|
# To GET SECTIONS
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
feeds = []
|
feeds = []
|
||||||
@ -118,5 +114,3 @@ class RevistaMuyInteresante(BasicNewsRecipe):
|
|||||||
if link_item:
|
if link_item:
|
||||||
cover_url = "http://www.muyinteresante.es"+link_item['src']
|
cover_url = "http://www.muyinteresante.es"+link_item['src']
|
||||||
return cover_url
|
return cover_url
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user