Fix #985353 (Updated Metro Nieuws NL)

This commit is contained in:
Kovid Goyal 2012-04-26 09:10:32 +05:30
parent 27845e11b1
commit 669fc85958

View File

@ -27,8 +27,13 @@ from BeautifulSoup import BeautifulSoup
Version 1.9.1 18-04-2012 Version 1.9.1 18-04-2012
removed some debug settings removed some debug settings
updated code to match new metro-layout updated code to match new metro-layout
Version 1.9.2 14-04-2012 Version 1.9.2 24-04-2012
updated code to match new metro-layout updated code to match new metro-layout
Version 1.9.3 25-04-2012
Changed a lot of custom code into calibre code as the default code of calibre has become much faster since the first version fo this recipe
Added new feeds
Updated css
Changed order of regex to speedup proces
''' '''
class AdvancedUserRecipe1306097511(BasicNewsRecipe): class AdvancedUserRecipe1306097511(BasicNewsRecipe):
@ -38,7 +43,7 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
__author__ = u'DrMerry' __author__ = u'DrMerry'
description = u'Metro Nederland' description = u'Metro Nederland'
language = u'nl' language = u'nl'
simultaneous_downloads = 3 simultaneous_downloads = 5
masthead_url = 'http://blog.metronieuws.nl/wp-content/themes/metro/images/header.gif' masthead_url = 'http://blog.metronieuws.nl/wp-content/themes/metro/images/header.gif'
timeout = 10 timeout = 10
center_navbar = True center_navbar = True
@ -49,48 +54,39 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
cover_url = 'http://www.oldreadmetro.com/img/en/metroholland/last/1/small.jpg' cover_url = 'http://www.oldreadmetro.com/img/en/metroholland/last/1/small.jpg'
publication_type = 'newspaper' publication_type = 'newspaper'
encoding = 'utf-8' encoding = 'utf-8'
remove_attributes = ['style', 'font', 'width', 'height'] remove_attributes = ['style', 'font', 'width', 'height', 'itemtype', 'itemprop', 'itemscope']#, 'href']
use_embedded_content = False use_embedded_content = False
conversion_options = { extra_css = 'body{font-size:1em;padding:5px 0}body,a,h2{background-color:#fff;text-decoration:none;color:#000}#date,div.byline,p.article-image-caption .credits,.calibrenavbar{font-size:.5em}.article-box-fact.module-title,#date,div.byline{clear:both}.article-box-fact.module-title{margin:8px 0}.article-box-fact.module-title,h2{font-size:1.1em}h1.title{font-size:1.4em}h1.title,.article-body p,div.article-image-caption-2column,div.article-image-caption-3column,#date,div.byline{margin-bottom:.6em}div.article-box-fact div.subtitle,.article-box-fact.module-title,h1.title,p.article-image-caption{font-weight:700}div.column-1-3{margin-left:19px}div.column-1-2{display:inline}div.column-1-2,div.column-1-3{margin-right:7px}p.article-image-caption{font-size:.6em;margin-top:5px}p.article-image-caption,#date,div.byline{color:#616262}p.article-image-caption .credits{font-style:italic}div.article-image-caption{width:246px}div.article-image-caption-2column{width:373px}div.column-3{background-color:#eee;float:right;width:50%}div.column-3 module-title{border:1px solid #aaa}div.article-box-fact div.subtitle,.article-box-fact.module-title{color:#24763b}div.byline{border-top:2px solid #24763b}div.column-3,img,div.column-3,p.small,div.article-image-caption{margin:.5em}img,p.small,.column1,h2{border:0;padding:0}.column1,h1,h2{margin:0}'
'authors' : 'Metro Nederland & calibre & DrMerry',
'author_sort' : 'Metro Nederland & calibre & DrMerry',
'publisher' : 'DrMerry/Metro Nederland'
}
extra_css = 'body {padding:5px 0; background-color:#fff;font-size: 1em}\
#date, div.share-and-byline div.byline div.text div.title, div.share-and-byline div.byline div.text div.name {margin-bottom: 10px}\
#date, div.share-and-byline div.byline div.text div.title, div.share-and-byline div.byline div.text div.name, p.article-image-caption .credits {font-size:0.5em}\
.article-box-fact.module-title, #date, div.share-and-byline div.byline div.text div.title, div.share-and-byline div.byline div.text div.name {clear:both}\
.article-box-fact.module-title {padding: 8px 0}\
h1.title {color: #000;font-size: 1.4em}\
.article-box-fact.module-title, h2.subtitle {font-size: 1.2em}\
h1.title, h2.subtitle, .article-body p{padding-bottom:10px}\
h1.title, p.article-image-caption {font-weight: 300}\
div.column-1-3{margin-left: 19px;padding-right: 9px}\
div.column-1-2 {display: inline;padding-right: 7px}\
p.article-image-caption {font-size: 0.6em;margin-top: 5px}\
p.article-image-caption, #date, div.share-and-byline div.byline div.text div.title, div.share-and-byline div.byline div.text div.name {color: #616262}\
p.article-image-caption .credits {font-style: italic}\
div.article-image-caption {width: 246px;margin: 5px}\
div.article-image-caption-2column {width: 373px}\
div.article-image-caption-2column, div.article-image-caption-3column {margin-bottom: 5px}\
img {border:0}\
img, div.column-3 {padding:2px}\
hr.merryhr {width:30%; border-width:0; margin-left:5px; background-color: #24763b}\
div.column-3 {background-color:#eee; width:50%; margin:2px; float:right}\
div.column-3 module-title {border: 1px solid #aaa}\
div.article-box-fact div.subtitle, .article-box-fact.module-title, h2.subtitle {font-weight:bold}\
div.article-box-fact div.subtitle, hr.merryhr, .article-box-fact.module-title {color: #24763b}'
preprocess_regexps = [ preprocess_regexps = [
(re.compile(r'<img[^>]+top-line[^>]+>', re.DOTALL|re.IGNORECASE), (re.compile(r'(&nbsp;|\s|<img[^>]+metronieuws\.nl/([^>]+/templates/[^>]+\.jpe?g|internal\-roxen\-unit\.gif)[^>]+>)', re.DOTALL|re.IGNORECASE),lambda match: ' '),
lambda match: '<hr class="merryhr" />'), #(re.compile(r'(&nbsp;|\s)+', re.DOTALL|re.IGNORECASE),lambda match:' '),
(re.compile(r'<img[^>]+(metronieuws\.nl/[^>]+/templates/[^>]+jpe?g|metronieuws\.nl/internal\-roxen\-unit\.gif)[^>]+>', re.DOTALL|re.IGNORECASE), #(re.compile(r'<(a |/a)[^>]*>', re.DOTALL|re.IGNORECASE),lambda match:'')
lambda match: ''), #(re.compile('(</?)h2', re.DOTALL|re.IGNORECASE),lambda match:'\1em')
] ]
remove_tags_before= dict(id='date')
remove_tags_after = [dict(name='div', attrs={'class':['column-1-3','gallery-text']})]#id='share-and-byline')]
remove_tags = [
dict(name=['iframe','script','noscript','style']),
dict(name='div', attrs={'class':[re.compile('column-[14]-5'),'col-179 ','col-373 ','clear','ad','navigation',re.compile('share-tools(-top)?'),'tools','metroCommentFormWrap','article-tools-below-title','related-links','padding-top-15',re.compile('^promo.*?$'),'teaser-component',re.compile('fb(-comments|_iframe_widget)')]}),
dict(id=['column-1-5-bottom','column-4-5',re.compile('^ad(\d+|adcomp.*?)?$'),'sidebar',re.compile('^article-\d'),'comments','gallery-1']),
dict(name='a', attrs={'name':'comments'}),
#dict(name='div', attrs={'data-href'}),
dict(name='img', attrs={'class':'top-line'}),
dict(attrs={'style':re.compile('^(.*(display\s?:\s?none|img-mask|white)\s?;?.*)$'),'title':'volledig scherm'})]
'''removed by before/after:
id:
column-1-5-top,'hidden_div','footer',
class:
'header',re.compile('^footer-[a-zA-Z0-9]+$),'header-links',
'''
def preprocess_html(self, soup): def preprocess_html(self, soup):
myProcess = MerryProcess() myProcess = MerryProcess()
myProcess.moveTitleAndAuthor(soup)
myProcess.removeUnwantedTags(soup) myProcess.removeUnwantedTags(soup)
return soup return soup
@ -108,26 +104,30 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
(u'Buitenland', u'http://www.metronieuws.nl/rss.xml?c=1277377288-4'), (u'Buitenland', u'http://www.metronieuws.nl/rss.xml?c=1277377288-4'),
(u'Columns', u'http://www.metronieuws.nl/rss.xml?c=1277377288-17'), (u'Columns', u'http://www.metronieuws.nl/rss.xml?c=1277377288-17'),
(u'Entertainment', u'http://www.metronieuws.nl/rss.xml?c=1277377288-2'), (u'Entertainment', u'http://www.metronieuws.nl/rss.xml?c=1277377288-2'),
(u'Dot', u'http://www.metronieuws.nl/rss.xml?c=1283166782-12'), (u'Strips',u'http://www.metronieuws.nl/rss.xml?c=1325037714-0'),
(u'Tech', u'http://www.metronieuws.nl/rss.xml?c=1283166782-12'),
(u'Familie', u'http://www.metronieuws.nl/rss.xml?c=1283166782-9'), (u'Familie', u'http://www.metronieuws.nl/rss.xml?c=1283166782-9'),
(u'Blogs', u'http://www.metronieuws.nl/rss.xml?c=1295586825-6'), (u'Blogs', u'http://www.metronieuws.nl/rss.xml?c=1295586825-6'),
(u'Reizen', u'http://www.metronieuws.nl/rss.xml?c=1277377288-13'), (u'Reizen', u'http://www.metronieuws.nl/rss.xml?c=1277377288-13'),
(u'Carri&egrave;re', u'http://www.metronieuws.nl/rss.xml?c=1278070988-1'), (u'Carrière', u'http://www.metronieuws.nl/rss.xml?c=1278070988-1'),
(u'Wetenschap',u'http://www.metronieuws.nl/rss.xml?c=1303088437-0'),
(u'Planeet',u'http://www.metronieuws.nl/rss.xml?c=1277377288-14'),
(u'Gezondheid',u'http://www.metronieuws.nl/rss.xml?c=1277377288-15'),
(u'Sport', u'http://www.metronieuws.nl/rss.xml?c=1277377288-12') (u'Sport', u'http://www.metronieuws.nl/rss.xml?c=1277377288-12')
] ]
class MerryPreProcess(): class MerryPreProcess():
def replacePictures(self, soup):
#to be implemented
return soup
def optimizePicture(self,soup): def optimizePicture(self,soup):
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')): for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
try:
iurl = tag['src'] iurl = tag['src']
img = Image() img = Image()
img.open(iurl) img.open(iurl)
img.trim(0) img.trim(0)
img.save(iurl) img.save(iurl)
except:
print '\n!!image optimize failed!!\n'
continue
return soup return soup
class MerryExtract(): class MerryExtract():
@ -145,47 +145,8 @@ class MerryExtract():
return False return False
return killingSoup return killingSoup
class MerryReplace():
myKiller = MerryExtract()
def replaceATag(self, soup):
anchors = []
anchors = soup.findAll('a')
if anchors and not (anchors == None or anchors == []):
try:
for link in anchors:
# print str(link)
if link and not link == None:
# print ('type: %s'%(str(type(link))))
# print ('link: %s' % (link))
myParent = link.parent
# print str('parent: %s'%(myParent))
try:
myIndex = link.parent.index(link)
hasIndex = True
except:
myIndex = 0
hasIndex = False
# print str('index %s'%(myIndex))
if not link.string == None:
# print 'link=notnone'
if hasIndex == True:
myParent.insert(myIndex, link.string)
else:
myParent.append(link.string)
else:
# print 'link=none'
myParent.insert(myIndex, link.contents)
self.myKiller.safeRemovePart(link, False)
else:
notshown = 'tag received is empty' # print
except:
notshown = 'tag received is empty' # print
notshown
return soup
class MerryProcess(BeautifulSoup): class MerryProcess(BeautifulSoup):
myKiller = MerryExtract() myKiller = MerryExtract()
myReplacer = MerryReplace()
myPrepare = MerryPreProcess() myPrepare = MerryPreProcess()
def optimizeLayout(self,soup): def optimizeLayout(self,soup):
@ -193,9 +154,10 @@ class MerryProcess(BeautifulSoup):
return soup return soup
def insertFacts(self, soup): def insertFacts(self, soup):
allfacts = soup.findAll('div', {'class':re.compile('^article-box-fact.*$')}) thefactpart = re.compile('^article-box-fact.*$')
allfacts = soup.findAll('div', {'class':thefactpart})
if allfacts and not allfacts == None: if allfacts and not allfacts == None:
allfactsparent = soup.find('div', {'class':re.compile('^article-box-fact.*$')}).parent allfactsparent = soup.find('div', {'class':thefactpart}).parent
for part in allfactsparent: for part in allfactsparent:
if not part in allfacts: if not part in allfacts:
self.myKiller.safeRemovePart(part, True) self.myKiller.safeRemovePart(part, True)
@ -212,83 +174,39 @@ class MerryProcess(BeautifulSoup):
pass pass
return soup return soup
def previousNextSibRemover(self, soup, previous=True, soupIsArray=False): def moveTitleAndAuthor(self, soup):
findsibsof = soup moveitem = soup.h1
firstpart = previous pubdate = soup.find(id="date")
if findsibsof and not findsibsof == None: if moveitem and not moveitem == None and pubdate and not pubdate == None:
if soupIsArray == True: try:
for foundsib in findsibsof: pubdate.parent.insert(0, moveitem)
self.previousNextSibRemover(foundsib, firstpart, soupIsArray=False) except:
else: print '\n!!error in moving title!!\n'
if firstpart == True and soupIsArray == False: pass
sibs = findsibsof.previousSiblingGenerator() moveitem = None
else: moveitem = soup.find('div', {'class':'byline'})
sibs = findsibsof.nextSiblingGenerator() if moveitem and not moveitem == None:
for sib in sibs: try:
self.myKiller.safeRemovePart(sib, True) moveitem.parent.parent.insert(-1, moveitem)
return except:
print '\n!!error in moving byline!!\n'
pass
return soup
def removeUnwantedTags(self,soup): def removeUnwantedTags(self,soup):
self.removeTagsByName(soup)
self.insertFacts(soup) self.insertFacts(soup)
self.removeFirstAndLastPart(soup)
self.removeUnwantedParts(soup)
self.removeEmptyTags(soup) self.removeEmptyTags(soup)
self.myReplacer.replaceATag(soup) self.removeArrayOfTags(soup.findAll(attrs={'class': 'share-tools-bottom'})) # at end to keep author
return soup
def removeUnwantedParts(self, soup):
self.removeUnwantedTagsByID(soup)
self.removeUnwantedTagsByClass(soup)
self.removeUnwantedTagsByStyle(soup)
return soup
def removeUnwantedTagsByStyle(self,soup):
self.removeArrayOfTags(soup.findAll(attrs={'style':re.compile("^(.*(display\s?:\s?none|img-mask|white)\s?;?.*)$")}))
self.removeArrayOfTags(soup.findAll(attrs={'title':'volledig scherm'}))
return soup return soup
def removeArrayOfTags(self,souparray): def removeArrayOfTags(self,souparray):
return self.myKiller.safeRemovePart(souparray, True) return self.myKiller.safeRemovePart(souparray, True)
def removeUnwantedTagsByClass(self,soup):
self.removeArrayOfTags(soup.findAll("div", { "class" :re.compile('^(promo.*?|share-tools-top|share-tools-bottom|article-tools-below-title|metroCommentFormWrap|ad|share-tools|tools|header-links|related-links|padding-top-15|footer-[a-zA-Z0-9]+|column-4-5)$')}))
return soup
def removeUnwantedTagsByID(self,soup):
defaultids = ['footer-extra',re.compile('^ad(\d+|adcomp.*?)?$'),'column-4-5','navigation','header',re.compile('^column-1-5-(top|bottom)$'),'footer','hidden_div','sidebar',re.compile('^article-\d$'),'comments','footer','gallery-1']
for removeid in defaultids:
self.removeArrayOfTags(soup.findAll(id=removeid))
return soup
# def safeRemoveTag(self, subtree):
# return self.myKiller.safeRemovePart(subtree, True)
def removeTagsByName(self, soup):
self.myKiller.safeRemovePart(soup.script, True)
self.myKiller.safeRemovePart(soup.iframe, True)
self.myKiller.safeRemovePart(soup.style, True)
self.myKiller.safeRemovePart(soup.noscript, True)
return soup
def removeEmptyTags(self,soup,run=0): def removeEmptyTags(self,soup,run=0):
emptymatches = re.compile('^(&nbsp;|\s|\n|\r|\t)*$') emptymatches = re.compile('^[&nbsp;\s\n\r\t ]*$')
emptytags = soup.findAll(lambda tag: tag.find(True) is None and (tag.string is None or tag.string.strip()=="" or tag.string.strip()==emptymatches) and not tag.isSelfClosing) emptytags = soup.findAll(lambda tag: tag.find(True) is None and (tag.string is None or tag.string.strip()=="" or tag.string.strip()==emptymatches) and not tag.isSelfClosing)
if emptytags and not (emptytags == None or emptytags == []): if emptytags and not (emptytags == None or emptytags == []):
self.removeArrayOfTags(emptytags) self.removeArrayOfTags(emptytags)
#recursive in case removing empty tag creates new empty tag #recursive in case removing empty tag creates new empty tag
self.removeEmptyTags(soup, run=run) self.removeEmptyTags(soup, run=run)
return soup return soup
def removeFirstAndLastPart(self,soup):
def findparenttag(lookuptag):
if lookuptag and not lookuptag == None:
return lookuptag.findParents()
findtag = soup.find(id="date")
self.previousNextSibRemover(findtag, previous=True, soupIsArray=False)
self.previousNextSibRemover(findparenttag(findtag), previous=True, soupIsArray=True)
for endtag in [soup.find(id="share-and-byline"), soup.find("div", { "class" : "gallery-text" })]:
self.previousNextSibRemover(endtag, previous=False, soupIsArray=False)
self.previousNextSibRemover(findparenttag(endtag), previous=False, soupIsArray=True)
return soup