Fix #985353 (Updated Metro Nieuws NL)

This commit is contained in:
Kovid Goyal 2012-04-19 09:05:46 +05:30
parent e67d9a0057
commit 11569fddac

View File

@ -3,25 +3,6 @@ from calibre.web.feeds.news import BasicNewsRecipe
import re import re
from calibre.utils.magick import Image from calibre.utils.magick import Image
from BeautifulSoup import BeautifulSoup from BeautifulSoup import BeautifulSoup
try:
from calibre_plugins.drMerry.debug import debuglogger as mlog
print 'drMerry debuglogger found, debug options can be used'
from calibre_plugins.drMerry.stats import statslogger as mstat
print 'drMerry stats tracker found, stat can be tracked'
mlog.setLoglevel(1) #-1 == no log; 0 for normal output
mstat.calculateStats(False) #track stats (to track stats loglevel must be > 0
KEEPSTATS = mstat.keepmystats()
SHOWDEBUG0 = mlog.showdebuglevel(0)
SHOWDEBUG1 = mlog.showdebuglevel(1)
SHOWDEBUG2 = mlog.showdebuglevel(2)
except:
#print 'drMerry debuglogger not found, skipping debug options'
SHOWDEBUG0 = False
SHOWDEBUG1 = False
SHOWDEBUG2 = False
KEEPSTATS = False
#print ('level0: %s\nlevel1: %s\nlevel2: %s' % (SHOWDEBUG0,SHOWDEBUG1,SHOWDEBUG2))
''' Version 1.2, updated cover image to match the changed website. ''' Version 1.2, updated cover image to match the changed website.
added info date on title added info date on title
@ -43,6 +24,9 @@ except:
extended timeout from 2 to 10 extended timeout from 2 to 10
changed oldest article from 10 to 1.2 changed oldest article from 10 to 1.2
changed max articles from 15 to 25 changed max articles from 15 to 25
Version 1.9.1 18-04-2012
removed some debug settings
updated code to match new metro-layout
''' '''
class AdvancedUserRecipe1306097511(BasicNewsRecipe): class AdvancedUserRecipe1306097511(BasicNewsRecipe):
@ -70,34 +54,40 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
'author_sort' : 'Metro Nederland & calibre & DrMerry', 'author_sort' : 'Metro Nederland & calibre & DrMerry',
'publisher' : 'DrMerry/Metro Nederland' 'publisher' : 'DrMerry/Metro Nederland'
} }
extra_css = 'body {padding:5px 0px; background:#fff;font-size: 13px;}\ extra_css = 'body {padding:5px 0; background-color:#fff;font-size: 1em}\
#date, div.share-and-byline div.byline div.text div.title, div.share-and-byline div.byline div.text div.name {clear: both;margin-bottom: 10px;font-size:0.5em; color: #616262;}\ #date, div.share-and-byline div.byline div.text div.title, div.share-and-byline div.byline div.text div.name {margin-bottom: 10px}\
.article-box-fact.module-title {clear:both;padding: 8px 0;color: #24763b;font-family: arial, sans-serif;font-size: 14px;font-weight: bold;}\ #date, div.share-and-byline div.byline div.text div.title, div.share-and-byline div.byline div.text div.name, p.article-image-caption .credits {font-size:0.5em}\
h1.title {color: #000000;font-size: 44px;padding-bottom: 10px;font-weight: 300;} h2.subtitle {font-size: 13px;font-weight: 700;padding-bottom: 10px;}\ .article-box-fact.module-title, #date, div.share-and-byline div.byline div.text div.title, div.share-and-byline div.byline div.text div.name {clear:both}\
.article-body p{padding-bottom:10px;}div.column-1-3{margin-left: 19px;padding-right: 9px;}\ .article-box-fact.module-title {padding: 8px 0}\
div.column-1-2 {display: inline;padding-right: 7px;}\ h1.title {color: #000;font-size: 1.4em}\
p.article-image-caption {font-size: 12px;font-weight: 300;color: #616262;margin-top: 5px;} \ .article-box-fact.module-title, h2.subtitle {font-size: 1.2em}\
p.article-image-caption .credits {font-style: italic;font-size: 10px;}\ h1.title, h2.subtitle, .article-body p{padding-bottom:10px}\
div.article-image-caption {width: 246px;margin-bottom: 5px;margin-left: 10px;}\ h1.title, p.article-image-caption {font-weight: 300}\
div.article-image-caption-2column {margin-bottom: 10px;width: 373px;} div.article-image-caption-3column {}\ div.column-1-3{margin-left: 19px;padding-right: 9px}\
img {border:0px; padding:2px;} hr.merryhr {width:30%; border-width:0px; color:green; margin-left:5px; background-color: green} div.column-3 {background-color:#eee; width:50%; margin:2px; float:right; padding:2px;} div.column-3 module-title {border: 1px solid #aaa} div.article-box-fact div.subtitle {font-weight:bold; color:green;}' div.column-1-2 {display: inline;padding-right: 7px}\
p.article-image-caption {font-size: 0.6em;margin-top: 5px}\
p.article-image-caption, #date, div.share-and-byline div.byline div.text div.title, div.share-and-byline div.byline div.text div.name {color: #616262}\
p.article-image-caption .credits {font-style: italic}\
div.article-image-caption {width: 246px;margin: 5px}\
div.article-image-caption-2column {width: 373px}\
div.article-image-caption-2column, div.article-image-caption-3column {margin-bottom: 5px}\
img {border:0}\
img, div.column-3 {padding:2px}\
hr.merryhr {width:30%; border-width:0; margin-left:5px; background-color: #24763b}\
div.column-3 {background-color:#eee; width:50%; margin:2px; float:right}\
div.column-3 module-title {border: 1px solid #aaa}\
div.article-box-fact div.subtitle, .article-box-fact.module-title, h2.subtitle {font-weight:bold}\
div.article-box-fact div.subtitle, hr.merryhr, .article-box-fact.module-title {color: #24763b}'
preprocess_regexps = [ preprocess_regexps = [
(re.compile(r'<img[^>]+top-line[^>]+>', re.DOTALL|re.IGNORECASE), (re.compile(r'<img[^>]+top-line[^>]+>', re.DOTALL|re.IGNORECASE),
lambda match: '<hr class="merryhr" />'), lambda match: '<hr class="merryhr" />'),
(re.compile(r'(<img[^>]+metronieuws\.nl/[^>]+/templates/[^>]+jpe?g[^>]+>|metronieuws\.nl/internal\-roxen\-unit\.gif)', re.DOTALL|re.IGNORECASE), (re.compile(r'<img[^>]+(metronieuws\.nl/[^>]+/templates/[^>]+jpe?g|metronieuws\.nl/internal\-roxen\-unit\.gif)[^>]+>', re.DOTALL|re.IGNORECASE),
lambda match: ''), lambda match: ''),
] ]
def preprocess_html(self, soup): def preprocess_html(self, soup):
if SHOWDEBUG0 == True:
mlog.setdefaults()
mlog.addTextAndTag(['Show debug = on with level'], [str(mlog.debuglevel)])
if KEEPSTATS == True:
mlog.addDebug('Stats will be calculated')
else:
mlog.addTextAndTag(['Stats won\'t be calculated\nTo be enabled, stats must be true, currently','and debug level must be 1 or higher, currently'],[mstat.dokeepmystats, mlog.debuglevel])
mlog.showDebug()
myProcess = MerryProcess() myProcess = MerryProcess()
myProcess.removeUnwantedTags(soup) myProcess.removeUnwantedTags(soup)
return soup return soup
@ -105,18 +95,6 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
def postprocess_html(self, soup, first): def postprocess_html(self, soup, first):
myProcess = MerryProcess() myProcess = MerryProcess()
myProcess.optimizeLayout(soup) myProcess.optimizeLayout(soup)
if SHOWDEBUG0 == True:
if KEEPSTATS == True:
statinfo = 'generated stats:'
statinfo += str(mstat.stats(mstat.statslist))
print statinfo
statinfo = 'generated stats (for removed tags):'
statinfo += str(mstat.stats(mstat.removedtagslist))
print statinfo
#show all Debug info we forgot to report
#Using print to be sure that this text will not be added at the end of the log.
print '\n!!!!!unreported messages:\n(should be empty)\n'
mlog.showDebug()
return soup return soup
feeds = [ feeds = [
@ -142,44 +120,24 @@ class MerryPreProcess():
return soup return soup
def optimizePicture(self,soup): def optimizePicture(self,soup):
if SHOWDEBUG0 == True:
mlog.addDebug('start image optimize')
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')): for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
iurl = tag['src'] iurl = tag['src']
img = Image() img = Image()
img.open(iurl) img.open(iurl)
img.trim(0) img.trim(0)
img.save(iurl) img.save(iurl)
if SHOWDEBUG0 == True:
mlog.addDebug('Images optimized')
mlog.showDebug()
return soup return soup
class MerryExtract(): class MerryExtract():
def safeRemovePart(self, killingSoup, soupIsArray): def safeRemovePart(self, killingSoup, soupIsArray):
if killingSoup and not killingSoup == None: if killingSoup and not killingSoup == None:
if SHOWDEBUG2 == True:
mlog.addTextAndTag(['items to remove'],[killingSoup])
try: try:
if soupIsArray == True: if soupIsArray == True:
for killer in killingSoup: for killer in killingSoup:
killer.extract() killer.extract()
else: else:
killingSoup.extract() killingSoup.extract()
if SHOWDEBUG1 == True:
mlog.addDebug('tag extracted')
mlog.showDebug()
if KEEPSTATS == True:
try:
mstat.addstat(mstat.removedtagslist,str(killingSoup.name))
except: except:
mstat.addstat(mstat.removedtagslist,'unknown')
except:
if SHOWDEBUG1 == True:
mlog.addDebug('tag extraction failed')
mlog.showDebug()
if KEEPSTATS == True:
mstat.addstat(mstat.removedtagslist,'exception')
return False return False
else: else:
return False return False
@ -230,60 +188,30 @@ class MerryProcess(BeautifulSoup):
def optimizeLayout(self,soup): def optimizeLayout(self,soup):
self.myPrepare.optimizePicture(soup) self.myPrepare.optimizePicture(soup)
if SHOWDEBUG0 == True:
mlog.addDebug('End of Optimize Layout')
mlog.showDebug()
return soup return soup
def insertFacts(self, soup): def insertFacts(self, soup):
allfacts = soup.findAll('div', {'class':re.compile('^article-box-fact.*$')}) allfacts = soup.findAll('div', {'class':re.compile('^article-box-fact.*$')})
if SHOWDEBUG0 == True:
mlog.addTextAndTag(['allfacts'],[allfacts])
mlog.showDebug()
if allfacts and not allfacts == None: if allfacts and not allfacts == None:
allfactsparent = soup.find('div', {'class':re.compile('^article-box-fact.*$')}).parent allfactsparent = soup.find('div', {'class':re.compile('^article-box-fact.*$')}).parent
if SHOWDEBUG0 == True:
mlog.addTextAndTag(['allfactsparent'],[allfactsparent])
mlog.showDebug()
for part in allfactsparent: for part in allfactsparent:
if not part in allfacts: if not part in allfacts:
if SHOWDEBUG0 == True:
mlog.addTextAndTag(['FOUND A non-fact'],[part])
mlog.showDebug()
self.myKiller.safeRemovePart(part, True) self.myKiller.safeRemovePart(part, True)
if SHOWDEBUG1 == True:
mlog.addTextAndTag(['New All Facts'],[allfacts])
mlog.showDebug()
articlefacts = soup.find('div', {'class':'article-box-fact column'}) articlefacts = soup.find('div', {'class':'article-box-fact column'})
errorOccured=False errorOccured=False
if (articlefacts and not articlefacts==None): if (articlefacts and not articlefacts==None):
try: try:
contenttag = soup.find('div', {'class':'article-body'}) contenttag = soup.find('div', {'class':'article-body'})
if SHOWDEBUG0 == True:
mlog.addTextAndTag(['curcontag'],[contenttag])
mlog.showDebug()
foundrighttag = False foundrighttag = False
if contenttag and not contenttag == None: if contenttag and not contenttag == None:
foundrighttag = True foundrighttag = True
if SHOWDEBUG0 == True:
if errorOccured == False:
mlog.addTextAndTag(['type','curcontag (in while)'],[type(contenttag),contenttag])
else:
mlog.addDebug('Could not find right parent tag. Error Occured')
mlog.showDebug()
if foundrighttag == True: if foundrighttag == True:
contenttag.insert(0, allfactsparent) contenttag.insert(0, allfactsparent)
if SHOWDEBUG2 == True:
mlog.addTextAndTag(['added parent'],[soup.prettify()])
mlog.showDebug()
except: except:
errorOccured=True errorOccured=True
mlog.addTrace() mlog.addTrace()
else: else:
errorOccured=True errorOccured=True
if SHOWDEBUG0 == True and errorOccured == True:
mlog.addTextAndTag(['no articlefacts'],[articlefacts])
mlog.showDebug()
return soup return soup
def previousNextSibRemover(self, soup, previous=True, soupIsArray=False): def previousNextSibRemover(self, soup, previous=True, soupIsArray=False):
@ -300,71 +228,38 @@ class MerryProcess(BeautifulSoup):
sibs = findsibsof.nextSiblingGenerator() sibs = findsibsof.nextSiblingGenerator()
for sib in sibs: for sib in sibs:
self.myKiller.safeRemovePart(sib, True) self.myKiller.safeRemovePart(sib, True)
else:
if SHOWDEBUG1 == True:
mlog.addDebug('Not any sib found')
return return
def removeUnwantedTags(self,soup): def removeUnwantedTags(self,soup):
if SHOWDEBUG1 == True:
mlog.addTextAndTag(['Len of Soup before RemoveTagsByName'],[len(str(soup))])
mlog.showDebug()
self.removeTagsByName(soup) self.removeTagsByName(soup)
if SHOWDEBUG1 == True:
mlog.addDebug('Len of Soup before firstandlastpart: %s' % len(str(soup)))
mlog.showDebug()
self.insertFacts(soup) self.insertFacts(soup)
self.removeFirstAndLastPart(soup) self.removeFirstAndLastPart(soup)
if SHOWDEBUG1 == True:
mlog.addDebug('Len of Soup before unwantedpart: %s' % len(str(soup)))
mlog.showDebug()
self.removeUnwantedParts(soup) self.removeUnwantedParts(soup)
if SHOWDEBUG1 == True:
mlog.addDebug('Len of Soup before EmptyParts: %s' % len(str(soup)))
mlog.showDebug()
self.removeEmptyTags(soup) self.removeEmptyTags(soup)
if SHOWDEBUG1 == True:
mlog.addDebug('Len of Soup after EmptyParts: %s' % len(str(soup)))
mlog.showDebug()
self.myReplacer.replaceATag(soup) self.myReplacer.replaceATag(soup)
return soup return soup
def removeUnwantedParts(self, soup): def removeUnwantedParts(self, soup):
if SHOWDEBUG1 == True:
mlog.addDebug('Len of Soup before UnwantedID: %s' % len(str(soup)))
mlog.showDebug()
self.removeUnwantedTagsByID(soup) self.removeUnwantedTagsByID(soup)
if SHOWDEBUG1 == True:
mlog.addDebug('Len of Soup before Class: %s' % len(str(soup)))
mlog.showDebug()
self.removeUnwantedTagsByClass(soup) self.removeUnwantedTagsByClass(soup)
if SHOWDEBUG1 == True:
mlog.addDebug('Len of Soup before Style: %s' % len(str(soup)))
mlog.showDebug()
self.removeUnwantedTagsByStyle(soup) self.removeUnwantedTagsByStyle(soup)
return soup return soup
def removeUnwantedTagsByStyle(self,soup): def removeUnwantedTagsByStyle(self,soup):
self.removeArrayOfTags(soup.findAll(attrs={'style':re.compile("^(.*(display\s?:\s?none|img-mask|white)\s?;?.*)$")})) self.removeArrayOfTags(soup.findAll(attrs={'style':re.compile("^(.*(display\s?:\s?none|img-mask|white)\s?;?.*)$")}))
if SHOWDEBUG0 == True: self.removeArrayOfTags(soup.findAll(attrs={'title':'volledig scherm'}))
mlog.addDebug('end remove by style')
return soup return soup
def removeArrayOfTags(self,souparray): def removeArrayOfTags(self,souparray):
return self.myKiller.safeRemovePart(souparray, True) return self.myKiller.safeRemovePart(souparray, True)
def removeUnwantedTagsByClass(self,soup): def removeUnwantedTagsByClass(self,soup):
if SHOWDEBUG0 == True: self.removeArrayOfTags(soup.findAll("div", { "class" :re.compile('^(promo.*?|share-tools-top|share-tools-bottom|article-tools-below-title|metroCommentFormWrap|ad|share-tools|tools|header-links|related-links|padding-top-15|footer-[a-zA-Z0-9]+)$')}))
mlog.addDebug('start remove by class')
self.removeArrayOfTags(soup.findAll("div", { "class" :re.compile('^(promo.*?|article-tools-below-title|metroCommentFormWrap|ad|share-tools|tools|header-links|related-links|padding-top-15)$')}))
return soup return soup
def removeUnwantedTagsByID(self,soup): def removeUnwantedTagsByID(self,soup):
defaultids = ['footer-extra',re.compile('^ad(\d+|adcomp.*?)?$'),'column-4-5','navigation','header',re.compile('^column-1-5-(top|bottom)$'),'footer','hidden_div','sidebar',re.compile('^article-\d$'),'comments','footer'] defaultids = ['footer-extra',re.compile('^ad(\d+|adcomp.*?)?$'),'column-4-5','navigation','header',re.compile('^column-1-5-(top|bottom)$'),'footer','hidden_div','sidebar',re.compile('^article-\d$'),'comments','footer','gallery-1']
for removeid in defaultids: for removeid in defaultids:
if SHOWDEBUG1 == True:
mlog.addDebug('RemoveTagByID, tag: %s, Len of Soup: %s' % (str(removeid), len(str(soup))))
mlog.showDebug()
self.removeArrayOfTags(soup.findAll(id=removeid)) self.removeArrayOfTags(soup.findAll(id=removeid))
return soup return soup
@ -380,33 +275,12 @@ class MerryProcess(BeautifulSoup):
return soup return soup
def removeEmptyTags(self,soup,run=0): def removeEmptyTags(self,soup,run=0):
if SHOWDEBUG0 == True:
mlog.addDebug('starting removeEmptyTags')
if SHOWDEBUG1 == True:
run += 1
mlog.addDebug(run)
if SHOWDEBUG2 == True:
mlog.addDebug(str(soup.prettify()))
mlog.showDebug()
emptymatches = re.compile('^(&nbsp;|\s|\n|\r|\t)*$') emptymatches = re.compile('^(&nbsp;|\s|\n|\r|\t)*$')
emptytags = soup.findAll(lambda tag: tag.find(True) is None and (tag.string is None or tag.string.strip()=="" or tag.string.strip()==emptymatches) and not tag.isSelfClosing) emptytags = soup.findAll(lambda tag: tag.find(True) is None and (tag.string is None or tag.string.strip()=="" or tag.string.strip()==emptymatches) and not tag.isSelfClosing)
if emptytags and not (emptytags == None or emptytags == []): if emptytags and not (emptytags == None or emptytags == []):
if SHOWDEBUG1 == True:
mlog.addDebug('tags found')
mlog.addDebug(str(emptytags))
self.removeArrayOfTags(emptytags) self.removeArrayOfTags(emptytags)
#recursive in case removing empty tag creates new empty tag #recursive in case removing empty tag creates new empty tag
self.removeEmptyTags(soup, run=run) self.removeEmptyTags(soup, run=run)
else:
if SHOWDEBUG1 == True:
mlog.addDebug('no empty tags found')
mlog.showDebug()
if SHOWDEBUG0 == True:
if SHOWDEBUG2 == True:
mlog.addDebug('new soup:')
mlog.addDebug(str(soup.prettify()))
mlog.addDebug('RemoveEmptyTags Completed')
mlog.showDebug()
return soup return soup
def removeFirstAndLastPart(self,soup): def removeFirstAndLastPart(self,soup):