Fix #898884 (New version of Metro NL)

This commit is contained in:
Kovid Goyal 2011-12-02 07:49:22 +05:30
parent c202ad15f2
commit 315c50a1aa

View File

@ -2,7 +2,26 @@
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
import re import re
from calibre.utils.magick import Image from calibre.utils.magick import Image
from BeautifulSoup import BeautifulSoup
try:
from calibre_plugins.drMerry.debug import debuglogger as mlog
print 'drMerry debuglogger found, debug options can be used'
from calibre_plugins.drMerry.stats import statslogger as mstat
print 'drMerry stats tracker found, stat can be tracked'
mlog.setLoglevel(1) #-1 == no log; 0 for normal output
mstat.calculateStats(False) #track stats (to track stats loglevel must be > 0
KEEPSTATS = mstat.keepmystats()
SHOWDEBUG0 = mlog.showdebuglevel(0)
SHOWDEBUG1 = mlog.showdebuglevel(1)
SHOWDEBUG2 = mlog.showdebuglevel(2)
except:
print 'drMerry debuglogger not found, skipping debug options'
SHOWDEBUG0 = False
SHOWDEBUG1 = False
SHOWDEBUG2 = False
KEEPSTATS = False
print ('level0: %s\nlevel1: %s\nlevel2: %s' % (SHOWDEBUG0,SHOWDEBUG1,SHOWDEBUG2))
''' Version 1.2, updated cover image to match the changed website. ''' Version 1.2, updated cover image to match the changed website.
added info date on title added info date on title
@ -17,39 +36,37 @@ from calibre.utils.magick import Image
changed è into è changed è into è
updated remove tags updated remove tags
removed keep_only tags removed keep_only tags
Version 1.8 26-11-2022
added remove tag: article-slideshow
''' '''
class AdvancedUserRecipe1306097511(BasicNewsRecipe): class AdvancedUserRecipe1306097511(BasicNewsRecipe):
title = u'Metro Nieuws NL' title = u'Metro Nieuws NL'
oldest_article = 1.5 oldest_article = 10
max_articles_per_feed = 100 max_articles_per_feed = 15
__author__ = u'DrMerry' __author__ = u'DrMerry'
description = u'Metro Nederland' description = u'Metro Nederland'
language = u'nl' language = u'nl'
simultaneous_downloads = 5 simultaneous_downloads = 5
masthead_url = 'http://blog.metronieuws.nl/wp-content/themes/metro/images/header.gif'
timeout = 2 timeout = 2
#delay = 1
center_navbar = True center_navbar = True
#auto_cleanup = True
#auto_cleanup_keep = '//div[@class="article-image-caption-2column"]/*|//div[@id="date"]/*|//div[@class="article-image-caption-3column"]/*'
timefmt = ' [%A, %d %b %Y]' timefmt = ' [%A, %d %b %Y]'
no_stylesheets = True no_stylesheets = True
remove_javascript = True remove_javascript = True
remove_empty_feeds = True remove_empty_feeds = True
cover_url = 'http://www.oldreadmetro.com/img/en/metroholland/last/1/small.jpg' cover_url = 'http://www.oldreadmetro.com/img/en/metroholland/last/1/small.jpg'
publication_type = 'newspaper' publication_type = 'newspaper'
remove_tags_before = dict(id='date')
remove_tags_after = dict(name='div', attrs={'class':'article-body'})
encoding = 'utf-8' encoding = 'utf-8'
remove_attributes = ['style', 'font', 'width', 'height'] remove_attributes = ['style', 'font', 'width', 'height']
use_embedded_content = False use_embedded_content = False
conversion_options = { conversion_options = {
'authors' : 'Metro Nederland', 'authors' : 'Metro Nederland & calibre & DrMerry',
'author_sort' : 'Metro Nederland', 'author_sort' : 'Metro Nederland & calibre & DrMerry',
'publisher' : 'DrMerry/Metro Nederland' 'publisher' : 'DrMerry/Metro Nederland'
} }
extra_css = 'body {padding:5px 0px; background:#fff;font-size: 13px;}\ extra_css = 'body {padding:5px 0px; background:#fff;font-size: 13px;}\
#date {clear: both;margin-left: 19px;font-size: 11px;font-weight: 300;color: #616262;height: 15px;}\ #date, div.share-and-byline div.byline div.text div.title, div.share-and-byline div.byline div.text div.name {clear: both;margin-bottom: 10px;font-size:0.5em; color: #616262;}\
.article-box-fact.module-title {clear:both;padding: 8px 0;color: #24763b;font-family: arial, sans-serif;font-size: 14px;font-weight: bold;}\ .article-box-fact.module-title {clear:both;padding: 8px 0;color: #24763b;font-family: arial, sans-serif;font-size: 14px;font-weight: bold;}\
h1.title {color: #000000;font-size: 44px;padding-bottom: 10px;font-weight: 300;} h2.subtitle {font-size: 13px;font-weight: 700;padding-bottom: 10px;}\ h1.title {color: #000000;font-size: 44px;padding-bottom: 10px;font-weight: 300;} h2.subtitle {font-size: 13px;font-weight: 700;padding-bottom: 10px;}\
.article-body p{padding-bottom:10px;}div.column-1-3{margin-left: 19px;padding-right: 9px;}\ .article-body p{padding-bottom:10px;}div.column-1-3{margin-left: 19px;padding-right: 9px;}\
@ -58,28 +75,43 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
p.article-image-caption .credits {font-style: italic;font-size: 10px;}\ p.article-image-caption .credits {font-style: italic;font-size: 10px;}\
div.article-image-caption {width: 246px;margin-bottom: 5px;margin-left: 10px;}\ div.article-image-caption {width: 246px;margin-bottom: 5px;margin-left: 10px;}\
div.article-image-caption-2column {margin-bottom: 10px;width: 373px;} div.article-image-caption-3column {}\ div.article-image-caption-2column {margin-bottom: 10px;width: 373px;} div.article-image-caption-3column {}\
img {border:0px;} .img-mask {position:absolute;top:0px;left:0px;}' img {border:0px; padding:2px;} hr.merryhr {width:30%; border-width:0px; color:green; margin-left:5px; background-color: green} div.column-3 {background-color:#eee; width:50%; margin:2px; float:right; padding:2px;} div.column-3 module-title {border: 1px solid #aaa} div.article-box-fact div.subtitle {font-weight:bold; color:green;}'
remove_tags = [dict(name='div', attrs={'class':[ 'metroCommentFormWrap', 'related-links' preprocess_regexps = [
'commentForm', 'metroCommentInnerWrap', 'article-slideshow-counter-container', 'article-slideshow-control', 'ad', 'header-links', (re.compile(r'<img[^>]+top-line[^>]+>', re.DOTALL|re.IGNORECASE),
'art-rgt','pluck-app pluck-comm', 'share-and-byline', 'article-tools-below-title', 'col-179 ', 'related-links', 'clear padding-top-15', 'share-tools', lambda match: '<hr class="merryhr" />'),
'article1','article-page-auto-pushes', 'footer-edit','clear']}), (re.compile(r'(<img[^>]+metronieuws\.nl/[^>]+/templates/[^>]+jpe?g[^>]+>|metronieuws\.nl/internal\-roxen\-unit\.gif)', re.DOTALL|re.IGNORECASE),
dict(name='div', attrs={'id':['article-2', 'article-4', 'article-1', 'navigation', 'footer', 'header', 'comments', 'sidebar', 'share-and-byline']}), lambda match: ''),
dict(name='iframe')]
preprocess_regexps = [(re.compile(r'(<p>(&nbsp;|\s)*</p>|<a[^>]*>Tweet</a>|<a[^>]*>|</a>|<!--.*?-->)', re.DOTALL|re.IGNORECASE),lambda match: ''),
(re.compile(r'(&nbsp;|\s\s)+\s*', re.DOTALL|re.IGNORECASE),lambda match: ' '),
(re.compile(r'([\s>])([^\s>]+)(<span[^>]+) />', re.DOTALL|re.IGNORECASE),
lambda match: match.group(1) + match.group(3) + '>' + match.group(2) + '</span>'),
] ]
def preprocess_html(self, soup):
if SHOWDEBUG0 == True:
mlog.setdefaults()
mlog.addTextAndTag(['Show debug = on with level'], [str(mlog.debuglevel)])
if KEEPSTATS == True:
mlog.addDebug('Stats will be calculated')
else:
mlog.addTextAndTag(['Stats won\'t be calculated\nTo be enabled, stats must be true, currently','and debug level must be 1 or higher, currently'],[mstat.dokeepmystats, mlog.debuglevel])
mlog.showDebug()
myProcess = MerryProcess()
myProcess.removeUnwantedTags(soup)
return soup
def postprocess_html(self, soup, first): def postprocess_html(self, soup, first):
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')): myProcess = MerryProcess()
iurl = tag['src'] myProcess.optimizeLayout(soup)
img = Image() if SHOWDEBUG0 == True:
img.open(iurl) if KEEPSTATS == True:
img.trim(0) statinfo = 'generated stats:'
img.save(iurl) statinfo += str(mstat.stats(mstat.statslist))
print statinfo
statinfo = 'generated stats (for removed tags):'
statinfo += str(mstat.stats(mstat.removedtagslist))
print statinfo
#show all Debug info we forgot to report
#Using print to be sure that this text will not be added at the end of the log.
print '\n!!!!!unreported messages:\n(should be empty)\n'
mlog.showDebug()
return soup return soup
feeds = [ feeds = [
@ -95,6 +127,291 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
(u'Familie', u'http://www.metronieuws.nl/rss.xml?c=1283166782-9'), (u'Familie', u'http://www.metronieuws.nl/rss.xml?c=1283166782-9'),
(u'Blogs', u'http://www.metronieuws.nl/rss.xml?c=1295586825-6'), (u'Blogs', u'http://www.metronieuws.nl/rss.xml?c=1295586825-6'),
(u'Reizen', u'http://www.metronieuws.nl/rss.xml?c=1277377288-13'), (u'Reizen', u'http://www.metronieuws.nl/rss.xml?c=1277377288-13'),
(u'Carrière', u'http://www.metronieuws.nl/rss.xml?c=1278070988-1'), (u'Carri&egrave;re', u'http://www.metronieuws.nl/rss.xml?c=1278070988-1'),
(u'Sport', u'http://www.metronieuws.nl/rss.xml?c=1277377288-12') (u'Sport', u'http://www.metronieuws.nl/rss.xml?c=1277377288-12')
] ]
class MerryPreProcess():
def replacePictures(self, soup):
#to be implemented
return soup
def optimizePicture(self,soup):
if SHOWDEBUG0 == True:
mlog.addDebug('start image optimize')
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
iurl = tag['src']
img = Image()
img.open(iurl)
img.trim(0)
img.save(iurl)
if SHOWDEBUG0 == True:
mlog.addDebug('Images optimized')
mlog.showDebug()
return soup
class MerryExtract():
def safeRemovePart(self, killingSoup, soupIsArray):
if killingSoup and not killingSoup == None:
if SHOWDEBUG2 == True:
mlog.addTextAndTag(['items to remove'],[killingSoup])
try:
if soupIsArray == True:
for killer in killingSoup:
killer.extract()
else:
killingSoup.extract()
if SHOWDEBUG1 == True:
mlog.addDebug('tag extracted')
mlog.showDebug()
if KEEPSTATS == True:
try:
mstat.addstat(mstat.removedtagslist,str(killingSoup.name))
except:
mstat.addstat(mstat.removedtagslist,'unknown')
except:
if SHOWDEBUG1 == True:
mlog.addDebug('tag extraction failed')
mlog.showDebug()
if KEEPSTATS == True:
mstat.addstat(mstat.removedtagslist,'exception')
return False
else:
return False
return killingSoup
class MerryReplace():
myKiller = MerryExtract()
def replaceATag(self, soup):
anchors = []
anchors = soup.findAll('a')
if anchors and not (anchors == None or anchors == []):
try:
for link in anchors:
# print str(link)
if link and not link == None:
# print ('type: %s'%(str(type(link))))
# print ('link: %s' % (link))
myParent = link.parent
# print str('parent: %s'%(myParent))
try:
myIndex = link.parent.index(link)
hasIndex = True
except:
myIndex = 0
hasIndex = False
# print str('index %s'%(myIndex))
if not link.string == None:
# print 'link=notnone'
if hasIndex == True:
myParent.insert(myIndex, link.string)
else:
myParent.append(link.string)
else:
# print 'link=none'
myParent.insert(myIndex, link.contents)
self.myKiller.safeRemovePart(link, False)
else:
notshown = 'tag received is empty' # print
except:
notshown = 'tag received is empty' # print
notshown
return soup
class MerryProcess(BeautifulSoup):
myKiller = MerryExtract()
myReplacer = MerryReplace()
myPrepare = MerryPreProcess()
def optimizeLayout(self,soup):
self.myPrepare.optimizePicture(soup)
if SHOWDEBUG0 == True:
mlog.addDebug('End of Optimize Layout')
mlog.showDebug()
return soup
def insertFacts(self, soup):
allfacts = soup.findAll('div', {'class':re.compile('^article-box-fact.*$')})
if SHOWDEBUG0 == True:
mlog.addTextAndTag(['allfacts'],[allfacts])
mlog.showDebug()
if allfacts and not allfacts == None:
allfactsparent = soup.find('div', {'class':re.compile('^article-box-fact.*$')}).parent
if SHOWDEBUG0 == True:
mlog.addTextAndTag(['allfactsparent'],[allfactsparent])
mlog.showDebug()
for part in allfactsparent:
if not part in allfacts:
if SHOWDEBUG0 == True:
mlog.addTextAndTag(['FOUND A non-fact'],[part])
mlog.showDebug()
self.myKiller.safeRemovePart(part, True)
if SHOWDEBUG1 == True:
mlog.addTextAndTag(['New All Facts'],[allfacts])
mlog.showDebug()
articlefacts = soup.find('div', {'class':'article-box-fact column'})
errorOccured=False
if (articlefacts and not articlefacts==None):
try:
contenttag = soup.find('div', {'class':'article-body'})
if SHOWDEBUG0 == True:
mlog.addTextAndTag(['curcontag'],[contenttag])
mlog.showDebug()
foundrighttag = False
if contenttag and not contenttag == None:
foundrighttag = True
if SHOWDEBUG0 == True:
if errorOccured == False:
mlog.addTextAndTag(['type','curcontag (in while)'],[type(contenttag),contenttag])
else:
mlog.addDebug('Could not find right parent tag. Error Occured')
mlog.showDebug()
if foundrighttag == True:
contenttag.insert(0, allfactsparent)
if SHOWDEBUG2 == True:
mlog.addTextAndTag(['added parent'],[soup.prettify()])
mlog.showDebug()
except:
errorOccured=True
mlog.addTrace()
else:
errorOccured=True
if SHOWDEBUG0 == True and errorOccured == True:
mlog.addTextAndTag(['no articlefacts'],[articlefacts])
mlog.showDebug()
return soup
def previousNextSibRemover(self, soup, previous=True, soupIsArray=False):
findsibsof = soup
firstpart = previous
if findsibsof and not findsibsof == None:
if soupIsArray == True:
for foundsib in findsibsof:
self.previousNextSibRemover(foundsib, firstpart, soupIsArray=False)
else:
if firstpart == True and soupIsArray == False:
sibs = findsibsof.previousSiblingGenerator()
else:
sibs = findsibsof.nextSiblingGenerator()
for sib in sibs:
self.myKiller.safeRemovePart(sib, True)
else:
if SHOWDEBUG1 == True:
mlog.addDebug('Not any sib found')
return
def removeUnwantedTags(self,soup):
if SHOWDEBUG1 == True:
mlog.addTextAndTag(['Len of Soup before RemoveTagsByName'],[len(str(soup))])
mlog.showDebug()
self.removeTagsByName(soup)
if SHOWDEBUG1 == True:
mlog.addDebug('Len of Soup before firstandlastpart: %s' % len(str(soup)))
mlog.showDebug()
self.insertFacts(soup)
self.removeFirstAndLastPart(soup)
if SHOWDEBUG1 == True:
mlog.addDebug('Len of Soup before unwantedpart: %s' % len(str(soup)))
mlog.showDebug()
self.removeUnwantedParts(soup)
if SHOWDEBUG1 == True:
mlog.addDebug('Len of Soup before EmptyParts: %s' % len(str(soup)))
mlog.showDebug()
self.removeEmptyTags(soup)
if SHOWDEBUG1 == True:
mlog.addDebug('Len of Soup after EmptyParts: %s' % len(str(soup)))
mlog.showDebug()
self.myReplacer.replaceATag(soup)
return soup
def removeUnwantedParts(self, soup):
if SHOWDEBUG1 == True:
mlog.addDebug('Len of Soup before UnwantedID: %s' % len(str(soup)))
mlog.showDebug()
self.removeUnwantedTagsByID(soup)
if SHOWDEBUG1 == True:
mlog.addDebug('Len of Soup before Class: %s' % len(str(soup)))
mlog.showDebug()
self.removeUnwantedTagsByClass(soup)
if SHOWDEBUG1 == True:
mlog.addDebug('Len of Soup before Style: %s' % len(str(soup)))
mlog.showDebug()
self.removeUnwantedTagsByStyle(soup)
return soup
def removeUnwantedTagsByStyle(self,soup):
self.removeArrayOfTags(soup.findAll(attrs={'style' : re.compile("^(.*(display\s?:\s?none|img-mask|white)\s?;?.*)$")}))
if SHOWDEBUG0 == True:
mlog.addDebug('end remove by style')
return soup
def removeArrayOfTags(self,souparray):
return self.myKiller.safeRemovePart(souparray, True)
def removeUnwantedTagsByClass(self,soup):
if SHOWDEBUG0 == True:
mlog.addDebug('start remove by class')
self.removeArrayOfTags(soup.findAll("div", { "class" :re.compile('^(promo.*?|article-tools-below-title|metroCommentFormWrap|ad|share-tools|tools|header-links|related-links|padding-top-15)$')}))
return soup
def removeUnwantedTagsByID(self,soup):
defaultids = ['footer-extra',re.compile('^ad(\d+|adcomp.*?)?$'),'column-4-5','navigation','header',re.compile('^column-1-5-(top|bottom)$'),'footer','hidden_div','sidebar',re.compile('^article-\d$'),'comments','footer']
for removeid in defaultids:
if SHOWDEBUG1 == True:
mlog.addDebug('RemoveTagByID, tag: %s, Len of Soup: %s' % (str(removeid), len(str(soup))))
mlog.showDebug()
self.removeArrayOfTags(soup.findAll(id=removeid))
return soup
# def safeRemoveTag(self, subtree):
# return self.myKiller.safeRemovePart(subtree, True)
def removeTagsByName(self, soup):
self.myKiller.safeRemovePart(soup.script, True)
self.myKiller.safeRemovePart(soup.iframe, True)
self.myKiller.safeRemovePart(soup.style, True)
self.myKiller.safeRemovePart(soup.noscript, True)
return soup
def removeEmptyTags(self,soup,run=0):
if SHOWDEBUG0 == True:
mlog.addDebug('starting removeEmptyTags')
if SHOWDEBUG1 == True:
run += 1
mlog.addDebug(run)
if SHOWDEBUG2 == True:
mlog.addDebug(str(soup.prettify()))
mlog.showDebug()
emptymatches = re.compile('^(&nbsp;|\s|\n|\r|\t)*$')
emptytags = soup.findAll(lambda tag: tag.find(True) is None and (tag.string is None or tag.string.strip()=="" or tag.string.strip()==emptymatches) and not tag.isSelfClosing)
if emptytags and not (emptytags == None or emptytags == []):
if SHOWDEBUG1 == True:
mlog.addDebug('tags found')
mlog.addDebug(str(emptytags))
self.removeArrayOfTags(emptytags)
#recursive in case removing empty tag creates new empty tag
self.removeEmptyTags(soup, run=run)
else:
if SHOWDEBUG1 == True:
mlog.addDebug('no empty tags found')
mlog.showDebug()
if SHOWDEBUG0 == True:
if SHOWDEBUG2 == True:
mlog.addDebug('new soup:')
mlog.addDebug(str(soup.prettify()))
mlog.addDebug('RemoveEmptyTags Completed')
mlog.showDebug()
return soup
def removeFirstAndLastPart(self,soup):
def findparenttag(lookuptag):
if lookuptag and not lookuptag == None:
return lookuptag.findParents()
findtag = soup.find(id="date")
self.previousNextSibRemover(findtag, previous=True, soupIsArray=False)
self.previousNextSibRemover(findparenttag(findtag), previous=True, soupIsArray=True)
for endtag in [soup.find(id="share-and-byline"), soup.find("div", { "class" : "gallery-text" })]:
self.previousNextSibRemover(endtag, previous=False, soupIsArray=False)
self.previousNextSibRemover(findparenttag(endtag), previous=False, soupIsArray=True)
return soup