merge from trunk

This commit is contained in:
ldolse 2010-09-20 00:44:34 +08:00
commit e303babf89
3 changed files with 59 additions and 14 deletions

View File

@ -0,0 +1,43 @@
__author__ = ' (lrfurtado@yahoo.com.br)'
from calibre.web.feeds.news import BasicNewsRecipe
class LeJournalDeMontrealRecipe(BasicNewsRecipe):
title = u'Le Journal de Montreal'
description = u'Le Journal de Montreal'
__author__ = 'Luciano Furtado'
language = 'fr'
oldest_article = 7
use_embedded_content=0
max_articles_per_feed = 15
remove_tags = [
dict(name='ul',attrs={'id':'mainNav'}),
dict(name='div',attrs={'id':'boxPolitique'}),
dict(name='div',attrs={'id':'boxScoop'}),
dict(name='div',attrs={'id':'DossierSpec'}),
dict(name='div',attrs={'id':'channelBoxes'}),
dict(name='div',attrs={'id':'sectionBoxes'}),
dict(name='div',attrs={'id':'header'}),
dict(name='div',attrs={'id':'footer'}),
dict(name='div',attrs={'id':'navbarCanoe_container'}),
dict(name='div',attrs={'id':'popularCanoe'}),
dict(name='div',attrs={'id':'textAds'}),
dict(name='div',attrs={'id':'24heures'}),
dict(name='div',attrs={'class':'bottomBox clear'}),
dict(name='div',attrs={'class':'articleControls thin'}),
]
feeds = [
(u'Actualites',
u'http://www.canoe.com/rss/feed/nouvelles/ljm_actualites.xml'),
(u'Arts et spectacle',
u'http://www.canoe.com/rss/feed/nouvelles/ljm_arts.xml'),
(u'Sports',
u'http://www.canoe.com/rss/feed/nouvelles/ljm_sports.xml'),
(u'Chroniques',
u'http://www.canoe.com/rss/feed/nouvelles/ljm_chroniques.xml'),
]

View File

@ -114,20 +114,22 @@ class Dehyphenator(object):
retain hyphens.
'''
def __init__(self):
# Add common suffixes to the regex below to increase the likelihood of a match -
# don't add suffixes which are also complete words, such as 'able' or 'sex'
self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)ion(s|al(ly)?)?|ings?|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
# remove prefixes if the prefix was not already the point of hyphenation
self.prefixes = re.compile(r'^(un|in|ex)$', re.IGNORECASE)
self.removeprefix = re.compile(r'^(un|in|ex)', re.IGNORECASE)
def dehyphenate(self, match):
firsthalf = match.group('firstpart')
secondhalf = match.group('secondpart')
hyphenated = str(firsthalf) + "-" + str(secondhalf)
dehyphenated = str(firsthalf) + str(secondhalf)
# Add common suffixes to the regex below to increase the likelihood of a match -
# don't add suffixes which are also complete words, such as 'able' or 'sex'
removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)ion(s|al(ly)?)?|ings?|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
lookupword = removesuffixes.sub('', dehyphenated)
# remove prefixes if the prefix was not already the point of hyphenation
prefixes = re.compile(r'^(un|in|ex)$', re.IGNORECASE)
removeprefix = re.compile(r'^(un|in|ex)', re.IGNORECASE)
if prefixes.match(firsthalf) is None:
lookupword = removeprefix.sub('', lookupword)
lookupword = self.removesuffixes.sub('', dehyphenated)
if self.prefixes.match(firsthalf) is None:
lookupword = self.removeprefix.sub('', lookupword)
booklookup = re.compile(u'%s' % lookupword, re.IGNORECASE)
#print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
match = booklookup.search(self.html)

View File

@ -184,7 +184,7 @@ class ContentServer(object):
if path and os.path.exists(path):
updated = fromtimestamp(os.stat(path).st_mtime)
cherrypy.response.headers['Last-Modified'] = self.last_modified(updated)
return fmt.read()
return fmt
# }}}