merge from trunk

This commit is contained in:
ldolse 2010-09-20 00:44:34 +08:00
commit e303babf89
3 changed files with 59 additions and 14 deletions

View File

@ -0,0 +1,43 @@
__author__ = ' (lrfurtado@yahoo.com.br)'
from calibre.web.feeds.news import BasicNewsRecipe
class LeJournalDeMontrealRecipe(BasicNewsRecipe):
title = u'Le Journal de Montreal'
description = u'Le Journal de Montreal'
__author__ = 'Luciano Furtado'
language = 'fr'
oldest_article = 7
use_embedded_content=0
max_articles_per_feed = 15
remove_tags = [
dict(name='ul',attrs={'id':'mainNav'}),
dict(name='div',attrs={'id':'boxPolitique'}),
dict(name='div',attrs={'id':'boxScoop'}),
dict(name='div',attrs={'id':'DossierSpec'}),
dict(name='div',attrs={'id':'channelBoxes'}),
dict(name='div',attrs={'id':'sectionBoxes'}),
dict(name='div',attrs={'id':'header'}),
dict(name='div',attrs={'id':'footer'}),
dict(name='div',attrs={'id':'navbarCanoe_container'}),
dict(name='div',attrs={'id':'popularCanoe'}),
dict(name='div',attrs={'id':'textAds'}),
dict(name='div',attrs={'id':'24heures'}),
dict(name='div',attrs={'class':'bottomBox clear'}),
dict(name='div',attrs={'class':'articleControls thin'}),
]
feeds = [
(u'Actualites',
u'http://www.canoe.com/rss/feed/nouvelles/ljm_actualites.xml'),
(u'Arts et spectacle',
u'http://www.canoe.com/rss/feed/nouvelles/ljm_arts.xml'),
(u'Sports',
u'http://www.canoe.com/rss/feed/nouvelles/ljm_sports.xml'),
(u'Chroniques',
u'http://www.canoe.com/rss/feed/nouvelles/ljm_chroniques.xml'),
]

View File

@ -108,26 +108,28 @@ def line_length(format, raw, percent):
class Dehyphenator(object): class Dehyphenator(object):
''' '''
Analyzes words to determine whether hyphens should be retained/removed. Uses the document Analyzes words to determine whether hyphens should be retained/removed. Uses the document
itself is as a dictionary. This method handles all languages along with uncommon, made-up, and itself is as a dictionary. This method handles all languages along with uncommon, made-up, and
scientific words. The primary disadvantage is that words appearing only once in the document scientific words. The primary disadvantage is that words appearing only once in the document
retain hyphens. retain hyphens.
''' '''
def __init__(self):
# Add common suffixes to the regex below to increase the likelihood of a match -
# don't add suffixes which are also complete words, such as 'able' or 'sex'
self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)ion(s|al(ly)?)?|ings?|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
# remove prefixes if the prefix was not already the point of hyphenation
self.prefixes = re.compile(r'^(un|in|ex)$', re.IGNORECASE)
self.removeprefix = re.compile(r'^(un|in|ex)', re.IGNORECASE)
def dehyphenate(self, match): def dehyphenate(self, match):
firsthalf = match.group('firstpart') firsthalf = match.group('firstpart')
secondhalf = match.group('secondpart') secondhalf = match.group('secondpart')
hyphenated = str(firsthalf) + "-" + str(secondhalf) hyphenated = str(firsthalf) + "-" + str(secondhalf)
dehyphenated = str(firsthalf) + str(secondhalf) dehyphenated = str(firsthalf) + str(secondhalf)
# Add common suffixes to the regex below to increase the likelihood of a match - lookupword = self.removesuffixes.sub('', dehyphenated)
# don't add suffixes which are also complete words, such as 'able' or 'sex' if self.prefixes.match(firsthalf) is None:
removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)ion(s|al(ly)?)?|ings?|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE) lookupword = self.removeprefix.sub('', lookupword)
lookupword = removesuffixes.sub('', dehyphenated)
# remove prefixes if the prefix was not already the point of hyphenation
prefixes = re.compile(r'^(un|in|ex)$', re.IGNORECASE)
removeprefix = re.compile(r'^(un|in|ex)', re.IGNORECASE)
if prefixes.match(firsthalf) is None:
lookupword = removeprefix.sub('', lookupword)
booklookup = re.compile(u'%s' % lookupword, re.IGNORECASE) booklookup = re.compile(u'%s' % lookupword, re.IGNORECASE)
#print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated) #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
match = booklookup.search(self.html) match = booklookup.search(self.html)
@ -137,7 +139,7 @@ class Dehyphenator(object):
else: else:
#print "returned hyphenated word: " + str(hyphenated) #print "returned hyphenated word: " + str(hyphenated)
return hyphenated return hyphenated
def __call__(self, html, format, length=1): def __call__(self, html, format, length=1):
self.html = html self.html = html
if format == 'html': if format == 'html':

View File

@ -184,7 +184,7 @@ class ContentServer(object):
if path and os.path.exists(path): if path and os.path.exists(path):
updated = fromtimestamp(os.stat(path).st_mtime) updated = fromtimestamp(os.stat(path).st_mtime)
cherrypy.response.headers['Last-Modified'] = self.last_modified(updated) cherrypy.response.headers['Last-Modified'] = self.last_modified(updated)
return fmt.read() return fmt
# }}} # }}}