mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
merge from trunk
This commit is contained in:
commit
e303babf89
43
resources/recipes/le_journal.recipe
Normal file
43
resources/recipes/le_journal.recipe
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
__author__ = ' (lrfurtado@yahoo.com.br)'
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class LeJournalDeMontrealRecipe(BasicNewsRecipe):
|
||||||
|
|
||||||
|
title = u'Le Journal de Montreal'
|
||||||
|
description = u'Le Journal de Montreal'
|
||||||
|
__author__ = 'Luciano Furtado'
|
||||||
|
language = 'fr'
|
||||||
|
|
||||||
|
oldest_article = 7
|
||||||
|
use_embedded_content=0
|
||||||
|
max_articles_per_feed = 15
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
dict(name='ul',attrs={'id':'mainNav'}),
|
||||||
|
dict(name='div',attrs={'id':'boxPolitique'}),
|
||||||
|
dict(name='div',attrs={'id':'boxScoop'}),
|
||||||
|
dict(name='div',attrs={'id':'DossierSpec'}),
|
||||||
|
dict(name='div',attrs={'id':'channelBoxes'}),
|
||||||
|
dict(name='div',attrs={'id':'sectionBoxes'}),
|
||||||
|
dict(name='div',attrs={'id':'header'}),
|
||||||
|
dict(name='div',attrs={'id':'footer'}),
|
||||||
|
dict(name='div',attrs={'id':'navbarCanoe_container'}),
|
||||||
|
dict(name='div',attrs={'id':'popularCanoe'}),
|
||||||
|
dict(name='div',attrs={'id':'textAds'}),
|
||||||
|
dict(name='div',attrs={'id':'24heures'}),
|
||||||
|
dict(name='div',attrs={'class':'bottomBox clear'}),
|
||||||
|
dict(name='div',attrs={'class':'articleControls thin'}),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'Actualites',
|
||||||
|
u'http://www.canoe.com/rss/feed/nouvelles/ljm_actualites.xml'),
|
||||||
|
(u'Arts et spectacle',
|
||||||
|
u'http://www.canoe.com/rss/feed/nouvelles/ljm_arts.xml'),
|
||||||
|
(u'Sports',
|
||||||
|
u'http://www.canoe.com/rss/feed/nouvelles/ljm_sports.xml'),
|
||||||
|
(u'Chroniques',
|
||||||
|
u'http://www.canoe.com/rss/feed/nouvelles/ljm_chroniques.xml'),
|
||||||
|
]
|
@ -108,26 +108,28 @@ def line_length(format, raw, percent):
|
|||||||
|
|
||||||
class Dehyphenator(object):
|
class Dehyphenator(object):
|
||||||
'''
|
'''
|
||||||
Analyzes words to determine whether hyphens should be retained/removed. Uses the document
|
Analyzes words to determine whether hyphens should be retained/removed. Uses the document
|
||||||
itself is as a dictionary. This method handles all languages along with uncommon, made-up, and
|
itself is as a dictionary. This method handles all languages along with uncommon, made-up, and
|
||||||
scientific words. The primary disadvantage is that words appearing only once in the document
|
scientific words. The primary disadvantage is that words appearing only once in the document
|
||||||
retain hyphens.
|
retain hyphens.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
# Add common suffixes to the regex below to increase the likelihood of a match -
|
||||||
|
# don't add suffixes which are also complete words, such as 'able' or 'sex'
|
||||||
|
self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)ion(s|al(ly)?)?|ings?|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
|
||||||
|
# remove prefixes if the prefix was not already the point of hyphenation
|
||||||
|
self.prefixes = re.compile(r'^(un|in|ex)$', re.IGNORECASE)
|
||||||
|
self.removeprefix = re.compile(r'^(un|in|ex)', re.IGNORECASE)
|
||||||
|
|
||||||
def dehyphenate(self, match):
|
def dehyphenate(self, match):
|
||||||
firsthalf = match.group('firstpart')
|
firsthalf = match.group('firstpart')
|
||||||
secondhalf = match.group('secondpart')
|
secondhalf = match.group('secondpart')
|
||||||
hyphenated = str(firsthalf) + "-" + str(secondhalf)
|
hyphenated = str(firsthalf) + "-" + str(secondhalf)
|
||||||
dehyphenated = str(firsthalf) + str(secondhalf)
|
dehyphenated = str(firsthalf) + str(secondhalf)
|
||||||
# Add common suffixes to the regex below to increase the likelihood of a match -
|
lookupword = self.removesuffixes.sub('', dehyphenated)
|
||||||
# don't add suffixes which are also complete words, such as 'able' or 'sex'
|
if self.prefixes.match(firsthalf) is None:
|
||||||
removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)ion(s|al(ly)?)?|ings?|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
|
lookupword = self.removeprefix.sub('', lookupword)
|
||||||
lookupword = removesuffixes.sub('', dehyphenated)
|
|
||||||
# remove prefixes if the prefix was not already the point of hyphenation
|
|
||||||
prefixes = re.compile(r'^(un|in|ex)$', re.IGNORECASE)
|
|
||||||
removeprefix = re.compile(r'^(un|in|ex)', re.IGNORECASE)
|
|
||||||
if prefixes.match(firsthalf) is None:
|
|
||||||
lookupword = removeprefix.sub('', lookupword)
|
|
||||||
booklookup = re.compile(u'%s' % lookupword, re.IGNORECASE)
|
booklookup = re.compile(u'%s' % lookupword, re.IGNORECASE)
|
||||||
#print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
|
#print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
|
||||||
match = booklookup.search(self.html)
|
match = booklookup.search(self.html)
|
||||||
@ -137,7 +139,7 @@ class Dehyphenator(object):
|
|||||||
else:
|
else:
|
||||||
#print "returned hyphenated word: " + str(hyphenated)
|
#print "returned hyphenated word: " + str(hyphenated)
|
||||||
return hyphenated
|
return hyphenated
|
||||||
|
|
||||||
def __call__(self, html, format, length=1):
|
def __call__(self, html, format, length=1):
|
||||||
self.html = html
|
self.html = html
|
||||||
if format == 'html':
|
if format == 'html':
|
||||||
|
@ -184,7 +184,7 @@ class ContentServer(object):
|
|||||||
if path and os.path.exists(path):
|
if path and os.path.exists(path):
|
||||||
updated = fromtimestamp(os.stat(path).st_mtime)
|
updated = fromtimestamp(os.stat(path).st_mtime)
|
||||||
cherrypy.response.headers['Last-Modified'] = self.last_modified(updated)
|
cherrypy.response.headers['Last-Modified'] = self.last_modified(updated)
|
||||||
return fmt.read()
|
return fmt
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user