merge from trunk

2025-12-08 06:05:04 -05:00 · 2010-09-20 00:44:34 +08:00 · 2010-09-20 00:44:34 +08:00 · e303babf89
commit e303babf89
parent d335bccd67 23cd4fd783
3 changed files with 59 additions and 14 deletions
--- a/resources/recipes/le_journal.recipe
+++ b/resources/recipes/le_journal.recipe
@ -0,0 +1,43 @@
+__author__ = ' (lrfurtado@yahoo.com.br)'
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class LeJournalDeMontrealRecipe(BasicNewsRecipe):
+
+     title       = u'Le Journal de Montreal'
+     description = u'Le Journal de Montreal'
+     __author__  = 'Luciano Furtado'
+     language = 'fr'
+
+     oldest_article = 7
+     use_embedded_content=0
+     max_articles_per_feed = 15
+
+     remove_tags = [
+                        dict(name='ul',attrs={'id':'mainNav'}),
+                        dict(name='div',attrs={'id':'boxPolitique'}),
+                        dict(name='div',attrs={'id':'boxScoop'}),
+                        dict(name='div',attrs={'id':'DossierSpec'}),
+                        dict(name='div',attrs={'id':'channelBoxes'}),
+                        dict(name='div',attrs={'id':'sectionBoxes'}),
+                        dict(name='div',attrs={'id':'header'}),
+                        dict(name='div',attrs={'id':'footer'}),
+ dict(name='div',attrs={'id':'navbarCanoe_container'}),
+                        dict(name='div',attrs={'id':'popularCanoe'}),
+                        dict(name='div',attrs={'id':'textAds'}),
+                        dict(name='div',attrs={'id':'24heures'}),
+                        dict(name='div',attrs={'class':'bottomBox clear'}),
+                        dict(name='div',attrs={'class':'articleControls thin'}),
+                  ]
+
+
+     feeds          = [
+                        (u'Actualites',
+ u'http://www.canoe.com/rss/feed/nouvelles/ljm_actualites.xml'),
+                        (u'Arts et spectacle',
+ u'http://www.canoe.com/rss/feed/nouvelles/ljm_arts.xml'),
+                        (u'Sports',
+ u'http://www.canoe.com/rss/feed/nouvelles/ljm_sports.xml'),
+                        (u'Chroniques',
+ u'http://www.canoe.com/rss/feed/nouvelles/ljm_chroniques.xml'),
+                     ]
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -114,20 +114,22 @@ class Dehyphenator(object):
    retain hyphens.
    '''

+    def __init__(self):
+        # Add common suffixes to the regex below to increase the likelihood of a match -
+        # don't add suffixes which are also complete words, such as 'able' or 'sex'
+        self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)ion(s|al(ly)?)?|ings?|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
+        # remove prefixes if the prefix was not already the point of hyphenation
+        self.prefixes = re.compile(r'^(un|in|ex)$', re.IGNORECASE)
+        self.removeprefix = re.compile(r'^(un|in|ex)', re.IGNORECASE)
+
    def dehyphenate(self, match):
        firsthalf = match.group('firstpart')
        secondhalf = match.group('secondpart')
        hyphenated = str(firsthalf) + "-" + str(secondhalf)
        dehyphenated = str(firsthalf) + str(secondhalf)
-        # Add common suffixes to the regex below to increase the likelihood of a match -   
-        # don't add suffixes which are also complete words, such as 'able' or 'sex'
-        removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)ion(s|al(ly)?)?|ings?|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
-        lookupword = removesuffixes.sub('', dehyphenated)
-        # remove prefixes if the prefix was not already the point of hyphenation
-        prefixes = re.compile(r'^(un|in|ex)$', re.IGNORECASE)
-        removeprefix = re.compile(r'^(un|in|ex)', re.IGNORECASE)
-        if prefixes.match(firsthalf) is None:
-           lookupword = removeprefix.sub('', lookupword)
+        lookupword = self.removesuffixes.sub('', dehyphenated)
+        if self.prefixes.match(firsthalf) is None:
+           lookupword = self.removeprefix.sub('', lookupword)
        booklookup = re.compile(u'%s' % lookupword, re.IGNORECASE)
        #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
        match = booklookup.search(self.html)
--- a/src/calibre/library/server/content.py
+++ b/src/calibre/library/server/content.py
@ -184,7 +184,7 @@ class ContentServer(object):
        if path and os.path.exists(path):
            updated = fromtimestamp(os.stat(path).st_mtime)
            cherrypy.response.headers['Last-Modified'] = self.last_modified(updated)
-        return fmt.read()
+        return fmt
    # }}}