merge from trunk

2025-12-08 06:05:04 -05:00 · 2010-09-20 00:44:34 +08:00 · 2010-09-20 00:44:34 +08:00 · e303babf89
commit e303babf89
parent d335bccd67 23cd4fd783
3 changed files with 59 additions and 14 deletions
--- a/resources/recipes/le_journal.recipe
+++ b/resources/recipes/le_journal.recipe
@ -0,0 +1,43 @@
 __author__ = ' (lrfurtado@yahoo.com.br)'
 from calibre.web.feeds.news import BasicNewsRecipe
 class LeJournalDeMontrealRecipe(BasicNewsRecipe):
     title       = u'Le Journal de Montreal'
     description = u'Le Journal de Montreal'
     __author__  = 'Luciano Furtado'
     language = 'fr'
     oldest_article = 7
     use_embedded_content=0
     max_articles_per_feed = 15
     remove_tags = [
                        dict(name='ul',attrs={'id':'mainNav'}),
                        dict(name='div',attrs={'id':'boxPolitique'}),
                        dict(name='div',attrs={'id':'boxScoop'}),
                        dict(name='div',attrs={'id':'DossierSpec'}),
                        dict(name='div',attrs={'id':'channelBoxes'}),
                        dict(name='div',attrs={'id':'sectionBoxes'}),
                        dict(name='div',attrs={'id':'header'}),
                        dict(name='div',attrs={'id':'footer'}),
 dict(name='div',attrs={'id':'navbarCanoe_container'}),
                        dict(name='div',attrs={'id':'popularCanoe'}),
                        dict(name='div',attrs={'id':'textAds'}),
                        dict(name='div',attrs={'id':'24heures'}),
                        dict(name='div',attrs={'class':'bottomBox clear'}),
                        dict(name='div',attrs={'class':'articleControls thin'}),
                  ]
     feeds          = [
                        (u'Actualites',
 u'http://www.canoe.com/rss/feed/nouvelles/ljm_actualites.xml'),
                        (u'Arts et spectacle',
 u'http://www.canoe.com/rss/feed/nouvelles/ljm_arts.xml'),
                        (u'Sports',
 u'http://www.canoe.com/rss/feed/nouvelles/ljm_sports.xml'),
                        (u'Chroniques',
 u'http://www.canoe.com/rss/feed/nouvelles/ljm_chroniques.xml'),
                     ]
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -108,26 +108,28 @@ def line_length(format, raw, percent):
 class Dehyphenator(object):
    '''
-    Analyzes words to determine whether hyphens should be retained/removed.  Uses the document 
+    Analyzes words to determine whether hyphens should be retained/removed.  Uses the document
-    itself is as a dictionary. This method handles all languages along with uncommon, made-up, and 
+    itself is as a dictionary. This method handles all languages along with uncommon, made-up, and
-    scientific words. The primary disadvantage is that words appearing only once in the document 
+    scientific words. The primary disadvantage is that words appearing only once in the document
    retain hyphens.
    '''
    def __init__(self):
        # Add common suffixes to the regex below to increase the likelihood of a match -
        # don't add suffixes which are also complete words, such as 'able' or 'sex'
        self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)ion(s|al(ly)?)?|ings?|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
        # remove prefixes if the prefix was not already the point of hyphenation
        self.prefixes = re.compile(r'^(un|in|ex)$', re.IGNORECASE)
        self.removeprefix = re.compile(r'^(un|in|ex)', re.IGNORECASE)
    def dehyphenate(self, match):
        firsthalf = match.group('firstpart')
        secondhalf = match.group('secondpart')
        hyphenated = str(firsthalf) + "-" + str(secondhalf)
        dehyphenated = str(firsthalf) + str(secondhalf)
-        # Add common suffixes to the regex below to increase the likelihood of a match -   
+        lookupword = self.removesuffixes.sub('', dehyphenated)
-        # don't add suffixes which are also complete words, such as 'able' or 'sex'
+        if self.prefixes.match(firsthalf) is None:
-        removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)ion(s|al(ly)?)?|ings?|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
+           lookupword = self.removeprefix.sub('', lookupword)
        lookupword = removesuffixes.sub('', dehyphenated)
        # remove prefixes if the prefix was not already the point of hyphenation
        prefixes = re.compile(r'^(un|in|ex)$', re.IGNORECASE)
        removeprefix = re.compile(r'^(un|in|ex)', re.IGNORECASE)
        if prefixes.match(firsthalf) is None:
           lookupword = removeprefix.sub('', lookupword)
        booklookup = re.compile(u'%s' % lookupword, re.IGNORECASE)
        #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
        match = booklookup.search(self.html)
@ -137,7 +139,7 @@ class Dehyphenator(object):
        else:
            #print "returned hyphenated word: " + str(hyphenated)
            return hyphenated
-            
+
    def __call__(self, html, format, length=1):
        self.html = html
        if format == 'html':
--- a/src/calibre/library/server/content.py
+++ b/src/calibre/library/server/content.py
@ -184,7 +184,7 @@ class ContentServer(object):
        if path and os.path.exists(path):
            updated = fromtimestamp(os.stat(path).st_mtime)
            cherrypy.response.headers['Last-Modified'] = self.last_modified(updated)
-        return fmt.read()
+        return fmt
    # }}}