diff --git a/resources/recipes/le_journal.recipe b/resources/recipes/le_journal.recipe new file mode 100644 index 0000000000..24a7d52164 --- /dev/null +++ b/resources/recipes/le_journal.recipe @@ -0,0 +1,43 @@ +__author__ = ' (lrfurtado@yahoo.com.br)' + +from calibre.web.feeds.news import BasicNewsRecipe + +class LeJournalDeMontrealRecipe(BasicNewsRecipe): + + title = u'Le Journal de Montreal' + description = u'Le Journal de Montreal' + __author__ = 'Luciano Furtado' + language = 'fr' + + oldest_article = 7 + use_embedded_content=0 + max_articles_per_feed = 15 + + remove_tags = [ + dict(name='ul',attrs={'id':'mainNav'}), + dict(name='div',attrs={'id':'boxPolitique'}), + dict(name='div',attrs={'id':'boxScoop'}), + dict(name='div',attrs={'id':'DossierSpec'}), + dict(name='div',attrs={'id':'channelBoxes'}), + dict(name='div',attrs={'id':'sectionBoxes'}), + dict(name='div',attrs={'id':'header'}), + dict(name='div',attrs={'id':'footer'}), + dict(name='div',attrs={'id':'navbarCanoe_container'}), + dict(name='div',attrs={'id':'popularCanoe'}), + dict(name='div',attrs={'id':'textAds'}), + dict(name='div',attrs={'id':'24heures'}), + dict(name='div',attrs={'class':'bottomBox clear'}), + dict(name='div',attrs={'class':'articleControls thin'}), + ] + + + feeds = [ + (u'Actualites', + u'http://www.canoe.com/rss/feed/nouvelles/ljm_actualites.xml'), + (u'Arts et spectacle', + u'http://www.canoe.com/rss/feed/nouvelles/ljm_arts.xml'), + (u'Sports', + u'http://www.canoe.com/rss/feed/nouvelles/ljm_sports.xml'), + (u'Chroniques', + u'http://www.canoe.com/rss/feed/nouvelles/ljm_chroniques.xml'), + ] diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 7f13cefcaa..3b1239814a 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -108,26 +108,28 @@ def line_length(format, raw, percent): class Dehyphenator(object): ''' - Analyzes words to determine whether hyphens should be retained/removed. Uses the document - itself is as a dictionary. This method handles all languages along with uncommon, made-up, and - scientific words. The primary disadvantage is that words appearing only once in the document + Analyzes words to determine whether hyphens should be retained/removed. Uses the document + itself is as a dictionary. This method handles all languages along with uncommon, made-up, and + scientific words. The primary disadvantage is that words appearing only once in the document retain hyphens. ''' + def __init__(self): + # Add common suffixes to the regex below to increase the likelihood of a match - + # don't add suffixes which are also complete words, such as 'able' or 'sex' + self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)ion(s|al(ly)?)?|ings?|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE) + # remove prefixes if the prefix was not already the point of hyphenation + self.prefixes = re.compile(r'^(un|in|ex)$', re.IGNORECASE) + self.removeprefix = re.compile(r'^(un|in|ex)', re.IGNORECASE) + def dehyphenate(self, match): firsthalf = match.group('firstpart') secondhalf = match.group('secondpart') hyphenated = str(firsthalf) + "-" + str(secondhalf) dehyphenated = str(firsthalf) + str(secondhalf) - # Add common suffixes to the regex below to increase the likelihood of a match - - # don't add suffixes which are also complete words, such as 'able' or 'sex' - removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)ion(s|al(ly)?)?|ings?|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE) - lookupword = removesuffixes.sub('', dehyphenated) - # remove prefixes if the prefix was not already the point of hyphenation - prefixes = re.compile(r'^(un|in|ex)$', re.IGNORECASE) - removeprefix = re.compile(r'^(un|in|ex)', re.IGNORECASE) - if prefixes.match(firsthalf) is None: - lookupword = removeprefix.sub('', lookupword) + lookupword = self.removesuffixes.sub('', dehyphenated) + if self.prefixes.match(firsthalf) is None: + lookupword = self.removeprefix.sub('', lookupword) booklookup = re.compile(u'%s' % lookupword, re.IGNORECASE) #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated) match = booklookup.search(self.html) @@ -137,7 +139,7 @@ class Dehyphenator(object): else: #print "returned hyphenated word: " + str(hyphenated) return hyphenated - + def __call__(self, html, format, length=1): self.html = html if format == 'html': diff --git a/src/calibre/library/server/content.py b/src/calibre/library/server/content.py index 95794a8c1d..aeba8a3218 100644 --- a/src/calibre/library/server/content.py +++ b/src/calibre/library/server/content.py @@ -184,7 +184,7 @@ class ContentServer(object): if path and os.path.exists(path): updated = fromtimestamp(os.stat(path).st_mtime) cherrypy.response.headers['Last-Modified'] = self.last_modified(updated) - return fmt.read() + return fmt # }}}