diff --git a/resources/images/news/popscience.png b/resources/images/news/popscience.png
new file mode 100644
index 0000000000..ff33483b10
Binary files /dev/null and b/resources/images/news/popscience.png differ
diff --git a/resources/recipes/le_journal.recipe b/resources/recipes/le_journal.recipe
new file mode 100644
index 0000000000..24a7d52164
--- /dev/null
+++ b/resources/recipes/le_journal.recipe
@@ -0,0 +1,43 @@
+__author__ = ' (lrfurtado@yahoo.com.br)'
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class LeJournalDeMontrealRecipe(BasicNewsRecipe):
+
+ title = u'Le Journal de Montreal'
+ description = u'Le Journal de Montreal'
+ __author__ = 'Luciano Furtado'
+ language = 'fr'
+
+ oldest_article = 7
+ use_embedded_content=0
+ max_articles_per_feed = 15
+
+ remove_tags = [
+ dict(name='ul',attrs={'id':'mainNav'}),
+ dict(name='div',attrs={'id':'boxPolitique'}),
+ dict(name='div',attrs={'id':'boxScoop'}),
+ dict(name='div',attrs={'id':'DossierSpec'}),
+ dict(name='div',attrs={'id':'channelBoxes'}),
+ dict(name='div',attrs={'id':'sectionBoxes'}),
+ dict(name='div',attrs={'id':'header'}),
+ dict(name='div',attrs={'id':'footer'}),
+ dict(name='div',attrs={'id':'navbarCanoe_container'}),
+ dict(name='div',attrs={'id':'popularCanoe'}),
+ dict(name='div',attrs={'id':'textAds'}),
+ dict(name='div',attrs={'id':'24heures'}),
+ dict(name='div',attrs={'class':'bottomBox clear'}),
+ dict(name='div',attrs={'class':'articleControls thin'}),
+ ]
+
+
+ feeds = [
+ (u'Actualites',
+ u'http://www.canoe.com/rss/feed/nouvelles/ljm_actualites.xml'),
+ (u'Arts et spectacle',
+ u'http://www.canoe.com/rss/feed/nouvelles/ljm_arts.xml'),
+ (u'Sports',
+ u'http://www.canoe.com/rss/feed/nouvelles/ljm_sports.xml'),
+ (u'Chroniques',
+ u'http://www.canoe.com/rss/feed/nouvelles/ljm_chroniques.xml'),
+ ]
diff --git a/resources/recipes/popscience.recipe b/resources/recipes/popscience.recipe
new file mode 100644
index 0000000000..a1ea91a6ae
--- /dev/null
+++ b/resources/recipes/popscience.recipe
@@ -0,0 +1,59 @@
+import re
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class AdvancedUserRecipe1282101454(BasicNewsRecipe):
+ title = 'Popular Science'
+ language = 'en'
+ __author__ = 'TonytheBookworm'
+ description = 'Popular Science'
+ publisher = 'Popular Science'
+ category = 'gadgets,science'
+ oldest_article = 7 # change this if you want more current articles. I like to go a week in
+ max_articles_per_feed = 100
+ no_stylesheets = True
+ remove_javascript = True
+
+ masthead_url = 'http://www.raytheon.com/newsroom/rtnwcm/groups/Public/documents/masthead/rtn08_popscidec_masthead.jpg'
+
+ remove_tags = [dict(name='div', attrs={'id':['toolbar','main_supplements']}),
+ dict(name='span', attrs={'class':['comments']}),
+ dict(name='div', attrs={'class':['relatedinfo related-right','node_navigation','content2']}),
+ dict(name='ul', attrs={'class':['item-list clear-block']})]
+ feeds = [
+
+ ('Gadgets', 'http://www.popsci.com/full-feed/gadgets'),
+ ('Cars', 'http://www.popsci.com/full-feed/cars'),
+ ('Science', 'http://www.popsci.com/full-feed/science'),
+ ('Technology', 'http://www.popsci.com/full-feed/technology'),
+ ('DIY', 'http://www.popsci.com/full-feed/diy'),
+
+ ]
+
+
+ #The following will get read of the Gallery: links when found
+
+ def preprocess_html(self, soup) :
+ print 'SOUP IS: ', soup
+ weblinks = soup.findAll(['head','h2'])
+ if weblinks is not None:
+ for link in weblinks:
+ if re.search('(Gallery)(:)',str(link)):
+
+ link.parent.extract()
+ return soup
+ #-----------------------------------------------------------------
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index e87a8021f9..3b1239814a 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -106,6 +106,52 @@ def line_length(format, raw, percent):
return lengths[index]
+class Dehyphenator(object):
+ '''
+ Analyzes words to determine whether hyphens should be retained/removed. Uses the document
+ itself is as a dictionary. This method handles all languages along with uncommon, made-up, and
+ scientific words. The primary disadvantage is that words appearing only once in the document
+ retain hyphens.
+ '''
+
+ def __init__(self):
+ # Add common suffixes to the regex below to increase the likelihood of a match -
+ # don't add suffixes which are also complete words, such as 'able' or 'sex'
+ self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)ion(s|al(ly)?)?|ings?|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
+ # remove prefixes if the prefix was not already the point of hyphenation
+ self.prefixes = re.compile(r'^(un|in|ex)$', re.IGNORECASE)
+ self.removeprefix = re.compile(r'^(un|in|ex)', re.IGNORECASE)
+
+ def dehyphenate(self, match):
+ firsthalf = match.group('firstpart')
+ secondhalf = match.group('secondpart')
+ hyphenated = str(firsthalf) + "-" + str(secondhalf)
+ dehyphenated = str(firsthalf) + str(secondhalf)
+ lookupword = self.removesuffixes.sub('', dehyphenated)
+ if self.prefixes.match(firsthalf) is None:
+ lookupword = self.removeprefix.sub('', lookupword)
+ booklookup = re.compile(u'%s' % lookupword, re.IGNORECASE)
+ #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
+ match = booklookup.search(self.html)
+ if match:
+ #print "returned dehyphenated word: " + str(dehyphenated)
+ return dehyphenated
+ else:
+ #print "returned hyphenated word: " + str(hyphenated)
+ return hyphenated
+
+ def __call__(self, html, format, length=1):
+ self.html = html
+ if format == 'html':
+ intextmatch = re.compile(u'(?<=.{%i})(?P |[iub]>\s* \s*<[iub]>)\s*(?P \s*(?=[[a-z\d])'), lambda match: ''))
# unwrap/delete soft hyphens
end_rules.append((re.compile(u'[](\s* )+\s*(?=[[a-z\d])'), lambda match: ''))
# unwrap/delete soft hyphens with formatting
@@ -350,7 +395,7 @@ class HTMLPreProcessor(object):
# print "The pdf line length returned is " + str(length)
end_rules.append(
# Un wrap using punctuation
- (re.compile(r'(?<=.{%i}([a-z,:)\-IA]|(?(i|b|u)>)?\s*(
\s*", "\n
", html)
- #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")
# detect chapters/sections to match xpath or splitting logic
heading = re.compile(' ]*>\s* ]*>\s*