From 1a5b92d6d915775428c38b6f99768c6def9bf012 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 18 Sep 2010 20:17:30 -0600
Subject: [PATCH 1/7] Popular Science by Tony Stegall

---
 resources/images/news/popscience.png | Bin 0 -> 737 bytes
 resources/recipes/popscience.recipe  |  59 +++++++++++++++++++++++++++
 2 files changed, 59 insertions(+)
 create mode 100644 resources/images/news/popscience.png
 create mode 100644 resources/recipes/popscience.recipe
diff --git a/resources/images/news/popscience.png b/resources/images/news/popscience.png
new file mode 100644
index 0000000000000000000000000000000000000000..ff33483b10421b209f599cdec809544db83988e7
GIT binary patch
literal 737
zcmeAS@N?(olHy`uVBq!ia0vp^f*{Pn1|+R>-G2cowj^(N7l!{JxM1({$v_d#0*}aI
zAngIhZYQ(tK!Rljj_E)ete>H;(O;c`fr-o0#WBR<bng_$jLQKs$Ld#pUCNxzlBzMO
z`Kv+~kIdndYF!<|{Yj}+R~!!}@AZ9^xk&x)!DjKj9)UUR)*T(fH&>QAzgQ^}I3sDr
z<bx%Nx6(Y%d=?O%JhM~&Pvyz^`tkRE-}^rAk!ScMhU_%4n>Es}bFIvJBwqaPUcE+q
zV_I6|?mA{M9-)bsPx{?cF*TW?b4hbr!70&y>`8k(&R^ALSkjdC{oLLFhrH{D3|BB5
z++3S2wAxB`X6MG1oAo9tny1q&@~&|Ix*cct+12Xkr1dM_v-vukMc+6dzaS%8aB<~-
z=Ec3i^}QNBjW4-xew9kfkg$1qYW;6M&)2@rRhM{I2d_LeUt4?Qz5d(f$?xZtJl$R_
zp2C&PnR-(tx17;{-JfYe-yEll+Y&;Tzl{0B8`A%!{a>lP4cGLk-+K5BuW$Vs&S3Fg
zT!6XO<1zQSS6My6Z@NtOH*fT?TKb;xfT(T9{rZV+(#L;v%+Ol=!usZ%v&UTa{Iba|
z=6$0cTsM!&|4!7|_{V)Q5%-p#FZ6$B&$E}avc3PbNrOeP)aqX%sp|x9+Zopj21+<z
zEPrUb?!lIOjhRW;vlZQC&mJ${WXFE7M@8{O=lqDh;)ZcPD)}l56K3vUtI~hD;_;=7
z$5XVWmzi>HEtq-c=jMqw+y!ipc6y(Dxk6agNLl^gnQ!yIcW&&+*;e!DAoGgf-;b?p
zVA#*#rWWaIlH~$Q9;zj-5hW>!C8<`)MX5lF!N|bSK-a)h*U%!w$iT|b)XKzM*TBrm
rz(8f@sU{Q+x%nxXX_Y7%jI0cS$Pl98PswvoB4+S(^>bP0l+XkK_>3?|

literal 0
HcmV?d00001

diff --git a/resources/recipes/popscience.recipe b/resources/recipes/popscience.recipe
new file mode 100644
index 0000000000..a1ea91a6ae
--- /dev/null
+++ b/resources/recipes/popscience.recipe
@@ -0,0 +1,59 @@
+import re
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class AdvancedUserRecipe1282101454(BasicNewsRecipe):
+    title = 'Popular Science'
+    language = 'en'
+    __author__ = 'TonytheBookworm'
+    description = 'Popular Science'
+    publisher = 'Popular Science'
+    category = 'gadgets,science'
+    oldest_article = 7 # change this if you want more current articles. I like to go a week in
+    max_articles_per_feed = 100
+    no_stylesheets = True
+    remove_javascript = True
+
+    masthead_url = 'http://www.raytheon.com/newsroom/rtnwcm/groups/Public/documents/masthead/rtn08_popscidec_masthead.jpg'
+
+    remove_tags = [dict(name='div', attrs={'id':['toolbar','main_supplements']}),
+                   dict(name='span', attrs={'class':['comments']}),
+                   dict(name='div', attrs={'class':['relatedinfo related-right','node_navigation','content2']}),
+                   dict(name='ul', attrs={'class':['item-list clear-block']})]
+    feeds          = [
+
+                      ('Gadgets', 'http://www.popsci.com/full-feed/gadgets'),
+                      ('Cars', 'http://www.popsci.com/full-feed/cars'),
+                      ('Science', 'http://www.popsci.com/full-feed/science'),
+                      ('Technology', 'http://www.popsci.com/full-feed/technology'),
+                      ('DIY', 'http://www.popsci.com/full-feed/diy'),
+
+                    ]
+
+
+ #The following will get read of the Gallery: links when found
+
+    def preprocess_html(self, soup) :
+        print 'SOUP IS: ', soup
+        weblinks = soup.findAll(['head','h2'])
+        if weblinks is not None:
+            for link in weblinks:
+                if re.search('(Gallery)(:)',str(link)):
+
+                  link.parent.extract()
+        return soup
+  #-----------------------------------------------------------------
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+

From 3b9e43e79ec323d2d7d4fd0b7b9900cec0d971ff Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 18 Sep 2010 21:00:28 -0600
Subject: [PATCH 2/7] EPUB metadata: Don't read timestamp value from epubs as I
 am sick of closing bugs about adding books and having the Date not be today.
 Does not affect reading of metadata from OPF, so it should still be possible
 to restore date when adding from a previously saved to disk folder (as long
 as the OPF was saved)

---
 src/calibre/ebooks/metadata/epub.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/calibre/ebooks/metadata/epub.py b/src/calibre/ebooks/metadata/epub.py
index 041a1ee603..df9a394258 100644
--- a/src/calibre/ebooks/metadata/epub.py
+++ b/src/calibre/ebooks/metadata/epub.py
@@ -176,6 +176,7 @@ def get_metadata(stream, extract_cover=True):
         except:
             import traceback
             traceback.print_exc()
+    mi.timestamp = None
     return mi
 
 def get_quick_metadata(stream):

From dec27fbaa1e9544675d6d10bda566d83fd7a85f2 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sun, 19 Sep 2010 13:02:02 +0800
Subject: [PATCH 3/7] new dehyphenation algorithm, using the document as a
 dictionary

---
 src/calibre/ebooks/conversion/preprocess.py | 58 +++++++++++++++++++--
 src/calibre/ebooks/conversion/utils.py      | 13 +++--
 2 files changed, 62 insertions(+), 9 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 03a0047927..a1e28b2554 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -106,6 +106,50 @@ def line_length(format, raw, percent):
 
     return lengths[index]
 
+class Dehyphenator(object):
+    '''
+    Analyzes words to determine whether hyphens should be retained/removed.  Uses the document 
+    itself is as a dictionary. This method handles all languages along with uncommon, made-up, and 
+    scientific words. The primary disadvantage is that words appearing only once in the document 
+    retain hyphens.
+    '''
+
+    def dehyphenate(self, match):
+        firsthalf = match.group('firstpart')
+        secondhalf = match.group('secondpart')
+        hyphenated = str(firsthalf) + "-" + str(secondhalf)
+        dehyphenated = str(firsthalf) + str(secondhalf)
+        # Add common suffixes to the regex below to increase the likelihood of a match -   
+        # don't add suffixes which are also complete words, such as 'able' or 'sex'
+        removesuffixes = re.compile(r"((ed)?ly|(')?s|a?(t|s)ion(s|al(ly)?)?|ings?|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
+        lookupword = removesuffixes.sub('', dehyphenated)
+        # remove prefixes if the prefix was not already the point of hyphenation
+        prefixes = re.compile(r'^(un|in|ex)$', re.IGNORECASE)
+        removeprefix = re.compile(r'^(un|in|ex)', re.IGNORECASE)
+        if prefixes.match(firsthalf) is None:
+           lookupword = removeprefix.sub('', lookupword)
+        booklookup = re.compile(u'%s' % lookupword, re.IGNORECASE)
+        #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
+        match = booklookup.search(self.html)
+        if match:
+            #print "returned dehyphenated word: " + str(dehyphenated)
+            return dehyphenated
+        else:
+            #print "returned hyphenated word: " + str(hyphenated)
+            return hyphenated
+            
+    def __call__(self, html, format, length=1):
+        self.html = html
+        if format == 'html':
+            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^“"\s>]+)-\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)' % length)
+        elif format == 'pdf':
+            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^“"\s>]+)-\s*(<p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length)
+        elif format == 'individual_words':
+            intextmatch = re.compile('>[^<]*\b(?P<firstpart>[^"\s>]+)-(?P<secondpart)\w+)\b[^<]*<') # for later, not called anywhere yet
+
+        html = intextmatch.sub(self.dehyphenate, html)
+        return html
+
 
 class CSSPreProcessor(object):
 
@@ -328,11 +372,10 @@ class HTMLPreProcessor(object):
                 print 'Failed to parse remove_footer regexp'
                 traceback.print_exc()
 
-        # unwrap hyphenation - moved here so it's executed after header/footer removal
+        # unwrap em/en dashes, delete soft hyphens - moved here so it's executed after header/footer removal
         if is_pdftohtml:
-            # unwrap visible dashes and hyphens - don't delete they are often hyphens for
-            # for compound words, formatting, etc
-            end_rules.append((re.compile(u'(?<=[-–—])\s*<p>\s*(?=[[a-z\d])'), lambda match: ''))
+            # unwrap em/en dashes
+            end_rules.append((re.compile(u'(?<=[–—])\s*<p>\s*(?=[[a-z\d])'), lambda match: ''))
             # unwrap/delete soft hyphens
             end_rules.append((re.compile(u'[­](\s*<p>)+\s*(?=[[a-z\d])'), lambda match: ''))
             # unwrap/delete soft hyphens with formatting
@@ -350,7 +393,7 @@ class HTMLPreProcessor(object):
                 # print "The pdf line length returned is " + str(length)
                 end_rules.append(
                     # Un wrap using punctuation
-                    (re.compile(r'(?<=.{%i}([a-z,:)\-IA]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
+                    (re.compile(r'(?<=.{%i}([a-z,:)\IA]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
                 )
 
         for rule in self.PREPROCESS + start_rules:
@@ -380,6 +423,11 @@ class HTMLPreProcessor(object):
         for rule in rules + end_rules:
             html = rule[0].sub(rule[1], html)
 
+        if is_pdftohtml:
+            # Dehyphenate
+            dehyphenator = Dehyphenator()
+            html = dehyphenator(html,'pdf', length)
+
         #dump(html, 'post-preprocess')
 
         # Handle broken XHTML w/ SVG (ugh)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 37fd169cb1..f9178ead0b 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -6,7 +6,7 @@ __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 
 import re
-from calibre.ebooks.conversion.preprocess import line_length
+from calibre.ebooks.conversion.preprocess import line_length, Dehyphenator
 from calibre.utils.logging import default_log
 
 class PreProcessor(object):
@@ -132,7 +132,6 @@ class PreProcessor(object):
         # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
         html = re.sub(r"\s*</p>", "</p>\n", html)
         html = re.sub(r"\s*<p>\s*", "\n<p>", html)
-        #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")
         # detect chapters/sections to match xpath or splitting logic
         heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
         self.html_preprocess_sections = len(heading.findall(html))
@@ -174,10 +173,16 @@ class PreProcessor(object):
         length = line_length(format, html, getattr(self.extra_opts,
             'html_unwrap_factor', 0.4))
         self.log("*** Median line length is " + str(length) + ", calculated with " + format + " format ***")
+        max_length = length * 1.4
+        min_max = str("(?<=.{"+str(length)+"})(?<!.{"+str(max_length)+"})")
         #
-        # Unwrap and/or delete soft-hyphens, hyphens
+        # Unwrap em/en dashes, delete soft-hyphens
+        #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")
         html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
-        html = re.sub(u'(?<=[-\u2013\u2014])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html)
+        html = re.sub(u'%s(?<=[\u2013\u2014])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])' % min_max, '', html)
+        # Dehyphenate
+        dehyphenator = Dehyphenator()
+        html = dehyphenator(html,'html', length)
 
         # Unwrap lines using punctation and line length
         unwrap = re.compile(r"(?<=.{%i}([a-z,;):\IA]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)

From 053d60331fcfb9f82e141ebc11a625b1acd3e1a4 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sun, 19 Sep 2010 23:07:07 +0800
Subject: [PATCH 4/7] regex optimizations

---
 src/calibre/ebooks/conversion/preprocess.py |  2 +-
 src/calibre/ebooks/conversion/utils.py      | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 16bfb42d1f..7f13cefcaa 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -121,7 +121,7 @@ class Dehyphenator(object):
         dehyphenated = str(firsthalf) + str(secondhalf)
         # Add common suffixes to the regex below to increase the likelihood of a match -   
         # don't add suffixes which are also complete words, such as 'able' or 'sex'
-        removesuffixes = re.compile(r"((ed)?ly|(')?s|a?(t|s)ion(s|al(ly)?)?|ings?|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
+        removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)ion(s|al(ly)?)?|ings?|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
         lookupword = removesuffixes.sub('', dehyphenated)
         # remove prefixes if the prefix was not already the point of hyphenation
         prefixes = re.compile(r'^(un|in|ex)$', re.IGNORECASE)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index f9178ead0b..6a5eaa4a34 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -114,7 +114,7 @@ class PreProcessor(object):
         html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
         # Get rid of empty span, bold, & italics tags
         html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
-        html = re.sub(r"\s*<[ibu]>\s*(<[ibu]>\s*</[ibu]>\s*){0,2}\s*</[ibu]>", " ", html)
+        html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*</[ibu]>\s*){0,2}\s*</[ibu]>", " ", html)
         html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
 
         # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
@@ -139,16 +139,16 @@ class PreProcessor(object):
         #
         # Start with most typical chapter headings, get more aggressive until one works
         if self.html_preprocess_sections < 10:
-            chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</[ibu]>){0,2})\s*(</span>)?s*(</[ibu]>){0,2}\s*(</span>)?\s*(</(p|/?br)>)\s*\s*(\s*<p[^>]*>\s*</p>){0,2}\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.IGNORECASE|re.VERBOSE)
+            chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu][^>]*>){0,2}\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</[ibu]>){0,2})\s*(</span>)?s*(</[ibu]>){0,2}\s*(</span>)?\s*(</(p|/?br)>)\s*\s*(\s*<p[^>]*>\s*</p>){0,2}\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.IGNORECASE|re.VERBOSE)
             html = chapdetect.sub(self.chapter_head, html)
         if self.html_preprocess_sections < 10:
             self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying numeric chapters")
-            chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE)
+            chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu][^>]*>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE)
             html = chapdetect2.sub(self.chapter_head, html)
 
         if self.html_preprocess_sections < 10:
             self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words")
-            chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}\s*.?([A-Z#\-\s]+)\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE)
+            chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu][^>]*>){0,2}\s*.?([A-Z#\-\s]+)\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE)
             html = chapdetect2.sub(self.chapter_head, html)
 
         ###### Unwrap lines ######
@@ -191,7 +191,7 @@ class PreProcessor(object):
         # If still no sections after unwrapping mark split points on lines with no punctuation
         if self.html_preprocess_sections < 10:
             self.log("Looking for more split points based on punctuation, currently have " + str(self.html_preprocess_sections))
-            chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*.?([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
+            chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
             html = chapdetect3.sub(self.chapter_break, html)
         # search for places where a first or second level heading is immediately followed by another
         # top level heading.  demote the second heading to h3 to prevent splitting between chapter

From 980388f2bde3d4cb4b07673cb9e79c951aabd867 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sun, 19 Sep 2010 09:48:39 -0600
Subject: [PATCH 5/7] Le Journal de Montreal by Luciano Furtado. Fixes #405
 (New news feed)

---
 resources/recipes/le_journal.recipe | 43 +++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)
 create mode 100644 resources/recipes/le_journal.recipe

diff --git a/resources/recipes/le_journal.recipe b/resources/recipes/le_journal.recipe
new file mode 100644
index 0000000000..24a7d52164
--- /dev/null
+++ b/resources/recipes/le_journal.recipe
@@ -0,0 +1,43 @@
+__author__ = ' (lrfurtado@yahoo.com.br)'
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class LeJournalDeMontrealRecipe(BasicNewsRecipe):
+
+     title       = u'Le Journal de Montreal'
+     description = u'Le Journal de Montreal'
+     __author__  = 'Luciano Furtado'
+     language = 'fr'
+
+     oldest_article = 7
+     use_embedded_content=0
+     max_articles_per_feed = 15
+
+     remove_tags = [
+                        dict(name='ul',attrs={'id':'mainNav'}),
+                        dict(name='div',attrs={'id':'boxPolitique'}),
+                        dict(name='div',attrs={'id':'boxScoop'}),
+                        dict(name='div',attrs={'id':'DossierSpec'}),
+                        dict(name='div',attrs={'id':'channelBoxes'}),
+                        dict(name='div',attrs={'id':'sectionBoxes'}),
+                        dict(name='div',attrs={'id':'header'}),
+                        dict(name='div',attrs={'id':'footer'}),
+ dict(name='div',attrs={'id':'navbarCanoe_container'}),
+                        dict(name='div',attrs={'id':'popularCanoe'}),
+                        dict(name='div',attrs={'id':'textAds'}),
+                        dict(name='div',attrs={'id':'24heures'}),
+                        dict(name='div',attrs={'class':'bottomBox clear'}),
+                        dict(name='div',attrs={'class':'articleControls thin'}),
+                  ]
+
+
+     feeds          = [
+                        (u'Actualites',
+ u'http://www.canoe.com/rss/feed/nouvelles/ljm_actualites.xml'),
+                        (u'Arts et spectacle',
+ u'http://www.canoe.com/rss/feed/nouvelles/ljm_arts.xml'),
+                        (u'Sports',
+ u'http://www.canoe.com/rss/feed/nouvelles/ljm_sports.xml'),
+                        (u'Chroniques',
+ u'http://www.canoe.com/rss/feed/nouvelles/ljm_chroniques.xml'),
+                     ]

From 23cd4fd7833180d7036aa77c0c1efcbd09ca6a00 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sun, 19 Sep 2010 10:16:41 -0600
Subject: [PATCH 6/7] Content server: Making serving of large files more
 efficient.

---
 src/calibre/library/server/content.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/library/server/content.py b/src/calibre/library/server/content.py
index 95794a8c1d..aeba8a3218 100644
--- a/src/calibre/library/server/content.py
+++ b/src/calibre/library/server/content.py
@@ -184,7 +184,7 @@ class ContentServer(object):
         if path and os.path.exists(path):
             updated = fromtimestamp(os.stat(path).st_mtime)
             cherrypy.response.headers['Last-Modified'] = self.last_modified(updated)
-        return fmt.read()
+        return fmt
     # }}}
 
 

From ef3fd4df536811ca7b91be06ab10595ae1dc6a4c Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sun, 19 Sep 2010 10:39:45 -0600
Subject: [PATCH 7/7] ...

---
 resources/content_server/gui.js | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/resources/content_server/gui.js b/resources/content_server/gui.js
index afc21137e1..bd0743a854 100644
--- a/resources/content_server/gui.js
+++ b/resources/content_server/gui.js
@@ -84,7 +84,10 @@ function render_book(book) {
     }
     title += '</span>'
     title += '<span class="tagdata_long" style="display:none">'
-    if (tags) title += 'Tags=[{0}] '.format(tags);
+    if (tags) {
+        t = tags.split(':&:', 2);
+        title += 'Tags=[{0}] '.format(t[1]);
+    }
     custcols = book.attr("custcols").split(',')
     for ( i = 0; i < custcols.length; i++) {
         if (custcols[i].length > 0) {