From 41b4a5dd96c5f129a915445e9a715e2d1988fd3f Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Thu, 24 Jun 2010 08:30:18 -0600
Subject: [PATCH 1/4] Metadata download: Filter out non book results. Also sort
 results by availability of covers for the isbn. Fixes #5946 (fix file plugin
 postprocessing and update metadata download sorting)

---
 src/calibre/ebooks/metadata/fetch.py | 102 +++++++++++++++++++++++----
 1 file changed, 90 insertions(+), 12 deletions(-)
diff --git a/src/calibre/ebooks/metadata/fetch.py b/src/calibre/ebooks/metadata/fetch.py
index d12c668e0d..db6ad0278d 100644
--- a/src/calibre/ebooks/metadata/fetch.py
+++ b/src/calibre/ebooks/metadata/fetch.py
@@ -3,17 +3,18 @@ __license__ = 'GPL 3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 
-import traceback, sys, textwrap, re
+import traceback, sys, textwrap, re, urllib2
 from threading import Thread
 
-from calibre import prints
+from calibre import prints, browser
 from calibre.utils.config import OptionParser
 from calibre.utils.logging import default_log
 from calibre.customize import Plugin
+from calibre.ebooks.metadata.library_thing import OPENLIBRARY
 
 metadata_config = None
 
-class MetadataSource(Plugin):
+class MetadataSource(Plugin): # {{{
 
     author = 'Kovid Goyal'
 
@@ -130,7 +131,9 @@ class MetadataSource(Plugin):
     def customization_help(self):
         return 'This plugin can only be customized using the GUI'
 
-class GoogleBooks(MetadataSource):
+    # }}}
+
+class GoogleBooks(MetadataSource): # {{{
 
     name = 'Google Books'
     description = _('Downloads metadata from Google Books')
@@ -145,8 +148,9 @@ class GoogleBooks(MetadataSource):
             self.exception = e
             self.tb = traceback.format_exc()
 
+    # }}}
 
-class ISBNDB(MetadataSource):
+class ISBNDB(MetadataSource): # {{{
 
     name = 'IsbnDB'
     description = _('Downloads metadata from isbndb.com')
@@ -181,7 +185,9 @@ class ISBNDB(MetadataSource):
                 'and enter your access key below.')
         return '<p>'+ans%('<a href="http://www.isbndb.com">', '</a>')
 
-class Amazon(MetadataSource):
+    # }}}
+
+class Amazon(MetadataSource): # {{{
 
     name = 'Amazon'
     metadata_type = 'social'
@@ -198,7 +204,9 @@ class Amazon(MetadataSource):
             self.exception = e
             self.tb = traceback.format_exc()
 
-class LibraryThing(MetadataSource):
+    # }}}
+
+class LibraryThing(MetadataSource): # {{{
 
     name = 'LibraryThing'
     metadata_type = 'social'
@@ -207,7 +215,6 @@ class LibraryThing(MetadataSource):
     def fetch(self):
         if not self.isbn:
             return
-        from calibre import browser
         from calibre.ebooks.metadata import MetaInformation
         import json
         br = browser()
@@ -228,6 +235,7 @@ class LibraryThing(MetadataSource):
         except Exception, e:
             self.exception = e
             self.tb = traceback.format_exc()
+    # }}}
 
 
 def result_index(source, result):
@@ -268,6 +276,27 @@ class MetadataSources(object):
         for s in self.sources:
             s.join()
 
+def filter_metadata_results(item):
+    keywords = ["audio", "tape", "cassette", "abridged", "playaway"]
+    for keyword in keywords:
+        if item.publisher and keyword in item.publisher.lower():
+            return False
+    return True
+
+class HeadRequest(urllib2.Request):
+    def get_method(self):
+        return "HEAD"
+
+def check_for_covers(items):
+    opener = browser()
+    for item in items:
+       item.has_cover = False
+       try:
+            opener.open(HeadRequest(OPENLIBRARY%item.isbn), timeout=5)
+            item.has_cover = True
+       except:
+           pass # Cover not found
+
 def search(title=None, author=None, publisher=None, isbn=None, isbndb_key=None,
            verbose=0):
     assert not(title is None and author is None and publisher is None and \
@@ -285,10 +314,59 @@ def search(title=None, author=None, publisher=None, isbn=None, isbndb_key=None,
     for fetcher in fetchers[1:]:
         merge_results(results, fetcher.results)
 
-    results = sorted(results, cmp=lambda x, y : cmp(
-            (x.comments.strip() if x.comments else ''),
-            (y.comments.strip() if y.comments else '')
-                                                  ), reverse=True)
+    results = list(filter(filter_metadata_results, results))
+
+    check_for_covers(results)
+
+    words = ("the", "a", "an", "of", "and")
+    prefix_pat = re.compile(r'^(%s)\s+'%("|".join(words)))
+    trailing_paren_pat = re.compile(r'\(.*\)$')
+    whitespace_pat = re.compile(r'\s+')
+
+    def sort_func(x, y):
+        def cleanup_title(s):
+            s = s.strip().lower()
+            s = prefix_pat.sub(' ', s)
+            s = trailing_paren_pat.sub('', s)
+            s = whitespace_pat.sub(' ', s)
+            return s.strip()
+
+        t = cleanup_title(title)
+        x_title = cleanup_title(x.title)
+        y_title = cleanup_title(y.title)
+
+        # prefer titles that start with the search title
+        tx = cmp(t, x_title)
+        ty = cmp(t, y_title)
+        result = 0 if abs(tx) == abs(ty) else abs(tx) - abs(ty)
+
+        # then prefer titles that have a cover image
+        if result == 0:
+            result = -cmp(x.has_cover, y.has_cover)
+
+        # then prefer titles with the longest comment, with in 10%
+        if result == 0:
+            cx = len(x.comments.strip() if x.comments else '')
+            cy = len(y.comments.strip() if y.comments else '')
+            t = (cx + cy) / 20
+            result = cy - cx
+            if abs(result) < t:
+                result = 0
+
+        return result
+
+    results = sorted(results, cmp=sort_func)
+
+    # if for some reason there is no comment in the top selection, go looking for one
+    if len(results) > 1:
+        if not results[0].comments or len(results[0].comments) == 0:
+            for r in results[1:]:
+                if title.lower() == r.title[:len(title)].lower() and r.comments and len(r.comments):
+                    results[0].comments = r.comments
+                    break
+
+ #   for r in results:
+ #       print "{0:14.14} {1:30.30} {2:20.20} {3:6} {4}".format(r.isbn, r.title, r.publisher, len(r.comments if r.comments else ''), r.has_cover)
 
     return results, [(x.name, x.exception, x.tb) for x in fetchers]
 

From bb5ab06f3b9e7791b9793c64c6e486b950e3b441 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Thu, 24 Jun 2010 11:56:54 -0600
Subject: [PATCH 2/4] Fix #5951 (unable to retrieve news item)

---
 resources/recipes/national_post.recipe | 39 ++++++++------------------
 1 file changed, 11 insertions(+), 28 deletions(-)

diff --git a/resources/recipes/national_post.recipe b/resources/recipes/national_post.recipe
index 4fe188934c..00eb918d02 100644
--- a/resources/recipes/national_post.recipe
+++ b/resources/recipes/national_post.recipe
@@ -7,18 +7,18 @@ class NYTimes(BasicNewsRecipe):
     __author__  = 'Krittika Goyal'
     description = 'Canadian national newspaper'
     timefmt = ' [%d %b, %Y]'
-    needs_subscription = False
     language = 'en_CA'
+    needs_subscription = False
 
     no_stylesheets = True
     #remove_tags_before = dict(name='h1', attrs={'class':'heading'})
-    #remove_tags_after  = dict(name='td', attrs={'class':'newptool1'})
+    remove_tags_after  = dict(name='div', attrs={'class':'npStoryTools npWidth1-6 npRight npTxtStrong'})
     remove_tags = [
        dict(name='iframe'),
-       dict(name='div', attrs={'class':'story-tools'}),
+       dict(name='div', attrs={'class':['story-tools', 'npStoryTools npWidth1-6 npRight npTxtStrong']}),
        #dict(name='div', attrs={'id':['qrformdiv', 'inSection', 'alpha-inner']}),
        #dict(name='form', attrs={'onsubmit':''}),
-       #dict(name='table', attrs={'cellspacing':'0'}),
+       dict(name='ul', attrs={'class':'npTxtAlt npGroup npTxtCentre npStoryShare npTxtStrong npTxtDim'}),
     ]
 
    # def preprocess_html(self, soup):
@@ -37,7 +37,7 @@ class NYTimes(BasicNewsRecipe):
     def parse_index(self):
             soup = self.nejm_get_index()
 
-            div = soup.find(id='LegoText4')
+            div = soup.find(id='npContentMain')
 
             current_section = None
             current_articles = []
@@ -50,7 +50,7 @@ class NYTimes(BasicNewsRecipe):
                     current_section = self.tag_to_string(x)
                     current_articles = []
                     self.log('\tFound section:', current_section)
-                if current_section is not None and x.name == 'h3':
+                if current_section is not None and x.name == 'h5':
                     # Article found
                     title = self.tag_to_string(x)
                     a = x.find('a', href=lambda x: x and 'story' in x)
@@ -59,8 +59,8 @@ class NYTimes(BasicNewsRecipe):
                     url = a.get('href', False)
                     if not url or not title:
                         continue
-                    if url.startswith('story'):
-                         url = 'http://www.nationalpost.com/todays-paper/'+url
+                    #if url.startswith('story'):
+                    url = 'http://www.nationalpost.com/todays-paper/'+url
                     self.log('\t\tFound article:', title)
                     self.log('\t\t\t', url)
                     current_articles.append({'title': title, 'url':url,
@@ -70,28 +70,11 @@ class NYTimes(BasicNewsRecipe):
                 feeds.append((current_section, current_articles))
 
             return feeds
-
     def preprocess_html(self, soup):
-        story = soup.find(name='div', attrs={'class':'triline'})
-        page2_link = soup.find('p','pagenav')
-        if page2_link:
-            atag = page2_link.find('a',href=True)
-            if atag:
-                page2_url = atag['href']
-                if page2_url.startswith('story'):
-                         page2_url = 'http://www.nationalpost.com/todays-paper/'+page2_url
-                elif page2_url.startswith( '/todays-paper/story.html'):
-                    page2_url = 'http://www.nationalpost.com/'+page2_url
-                page2_soup = self.index_to_soup(page2_url)
-                if page2_soup:
-                    page2_content = page2_soup.find('div','story-content')
-                    if page2_content:
-                        full_story = BeautifulSoup('<div></div>')
-                        full_story.insert(0,story)
-                        full_story.insert(1,page2_content)
-                        story = full_story
+        story = soup.find(name='div', attrs={'id':'npContentMain'})
+        ##td = heading.findParent(name='td')
+        ##td.extract()
         soup = BeautifulSoup('<html><head><title>t</title></head><body></body></html>')
         body = soup.find(name='body')
         body.insert(0, story)
         return soup
-

From 985e65d3864fa6a8575c9bb0f76ea8089eab72fc Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Thu, 24 Jun 2010 12:06:34 -0600
Subject: [PATCH 3/4] Metadata download: Make cover check multithreaded

---
 src/calibre/ebooks/metadata/fetch.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/src/calibre/ebooks/metadata/fetch.py b/src/calibre/ebooks/metadata/fetch.py
index e7883d3757..0fd671f86a 100644
--- a/src/calibre/ebooks/metadata/fetch.py
+++ b/src/calibre/ebooks/metadata/fetch.py
@@ -287,15 +287,19 @@ class HeadRequest(urllib2.Request):
     def get_method(self):
         return "HEAD"
 
-def check_for_covers(items):
+def do_cover_check(item):
     opener = browser()
-    for item in items:
-       item.has_cover = False
-       try:
-            opener.open(HeadRequest(OPENLIBRARY%item.isbn), timeout=5)
-            item.has_cover = True
-       except:
-           pass # Cover not found
+    item.has_cover = False
+    try:
+        opener.open(HeadRequest(OPENLIBRARY%item.isbn), timeout=5)
+        item.has_cover = True
+    except:
+        pass # Cover not found
+
+def check_for_covers(items):
+    threads = [Thread(target=do_cover_check, args=(item,)) for item in items]
+    for t in threads: t.start()
+    for t in threads: t.join()
 
 def search(title=None, author=None, publisher=None, isbn=None, isbndb_key=None,
            verbose=0):

From e7eb5b69657de4d051bd1900a27f16f501afb5b3 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Thu, 24 Jun 2010 12:07:33 -0600
Subject: [PATCH 4/4] Fix #5937 ("New Scientist" recipe problems)

---
 resources/recipes/new_scientist.recipe | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/resources/recipes/new_scientist.recipe b/resources/recipes/new_scientist.recipe
index 1727a926ed..b40be458bc 100644
--- a/resources/recipes/new_scientist.recipe
+++ b/resources/recipes/new_scientist.recipe
@@ -32,15 +32,16 @@ class NewScientist(BasicNewsRecipe):
                         }
     preprocess_regexps = [(re.compile(r'</title>.*?</head>', re.DOTALL|re.IGNORECASE),lambda match: '</title></head>')]
 
-    keep_only_tags = [dict(name='div', attrs={'id':['pgtop','maincol','nsblgposts','hldgalcols']})]
+    keep_only_tags = [dict(name='div', attrs={'id':['pgtop','maincol','blgmaincol','nsblgposts','hldgalcols']})]
 
     remove_tags = [
                      dict(name='div'  , attrs={'class':['hldBd','adline','pnl','infotext' ]})
-                    ,dict(name='div'  , attrs={'id'   :['compnl','artIssueInfo','artTools']})
+                    ,dict(name='div'  , attrs={'id'   :['compnl','artIssueInfo','artTools','comments','blgsocial']})
                     ,dict(name='p'    , attrs={'class':['marker','infotext'               ]})
                     ,dict(name='meta' , attrs={'name' :'description'                       })
+                    ,dict(name='a'    , attrs={'rel'  :'tag'                                })
                   ]
-    remove_tags_after = dict(attrs={'class':'nbpcopy'})
+    remove_tags_after = dict(attrs={'class':['nbpcopy','comments']})
     remove_attributes = ['height','width']
 
     feeds          = [