Merge from trunk

2025-07-09 03:04:10 -04:00 · 2010-06-24 19:10:24 +01:00 · 2010-06-24 19:10:24 +01:00 · 90be73fe5b
commit 90be73fe5b
parent 058ddc1274 e7eb5b6965
5 changed files with 117 additions and 46 deletions
--- a/resources/recipes/national_post.recipe
+++ b/resources/recipes/national_post.recipe
@ -7,18 +7,18 @@ class NYTimes(BasicNewsRecipe):
    __author__  = 'Krittika Goyal'
    description = 'Canadian national newspaper'
    timefmt = ' [%d %b, %Y]'
    needs_subscription = False
    language = 'en_CA'
    needs_subscription = False
    no_stylesheets = True
    #remove_tags_before = dict(name='h1', attrs={'class':'heading'})
-    #remove_tags_after  = dict(name='td', attrs={'class':'newptool1'})
+    remove_tags_after  = dict(name='div', attrs={'class':'npStoryTools npWidth1-6 npRight npTxtStrong'})
    remove_tags = [
       dict(name='iframe'),
-       dict(name='div', attrs={'class':'story-tools'}),
+       dict(name='div', attrs={'class':['story-tools', 'npStoryTools npWidth1-6 npRight npTxtStrong']}),
       #dict(name='div', attrs={'id':['qrformdiv', 'inSection', 'alpha-inner']}),
       #dict(name='form', attrs={'onsubmit':''}),
-       #dict(name='table', attrs={'cellspacing':'0'}),
+       dict(name='ul', attrs={'class':'npTxtAlt npGroup npTxtCentre npStoryShare npTxtStrong npTxtDim'}),
    ]
   # def preprocess_html(self, soup):
@ -37,7 +37,7 @@ class NYTimes(BasicNewsRecipe):
    def parse_index(self):
            soup = self.nejm_get_index()
-            div = soup.find(id='LegoText4')
+            div = soup.find(id='npContentMain')
            current_section = None
            current_articles = []
@ -50,7 +50,7 @@ class NYTimes(BasicNewsRecipe):
                    current_section = self.tag_to_string(x)
                    current_articles = []
                    self.log('\tFound section:', current_section)
-                if current_section is not None and x.name == 'h3':
+                if current_section is not None and x.name == 'h5':
                    # Article found
                    title = self.tag_to_string(x)
                    a = x.find('a', href=lambda x: x and 'story' in x)
@ -59,8 +59,8 @@ class NYTimes(BasicNewsRecipe):
                    url = a.get('href', False)
                    if not url or not title:
                        continue
-                    if url.startswith('story'):
+                    #if url.startswith('story'):
-                         url = 'http://www.nationalpost.com/todays-paper/'+url
+                    url = 'http://www.nationalpost.com/todays-paper/'+url
                    self.log('\t\tFound article:', title)
                    self.log('\t\t\t', url)
                    current_articles.append({'title': title, 'url':url,
@ -70,28 +70,11 @@ class NYTimes(BasicNewsRecipe):
                feeds.append((current_section, current_articles))
            return feeds
    def preprocess_html(self, soup):
-        story = soup.find(name='div', attrs={'class':'triline'})
+        story = soup.find(name='div', attrs={'id':'npContentMain'})
-        page2_link = soup.find('p','pagenav')
+        ##td = heading.findParent(name='td')
-        if page2_link:
+        ##td.extract()
            atag = page2_link.find('a',href=True)
            if atag:
                page2_url = atag['href']
                if page2_url.startswith('story'):
                         page2_url = 'http://www.nationalpost.com/todays-paper/'+page2_url
                elif page2_url.startswith( '/todays-paper/story.html'):
                    page2_url = 'http://www.nationalpost.com/'+page2_url
                page2_soup = self.index_to_soup(page2_url)
                if page2_soup:
                    page2_content = page2_soup.find('div','story-content')
                    if page2_content:
                        full_story = BeautifulSoup('<div></div>')
                        full_story.insert(0,story)
                        full_story.insert(1,page2_content)
                        story = full_story
        soup = BeautifulSoup('<html><head><title>t</title></head><body></body></html>')
        body = soup.find(name='body')
        body.insert(0, story)
        return soup
--- a/resources/recipes/new_scientist.recipe
+++ b/resources/recipes/new_scientist.recipe
@ -32,15 +32,16 @@ class NewScientist(BasicNewsRecipe):
                        }
    preprocess_regexps = [(re.compile(r'</title>.*?</head>', re.DOTALL|re.IGNORECASE),lambda match: '</title></head>')]
-    keep_only_tags = [dict(name='div', attrs={'id':['pgtop','maincol','nsblgposts','hldgalcols']})]
+    keep_only_tags = [dict(name='div', attrs={'id':['pgtop','maincol','blgmaincol','nsblgposts','hldgalcols']})]
    remove_tags = [
                     dict(name='div'  , attrs={'class':['hldBd','adline','pnl','infotext' ]})
-                    ,dict(name='div'  , attrs={'id'   :['compnl','artIssueInfo','artTools']})
+                    ,dict(name='div'  , attrs={'id'   :['compnl','artIssueInfo','artTools','comments','blgsocial']})
                    ,dict(name='p'    , attrs={'class':['marker','infotext'               ]})
                    ,dict(name='meta' , attrs={'name' :'description'                       })
                    ,dict(name='a'    , attrs={'rel'  :'tag'                                })
                  ]
-    remove_tags_after = dict(attrs={'class':'nbpcopy'})
+    remove_tags_after = dict(attrs={'class':['nbpcopy','comments']})
    remove_attributes = ['height','width']
    feeds          = [
--- a/src/calibre/ebooks/metadata/fetch.py
+++ b/src/calibre/ebooks/metadata/fetch.py
@ -3,17 +3,18 @@ __license__ = 'GPL 3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
-import traceback, sys, textwrap, re
+import traceback, sys, textwrap, re, urllib2
 from threading import Thread
-from calibre import prints
+from calibre import prints, browser
 from calibre.utils.config import OptionParser
 from calibre.utils.logging import default_log
 from calibre.customize import Plugin
 from calibre.ebooks.metadata.library_thing import OPENLIBRARY
 metadata_config = None
-class MetadataSource(Plugin):
+class MetadataSource(Plugin): # {{{
    author = 'Kovid Goyal'
@ -130,7 +131,9 @@ class MetadataSource(Plugin):
    def customization_help(self):
        return 'This plugin can only be customized using the GUI'
-class GoogleBooks(MetadataSource):
+    # }}}
 class GoogleBooks(MetadataSource): # {{{
    name = 'Google Books'
    description = _('Downloads metadata from Google Books')
@ -145,8 +148,9 @@ class GoogleBooks(MetadataSource):
            self.exception = e
            self.tb = traceback.format_exc()
    # }}}
-class ISBNDB(MetadataSource):
+class ISBNDB(MetadataSource): # {{{
    name = 'IsbnDB'
    description = _('Downloads metadata from isbndb.com')
@ -181,7 +185,9 @@ class ISBNDB(MetadataSource):
                'and enter your access key below.')
        return '<p>'+ans%('<a href="http://www.isbndb.com">', '</a>')
-class Amazon(MetadataSource):
+    # }}}
 class Amazon(MetadataSource): # {{{
    name = 'Amazon'
    metadata_type = 'social'
@ -198,7 +204,9 @@ class Amazon(MetadataSource):
            self.exception = e
            self.tb = traceback.format_exc()
-class LibraryThing(MetadataSource):
+    # }}}
 class LibraryThing(MetadataSource): # {{{
    name = 'LibraryThing'
    metadata_type = 'social'
@ -207,7 +215,6 @@ class LibraryThing(MetadataSource):
    def fetch(self):
        if not self.isbn:
            return
        from calibre import browser
        from calibre.ebooks.metadata import MetaInformation
        import json
        br = browser()
@ -228,6 +235,7 @@ class LibraryThing(MetadataSource):
        except Exception, e:
            self.exception = e
            self.tb = traceback.format_exc()
    # }}}
 def result_index(source, result):
@ -268,6 +276,31 @@ class MetadataSources(object):
        for s in self.sources:
            s.join()
 def filter_metadata_results(item):
    keywords = ["audio", "tape", "cassette", "abridged", "playaway"]
    for keyword in keywords:
        if item.publisher and keyword in item.publisher.lower():
            return False
    return True
 class HeadRequest(urllib2.Request):
    def get_method(self):
        return "HEAD"
 def do_cover_check(item):
    opener = browser()
    item.has_cover = False
    try:
        opener.open(HeadRequest(OPENLIBRARY%item.isbn), timeout=5)
        item.has_cover = True
    except:
        pass # Cover not found
 def check_for_covers(items):
    threads = [Thread(target=do_cover_check, args=(item,)) for item in items]
    for t in threads: t.start()
    for t in threads: t.join()
 def search(title=None, author=None, publisher=None, isbn=None, isbndb_key=None,
           verbose=0):
    assert not(title is None and author is None and publisher is None and \
@ -285,10 +318,60 @@ def search(title=None, author=None, publisher=None, isbn=None, isbndb_key=None,
    for fetcher in fetchers[1:]:
        merge_results(results, fetcher.results)
-    results = sorted(results, cmp=lambda x, y : cmp(
+    results = list(filter(filter_metadata_results, results))
-            (x.comments.strip() if x.comments else ''),
+
-            (y.comments.strip() if y.comments else '')
+    check_for_covers(results)
-                                                  ), reverse=True)
+
    words = ("the", "a", "an", "of", "and")
    prefix_pat = re.compile(r'^(%s)\s+'%("|".join(words)))
    trailing_paren_pat = re.compile(r'\(.*\)$')
    whitespace_pat = re.compile(r'\s+')
    def sort_func(x, y):
        def cleanup_title(s):
            s = s.strip().lower()
            s = prefix_pat.sub(' ', s)
            s = trailing_paren_pat.sub('', s)
            s = whitespace_pat.sub(' ', s)
            return s.strip()
        t = cleanup_title(title)
        x_title = cleanup_title(x.title)
        y_title = cleanup_title(y.title)
        # prefer titles that start with the search title
        tx = cmp(t, x_title)
        ty = cmp(t, y_title)
        result = 0 if abs(tx) == abs(ty) else abs(tx) - abs(ty)
        # then prefer titles that have a cover image
        if result == 0:
            result = -cmp(x.has_cover, y.has_cover)
        # then prefer titles with the longest comment, with in 10%
        if result == 0:
            cx = len(x.comments.strip() if x.comments else '')
            cy = len(y.comments.strip() if y.comments else '')
            t = (cx + cy) / 20
            result = cy - cx
            if abs(result) < t:
                result = 0
        return result
    results = sorted(results, cmp=sort_func)
    # if for some reason there is no comment in the top selection, go looking for one
    if len(results) > 1:
        if not results[0].comments or len(results[0].comments) == 0:
            for r in results[1:]:
                if title.lower() == r.title[:len(title)].lower() and r.comments and len(r.comments):
                    results[0].comments = r.comments
                    break
 #   for r in results:
 #       print "{0:14.14} {1:30.30} {2:20.20} {3:6} {4}".format(r.isbn, r.title, r.publisher, len(r.comments if r.comments else ''), r.has_cover)
    return results, [(x.name, x.exception, x.tb) for x in fetchers]
--- a/src/calibre/gui2/dialogs/config/add_save.ui
+++ b/src/calibre/gui2/dialogs/config/add_save.ui
@ -181,14 +181,14 @@ Title match ignores leading indefinite articles (&quot;the&quot;, &quot;a&quot;,
    <item>
     <widget class="QCheckBox" name="preserve_user_collections">
      <property name="text">
-       <string>Preserve user collections.</string>
+       <string>Preserve device collections.</string>
      </property>
     </widget>
    </item>
    <item>
     <widget class="QLabel" name="label_41">
      <property name="text">
-       <string>If checked, collections will not be deleted even if a book with changed metadata is resent and the collection is not in the book's metadata. In addition, editing collections on the device view will be enabled.</string>
+       <string>If checked, collections will not be deleted even if a book with changed metadata is resent and the collection is not in the book's metadata. In addition, editing collections in the device view will be enabled. If unchecked, collections will be always reflect only the metadata in the calibre library.</string>
      </property>
      <property name="wordWrap">
       <bool>true</bool>
--- a/src/calibre/manual/faq.rst
+++ b/src/calibre/manual/faq.rst
@ -325,6 +325,10 @@ Post any output you see in a help message on the `Forum <http://www.mobileread.c
 |app| is not starting on OS X?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 One common cause of failures on OS X is the use of accessibility technologies that are incompatible with the graphics toolkit |app| uses.
 Try turning off VoiceOver if you have it on. Also go to System Preferences->System->Universal Access and turn off the setting for enabling
 access for assistive devices in all the tabs.
 You can obtain debug output about why |app| is not starting by running `Console.app`. Debug output will
 be printed to it. If the debug output contains a line that looks like::
@ -334,9 +338,9 @@ then the problem is probably a corrupted font cache. You can clear the cache by
 `instructions <http://www.macworld.com/article/139383/2009/03/fontcacheclear.html>`_. If that doesn't
 solve it, look for a corrupted font file on your system, in ~/Library/Fonts or the like.
 My antivirus program claims |app| is a virus/trojan?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Your antivirus program is wrong. |app| is a completely open source product. You can actually browse the source code yourself (or hire someone to do it for you) to verify that it is not a virus. Please report the false identification to whatever company you buy your antivirus software from. If the antivirus program is preventing you from downloading/installing |app|, disable it temporarily, install |app| and then re-enable it.
 How do I use purchased EPUB books with |app|?