diff --git a/resources/recipes/national_post.recipe b/resources/recipes/national_post.recipe index 4fe188934c..00eb918d02 100644 --- a/resources/recipes/national_post.recipe +++ b/resources/recipes/national_post.recipe @@ -7,18 +7,18 @@ class NYTimes(BasicNewsRecipe): __author__ = 'Krittika Goyal' description = 'Canadian national newspaper' timefmt = ' [%d %b, %Y]' - needs_subscription = False language = 'en_CA' + needs_subscription = False no_stylesheets = True #remove_tags_before = dict(name='h1', attrs={'class':'heading'}) - #remove_tags_after = dict(name='td', attrs={'class':'newptool1'}) + remove_tags_after = dict(name='div', attrs={'class':'npStoryTools npWidth1-6 npRight npTxtStrong'}) remove_tags = [ dict(name='iframe'), - dict(name='div', attrs={'class':'story-tools'}), + dict(name='div', attrs={'class':['story-tools', 'npStoryTools npWidth1-6 npRight npTxtStrong']}), #dict(name='div', attrs={'id':['qrformdiv', 'inSection', 'alpha-inner']}), #dict(name='form', attrs={'onsubmit':''}), - #dict(name='table', attrs={'cellspacing':'0'}), + dict(name='ul', attrs={'class':'npTxtAlt npGroup npTxtCentre npStoryShare npTxtStrong npTxtDim'}), ] # def preprocess_html(self, soup): @@ -37,7 +37,7 @@ class NYTimes(BasicNewsRecipe): def parse_index(self): soup = self.nejm_get_index() - div = soup.find(id='LegoText4') + div = soup.find(id='npContentMain') current_section = None current_articles = [] @@ -50,7 +50,7 @@ class NYTimes(BasicNewsRecipe): current_section = self.tag_to_string(x) current_articles = [] self.log('\tFound section:', current_section) - if current_section is not None and x.name == 'h3': + if current_section is not None and x.name == 'h5': # Article found title = self.tag_to_string(x) a = x.find('a', href=lambda x: x and 'story' in x) @@ -59,8 +59,8 @@ class NYTimes(BasicNewsRecipe): url = a.get('href', False) if not url or not title: continue - if url.startswith('story'): - url = 'http://www.nationalpost.com/todays-paper/'+url + #if url.startswith('story'): + url = 'http://www.nationalpost.com/todays-paper/'+url self.log('\t\tFound article:', title) self.log('\t\t\t', url) current_articles.append({'title': title, 'url':url, @@ -70,28 +70,11 @@ class NYTimes(BasicNewsRecipe): feeds.append((current_section, current_articles)) return feeds - def preprocess_html(self, soup): - story = soup.find(name='div', attrs={'class':'triline'}) - page2_link = soup.find('p','pagenav') - if page2_link: - atag = page2_link.find('a',href=True) - if atag: - page2_url = atag['href'] - if page2_url.startswith('story'): - page2_url = 'http://www.nationalpost.com/todays-paper/'+page2_url - elif page2_url.startswith( '/todays-paper/story.html'): - page2_url = 'http://www.nationalpost.com/'+page2_url - page2_soup = self.index_to_soup(page2_url) - if page2_soup: - page2_content = page2_soup.find('div','story-content') - if page2_content: - full_story = BeautifulSoup('
') - full_story.insert(0,story) - full_story.insert(1,page2_content) - story = full_story + story = soup.find(name='div', attrs={'id':'npContentMain'}) + ##td = heading.findParent(name='td') + ##td.extract() soup = BeautifulSoup('t') body = soup.find(name='body') body.insert(0, story) return soup - diff --git a/resources/recipes/new_scientist.recipe b/resources/recipes/new_scientist.recipe index 1727a926ed..b40be458bc 100644 --- a/resources/recipes/new_scientist.recipe +++ b/resources/recipes/new_scientist.recipe @@ -32,15 +32,16 @@ class NewScientist(BasicNewsRecipe): } preprocess_regexps = [(re.compile(r'.*?', re.DOTALL|re.IGNORECASE),lambda match: '')] - keep_only_tags = [dict(name='div', attrs={'id':['pgtop','maincol','nsblgposts','hldgalcols']})] + keep_only_tags = [dict(name='div', attrs={'id':['pgtop','maincol','blgmaincol','nsblgposts','hldgalcols']})] remove_tags = [ dict(name='div' , attrs={'class':['hldBd','adline','pnl','infotext' ]}) - ,dict(name='div' , attrs={'id' :['compnl','artIssueInfo','artTools']}) + ,dict(name='div' , attrs={'id' :['compnl','artIssueInfo','artTools','comments','blgsocial']}) ,dict(name='p' , attrs={'class':['marker','infotext' ]}) ,dict(name='meta' , attrs={'name' :'description' }) + ,dict(name='a' , attrs={'rel' :'tag' }) ] - remove_tags_after = dict(attrs={'class':'nbpcopy'}) + remove_tags_after = dict(attrs={'class':['nbpcopy','comments']}) remove_attributes = ['height','width'] feeds = [ diff --git a/src/calibre/ebooks/metadata/fetch.py b/src/calibre/ebooks/metadata/fetch.py index d12c668e0d..0fd671f86a 100644 --- a/src/calibre/ebooks/metadata/fetch.py +++ b/src/calibre/ebooks/metadata/fetch.py @@ -3,17 +3,18 @@ __license__ = 'GPL 3' __copyright__ = '2009, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import traceback, sys, textwrap, re +import traceback, sys, textwrap, re, urllib2 from threading import Thread -from calibre import prints +from calibre import prints, browser from calibre.utils.config import OptionParser from calibre.utils.logging import default_log from calibre.customize import Plugin +from calibre.ebooks.metadata.library_thing import OPENLIBRARY metadata_config = None -class MetadataSource(Plugin): +class MetadataSource(Plugin): # {{{ author = 'Kovid Goyal' @@ -130,7 +131,9 @@ class MetadataSource(Plugin): def customization_help(self): return 'This plugin can only be customized using the GUI' -class GoogleBooks(MetadataSource): + # }}} + +class GoogleBooks(MetadataSource): # {{{ name = 'Google Books' description = _('Downloads metadata from Google Books') @@ -145,8 +148,9 @@ class GoogleBooks(MetadataSource): self.exception = e self.tb = traceback.format_exc() + # }}} -class ISBNDB(MetadataSource): +class ISBNDB(MetadataSource): # {{{ name = 'IsbnDB' description = _('Downloads metadata from isbndb.com') @@ -181,7 +185,9 @@ class ISBNDB(MetadataSource): 'and enter your access key below.') return '

'+ans%('', '') -class Amazon(MetadataSource): + # }}} + +class Amazon(MetadataSource): # {{{ name = 'Amazon' metadata_type = 'social' @@ -198,7 +204,9 @@ class Amazon(MetadataSource): self.exception = e self.tb = traceback.format_exc() -class LibraryThing(MetadataSource): + # }}} + +class LibraryThing(MetadataSource): # {{{ name = 'LibraryThing' metadata_type = 'social' @@ -207,7 +215,6 @@ class LibraryThing(MetadataSource): def fetch(self): if not self.isbn: return - from calibre import browser from calibre.ebooks.metadata import MetaInformation import json br = browser() @@ -228,6 +235,7 @@ class LibraryThing(MetadataSource): except Exception, e: self.exception = e self.tb = traceback.format_exc() + # }}} def result_index(source, result): @@ -268,6 +276,31 @@ class MetadataSources(object): for s in self.sources: s.join() +def filter_metadata_results(item): + keywords = ["audio", "tape", "cassette", "abridged", "playaway"] + for keyword in keywords: + if item.publisher and keyword in item.publisher.lower(): + return False + return True + +class HeadRequest(urllib2.Request): + def get_method(self): + return "HEAD" + +def do_cover_check(item): + opener = browser() + item.has_cover = False + try: + opener.open(HeadRequest(OPENLIBRARY%item.isbn), timeout=5) + item.has_cover = True + except: + pass # Cover not found + +def check_for_covers(items): + threads = [Thread(target=do_cover_check, args=(item,)) for item in items] + for t in threads: t.start() + for t in threads: t.join() + def search(title=None, author=None, publisher=None, isbn=None, isbndb_key=None, verbose=0): assert not(title is None and author is None and publisher is None and \ @@ -285,10 +318,60 @@ def search(title=None, author=None, publisher=None, isbn=None, isbndb_key=None, for fetcher in fetchers[1:]: merge_results(results, fetcher.results) - results = sorted(results, cmp=lambda x, y : cmp( - (x.comments.strip() if x.comments else ''), - (y.comments.strip() if y.comments else '') - ), reverse=True) + results = list(filter(filter_metadata_results, results)) + + check_for_covers(results) + + words = ("the", "a", "an", "of", "and") + prefix_pat = re.compile(r'^(%s)\s+'%("|".join(words))) + trailing_paren_pat = re.compile(r'\(.*\)$') + whitespace_pat = re.compile(r'\s+') + + def sort_func(x, y): + + def cleanup_title(s): + s = s.strip().lower() + s = prefix_pat.sub(' ', s) + s = trailing_paren_pat.sub('', s) + s = whitespace_pat.sub(' ', s) + return s.strip() + + t = cleanup_title(title) + x_title = cleanup_title(x.title) + y_title = cleanup_title(y.title) + + # prefer titles that start with the search title + tx = cmp(t, x_title) + ty = cmp(t, y_title) + result = 0 if abs(tx) == abs(ty) else abs(tx) - abs(ty) + + # then prefer titles that have a cover image + if result == 0: + result = -cmp(x.has_cover, y.has_cover) + + # then prefer titles with the longest comment, with in 10% + if result == 0: + cx = len(x.comments.strip() if x.comments else '') + cy = len(y.comments.strip() if y.comments else '') + t = (cx + cy) / 20 + result = cy - cx + if abs(result) < t: + result = 0 + + return result + + results = sorted(results, cmp=sort_func) + + # if for some reason there is no comment in the top selection, go looking for one + if len(results) > 1: + if not results[0].comments or len(results[0].comments) == 0: + for r in results[1:]: + if title.lower() == r.title[:len(title)].lower() and r.comments and len(r.comments): + results[0].comments = r.comments + break + + # for r in results: + # print "{0:14.14} {1:30.30} {2:20.20} {3:6} {4}".format(r.isbn, r.title, r.publisher, len(r.comments if r.comments else ''), r.has_cover) return results, [(x.name, x.exception, x.tb) for x in fetchers] diff --git a/src/calibre/gui2/dialogs/config/add_save.ui b/src/calibre/gui2/dialogs/config/add_save.ui index a29c0fd2e6..64a8137aa1 100644 --- a/src/calibre/gui2/dialogs/config/add_save.ui +++ b/src/calibre/gui2/dialogs/config/add_save.ui @@ -181,14 +181,14 @@ Title match ignores leading indefinite articles ("the", "a", - Preserve user collections. + Preserve device collections. - If checked, collections will not be deleted even if a book with changed metadata is resent and the collection is not in the book's metadata. In addition, editing collections on the device view will be enabled. + If checked, collections will not be deleted even if a book with changed metadata is resent and the collection is not in the book's metadata. In addition, editing collections in the device view will be enabled. If unchecked, collections will be always reflect only the metadata in the calibre library. true diff --git a/src/calibre/manual/faq.rst b/src/calibre/manual/faq.rst index 74da4f4782..4c89ed48c0 100644 --- a/src/calibre/manual/faq.rst +++ b/src/calibre/manual/faq.rst @@ -325,6 +325,10 @@ Post any output you see in a help message on the `Forum System->Universal Access and turn off the setting for enabling +access for assistive devices in all the tabs. + You can obtain debug output about why |app| is not starting by running `Console.app`. Debug output will be printed to it. If the debug output contains a line that looks like:: @@ -334,9 +338,9 @@ then the problem is probably a corrupted font cache. You can clear the cache by `instructions `_. If that doesn't solve it, look for a corrupted font file on your system, in ~/Library/Fonts or the like. - My antivirus program claims |app| is a virus/trojan? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Your antivirus program is wrong. |app| is a completely open source product. You can actually browse the source code yourself (or hire someone to do it for you) to verify that it is not a virus. Please report the false identification to whatever company you buy your antivirus software from. If the antivirus program is preventing you from downloading/installing |app|, disable it temporarily, install |app| and then re-enable it. How do I use purchased EPUB books with |app|?