Merge from trunk

This commit is contained in:
Charles Haley 2010-06-24 19:10:24 +01:00
commit 90be73fe5b
5 changed files with 117 additions and 46 deletions

View File

@ -7,18 +7,18 @@ class NYTimes(BasicNewsRecipe):
__author__ = 'Krittika Goyal' __author__ = 'Krittika Goyal'
description = 'Canadian national newspaper' description = 'Canadian national newspaper'
timefmt = ' [%d %b, %Y]' timefmt = ' [%d %b, %Y]'
needs_subscription = False
language = 'en_CA' language = 'en_CA'
needs_subscription = False
no_stylesheets = True no_stylesheets = True
#remove_tags_before = dict(name='h1', attrs={'class':'heading'}) #remove_tags_before = dict(name='h1', attrs={'class':'heading'})
#remove_tags_after = dict(name='td', attrs={'class':'newptool1'}) remove_tags_after = dict(name='div', attrs={'class':'npStoryTools npWidth1-6 npRight npTxtStrong'})
remove_tags = [ remove_tags = [
dict(name='iframe'), dict(name='iframe'),
dict(name='div', attrs={'class':'story-tools'}), dict(name='div', attrs={'class':['story-tools', 'npStoryTools npWidth1-6 npRight npTxtStrong']}),
#dict(name='div', attrs={'id':['qrformdiv', 'inSection', 'alpha-inner']}), #dict(name='div', attrs={'id':['qrformdiv', 'inSection', 'alpha-inner']}),
#dict(name='form', attrs={'onsubmit':''}), #dict(name='form', attrs={'onsubmit':''}),
#dict(name='table', attrs={'cellspacing':'0'}), dict(name='ul', attrs={'class':'npTxtAlt npGroup npTxtCentre npStoryShare npTxtStrong npTxtDim'}),
] ]
# def preprocess_html(self, soup): # def preprocess_html(self, soup):
@ -37,7 +37,7 @@ class NYTimes(BasicNewsRecipe):
def parse_index(self): def parse_index(self):
soup = self.nejm_get_index() soup = self.nejm_get_index()
div = soup.find(id='LegoText4') div = soup.find(id='npContentMain')
current_section = None current_section = None
current_articles = [] current_articles = []
@ -50,7 +50,7 @@ class NYTimes(BasicNewsRecipe):
current_section = self.tag_to_string(x) current_section = self.tag_to_string(x)
current_articles = [] current_articles = []
self.log('\tFound section:', current_section) self.log('\tFound section:', current_section)
if current_section is not None and x.name == 'h3': if current_section is not None and x.name == 'h5':
# Article found # Article found
title = self.tag_to_string(x) title = self.tag_to_string(x)
a = x.find('a', href=lambda x: x and 'story' in x) a = x.find('a', href=lambda x: x and 'story' in x)
@ -59,8 +59,8 @@ class NYTimes(BasicNewsRecipe):
url = a.get('href', False) url = a.get('href', False)
if not url or not title: if not url or not title:
continue continue
if url.startswith('story'): #if url.startswith('story'):
url = 'http://www.nationalpost.com/todays-paper/'+url url = 'http://www.nationalpost.com/todays-paper/'+url
self.log('\t\tFound article:', title) self.log('\t\tFound article:', title)
self.log('\t\t\t', url) self.log('\t\t\t', url)
current_articles.append({'title': title, 'url':url, current_articles.append({'title': title, 'url':url,
@ -70,28 +70,11 @@ class NYTimes(BasicNewsRecipe):
feeds.append((current_section, current_articles)) feeds.append((current_section, current_articles))
return feeds return feeds
def preprocess_html(self, soup): def preprocess_html(self, soup):
story = soup.find(name='div', attrs={'class':'triline'}) story = soup.find(name='div', attrs={'id':'npContentMain'})
page2_link = soup.find('p','pagenav') ##td = heading.findParent(name='td')
if page2_link: ##td.extract()
atag = page2_link.find('a',href=True)
if atag:
page2_url = atag['href']
if page2_url.startswith('story'):
page2_url = 'http://www.nationalpost.com/todays-paper/'+page2_url
elif page2_url.startswith( '/todays-paper/story.html'):
page2_url = 'http://www.nationalpost.com/'+page2_url
page2_soup = self.index_to_soup(page2_url)
if page2_soup:
page2_content = page2_soup.find('div','story-content')
if page2_content:
full_story = BeautifulSoup('<div></div>')
full_story.insert(0,story)
full_story.insert(1,page2_content)
story = full_story
soup = BeautifulSoup('<html><head><title>t</title></head><body></body></html>') soup = BeautifulSoup('<html><head><title>t</title></head><body></body></html>')
body = soup.find(name='body') body = soup.find(name='body')
body.insert(0, story) body.insert(0, story)
return soup return soup

View File

@ -32,15 +32,16 @@ class NewScientist(BasicNewsRecipe):
} }
preprocess_regexps = [(re.compile(r'</title>.*?</head>', re.DOTALL|re.IGNORECASE),lambda match: '</title></head>')] preprocess_regexps = [(re.compile(r'</title>.*?</head>', re.DOTALL|re.IGNORECASE),lambda match: '</title></head>')]
keep_only_tags = [dict(name='div', attrs={'id':['pgtop','maincol','nsblgposts','hldgalcols']})] keep_only_tags = [dict(name='div', attrs={'id':['pgtop','maincol','blgmaincol','nsblgposts','hldgalcols']})]
remove_tags = [ remove_tags = [
dict(name='div' , attrs={'class':['hldBd','adline','pnl','infotext' ]}) dict(name='div' , attrs={'class':['hldBd','adline','pnl','infotext' ]})
,dict(name='div' , attrs={'id' :['compnl','artIssueInfo','artTools']}) ,dict(name='div' , attrs={'id' :['compnl','artIssueInfo','artTools','comments','blgsocial']})
,dict(name='p' , attrs={'class':['marker','infotext' ]}) ,dict(name='p' , attrs={'class':['marker','infotext' ]})
,dict(name='meta' , attrs={'name' :'description' }) ,dict(name='meta' , attrs={'name' :'description' })
,dict(name='a' , attrs={'rel' :'tag' })
] ]
remove_tags_after = dict(attrs={'class':'nbpcopy'}) remove_tags_after = dict(attrs={'class':['nbpcopy','comments']})
remove_attributes = ['height','width'] remove_attributes = ['height','width']
feeds = [ feeds = [

View File

@ -3,17 +3,18 @@ __license__ = 'GPL 3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import traceback, sys, textwrap, re import traceback, sys, textwrap, re, urllib2
from threading import Thread from threading import Thread
from calibre import prints from calibre import prints, browser
from calibre.utils.config import OptionParser from calibre.utils.config import OptionParser
from calibre.utils.logging import default_log from calibre.utils.logging import default_log
from calibre.customize import Plugin from calibre.customize import Plugin
from calibre.ebooks.metadata.library_thing import OPENLIBRARY
metadata_config = None metadata_config = None
class MetadataSource(Plugin): class MetadataSource(Plugin): # {{{
author = 'Kovid Goyal' author = 'Kovid Goyal'
@ -130,7 +131,9 @@ class MetadataSource(Plugin):
def customization_help(self): def customization_help(self):
return 'This plugin can only be customized using the GUI' return 'This plugin can only be customized using the GUI'
class GoogleBooks(MetadataSource): # }}}
class GoogleBooks(MetadataSource): # {{{
name = 'Google Books' name = 'Google Books'
description = _('Downloads metadata from Google Books') description = _('Downloads metadata from Google Books')
@ -145,8 +148,9 @@ class GoogleBooks(MetadataSource):
self.exception = e self.exception = e
self.tb = traceback.format_exc() self.tb = traceback.format_exc()
# }}}
class ISBNDB(MetadataSource): class ISBNDB(MetadataSource): # {{{
name = 'IsbnDB' name = 'IsbnDB'
description = _('Downloads metadata from isbndb.com') description = _('Downloads metadata from isbndb.com')
@ -181,7 +185,9 @@ class ISBNDB(MetadataSource):
'and enter your access key below.') 'and enter your access key below.')
return '<p>'+ans%('<a href="http://www.isbndb.com">', '</a>') return '<p>'+ans%('<a href="http://www.isbndb.com">', '</a>')
class Amazon(MetadataSource): # }}}
class Amazon(MetadataSource): # {{{
name = 'Amazon' name = 'Amazon'
metadata_type = 'social' metadata_type = 'social'
@ -198,7 +204,9 @@ class Amazon(MetadataSource):
self.exception = e self.exception = e
self.tb = traceback.format_exc() self.tb = traceback.format_exc()
class LibraryThing(MetadataSource): # }}}
class LibraryThing(MetadataSource): # {{{
name = 'LibraryThing' name = 'LibraryThing'
metadata_type = 'social' metadata_type = 'social'
@ -207,7 +215,6 @@ class LibraryThing(MetadataSource):
def fetch(self): def fetch(self):
if not self.isbn: if not self.isbn:
return return
from calibre import browser
from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata import MetaInformation
import json import json
br = browser() br = browser()
@ -228,6 +235,7 @@ class LibraryThing(MetadataSource):
except Exception, e: except Exception, e:
self.exception = e self.exception = e
self.tb = traceback.format_exc() self.tb = traceback.format_exc()
# }}}
def result_index(source, result): def result_index(source, result):
@ -268,6 +276,31 @@ class MetadataSources(object):
for s in self.sources: for s in self.sources:
s.join() s.join()
def filter_metadata_results(item):
keywords = ["audio", "tape", "cassette", "abridged", "playaway"]
for keyword in keywords:
if item.publisher and keyword in item.publisher.lower():
return False
return True
class HeadRequest(urllib2.Request):
def get_method(self):
return "HEAD"
def do_cover_check(item):
opener = browser()
item.has_cover = False
try:
opener.open(HeadRequest(OPENLIBRARY%item.isbn), timeout=5)
item.has_cover = True
except:
pass # Cover not found
def check_for_covers(items):
threads = [Thread(target=do_cover_check, args=(item,)) for item in items]
for t in threads: t.start()
for t in threads: t.join()
def search(title=None, author=None, publisher=None, isbn=None, isbndb_key=None, def search(title=None, author=None, publisher=None, isbn=None, isbndb_key=None,
verbose=0): verbose=0):
assert not(title is None and author is None and publisher is None and \ assert not(title is None and author is None and publisher is None and \
@ -285,10 +318,60 @@ def search(title=None, author=None, publisher=None, isbn=None, isbndb_key=None,
for fetcher in fetchers[1:]: for fetcher in fetchers[1:]:
merge_results(results, fetcher.results) merge_results(results, fetcher.results)
results = sorted(results, cmp=lambda x, y : cmp( results = list(filter(filter_metadata_results, results))
(x.comments.strip() if x.comments else ''),
(y.comments.strip() if y.comments else '') check_for_covers(results)
), reverse=True)
words = ("the", "a", "an", "of", "and")
prefix_pat = re.compile(r'^(%s)\s+'%("|".join(words)))
trailing_paren_pat = re.compile(r'\(.*\)$')
whitespace_pat = re.compile(r'\s+')
def sort_func(x, y):
def cleanup_title(s):
s = s.strip().lower()
s = prefix_pat.sub(' ', s)
s = trailing_paren_pat.sub('', s)
s = whitespace_pat.sub(' ', s)
return s.strip()
t = cleanup_title(title)
x_title = cleanup_title(x.title)
y_title = cleanup_title(y.title)
# prefer titles that start with the search title
tx = cmp(t, x_title)
ty = cmp(t, y_title)
result = 0 if abs(tx) == abs(ty) else abs(tx) - abs(ty)
# then prefer titles that have a cover image
if result == 0:
result = -cmp(x.has_cover, y.has_cover)
# then prefer titles with the longest comment, with in 10%
if result == 0:
cx = len(x.comments.strip() if x.comments else '')
cy = len(y.comments.strip() if y.comments else '')
t = (cx + cy) / 20
result = cy - cx
if abs(result) < t:
result = 0
return result
results = sorted(results, cmp=sort_func)
# if for some reason there is no comment in the top selection, go looking for one
if len(results) > 1:
if not results[0].comments or len(results[0].comments) == 0:
for r in results[1:]:
if title.lower() == r.title[:len(title)].lower() and r.comments and len(r.comments):
results[0].comments = r.comments
break
# for r in results:
# print "{0:14.14} {1:30.30} {2:20.20} {3:6} {4}".format(r.isbn, r.title, r.publisher, len(r.comments if r.comments else ''), r.has_cover)
return results, [(x.name, x.exception, x.tb) for x in fetchers] return results, [(x.name, x.exception, x.tb) for x in fetchers]

View File

@ -181,14 +181,14 @@ Title match ignores leading indefinite articles (&quot;the&quot;, &quot;a&quot;,
<item> <item>
<widget class="QCheckBox" name="preserve_user_collections"> <widget class="QCheckBox" name="preserve_user_collections">
<property name="text"> <property name="text">
<string>Preserve user collections.</string> <string>Preserve device collections.</string>
</property> </property>
</widget> </widget>
</item> </item>
<item> <item>
<widget class="QLabel" name="label_41"> <widget class="QLabel" name="label_41">
<property name="text"> <property name="text">
<string>If checked, collections will not be deleted even if a book with changed metadata is resent and the collection is not in the book's metadata. In addition, editing collections on the device view will be enabled.</string> <string>If checked, collections will not be deleted even if a book with changed metadata is resent and the collection is not in the book's metadata. In addition, editing collections in the device view will be enabled. If unchecked, collections will be always reflect only the metadata in the calibre library.</string>
</property> </property>
<property name="wordWrap"> <property name="wordWrap">
<bool>true</bool> <bool>true</bool>

View File

@ -325,6 +325,10 @@ Post any output you see in a help message on the `Forum <http://www.mobileread.c
|app| is not starting on OS X? |app| is not starting on OS X?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
One common cause of failures on OS X is the use of accessibility technologies that are incompatible with the graphics toolkit |app| uses.
Try turning off VoiceOver if you have it on. Also go to System Preferences->System->Universal Access and turn off the setting for enabling
access for assistive devices in all the tabs.
You can obtain debug output about why |app| is not starting by running `Console.app`. Debug output will You can obtain debug output about why |app| is not starting by running `Console.app`. Debug output will
be printed to it. If the debug output contains a line that looks like:: be printed to it. If the debug output contains a line that looks like::
@ -334,9 +338,9 @@ then the problem is probably a corrupted font cache. You can clear the cache by
`instructions <http://www.macworld.com/article/139383/2009/03/fontcacheclear.html>`_. If that doesn't `instructions <http://www.macworld.com/article/139383/2009/03/fontcacheclear.html>`_. If that doesn't
solve it, look for a corrupted font file on your system, in ~/Library/Fonts or the like. solve it, look for a corrupted font file on your system, in ~/Library/Fonts or the like.
My antivirus program claims |app| is a virus/trojan? My antivirus program claims |app| is a virus/trojan?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Your antivirus program is wrong. |app| is a completely open source product. You can actually browse the source code yourself (or hire someone to do it for you) to verify that it is not a virus. Please report the false identification to whatever company you buy your antivirus software from. If the antivirus program is preventing you from downloading/installing |app|, disable it temporarily, install |app| and then re-enable it. Your antivirus program is wrong. |app| is a completely open source product. You can actually browse the source code yourself (or hire someone to do it for you) to verify that it is not a virus. Please report the false identification to whatever company you buy your antivirus software from. If the antivirus program is preventing you from downloading/installing |app|, disable it temporarily, install |app| and then re-enable it.
How do I use purchased EPUB books with |app|? How do I use purchased EPUB books with |app|?