merge from trunk

This commit is contained in:
ldolse 2010-09-29 07:18:30 +08:00
commit 81027bcff9
11 changed files with 178 additions and 93 deletions

View File

@ -9,6 +9,8 @@ __docformat__ = 'restructuredtext en'
import datetime import datetime
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1282101454(BasicNewsRecipe): class AdvancedUserRecipe1282101454(BasicNewsRecipe):
now = datetime.datetime.now() now = datetime.datetime.now()
title = 'The AJC' title = 'The AJC'

View File

@ -8,7 +8,7 @@ __docformat__ = 'restructuredtext en'
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, re
class AdvancedUserRecipe1282101454(BasicNewsRecipe): class AdvancedUserRecipe1282101454(BasicNewsRecipe):
title = 'Nealz Nuze' title = 'Nealz Nuze'
language = 'en' language = 'en'

View File

@ -1,5 +1,5 @@
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, re import re
class AdvancedUserRecipe1282101454(BasicNewsRecipe): class AdvancedUserRecipe1282101454(BasicNewsRecipe):
title = 'Popular Science' title = 'Popular Science'

View File

@ -1,6 +1,5 @@
#!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
''' '''
telegraph.co.uk telegraph.co.uk
''' '''
@ -8,14 +7,16 @@ telegraph.co.uk
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class TelegraphUK(BasicNewsRecipe): class TelegraphUK(BasicNewsRecipe):
title = u'Telegraph.co.uk' title = 'Telegraph.co.uk'
__author__ = 'Darko Miletic and Sujata Raman' __author__ = 'Darko Miletic and Sujata Raman'
description = 'News from United Kingdom' description = 'News from United Kingdom'
oldest_article = 7 oldest_article = 2
category = 'news, politics, UK'
publisher = 'Telegraph Media Group ltd.'
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
language = 'en' language = 'en_GB'
remove_empty_feeds = True
use_embedded_content = False use_embedded_content = False
extra_css = ''' extra_css = '''
@ -27,13 +28,20 @@ class TelegraphUK(BasicNewsRecipe):
.imageExtras{color:#666666; font-family :Arial,Helvetica,sans-serif; font-size: xx-small;} .imageExtras{color:#666666; font-family :Arial,Helvetica,sans-serif; font-size: xx-small;}
''' '''
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
keep_only_tags = [ keep_only_tags = [
dict(name='div', attrs={'class':'storyHead'}) dict(name='div', attrs={'class':['storyHead','byline']})
,dict(name='div', attrs={'class':'story' }) ,dict(name='div', attrs={'id':'mainBodyArea' })
#,dict(name='div', attrs={'class':['slideshowHD gutterUnder',"twoThirds gutter","caption" ] })
] ]
remove_tags = [dict(name='div', attrs={'class':['related_links_inline',"imgindex","next","prev","gutterUnder",'ssImgHide','imageExtras','ssImg hide']}) remove_tags = [dict(name='div', attrs={'class':['related_links_inline',"imgindex","next","prev","gutterUnder",'ssImgHide','imageExtras','ssImg hide','related_links_video']})
#,dict(name='div', attrs={'class':['toolshideoneQuarter']}) ,dict(name='ul' , attrs={'class':['shareThis shareBottom']})
,dict(name='span', attrs={'class':['num','placeComment']}) ,dict(name='span', attrs={'class':['num','placeComment']})
] ]
@ -51,24 +59,7 @@ class TelegraphUK(BasicNewsRecipe):
] ]
def get_article_url(self, article): def get_article_url(self, article):
url = article.get('link', None)
url = article.get('guid', None)
if 'picture-galleries' in url or 'pictures' in url or 'picturegalleries' in url : if 'picture-galleries' in url or 'pictures' in url or 'picturegalleries' in url :
url = None url = None
return url return url
def postprocess_html(self,soup,first):
for bylineTag in soup.findAll(name='div', attrs={'class':'byline'}):
for pTag in bylineTag.findAll(name='p'):
if getattr(pTag.contents[0],"Comments",True):
pTag.extract()
return soup

View File

@ -469,14 +469,14 @@ from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB, Amazon, \
LibraryThing LibraryThing
from calibre.ebooks.metadata.douban import DoubanBooks from calibre.ebooks.metadata.douban import DoubanBooks
from calibre.ebooks.metadata.covers import OpenLibraryCovers, \ from calibre.ebooks.metadata.covers import OpenLibraryCovers, \
LibraryThingCovers LibraryThingCovers, DoubanCovers
from calibre.library.catalog import CSV_XML, EPUB_MOBI, BIBTEX from calibre.library.catalog import CSV_XML, EPUB_MOBI, BIBTEX
from calibre.ebooks.epub.fix.unmanifested import Unmanifested from calibre.ebooks.epub.fix.unmanifested import Unmanifested
from calibre.ebooks.epub.fix.epubcheck import Epubcheck from calibre.ebooks.epub.fix.epubcheck import Epubcheck
plugins = [HTML2ZIP, PML2PMLZ, ArchiveExtract, GoogleBooks, ISBNDB, Amazon, plugins = [HTML2ZIP, PML2PMLZ, ArchiveExtract, GoogleBooks, ISBNDB, Amazon,
LibraryThing, DoubanBooks, CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested, LibraryThing, DoubanBooks, CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested,
Epubcheck, OpenLibraryCovers, LibraryThingCovers] Epubcheck, OpenLibraryCovers, LibraryThingCovers, DoubanCovers]
plugins += [ plugins += [
ComicInput, ComicInput,
EPUBInput, EPUBInput,

View File

@ -120,7 +120,7 @@ def enable_plugin(plugin_or_name):
config['enabled_plugins'] = ep config['enabled_plugins'] = ep
default_disabled_plugins = set([ default_disabled_plugins = set([
'Douban Books', 'Douban Books', 'Douban.com covers',
]) ])
def is_disabled(plugin): def is_disabled(plugin):

View File

@ -150,11 +150,9 @@ class DocAnalysis(object):
# Find the biggest bucket # Find the biggest bucket
maxValue = 0 maxValue = 0
peakPosition = 0
for i in range(0,len(h)): for i in range(0,len(h)):
if h[i] > maxValue: if h[i] > maxValue:
maxValue = h[i] maxValue = h[i]
peakPosition = i
if maxValue < percent: if maxValue < percent:
#print "Line lengths are too variable. Not unwrapping." #print "Line lengths are too variable. Not unwrapping."
@ -533,12 +531,12 @@ class HTMLPreProcessor(object):
html = self.smarten_punctuation(html) html = self.smarten_punctuation(html)
unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
if unsupported_unicode_chars != []: if unsupported_unicode_chars:
from calibre.ebooks.unidecode.unidecoder import Unidecoder from calibre.ebooks.unidecode.unidecoder import Unidecoder
unidecoder = Unidecoder() unidecoder = Unidecoder()
for char in unsupported_unicode_chars: for char in unsupported_unicode_chars:
asciichar = unidecoder.decode(char) asciichar = unidecoder.decode(char)
html = re.sub(u'%s' % char, asciichar, html) html = html.replace(char, asciichar)
return html return html

View File

@ -129,6 +129,7 @@ class PreProcessor(object):
#multi_blank = re.compile(r'(\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>){2,}', re.IGNORECASE) #multi_blank = re.compile(r'(\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>){2,}', re.IGNORECASE)
blanklines = blankreg.findall(html) blanklines = blankreg.findall(html)
lines = linereg.findall(html) lines = linereg.findall(html)
blanks_between_paragraphs = False
if len(lines) > 1: if len(lines) > 1:
self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank") self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank")
if float(len(blanklines)) / float(len(lines)) > 0.40 and getattr(self.extra_opts, if float(len(blanklines)) / float(len(lines)) > 0.40 and getattr(self.extra_opts,

View File

@ -9,6 +9,7 @@ import traceback, socket, re, sys
from functools import partial from functools import partial
from threading import Thread, Event from threading import Thread, Event
from Queue import Queue, Empty from Queue import Queue, Empty
from lxml import etree
import mechanize import mechanize
@ -216,6 +217,68 @@ def download_covers(mi, result_queue, max_covers=50, timeout=5.): # {{{
# }}} # }}}
class DoubanCovers(CoverDownload): # {{{
'Download covers from Douban.com'
DOUBAN_ISBN_URL = 'http://api.douban.com/book/subject/isbn/'
CALIBRE_DOUBAN_API_KEY = '0bd1672394eb1ebf2374356abec15c3d'
name = 'Douban.com covers'
description = _('Download covers from Douban.com')
author = 'Li Fanxi'
def get_cover_url(self, isbn, br, timeout=5.):
try:
url = self.DOUBAN_ISBN_URL + isbn + "?apikey=" + self.CALIBRE_DOUBAN_API_KEY
src = br.open(url, timeout=timeout).read()
except Exception, err:
if isinstance(getattr(err, 'args', [None])[0], socket.timeout):
err = Exception(_('Douban.com API timed out. Try again later.'))
raise err
else:
feed = etree.fromstring(src)
NAMESPACES = {
'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/',
'atom' : 'http://www.w3.org/2005/Atom',
'db': 'http://www.douban.com/xmlns/'
}
XPath = partial(etree.XPath, namespaces=NAMESPACES)
entries = XPath('//atom:entry')(feed)
if len(entries) < 1:
return None
try:
cover_url = XPath("descendant::atom:link[@rel='image']/attribute::href")
u = cover_url(entries[0])[0].replace('/spic/', '/lpic/');
# If URL contains "book-default", the book doesn't have a cover
if u.find('book-default') != -1:
return None
except:
return None
return u
def has_cover(self, mi, ans, timeout=5.):
if not mi.isbn:
return False
br = browser()
try:
if self.get_cover_url(mi.isbn, br, timeout=timeout) != None:
self.debug('cover for', mi.isbn, 'found')
ans.set()
except Exception, e:
self.debug(e)
def get_covers(self, mi, result_queue, abort, timeout=5.):
if not mi.isbn:
return
br = browser()
try:
url = self.get_cover_url(mi.isbn, br, timeout=timeout)
cover_data = br.open_novisit(url).read()
result_queue.put((True, cover_data, 'jpg', self.name))
except Exception, e:
result_queue.put((False, self.exception_to_string(e),
traceback.format_exc(), self.name))
# }}}
def download_cover(mi, timeout=5.): # {{{ def download_cover(mi, timeout=5.): # {{{
results = Queue() results = Queue()
download_covers(mi, results, max_covers=1, timeout=timeout) download_covers(mi, results, max_covers=1, timeout=timeout)

View File

@ -584,12 +584,42 @@ class LibraryPage(QWizardPage, LibraryUI):
qt_app.load_translations() qt_app.load_translations()
self.emit(SIGNAL('retranslate()')) self.emit(SIGNAL('retranslate()'))
self.init_languages() self.init_languages()
try:
if prefs['language'].lower().startswith('zh'):
from calibre.customize.ui import enable_plugin
for name in ('Douban Books', 'Douban.com covers'):
enable_plugin(name)
except:
pass
def is_library_dir_suitable(self, x):
return LibraryDatabase2.exists_at(x) or not os.listdir(x)
def validatePage(self):
newloc = unicode(self.location.text())
if not self.is_library_dir_suitable(newloc):
self.show_library_dir_error(newloc)
return False
return True
def change(self): def change(self):
dir = choose_dir(self, 'database location dialog', x = choose_dir(self, 'database location dialog',
_('Select location for books')) _('Select location for books'))
if dir: if x:
self.location.setText(dir) if self.is_library_dir_suitable(x):
self.location.setText(x)
else:
self.show_library_dir_error(x)
def show_library_dir_error(self, x):
if not isinstance(x, unicode):
try:
x = x.decode(filesystem_encoding)
except:
x = unicode(repr(x))
error_dialog(self, _('Bad location'),
_('You must choose an empty folder for '
'the calibre library. %s is not empty.')%x, show=True)
def initializePage(self): def initializePage(self):
lp = prefs['library_path'] lp = prefs['library_path']