Merge from trunk

This commit is contained in:
Charles Haley 2011-06-24 09:02:39 +01:00
commit 64cff177e9
4 changed files with 243 additions and 85 deletions

View File

@ -14,7 +14,7 @@ class LeTemps(BasicNewsRecipe):
title = u'Le Temps' title = u'Le Temps'
oldest_article = 7 oldest_article = 7
max_articles_per_feed = 100 max_articles_per_feed = 100
__author__ = 'Sujata Raman' __author__ = 'Kovid Goyal'
description = 'French news. Needs a subscription from http://www.letemps.ch' description = 'French news. Needs a subscription from http://www.letemps.ch'
no_stylesheets = True no_stylesheets = True
remove_javascript = True remove_javascript = True
@ -27,6 +27,7 @@ class LeTemps(BasicNewsRecipe):
def get_browser(self): def get_browser(self):
br = BasicNewsRecipe.get_browser(self) br = BasicNewsRecipe.get_browser(self)
br.open('http://www.letemps.ch/login') br.open('http://www.letemps.ch/login')
br.select_form(nr=1)
br['username'] = self.username br['username'] = self.username
br['password'] = self.password br['password'] = self.password
raw = br.submit().read() raw = br.submit().read()

View File

@ -1,96 +1,235 @@
#!/usr/bin/env python #!/usr/bin/env python
from __future__ import with_statement from __future__ import with_statement
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Anatoly Shipitsin <norguhtar at gmail.com>' __copyright__ = '2011, Roman Mukhin <ramses_ru at hotmail.com>, '\
'2008, Anatoly Shipitsin <norguhtar at gmail.com>'
'''Read meta information from fb2 files''' '''Read meta information from fb2 files'''
import os import os
import datetime
from functools import partial
from base64 import b64decode from base64 import b64decode
from lxml import etree from lxml import etree
from calibre.ebooks.metadata import MetaInformation from calibre.utils.date import parse_date
from calibre import guess_all_extensions, prints, force_unicode
from calibre.ebooks.metadata import MetaInformation, check_isbn
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
from calibre import guess_all_extensions
XLINK_NS = 'http://www.w3.org/1999/xlink'
def XLINK(name):
return '{%s}%s' % (XLINK_NS, name)
NAMESPACES = {
'fb2' : 'http://www.gribuser.ru/xml/fictionbook/2.0',
'xlink' : 'http://www.w3.org/1999/xlink' }
XPath = partial(etree.XPath, namespaces=NAMESPACES)
tostring = partial(etree.tostring, method='text', encoding=unicode)
def get_metadata(stream): def get_metadata(stream):
""" Return metadata as a L{MetaInfo} object """ """ Return fb2 metadata as a L{MetaInformation} object """
XPath = lambda x : etree.XPath(x,
namespaces={'fb2':'http://www.gribuser.ru/xml/fictionbook/2.0', root = _get_fbroot(stream)
'xlink':XLINK_NS})
tostring = lambda x : etree.tostring(x, method='text', book_title = _parse_book_title(root)
encoding=unicode).strip() authors = _parse_authors(root)
# fallback for book_title
if book_title:
book_title = unicode(book_title)
else:
book_title = force_unicode(os.path.splitext(
os.path.basename(getattr(stream, 'name',
_('Unknown'))))[0])
mi = MetaInformation(book_title, authors)
try:
_parse_cover(root, mi)
except:
pass
try:
_parse_comments(root, mi)
except:
pass
try:
_parse_tags(root, mi)
except:
pass
try:
_parse_series(root, mi)
except:
pass
try:
_parse_isbn(root, mi)
except:
pass
try:
_parse_publisher(root, mi)
except:
pass
try:
_parse_pubdate(root, mi)
except:
pass
try:
_parse_timestamp(root, mi)
except:
pass
try:
_parse_language(root, mi)
except:
pass
#_parse_uuid(root, mi)
#if DEBUG:
# prints(mi)
return mi
def _parse_authors(root):
authors = []
# pick up authors but only from 1 secrion <title-info>; otherwise it is not consistent!
# Those are fallbacks: <src-title-info>, <document-info>
for author_sec in ['title-info', 'src-title-info', 'document-info']:
for au in XPath('//fb2:%s/fb2:author'%author_sec)(root):
author = _parse_author(au)
if author:
authors.append(author)
if author:
break
# if no author so far
if not authors:
authors.append(_('Unknown'))
return authors
def _parse_author(elm_author):
""" Returns a list of display author and sortable author"""
xp_templ = 'normalize-space(fb2:%s/text())'
author = XPath(xp_templ % 'first-name')(elm_author)
lname = XPath(xp_templ % 'last-name')(elm_author)
mname = XPath(xp_templ % 'middle-name')(elm_author)
if mname:
author = (author + ' ' + mname).strip()
if lname:
author = (author + ' ' + lname).strip()
# fallback to nickname
if not author:
nname = XPath(xp_templ % 'nickname')(elm_author)
if nname:
author = nname
return author
def _parse_book_title(root):
# <title-info> has a priority. (actually <title-info> is mandatory)
# other are backup solution (sequence is important. other then in fb2-doc)
xp_ti = '//fb2:title-info/fb2:book-title/text()'
xp_pi = '//fb2:publish-info/fb2:book-title/text()'
xp_si = '//fb2:src-title-info/fb2:book-title/text()'
book_title = XPath('normalize-space(%s|%s|%s)' % (xp_ti, xp_pi, xp_si))(root)
return book_title
def _parse_cover(root, mi):
# pickup from <title-info>, if not exists it fallbacks to <src-title-info>
imgid = XPath('substring-after(string(//fb2:coverpage/fb2:image/@xlink:href), "#")')(root)
if imgid:
try:
_parse_cover_data(root, imgid, mi)
except:
pass
def _parse_cover_data(root, imgid, mi):
elm_binary = XPath('//fb2:binary[@id="%s"]'%imgid)(root)
if elm_binary:
mimetype = elm_binary[0].get('content-type', 'image/jpeg')
mime_extensions = guess_all_extensions(mimetype)
if mime_extensions:
pic_data = elm_binary[0].text
if pic_data:
mi.cover_data = (mime_extensions[0][1:], b64decode(pic_data))
else:
prints("WARNING: Unsupported coverpage mime-type '%s' (id=#%s)" % (mimetype, imgid) )
def _parse_tags(root, mi):
# pick up genre but only from 1 secrion <title-info>; otherwise it is not consistent!
# Those are fallbacks: <src-title-info>
for genre_sec in ['title-info', 'src-title-info']:
# -- i18n Translations-- ?
tags = XPath('//fb2:%s/fb2:genre/text()' % genre_sec)(root)
if tags:
mi.tags = list(map(unicode, tags))
break
def _parse_series(root, mi):
#calibri supports only 1 series: use the 1-st one
# pick up sequence but only from 1 secrion in prefered order
# except <src-title-info>
xp_ti = '//fb2:title-info/fb2:sequence[1]'
xp_pi = '//fb2:publish-info/fb2:sequence[1]'
elms_sequence = XPath('%s|%s' % (xp_ti, xp_pi))(root)
if elms_sequence:
mi.series = elms_sequence[0].get('name', None)
if mi.series:
mi.series_index = elms_sequence[0].get('number', None)
def _parse_isbn(root, mi):
# some people try to put several isbn in this field, but it is not allowed. try to stick to the 1-st one in this case
isbn = XPath('normalize-space(//fb2:publish-info/fb2:isbn/text())')(root)
# some people try to put several isbn in this field, but it is not allowed. try to stick to the 1-st one in this case
if ',' in isbn:
isbn = isbn[:isbn.index(',')]
if check_isbn(isbn):
mi.isbn = isbn
def _parse_comments(root, mi):
# pick up annotation but only from 1 secrion <title-info>; fallback: <src-title-info>
for annotation_sec in ['title-info', 'src-title-info']:
elms_annotation = XPath('//fb2:%s/fb2:annotation' % annotation_sec)(root)
if elms_annotation:
mi.comments = tostring(elms_annotation[0])
# TODO: tags i18n, xslt?
break
def _parse_publisher(root, mi):
publisher = XPath('string(//fb2:publish-info/fb2:publisher/text())')(root)
if publisher:
mi.publisher = publisher
def _parse_pubdate(root, mi):
year = XPath('number(//fb2:publish-info/fb2:year/text())')(root)
if float.is_integer(year):
# only year is available, so use 1-st of Jan
mi.pubdate = datetime.date(int(year), 1, 1)
def _parse_timestamp(root, mi):
#<date value="1996-12-03">03.12.1996</date>
xp ='//fb2:document-info/fb2:date/@value|'\
'//fb2:document-info/fb2:date/text()'
docdate = XPath('string(%s)' % xp)(root)
if docdate:
mi.timestamp = parse_date(docdate)
def _parse_language(root, mi):
language = XPath('string(//fb2:title-info/fb2:lang/text())')(root)
if language:
mi.language = language
mi.languages = [ language ]
def _parse_uuid(root, mi):
uuid = XPath('normalize-space(//document-info/fb2:id/text())')(root)
if uuid:
mi.uuid = uuid
def _get_fbroot(stream):
parser = etree.XMLParser(recover=True, no_network=True) parser = etree.XMLParser(recover=True, no_network=True)
raw = stream.read() raw = stream.read()
raw = xml_to_unicode(raw, strip_encoding_pats=True, raw = xml_to_unicode(raw, strip_encoding_pats=True)[0]
assume_utf8=True)[0]
root = etree.fromstring(raw, parser=parser) root = etree.fromstring(raw, parser=parser)
authors, author_sort = [], None return root
for au in XPath('//fb2:author')(root):
fname = lname = author = None
fe = XPath('descendant::fb2:first-name')(au)
if fe:
fname = tostring(fe[0])
author = fname
le = XPath('descendant::fb2:last-name')(au)
if le:
lname = tostring(le[0])
if author:
author += ' '+lname
else:
author = lname
if author:
authors.append(author)
if len(authors) == 1 and author is not None:
if lname:
author_sort = lname
if fname:
if author_sort: author_sort += ', '+fname
else: author_sort = fname
title = os.path.splitext(os.path.basename(getattr(stream, 'name',
_('Unknown'))))[0]
for x in XPath('//fb2:book-title')(root):
title = tostring(x)
break
comments = ''
for x in XPath('//fb2:annotation')(root):
comments += tostring(x)
if not comments:
comments = None
tags = list(map(tostring, XPath('//fb2:genre')(root)))
cp = XPath('//fb2:coverpage')(root)
cdata = None
if cp:
cimage = XPath('descendant::fb2:image[@xlink:href]')(cp[0])
if cimage:
id = cimage[0].get(XLINK('href')).replace('#', '')
binary = XPath('//fb2:binary[@id="%s"]'%id)(root)
if binary:
mt = binary[0].get('content-type', 'image/jpeg')
exts = guess_all_extensions(mt)
if not exts:
exts = ['.jpg']
cdata = (exts[0][1:], b64decode(tostring(binary[0])))
series = None
series_index = 1.0
for x in XPath('//fb2:sequence')(root):
series = x.get('name', None)
if series is not None:
series_index = x.get('number', 1.0)
break
mi = MetaInformation(title, authors)
mi.comments = comments
mi.author_sort = author_sort
if tags:
mi.tags = tags
mi.series = series
mi.series_index = series_index
if cdata:
mi.cover_data = cdata
return mi

View File

@ -591,8 +591,10 @@ class BooksView(QTableView): # {{{
fmt = prefs['output_format'] fmt = prefs['output_format']
def url_for_id(i): def url_for_id(i):
ans = db.format(i, fmt, index_is_id=True, as_path=True, try:
preserve_filename=True) ans = db.format_path(i, fmt, index_is_id=True)
except:
ans = None
if ans is None: if ans is None:
fmts = db.formats(i, index_is_id=True) fmts = db.formats(i, index_is_id=True)
if fmts: if fmts:
@ -600,13 +602,15 @@ class BooksView(QTableView): # {{{
else: else:
fmts = [] fmts = []
for f in fmts: for f in fmts:
ans = db.format(i, f, index_is_id=True, as_path=True, try:
preserve_filename=True) ans = db.format_path(i, f, index_is_id=True)
except:
ans = None
if ans is None: if ans is None:
ans = db.abspath(i, index_is_id=True) ans = db.abspath(i, index_is_id=True)
return QUrl.fromLocalFile(ans) return QUrl.fromLocalFile(ans)
md.setUrls([url_for_id(i) for i in selected[:25]]) md.setUrls([url_for_id(i) for i in selected])
drag = QDrag(self) drag = QDrag(self)
col = self.selectionModel().currentIndex().column() col = self.selectionModel().currentIndex().column()
md.column_name = self.column_map[col] md.column_name = self.column_map[col]

View File

@ -1144,6 +1144,20 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
break break
return sha.hexdigest() return sha.hexdigest()
def format_path(self, index, fmt, index_is_id=False):
'''
This method is intended to be used only in those rare situations, like
Drag'n Drop, when you absolutely need the path to the original file.
Otherwise, use format(..., as_path=True).
Note that a networked backend will always return None.
'''
path = self.format_abspath(index, fmt, index_is_id=index_is_id)
if path is None:
id_ = index if index_is_id else self.id(index)
raise NoSuchFormat('Record %d has no format: %s'%(id_, fmt))
return path
def format_abspath(self, index, format, index_is_id=False): def format_abspath(self, index, format, index_is_id=False):
''' '''
Return absolute path to the ebook file of format `format` Return absolute path to the ebook file of format `format`