mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-11-04 03:27:00 -05:00 
			
		
		
		
	FB2 Metadata Reader: Fix cover extraction, author_sort extraction, support multiple authors. Improved Irish Times recipe
This commit is contained in:
		
							parent
							
								
									49ab346619
								
							
						
					
					
						commit
						c30fb96ebf
					
				@ -5,50 +5,82 @@ __copyright__ = '2008, Anatoly Shipitsin <norguhtar at gmail.com>'
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
'''Read meta information from fb2 files'''
 | 
					'''Read meta information from fb2 files'''
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import mimetypes
 | 
					import mimetypes, os
 | 
				
			||||||
from base64 import b64decode
 | 
					from base64 import b64decode
 | 
				
			||||||
 | 
					from lxml import etree
 | 
				
			||||||
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
 | 
					 | 
				
			||||||
from calibre.ebooks.metadata import MetaInformation
 | 
					from calibre.ebooks.metadata import MetaInformation
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					XLINK_NS     = 'http://www.w3.org/1999/xlink'
 | 
				
			||||||
 | 
					def XLINK(name):
 | 
				
			||||||
 | 
					    return '{%s}%s' % (XLINK_NS, name)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def get_metadata(stream):
 | 
					def get_metadata(stream):
 | 
				
			||||||
    """ Return metadata as a L{MetaInfo} object """
 | 
					    """ Return metadata as a L{MetaInfo} object """
 | 
				
			||||||
    soup =  BeautifulStoneSoup(stream.read())
 | 
					    XPath = lambda x : etree.XPath(x,
 | 
				
			||||||
    firstname = soup.find("first-name").contents[0]
 | 
					            namespaces={'fb2':'http://www.gribuser.ru/xml/fictionbook/2.0',
 | 
				
			||||||
    lastname = soup.find("last-name").contents[0]
 | 
					                'xlink':XLINK_NS})
 | 
				
			||||||
    author= [firstname+" "+lastname]
 | 
					    tostring = lambda x : etree.tostring(x, method='text',
 | 
				
			||||||
    title = soup.find("book-title").string
 | 
					            encoding=unicode).strip()
 | 
				
			||||||
    comments = soup.find("annotation")
 | 
					    root = etree.fromstring(stream.read())
 | 
				
			||||||
    tags = soup.findAll('genre')
 | 
					    authors, author_sort = [], None
 | 
				
			||||||
    tags = [t.contents[0] for t in tags]
 | 
					    for au in XPath('//fb2:author')(root):
 | 
				
			||||||
    cp = soup.find('coverpage')
 | 
					        fname = lname = author = None
 | 
				
			||||||
 | 
					        fe = XPath('descendant::fb2:first-name')(au)
 | 
				
			||||||
 | 
					        if fe:
 | 
				
			||||||
 | 
					            fname = tostring(fe[0])
 | 
				
			||||||
 | 
					            author = fname
 | 
				
			||||||
 | 
					        le = XPath('descendant::fb2:last-name')(au)
 | 
				
			||||||
 | 
					        if le:
 | 
				
			||||||
 | 
					            lname = tostring(le[0])
 | 
				
			||||||
 | 
					            author += ' '+lname
 | 
				
			||||||
 | 
					        if author:
 | 
				
			||||||
 | 
					            authors.append(author)
 | 
				
			||||||
 | 
					        if len(authors) == 1 and author is not None:
 | 
				
			||||||
 | 
					            if lname:
 | 
				
			||||||
 | 
					                author_sort = lname
 | 
				
			||||||
 | 
					            if fname:
 | 
				
			||||||
 | 
					                if author_sort: author_sort += ', '+fname
 | 
				
			||||||
 | 
					                else: author_sort = fname
 | 
				
			||||||
 | 
					    title = os.path.splitext(os.path.basename(getattr(stream, 'name',
 | 
				
			||||||
 | 
					        _('Unknown'))))[0]
 | 
				
			||||||
 | 
					    for x in XPath('//fb2:book-title')(root):
 | 
				
			||||||
 | 
					        title = tostring(x)
 | 
				
			||||||
 | 
					    comments = ''
 | 
				
			||||||
 | 
					    for x in XPath('//fb2:annotation')(root):
 | 
				
			||||||
 | 
					        comments += tostring(x)
 | 
				
			||||||
 | 
					    if not comments:
 | 
				
			||||||
 | 
					        comments = None
 | 
				
			||||||
 | 
					    tags = list(map(tostring, XPath('//fb2:genre')(root)))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    cp = XPath('//fb2:coverpage')(root)
 | 
				
			||||||
    cdata = None
 | 
					    cdata = None
 | 
				
			||||||
    if cp:
 | 
					    if cp:
 | 
				
			||||||
        cimage = cp.find('image', attrs={'l:href':True})
 | 
					        cimage = XPath('descendant::fb2:image[@xlink:href]')(cp[0])
 | 
				
			||||||
        if cimage:
 | 
					        if cimage:
 | 
				
			||||||
            id = cimage['l:href'].replace('#', '')
 | 
					            id = cimage[0].get(XLINK('href')).replace('#', '')
 | 
				
			||||||
            binary = soup.find('binary', id=id, attrs={'content-type':True})
 | 
					            binary = XPath('//fb2:binary[@id="%s"]'%id)(root)
 | 
				
			||||||
            if binary:
 | 
					            if binary:
 | 
				
			||||||
                mt = binary['content-type']
 | 
					                mt = binary[0].get('content-type', 'image/jpeg')
 | 
				
			||||||
                exts = mimetypes.guess_all_extensions(mt)
 | 
					                exts = mimetypes.guess_all_extensions(mt)
 | 
				
			||||||
                if not exts:
 | 
					                if not exts:
 | 
				
			||||||
                    exts = ['.jpg']
 | 
					                    exts = ['.jpg']
 | 
				
			||||||
                cdata = (exts[0][1:], b64decode(binary.string.strip()))
 | 
					                cdata = (exts[0][1:], b64decode(tostring(binary[0])))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if comments:
 | 
					    series = None
 | 
				
			||||||
        comments = u''.join(comments.findAll(text=True))
 | 
					    series_index = 1.0
 | 
				
			||||||
    series = soup.find("sequence")
 | 
					    for x in XPath('//fb2:sequence')(root):
 | 
				
			||||||
    mi = MetaInformation(title, author)
 | 
					        series = x.get('name', None)
 | 
				
			||||||
 | 
					        if series is not None:
 | 
				
			||||||
 | 
					            series_index = x.get('number', 1.0)
 | 
				
			||||||
 | 
					            break
 | 
				
			||||||
 | 
					    mi = MetaInformation(title, authors)
 | 
				
			||||||
    mi.comments = comments
 | 
					    mi.comments = comments
 | 
				
			||||||
    mi.author_sort = lastname+'; '+firstname
 | 
					    mi.author_sort = author_sort
 | 
				
			||||||
    if tags:
 | 
					    if tags:
 | 
				
			||||||
        mi.tags = tags
 | 
					        mi.tags = tags
 | 
				
			||||||
    if series:
 | 
					    mi.series = series
 | 
				
			||||||
        mi.series = series.get('name', None)
 | 
					    mi.series_index = series_index
 | 
				
			||||||
        try:
 | 
					 | 
				
			||||||
            mi.series_index = float(series.get('number', None))
 | 
					 | 
				
			||||||
        except (TypeError, ValueError):
 | 
					 | 
				
			||||||
            pass
 | 
					 | 
				
			||||||
    if cdata:
 | 
					    if cdata:
 | 
				
			||||||
        mi.cover_data = cdata
 | 
					        mi.cover_data = cdata
 | 
				
			||||||
    return mi
 | 
					    return mi
 | 
				
			||||||
 | 
				
			|||||||
@ -1,17 +1,20 @@
 | 
				
			|||||||
__license__   = 'GPL v3'
 | 
					__license__   = 'GPL v3'
 | 
				
			||||||
__copyright__ = '2008, Derry FitzGerald'
 | 
					__copyright__ = '2008, Derry FitzGerald. 2009 Modified by Ray Kinsella'
 | 
				
			||||||
'''
 | 
					'''
 | 
				
			||||||
irishtimes.com
 | 
					irishtimes.com
 | 
				
			||||||
'''
 | 
					'''
 | 
				
			||||||
 | 
					import re
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from calibre.web.feeds.news import BasicNewsRecipe
 | 
					from calibre.web.feeds.news import BasicNewsRecipe
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class IrishTimes(BasicNewsRecipe):
 | 
					class IrishTimes(BasicNewsRecipe):
 | 
				
			||||||
    title          = u'The Irish Times'
 | 
					    title          = u'The Irish Times'
 | 
				
			||||||
    __author__     = 'Derry FitzGerald'
 | 
					    __author__     = 'Derry FitzGerald and Ray Kinsella'
 | 
				
			||||||
    language = _('English')
 | 
					    language = _('English')
 | 
				
			||||||
    no_stylesheets = True
 | 
					    no_stylesheets = True
 | 
				
			||||||
 | 
					    simultaneous_downloads= 1
 | 
				
			||||||
	
 | 
						
 | 
				
			||||||
 | 
					    r 			   = re.compile('.*(?P<url>http:\/\/www.irishtimes.com\/.*\.html).*')
 | 
				
			||||||
    remove_tags    = [dict(name='div', attrs={'class':'footer'})]
 | 
					    remove_tags    = [dict(name='div', attrs={'class':'footer'})]
 | 
				
			||||||
    extra_css      = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt  }' 
 | 
					    extra_css      = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt  }' 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -24,14 +27,13 @@ class IrishTimes(BasicNewsRecipe):
 | 
				
			|||||||
                      ('Sport', 'http://www.irishtimes.com/feeds/rss/newspaper/sport.rss'),
 | 
					                      ('Sport', 'http://www.irishtimes.com/feeds/rss/newspaper/sport.rss'),
 | 
				
			||||||
                      ('Opinion', 'http://www.irishtimes.com/feeds/rss/newspaper/opinion.rss'),
 | 
					                      ('Opinion', 'http://www.irishtimes.com/feeds/rss/newspaper/opinion.rss'),
 | 
				
			||||||
                      ('Letters', 'http://www.irishtimes.com/feeds/rss/newspaper/letters.rss'),
 | 
					                      ('Letters', 'http://www.irishtimes.com/feeds/rss/newspaper/letters.rss'),
 | 
				
			||||||
                      ('Health', 'http://www.irishtimes.com/feeds/rss/newspaper/health.rss'),
 | 
					 | 
				
			||||||
                      ('Education and Parenting', 'http://www.irishtimes.com/feeds/rss/newspaper/education.rss'),
 | 
					 | 
				
			||||||
                      ('Science Today', 'http://www.irishtimes.com/feeds/rss/newspaper/sciencetoday.rss'),
 | 
					 | 
				
			||||||
                      ('The Ticket', 'http://www.irishtimes.com/feeds/rss/newspaper/theticket.rss'),
 | 
					 | 
				
			||||||
                      ('Weekend', 'http://www.irishtimes.com/feeds/rss/newspaper/weekend.rss'),
 | 
					 | 
				
			||||||
                      ('News Features', 'http://www.irishtimes.com/feeds/rss/newspaper/newsfeatures.rss'),
 | 
					 | 
				
			||||||
                      ('Magazine', 'http://www.irishtimes.com/feeds/rss/newspaper/magazine.rss'),
 | 
					 | 
				
			||||||
                    ]
 | 
					                    ]
 | 
				
			||||||
	
 | 
						
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def print_version(self, url):
 | 
					    def print_version(self, url):
 | 
				
			||||||
        return url.replace('.html', '_pf.html')
 | 
					        return url.replace('.html', '_pf.html')
 | 
				
			||||||
 | 
							
 | 
				
			||||||
 | 
					    def get_article_url(self, article):
 | 
				
			||||||
 | 
					        m = self.r.match(article.get('description',  None))
 | 
				
			||||||
 | 
					        print m.group('url')
 | 
				
			||||||
 | 
					        return m.group('url')
 | 
				
			||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user