mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-11-03 19:17:02 -05:00 
			
		
		
		
	Implementing parsing of manifest, spine and toc in OPFReader
This commit is contained in:
		
							parent
							
								
									c1b6d4c136
								
							
						
					
					
						commit
						bea1ea2ef5
					
				@ -14,22 +14,91 @@
 | 
				
			|||||||
##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
					##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
				
			||||||
'''Read/Write metadata from Open Packaging Format (.opf) files.'''
 | 
					'''Read/Write metadata from Open Packaging Format (.opf) files.'''
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import sys, re
 | 
					import sys, re, os
 | 
				
			||||||
 | 
					from urllib import unquote
 | 
				
			||||||
 | 
					from urlparse import urlparse
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from libprs500.ebooks.metadata import MetaInformation
 | 
					from libprs500.ebooks.metadata import MetaInformation
 | 
				
			||||||
from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup
 | 
					from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup, BeautifulSoup, NavigableString
 | 
				
			||||||
from libprs500.ebooks.lrf import entity_to_unicode
 | 
					from libprs500.ebooks.lrf import entity_to_unicode
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class ManifestItem(object):
 | 
				
			||||||
 | 
					    def __init__(self, item, cwd):
 | 
				
			||||||
 | 
					        self.id = item['id'] if item.has_key('id') else ''
 | 
				
			||||||
 | 
					        self.href = urlparse(unquote(item['href']))[2] if item.has_key('href') else ''
 | 
				
			||||||
 | 
					        if not os.path.isabs(self.href):
 | 
				
			||||||
 | 
					            self.href = os.path.join(cwd, self.href)
 | 
				
			||||||
 | 
					        self.href = os.path.normpath(self.href)
 | 
				
			||||||
 | 
					        self.media_type = item['media-type'] if item.has_key('media-type') else ''
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					    def __unicode__(self):
 | 
				
			||||||
 | 
					        return u'<item id="%s" href="%s" media-type="%s" />'%(self.id, self.href, self.media_type)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class Manifest(list):
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    def __init__(self, soup, dir):
 | 
				
			||||||
 | 
					        manifest = soup.find('manifest')
 | 
				
			||||||
 | 
					        if manifest is not None:
 | 
				
			||||||
 | 
					            for item in manifest.findAll('item'):
 | 
				
			||||||
 | 
					                self.append(ManifestItem(item, dir))
 | 
				
			||||||
 | 
					                
 | 
				
			||||||
 | 
					    def item(self, id):
 | 
				
			||||||
 | 
					        for i in self:
 | 
				
			||||||
 | 
					            if i.id == id:
 | 
				
			||||||
 | 
					                return i    
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class Spine(list):
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    def __init__(self, soup, manifest):
 | 
				
			||||||
 | 
					        self.manifest = manifest
 | 
				
			||||||
 | 
					        spine = soup.find('spine')
 | 
				
			||||||
 | 
					        if spine is not None:
 | 
				
			||||||
 | 
					            for itemref in spine.findAll('itemref'):
 | 
				
			||||||
 | 
					                if itemref.has_key('idref'):
 | 
				
			||||||
 | 
					                    self.append(itemref['idref'])
 | 
				
			||||||
 | 
					                    
 | 
				
			||||||
 | 
					    def items(self):
 | 
				
			||||||
 | 
					        for i in self:
 | 
				
			||||||
 | 
					            yield  self.manifest.item(i)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class TOC(list):
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    def __init__(self, opfreader, cwd):
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
 | 
					            toc = opfreader.soup.find('guide').find('reference', attrs={'type':'toc'})['href']
 | 
				
			||||||
 | 
					        except:
 | 
				
			||||||
 | 
					            for item in opfreader.manifest:
 | 
				
			||||||
 | 
					                if 'toc' in item.href.lower():
 | 
				
			||||||
 | 
					                    toc = item.href
 | 
				
			||||||
 | 
					                    break
 | 
				
			||||||
 | 
					        toc = urlparse(unquote(toc))[2]
 | 
				
			||||||
 | 
					        if not os.path.isabs(toc):
 | 
				
			||||||
 | 
					            toc = os.path.join(cwd, toc)
 | 
				
			||||||
 | 
					        self.toc = toc
 | 
				
			||||||
 | 
					        soup = BeautifulSoup(open(toc, 'rb').read(), convertEntities=BeautifulSoup.HTML_ENTITIES)
 | 
				
			||||||
 | 
					        for a in soup.findAll('a'):
 | 
				
			||||||
 | 
					            if not a.has_key('href'):
 | 
				
			||||||
 | 
					                continue
 | 
				
			||||||
 | 
					            href = urlparse(unquote(a['href']))[2]
 | 
				
			||||||
 | 
					            if not os.path.isabs(href):
 | 
				
			||||||
 | 
					                href = os.path.join(cwd, href)
 | 
				
			||||||
 | 
					            txt = ''.join([unicode(s).strip() for s in a.findAll(text=True)])
 | 
				
			||||||
 | 
					            self.append((href, txt))
 | 
				
			||||||
 | 
					            
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class OPFReader(MetaInformation):
 | 
					class OPFReader(MetaInformation):
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    ENTITY_PATTERN = re.compile(r'&(\S+);')
 | 
					    ENTITY_PATTERN = re.compile(r'&(\S+);')
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    def __init__(self, stream):
 | 
					    def __init__(self, stream, dir=os.getcwd()):
 | 
				
			||||||
        self.default_title = stream.name if hasattr(stream, 'name') else 'Unknown' 
 | 
					        self.default_title = stream.name if hasattr(stream, 'name') else 'Unknown' 
 | 
				
			||||||
        if hasattr(stream, 'seek'):
 | 
					        if hasattr(stream, 'seek'):
 | 
				
			||||||
            stream.seek(0)
 | 
					            stream.seek(0)
 | 
				
			||||||
        self.soup = BeautifulStoneSoup(stream.read())
 | 
					        self.soup = BeautifulStoneSoup(stream.read())
 | 
				
			||||||
        self.series = self.series_index = self.rating = None
 | 
					        self.series = self.series_index = self.rating = None
 | 
				
			||||||
 | 
					        self.manifest = Manifest(self.soup, dir)
 | 
				
			||||||
 | 
					        self.spine = Spine(self.soup, self.manifest)
 | 
				
			||||||
 | 
					        self.toc = TOC(self, dir)
 | 
				
			||||||
        
 | 
					        
 | 
				
			||||||
    @apply
 | 
					    @apply
 | 
				
			||||||
    def title():
 | 
					    def title():
 | 
				
			||||||
@ -153,6 +222,7 @@ class OPFReader(MetaInformation):
 | 
				
			|||||||
    
 | 
					    
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
def main(args=sys.argv):
 | 
					def main(args=sys.argv):
 | 
				
			||||||
 | 
					    r = OPFReader(open(args[1], 'rb'))
 | 
				
			||||||
    return 0
 | 
					    return 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
if __name__ == '__main__':
 | 
					if __name__ == '__main__':
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user