mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Refactored OPF creation code. Implemented Table of Contents support in feeds2disk.
This commit is contained in:
parent
748c184ccb
commit
6982652f92
6
Makefile
6
Makefile
@ -1,6 +1,6 @@
|
|||||||
PYTHON = python
|
PYTHON = python
|
||||||
|
|
||||||
all : gui2 translations
|
all : gui2 translations resources
|
||||||
|
|
||||||
clean :
|
clean :
|
||||||
cd src/libprs500/gui2 && ${PYTHON} make.py clean
|
cd src/libprs500/gui2 && ${PYTHON} make.py clean
|
||||||
@ -14,3 +14,7 @@ test : gui2
|
|||||||
translations :
|
translations :
|
||||||
cd src/libprs500 && ${PYTHON} translations/__init__.py
|
cd src/libprs500 && ${PYTHON} translations/__init__.py
|
||||||
|
|
||||||
|
resources:
|
||||||
|
${PYTHON} resources.py
|
||||||
|
|
||||||
|
|
||||||
|
39
resources.py
Normal file
39
resources.py
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
|
||||||
|
## This program is free software; you can redistribute it and/or modify
|
||||||
|
## it under the terms of the GNU General Public License as published by
|
||||||
|
## the Free Software Foundation; either version 2 of the License, or
|
||||||
|
## (at your option) any later version.
|
||||||
|
##
|
||||||
|
## This program is distributed in the hope that it will be useful,
|
||||||
|
## but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
## GNU General Public License for more details.
|
||||||
|
##
|
||||||
|
## You should have received a copy of the GNU General Public License along
|
||||||
|
## with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
'''
|
||||||
|
Compile resource files.
|
||||||
|
'''
|
||||||
|
import os, sys
|
||||||
|
sys.path.insert(1, os.path.join(os.getcwd(), 'src'))
|
||||||
|
from libprs500 import __appname__
|
||||||
|
|
||||||
|
RESOURCES = dict(
|
||||||
|
opf_template = '%p/ebooks/metadata/opf.xml',
|
||||||
|
ncx_template = '%p/ebooks/metadata/ncx.xml',
|
||||||
|
)
|
||||||
|
|
||||||
|
def main(args=sys.argv):
|
||||||
|
data = ''
|
||||||
|
for key, value in RESOURCES.items():
|
||||||
|
path = value.replace('%p', 'src'+os.sep+__appname__)
|
||||||
|
bytes = repr(open(path, 'rb').read())
|
||||||
|
data += key + ' = ' + bytes + '\n\n'
|
||||||
|
open('src'+os.sep+__appname__+os.sep+'/resources.py', 'wb').write(data)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
sys.exit(main())
|
@ -60,6 +60,8 @@ def update_css(ncss, ocss):
|
|||||||
def munge_paths(basepath, url):
|
def munge_paths(basepath, url):
|
||||||
purl = urlparse(unquote(url),)
|
purl = urlparse(unquote(url),)
|
||||||
path, fragment = purl[2], purl[5]
|
path, fragment = purl[2], purl[5]
|
||||||
|
if path:
|
||||||
|
path = path.replace('/', os.sep)
|
||||||
if not path:
|
if not path:
|
||||||
path = basepath
|
path = basepath
|
||||||
elif not os.path.isabs(path):
|
elif not os.path.isabs(path):
|
||||||
@ -223,7 +225,6 @@ class HTMLConverter(object):
|
|||||||
self.extra_toc_entries = [] #: TOC entries gleaned from semantic information
|
self.extra_toc_entries = [] #: TOC entries gleaned from semantic information
|
||||||
self.image_memory = []
|
self.image_memory = []
|
||||||
self.id_counter = 0
|
self.id_counter = 0
|
||||||
self.toc_from_metadata = False #: If True means that the toc has been populated from metadata
|
|
||||||
self.unused_target_blocks = [] #: Used to remove extra TextBlocks
|
self.unused_target_blocks = [] #: Used to remove extra TextBlocks
|
||||||
self.link_level = 0 #: Current link level
|
self.link_level = 0 #: Current link level
|
||||||
self.memory = [] #: Used to ensure that duplicate CSS unhandled erros are not reported
|
self.memory = [] #: Used to ensure that duplicate CSS unhandled erros are not reported
|
||||||
@ -543,7 +544,7 @@ class HTMLConverter(object):
|
|||||||
|
|
||||||
path, fragment = munge_paths(self.target_prefix, tag['href'])
|
path, fragment = munge_paths(self.target_prefix, tag['href'])
|
||||||
return {'para':para, 'text':text, 'path':os.path.abspath(path),
|
return {'para':para, 'text':text, 'path':os.path.abspath(path),
|
||||||
'fragment':fragment, 'in toc': (self.link_level == 0 and not self.toc_from_metadata)}
|
'fragment':fragment, 'in toc': (self.link_level == 0 and not self.use_spine)}
|
||||||
|
|
||||||
|
|
||||||
def get_text(self, tag, limit=None):
|
def get_text(self, tag, limit=None):
|
||||||
@ -637,13 +638,12 @@ class HTMLConverter(object):
|
|||||||
return outside_links
|
return outside_links
|
||||||
|
|
||||||
def create_toc(self, toc):
|
def create_toc(self, toc):
|
||||||
for (path, fragment, txt) in toc:
|
for item in toc.top_level_items():
|
||||||
ascii_text = txt.encode('ascii', 'ignore') # Bug in SONY LRF renderer
|
ascii_text = item.text.encode('ascii', 'ignore') # Bug in SONY LRF renderer
|
||||||
self.toc_from_metadata = True
|
if not item.fragment and item.abspath in self.tops:
|
||||||
if not fragment and path in self.tops:
|
self.book.addTocEntry(ascii_text, self.tops[item.abspath])
|
||||||
self.book.addTocEntry(ascii_text, self.tops[path])
|
|
||||||
else:
|
else:
|
||||||
url = path+fragment
|
url = item.abspath+item.fragment
|
||||||
if url in self.targets:
|
if url in self.targets:
|
||||||
self.book.addTocEntry(ascii_text, self.targets[url])
|
self.book.addTocEntry(ascii_text, self.targets[url])
|
||||||
|
|
||||||
@ -1846,6 +1846,7 @@ def try_opf(path, options, logger):
|
|||||||
options.cover = None
|
options.cover = None
|
||||||
cover = opf.cover
|
cover = opf.cover
|
||||||
if cover:
|
if cover:
|
||||||
|
cover = cover.replace('/', os.sep)
|
||||||
if not os.path.isabs(cover):
|
if not os.path.isabs(cover):
|
||||||
cover = os.path.join(dirpath, cover)
|
cover = os.path.join(dirpath, cover)
|
||||||
if os.access(cover, os.R_OK):
|
if os.access(cover, os.R_OK):
|
||||||
|
@ -65,7 +65,7 @@ class LRFConverter(object):
|
|||||||
def create_metadata(self):
|
def create_metadata(self):
|
||||||
self.logger.info('Reading metadata...')
|
self.logger.info('Reading metadata...')
|
||||||
mi = get_metadata(self.lrf)
|
mi = get_metadata(self.lrf)
|
||||||
self.opf = OPFCreator(mi)
|
self.opf = OPFCreator(self.output_dir, mi)
|
||||||
|
|
||||||
def create_page_styles(self):
|
def create_page_styles(self):
|
||||||
self.page_css = ''
|
self.page_css = ''
|
||||||
|
@ -45,12 +45,13 @@ class MetaInformation(object):
|
|||||||
ans = MetaInformation(mi.title, mi.authors)
|
ans = MetaInformation(mi.title, mi.authors)
|
||||||
for attr in ('author_sort', 'title_sort', 'comments', 'category',
|
for attr in ('author_sort', 'title_sort', 'comments', 'category',
|
||||||
'publisher', 'series', 'series_index', 'rating',
|
'publisher', 'series', 'series_index', 'rating',
|
||||||
'isbn', 'tags', 'cover_data', 'libprs_id'):
|
'isbn', 'tags', 'cover_data', 'application_id',
|
||||||
|
'manifest', 'spine', 'toc', 'cover'):
|
||||||
if hasattr(mi, attr):
|
if hasattr(mi, attr):
|
||||||
setattr(ans, attr, getattr(mi, attr))
|
setattr(ans, attr, getattr(mi, attr))
|
||||||
|
|
||||||
|
|
||||||
def __init__(self, title, authors):
|
def __init__(self, title, authors=['Unknown']):
|
||||||
'''
|
'''
|
||||||
@param title: title or "Unknown" or a MetaInformation object
|
@param title: title or "Unknown" or a MetaInformation object
|
||||||
@param authors: List of strings or []
|
@param authors: List of strings or []
|
||||||
@ -76,8 +77,11 @@ class MetaInformation(object):
|
|||||||
self.isbn = None if not mi else mi.isbn
|
self.isbn = None if not mi else mi.isbn
|
||||||
self.tags = [] if not mi else mi.tags
|
self.tags = [] if not mi else mi.tags
|
||||||
self.cover_data = mi.cover_data if (mi and hasattr(mi, 'cover_data')) else (None, None)
|
self.cover_data = mi.cover_data if (mi and hasattr(mi, 'cover_data')) else (None, None)
|
||||||
self.libprs_id = mi.libprs_id if (mi and hasattr(mi, 'libprs_id')) else None
|
self.application_id = mi.application_id if (mi and hasattr(mi, 'application_id')) else None
|
||||||
|
self.manifest = getattr(mi, 'manifest', None)
|
||||||
|
self.toc = getattr(mi, 'toc', None)
|
||||||
|
self.spine = getattr(mi, 'spine', None)
|
||||||
|
self.cover = getattr(mi, 'cover', None)
|
||||||
|
|
||||||
def smart_update(self, mi):
|
def smart_update(self, mi):
|
||||||
'''
|
'''
|
||||||
@ -92,7 +96,7 @@ class MetaInformation(object):
|
|||||||
|
|
||||||
for attr in ('author_sort', 'title_sort', 'comments', 'category',
|
for attr in ('author_sort', 'title_sort', 'comments', 'category',
|
||||||
'publisher', 'series', 'series_index', 'rating',
|
'publisher', 'series', 'series_index', 'rating',
|
||||||
'isbn', 'libprs_id'):
|
'isbn', 'application_id', 'manifest', 'spine', 'toc', 'cover'):
|
||||||
if hasattr(mi, attr):
|
if hasattr(mi, attr):
|
||||||
val = getattr(mi, attr)
|
val = getattr(mi, attr)
|
||||||
if val is not None:
|
if val is not None:
|
||||||
|
@ -51,7 +51,7 @@ def metadata_from_formats(formats):
|
|||||||
ext = path_to_ext(path)
|
ext = path_to_ext(path)
|
||||||
stream = open(path, 'rb')
|
stream = open(path, 'rb')
|
||||||
mi.smart_update(get_metadata(stream, stream_type=ext, use_libprs_metadata=True))
|
mi.smart_update(get_metadata(stream, stream_type=ext, use_libprs_metadata=True))
|
||||||
if getattr(mi, 'libprs_id', None) is not None:
|
if getattr(mi, 'application_id', None) is not None:
|
||||||
return mi
|
return mi
|
||||||
|
|
||||||
return mi
|
return mi
|
||||||
@ -69,7 +69,7 @@ def get_metadata(stream, stream_type='lrf', use_libprs_metadata=False):
|
|||||||
if os.access(c, os.R_OK):
|
if os.access(c, os.R_OK):
|
||||||
opf = opf_metadata(os.path.abspath(c))
|
opf = opf_metadata(os.path.abspath(c))
|
||||||
|
|
||||||
if use_libprs_metadata and getattr(opf, 'libprs_id', None) is not None:
|
if use_libprs_metadata and getattr(opf, 'application_id', None) is not None:
|
||||||
return opf
|
return opf
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -147,7 +147,7 @@ def opf_metadata(opfpath):
|
|||||||
f = open(opfpath, 'rb')
|
f = open(opfpath, 'rb')
|
||||||
opf = OPFReader(f, os.path.dirname(opfpath))
|
opf = OPFReader(f, os.path.dirname(opfpath))
|
||||||
try:
|
try:
|
||||||
if opf.libprs_id is not None:
|
if opf.application_id is not None:
|
||||||
mi = MetaInformation(opf, None)
|
mi = MetaInformation(opf, None)
|
||||||
if hasattr(opf, 'cover') and opf.cover:
|
if hasattr(opf, 'cover') and opf.cover:
|
||||||
cpath = os.path.join(os.path.dirname(opfpath), opf.cover)
|
cpath = os.path.join(os.path.dirname(opfpath), opf.cover)
|
||||||
|
27
src/libprs500/ebooks/metadata/ncx.xml
Normal file
27
src/libprs500/ebooks/metadata/ncx.xml
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
<ncx version="2005-1"
|
||||||
|
xml:lang="en"
|
||||||
|
xmlns="http://www.daisy.org/z3986/2005/ncx/"
|
||||||
|
xmlns:py="http://genshi.edgewall.org/"
|
||||||
|
>
|
||||||
|
<head>
|
||||||
|
<meta name="dtb:uid" content="${uid}"/>
|
||||||
|
<meta name="dtb:depth" content="${toc.depth()}"/>
|
||||||
|
<meta name="dtb:generator" content="${__appname__}"/>
|
||||||
|
<meta name="dtb:totalPageCount" content="0"/>
|
||||||
|
<meta name="dtb:maxPageNumber" content="0"/>
|
||||||
|
</head>
|
||||||
|
<docTitle><text>Table of Contents</text></docTitle>
|
||||||
|
|
||||||
|
<py:def function="navpoint(np, level)">
|
||||||
|
${'%*s'%(4*level,'')}<navPoint playOrder="${str(np.play_order)}">
|
||||||
|
${'%*s'%(4*level,'')}<navLabel>
|
||||||
|
${'%*s'%(4*level,'')}<text>${np.text}</text>
|
||||||
|
${'%*s'%(4*level,'')}</navLabel>
|
||||||
|
${'%*s'%(4*level,'')}<content src="${str(np.href)+(('#' + str(np.fragment)) if np.fragment else '')}" />
|
||||||
|
<py:for each="np2 in np">${navpoint(np2, level+1)}</py:for>
|
||||||
|
${'%*s'%(4*level,'')}</navPoint>
|
||||||
|
</py:def>
|
||||||
|
<navMap>
|
||||||
|
<py:for each="np in toc">${navpoint(np, 0)}</py:for>
|
||||||
|
</navMap>
|
||||||
|
</ncx>
|
@ -12,18 +12,21 @@
|
|||||||
## You should have received a copy of the GNU General Public License along
|
## You should have received a copy of the GNU General Public License along
|
||||||
## with this program; if not, write to the Free Software Foundation, Inc.,
|
## with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
import uuid
|
||||||
'''Read/Write metadata from Open Packaging Format (.opf) files.'''
|
'''Read/Write metadata from Open Packaging Format (.opf) files.'''
|
||||||
|
|
||||||
import sys, re, os, glob
|
import sys, re, os, mimetypes
|
||||||
from urllib import unquote
|
from urllib import unquote
|
||||||
from urlparse import urlparse
|
from urlparse import urlparse
|
||||||
import xml.dom.minidom as dom
|
import xml.dom.minidom as dom
|
||||||
from itertools import repeat
|
from itertools import repeat
|
||||||
|
|
||||||
|
from libprs500 import __appname__
|
||||||
from libprs500.ebooks.metadata import MetaInformation
|
from libprs500.ebooks.metadata import MetaInformation
|
||||||
from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup, BeautifulSoup
|
from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup
|
||||||
from libprs500.ebooks.lrf import entity_to_unicode
|
from libprs500.ebooks.lrf import entity_to_unicode
|
||||||
from libprs500.ebooks.metadata import get_parser
|
from libprs500.ebooks.metadata import get_parser
|
||||||
|
from libprs500.ebooks.metadata.toc import TOC
|
||||||
|
|
||||||
class ManifestItem(object):
|
class ManifestItem(object):
|
||||||
def __init__(self, item, cwd):
|
def __init__(self, item, cwd):
|
||||||
@ -41,6 +44,14 @@ class ManifestItem(object):
|
|||||||
def __unicode__(self):
|
def __unicode__(self):
|
||||||
return u'<item id="%s" href="%s" media-type="%s" />'%(self.id, self.href, self.media_type)
|
return u'<item id="%s" href="%s" media-type="%s" />'%(self.id, self.href, self.media_type)
|
||||||
|
|
||||||
|
def __getitem__(self, index):
|
||||||
|
if index == 0:
|
||||||
|
return self.href
|
||||||
|
if index == 1:
|
||||||
|
return self.media_type
|
||||||
|
raise IndexError('%d out of bounds.'%index)
|
||||||
|
|
||||||
|
|
||||||
class Manifest(list):
|
class Manifest(list):
|
||||||
|
|
||||||
def __init__(self, soup, dir):
|
def __init__(self, soup, dir):
|
||||||
@ -82,84 +93,10 @@ class Spine(object):
|
|||||||
for i in self.linear_ids + self.nonlinear_ids:
|
for i in self.linear_ids + self.nonlinear_ids:
|
||||||
yield self.manifest.item(i)
|
yield self.manifest.item(i)
|
||||||
|
|
||||||
class TOC(list):
|
def __iter__(self):
|
||||||
|
for i in self.linear_ids + self.nonlinear_ids:
|
||||||
|
yield i
|
||||||
|
|
||||||
def __init__(self, opfreader, cwd):
|
|
||||||
self.toc = None
|
|
||||||
toc = opfreader.soup.find('spine', toc=True)
|
|
||||||
if toc is not None:
|
|
||||||
toc = toc['toc']
|
|
||||||
if toc is None:
|
|
||||||
try:
|
|
||||||
toc = opfreader.soup.find('guide').find('reference', attrs={'type':'toc'})['href']
|
|
||||||
except:
|
|
||||||
for item in opfreader.manifest:
|
|
||||||
if 'toc' in item.href.lower():
|
|
||||||
toc = item.href
|
|
||||||
break
|
|
||||||
|
|
||||||
if toc is not None:
|
|
||||||
if toc.lower() != 'ncx':
|
|
||||||
toc = urlparse(unquote(toc))[2]
|
|
||||||
if not os.path.isabs(toc):
|
|
||||||
toc = os.path.join(cwd, toc)
|
|
||||||
try:
|
|
||||||
if not os.path.exists(toc):
|
|
||||||
bn = os.path.basename(toc)
|
|
||||||
bn = bn.replace('_top.htm', '_toc.htm') # Bug in BAEN OPF files
|
|
||||||
toc = os.path.join(os.path.dirname(toc), bn)
|
|
||||||
|
|
||||||
self.read_html_toc(toc, cwd)
|
|
||||||
self.toc = toc
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
cwd = os.path.abspath(cwd)
|
|
||||||
m = glob.glob(os.path.join(cwd, '*.ncx'))
|
|
||||||
if m:
|
|
||||||
toc = m[0]
|
|
||||||
try:
|
|
||||||
self.read_ncx_toc(toc)
|
|
||||||
self.toc = toc
|
|
||||||
except:
|
|
||||||
raise
|
|
||||||
pass
|
|
||||||
|
|
||||||
def read_ncx_toc(self, toc):
|
|
||||||
bdir = os.path.dirname(toc)
|
|
||||||
soup = BeautifulStoneSoup(open(toc, 'rb').read(),
|
|
||||||
convertEntities=BeautifulSoup.HTML_ENTITIES)
|
|
||||||
elems = soup.findAll('navpoint')
|
|
||||||
elems.sort(cmp=lambda x, y: cmp(int(x['playorder']), int(y['playorder'])))
|
|
||||||
|
|
||||||
for elem in elems:
|
|
||||||
txt = u''
|
|
||||||
for nl in elem.findAll('navlabel'):
|
|
||||||
for text in nl.findAll('text'):
|
|
||||||
txt += ''.join([unicode(s) for s in text.findAll(text=True)])
|
|
||||||
|
|
||||||
content = elem.find('content')
|
|
||||||
if content is None or not content.has_key('src') or not txt:
|
|
||||||
continue
|
|
||||||
|
|
||||||
purl = urlparse(unquote(content['src']))
|
|
||||||
href, fragment = purl[2], purl[5]
|
|
||||||
if not os.path.isabs(href):
|
|
||||||
href = os.path.join(bdir, href)
|
|
||||||
self.append((href, fragment, txt))
|
|
||||||
|
|
||||||
|
|
||||||
def read_html_toc(self, toc, cwd):
|
|
||||||
soup = BeautifulSoup(open(toc, 'rb').read(), convertEntities=BeautifulSoup.HTML_ENTITIES)
|
|
||||||
for a in soup.findAll('a'):
|
|
||||||
if not a.has_key('href'):
|
|
||||||
continue
|
|
||||||
purl = urlparse(unquote(a['href']))
|
|
||||||
href, fragment = purl[2], purl[5]
|
|
||||||
if not os.path.isabs(href):
|
|
||||||
href = os.path.join(cwd, href)
|
|
||||||
txt = ''.join([unicode(s).strip() for s in a.findAll(text=True)])
|
|
||||||
self.append((href, fragment, txt))
|
|
||||||
|
|
||||||
|
|
||||||
class standard_field(object):
|
class standard_field(object):
|
||||||
@ -178,21 +115,21 @@ class OPF(MetaInformation):
|
|||||||
MIMETYPE = 'application/oebps-package+xml'
|
MIMETYPE = 'application/oebps-package+xml'
|
||||||
ENTITY_PATTERN = re.compile(r'&(\S+?);')
|
ENTITY_PATTERN = re.compile(r'&(\S+?);')
|
||||||
|
|
||||||
uid = standard_field('uid')
|
uid = standard_field('uid')
|
||||||
libprs_id = standard_field('libprs_id')
|
application_id = standard_field('application_id')
|
||||||
title = standard_field('title')
|
title = standard_field('title')
|
||||||
authors = standard_field('authors')
|
authors = standard_field('authors')
|
||||||
title_sort = standard_field('title_sort')
|
title_sort = standard_field('title_sort')
|
||||||
author_sort = standard_field('author_sort')
|
author_sort = standard_field('author_sort')
|
||||||
comments = standard_field('comments')
|
comments = standard_field('comments')
|
||||||
category = standard_field('category')
|
category = standard_field('category')
|
||||||
publisher = standard_field('publisher')
|
publisher = standard_field('publisher')
|
||||||
isbn = standard_field('isbn')
|
isbn = standard_field('isbn')
|
||||||
cover = standard_field('cover')
|
cover = standard_field('cover')
|
||||||
series = standard_field('series')
|
series = standard_field('series')
|
||||||
series_index = standard_field('series_index')
|
series_index = standard_field('series_index')
|
||||||
rating = standard_field('rating')
|
rating = standard_field('rating')
|
||||||
tags = standard_field('tags')
|
tags = standard_field('tags')
|
||||||
|
|
||||||
HEADER = '''\
|
HEADER = '''\
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
@ -207,14 +144,14 @@ class OPF(MetaInformation):
|
|||||||
if not hasattr(self, 'soup'):
|
if not hasattr(self, 'soup'):
|
||||||
self.soup = BeautifulStoneSoup(u'''\
|
self.soup = BeautifulStoneSoup(u'''\
|
||||||
%s
|
%s
|
||||||
<package unique-identifier="libprs_id">
|
<package unique-identifier="%s_id">
|
||||||
<metadata>
|
<metadata>
|
||||||
<dc-metadata
|
<dc-metadata
|
||||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||||
xmlns:oebpackage="http://openebook.org/namespaces/oeb-package/1.0/" />
|
xmlns:oebpackage="http://openebook.org/namespaces/oeb-package/1.0/" />
|
||||||
</metadata>
|
</metadata>
|
||||||
</package>
|
</package>
|
||||||
'''%self.HEADER)
|
'''%(__appname__, self.HEADER))
|
||||||
|
|
||||||
def _commit(self, doc):
|
def _commit(self, doc):
|
||||||
self.soup = BeautifulStoneSoup(doc.toxml('utf-8'), fromEncoding='utf-8')
|
self.soup = BeautifulStoneSoup(doc.toxml('utf-8'), fromEncoding='utf-8')
|
||||||
@ -403,15 +340,15 @@ class OPF(MetaInformation):
|
|||||||
self._set_metadata_element('dc:identifier', isbn, [('scheme', 'ISBN')],
|
self._set_metadata_element('dc:identifier', isbn, [('scheme', 'ISBN')],
|
||||||
replace=True)
|
replace=True)
|
||||||
|
|
||||||
def get_libprs_id(self):
|
def get_application_id(self):
|
||||||
for item in self.soup.package.metadata.findAll('dc:identifier'):
|
for item in self.soup.package.metadata.findAll('dc:identifier'):
|
||||||
if item.has_key('scheme') and item['scheme'] == 'libprs':
|
if item.has_key('scheme') and item['scheme'] == __appname__:
|
||||||
return str(item.string).strip()
|
return str(item.string).strip()
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def set_libprs_id(self, val):
|
def set_application_id(self, val):
|
||||||
if val:
|
if val:
|
||||||
self._set_metadata_element('dc:identifier', str(val), [('scheme', 'libprs'), ('id', 'libprs_id')],
|
self._set_metadata_element('dc:identifier', str(val), [('scheme', __appname__), ('id', __appname__+'_id')],
|
||||||
replace=True)
|
replace=True)
|
||||||
|
|
||||||
def get_cover(self):
|
def get_cover(self):
|
||||||
@ -564,61 +501,72 @@ class OPFReader(OPF):
|
|||||||
stream.close()
|
stream.close()
|
||||||
self.manifest = Manifest(self.soup, dir)
|
self.manifest = Manifest(self.soup, dir)
|
||||||
self.spine = Spine(self.soup, self.manifest)
|
self.spine = Spine(self.soup, self.manifest)
|
||||||
self.toc = TOC(self, dir)
|
self.toc = TOC()
|
||||||
|
self.toc.read_from_opf(self)
|
||||||
self.cover_data = (None, None)
|
self.cover_data = (None, None)
|
||||||
|
|
||||||
class OPFCreator(OPF):
|
class OPFCreator(MetaInformation):
|
||||||
|
|
||||||
def __init__(self, mi):
|
def __init__(self, base_path, *args, **kwargs):
|
||||||
self.title = mi.title
|
'''
|
||||||
self.authors = mi.authors
|
Initialize.
|
||||||
if mi.category:
|
@param base_path: An absolute path to the directory in which this OPF file
|
||||||
self.category = mi.category
|
will eventually be. This is used by the L{create_manifest} method
|
||||||
if mi.comments:
|
to convert paths to files into relative paths.
|
||||||
self.comments = mi.comments
|
'''
|
||||||
if mi.publisher:
|
MetaInformation.__init__(self, *args, **kwargs)
|
||||||
self.publisher = mi.publisher
|
self.base_path = os.path.abspath(base_path)
|
||||||
if mi.rating:
|
if self.application_id is None:
|
||||||
self.rating = mi.rating
|
self.application_id = str(uuid.uuid4())
|
||||||
if mi.series:
|
self.toc = None
|
||||||
self.series = mi.series
|
if isinstance(self.manifest, Manifest):
|
||||||
if mi.series_index:
|
manifest = []
|
||||||
self.series_index = mi.series_index
|
for path, mt in self.manifest:
|
||||||
if mi.tags:
|
if not path.startswith(self.base_path):
|
||||||
self.tags = mi.tags
|
raise ValueError('Inavlid manifest item %s for base path %s'%(path, self.base_path))
|
||||||
if mi.isbn:
|
path = path[len(self.base_path)+1:]
|
||||||
self.isbn = mi.isbn
|
manifest.append((path, mt))
|
||||||
self.cover_data = mi.cover_data
|
self.manifest = manifest
|
||||||
if hasattr(mi, 'libprs_id'):
|
|
||||||
self.libprs_id = mi.libprs_id
|
|
||||||
if hasattr(mi, 'uid'):
|
|
||||||
self.uid = mi.uid
|
|
||||||
|
|
||||||
def create_manifest(self, entries):
|
def create_manifest(self, entries):
|
||||||
'''
|
'''
|
||||||
Create <manifest>
|
Create <manifest>
|
||||||
@param entries: List of (URL, mime-type)
|
@param entries: List of (path, mime-type)
|
||||||
|
@param base_path: It is used to convert each path into a path relative to itself
|
||||||
@type entries: list of 2-tuples
|
@type entries: list of 2-tuples
|
||||||
'''
|
'''
|
||||||
doc = dom.parseString(self.soup.__str__('UTF-8').strip())
|
rentries = []
|
||||||
package = doc.documentElement
|
base_path = self.base_path
|
||||||
manifest = doc.createElement('manifest')
|
mimetypes.init()
|
||||||
package.appendChild(manifest)
|
for href, mt in entries:
|
||||||
package.appendChild(doc.createTextNode('\n'))
|
href = os.path.abspath(href)
|
||||||
|
if not href.startswith(base_path):
|
||||||
|
raise ValueError('OPF should only refer to files below it. %s is above %s'%(href, base_path))
|
||||||
|
href = href[len(base_path)+1:].replace(os.sep, '/')
|
||||||
|
if not mt:
|
||||||
|
mt = mimetypes.guess_type(href)[0]
|
||||||
|
if not mt:
|
||||||
|
mt = ''
|
||||||
|
rentries.append((href, mt))
|
||||||
|
|
||||||
self.href_map = {}
|
self.manifest = rentries
|
||||||
|
|
||||||
for href, media_type in entries:
|
def create_manifest_from_files_in(self, files_and_dirs):
|
||||||
item = doc.createElement('item')
|
entries = []
|
||||||
item.setAttribute('href', href)
|
|
||||||
item.setAttribute('media-type', media_type)
|
|
||||||
self.href_map[href] = str(hash(href))
|
|
||||||
item.setAttribute('id', self.href_map[href])
|
|
||||||
manifest.appendChild(item)
|
|
||||||
manifest.appendChild(doc.createTextNode('\n'))
|
|
||||||
|
|
||||||
self._commit(doc)
|
def dodir(dir):
|
||||||
|
for root, dirs, files in os.walk(dir):
|
||||||
|
for name in files:
|
||||||
|
path = os.path.join(root, name)
|
||||||
|
entries.append((path, None))
|
||||||
|
|
||||||
|
for i in files_and_dirs:
|
||||||
|
if os.path.isdir(i):
|
||||||
|
dodir(i)
|
||||||
|
else:
|
||||||
|
entries.append((i, None))
|
||||||
|
|
||||||
|
self.create_manifest(entries)
|
||||||
|
|
||||||
def create_spine(self, entries):
|
def create_spine(self, entries):
|
||||||
'''
|
'''
|
||||||
@ -626,19 +574,43 @@ class OPFCreator(OPF):
|
|||||||
@param: List of paths
|
@param: List of paths
|
||||||
@type param: list of strings
|
@type param: list of strings
|
||||||
'''
|
'''
|
||||||
doc = dom.parseString(self.soup.__str__('UTF-8').strip())
|
self.spine = []
|
||||||
package = doc.documentElement
|
|
||||||
spine = doc.createElement('spine')
|
|
||||||
package.appendChild(spine)
|
|
||||||
package.appendChild(doc.createTextNode('\n'))
|
|
||||||
|
|
||||||
for href in entries:
|
for path in entries:
|
||||||
itemref = doc.createElement('itemref')
|
if not os.path.isabs(path):
|
||||||
itemref.setAttribute('idref', self.href_map[href])
|
path = os.path.join(self.base_path, path)
|
||||||
spine.appendChild(itemref)
|
if not path.startswith(self.base_path):
|
||||||
spine.appendChild(doc.createTextNode('\n'))
|
raise ValueError('Invalid entry %s for base path %s'%(path, self.base_path))
|
||||||
|
href = path[len(self.base_path)+1:]
|
||||||
|
in_manifest = False
|
||||||
|
for i, m in enumerate(self.manifest):
|
||||||
|
if m[0] == href:
|
||||||
|
in_manifest = True
|
||||||
|
break
|
||||||
|
if not in_manifest:
|
||||||
|
raise ValueError('%s is not in the manifest. (%s)'%(href, path))
|
||||||
|
self.spine.append(i)
|
||||||
|
|
||||||
self._commit(doc)
|
|
||||||
|
|
||||||
|
def set_toc(self, toc):
|
||||||
|
'''
|
||||||
|
Set the toc. You must call L{create_spine} before calling this
|
||||||
|
method.
|
||||||
|
@param toc: A Table of Contents
|
||||||
|
@type toc: L{TOC}
|
||||||
|
'''
|
||||||
|
self.toc = toc
|
||||||
|
|
||||||
|
def render(self, opf_stream, ncx_stream=None):
|
||||||
|
from libprs500.resources import opf_template
|
||||||
|
from genshi.template import MarkupTemplate
|
||||||
|
template = MarkupTemplate(opf_template)
|
||||||
|
opf = template.generate(__appname__=__appname__, mi=self).render('xml')
|
||||||
|
opf_stream.write(opf)
|
||||||
|
toc = getattr(self, 'toc', None)
|
||||||
|
if toc is not None and ncx_stream is not None:
|
||||||
|
toc.render(ncx_stream, self.application_id)
|
||||||
|
|
||||||
def option_parser():
|
def option_parser():
|
||||||
return get_parser('opf')
|
return get_parser('opf')
|
||||||
@ -649,7 +621,7 @@ def main(args=sys.argv):
|
|||||||
if len(args) != 2:
|
if len(args) != 2:
|
||||||
parser.print_help()
|
parser.print_help()
|
||||||
return 1
|
return 1
|
||||||
mi = OPFReader(open(args[1], 'rb'))
|
mi = MetaInformation(OPFReader(open(args[1], 'rb')))
|
||||||
if opts.title is not None:
|
if opts.title is not None:
|
||||||
mi.title = opts.title.replace('&', '&').replace('<', '<').replace('>', '>')
|
mi.title = opts.title.replace('&', '&').replace('<', '<').replace('>', '>')
|
||||||
if opts.authors is not None:
|
if opts.authors is not None:
|
||||||
@ -660,7 +632,8 @@ def main(args=sys.argv):
|
|||||||
if opts.comment is not None:
|
if opts.comment is not None:
|
||||||
mi.comments = opts.comment.replace('&', '&').replace('<', '<').replace('>', '>')
|
mi.comments = opts.comment.replace('&', '&').replace('<', '<').replace('>', '>')
|
||||||
print mi
|
print mi
|
||||||
mi.write(open(args[1], 'wb'))
|
mo = OPFCreator(os.getcwd(), mi)
|
||||||
|
mo.render(open(args[1], 'wb'))
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
36
src/libprs500/ebooks/metadata/opf.xml
Normal file
36
src/libprs500/ebooks/metadata/opf.xml
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<package version="2.0"
|
||||||
|
xmlns:opf="http://www.idpf.org/2007/opf"
|
||||||
|
xmlns:py="http://genshi.edgewall.org/"
|
||||||
|
unique-identifier="${__appname__}_id"
|
||||||
|
|
||||||
|
>
|
||||||
|
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
|
||||||
|
<dc:title py:with="attrs={'files-as':mi.title_sort}" py:attrs="attrs">${mi.title}</dc:title>
|
||||||
|
<dc:creator opf:role="aut" py:for="i, author in enumerate(mi.authors)" py:with="attrs={'file-as':mi.author_sort if i==0 else None}" py:attrs="attrs">${author}</dc:creator>
|
||||||
|
<dc:identifier scheme="${__appname__}" id="${__appname__}_id">${mi.application_id}</dc:identifier>
|
||||||
|
|
||||||
|
<dc:type py:if="mi.category">${mi.category}</dc:type>
|
||||||
|
<dc:description py:if="mi.comments">${mi.comments}</dc:description>
|
||||||
|
<dc:publisher py:if="mi.publisher">${mi.publisher}</dc:publisher>
|
||||||
|
<dc:identifier opf:scheme="ISBN" py:if="mi.isbn">${mi.isbn}</dc:identifier>
|
||||||
|
<series py:if="mi.series">${mi.series}</series>
|
||||||
|
<series-index py:if="mi.series_index is not None">${mi.series_index}</series-index>
|
||||||
|
<rating py:if="mi.rating is not None">${mi.rating}</rating>
|
||||||
|
<dc:subject py:if="mi.tags is not None" py:for="tag in mi.tags">${tag}</dc:subject>
|
||||||
|
</metadata>
|
||||||
|
|
||||||
|
<guide>
|
||||||
|
<reference py:if="mi.cover" type="cover" href="${mi.cover}" />
|
||||||
|
</guide>
|
||||||
|
|
||||||
|
<manifest>
|
||||||
|
<py:for each="i, m in enumerate(mi.manifest)">
|
||||||
|
<item id="${str(i)}" href="${m[0]}" media-type="${m[1]}" />
|
||||||
|
</py:for>
|
||||||
|
</manifest>
|
||||||
|
|
||||||
|
<spine py:with="attrs={'toc':'ncx' if mi.toc else None}" py:attrs="attrs">
|
||||||
|
<itemref py:for="idref in mi.spine" idref="${str(idref)}" />
|
||||||
|
</spine>
|
||||||
|
</package>
|
154
src/libprs500/ebooks/metadata/toc.py
Normal file
154
src/libprs500/ebooks/metadata/toc.py
Normal file
@ -0,0 +1,154 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
|
||||||
|
## This program is free software; you can redistribute it and/or modify
|
||||||
|
## it under the terms of the GNU General Public License as published by
|
||||||
|
## the Free Software Foundation; either version 2 of the License, or
|
||||||
|
## (at your option) any later version.
|
||||||
|
##
|
||||||
|
## This program is distributed in the hope that it will be useful,
|
||||||
|
## but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
## GNU General Public License for more details.
|
||||||
|
##
|
||||||
|
## You should have received a copy of the GNU General Public License along
|
||||||
|
## with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
import os, glob
|
||||||
|
from urlparse import urlparse
|
||||||
|
from urllib import unquote
|
||||||
|
|
||||||
|
from libprs500 import __appname__
|
||||||
|
from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup, BeautifulSoup
|
||||||
|
|
||||||
|
class NCXSoup(BeautifulStoneSoup):
|
||||||
|
|
||||||
|
NESTABLE_TAGS = {'navpoint':[]}
|
||||||
|
|
||||||
|
def __init__(self, raw):
|
||||||
|
BeautifulStoneSoup.__init__(self, raw,
|
||||||
|
convertEntities=BeautifulSoup.HTML_ENTITIES,
|
||||||
|
selfClosingTags=['meta', 'content'])
|
||||||
|
|
||||||
|
class TOC(list):
|
||||||
|
|
||||||
|
def __init__(self, href=None, fragment=None, text=None, parent=None, play_order=1,
|
||||||
|
base_path=os.getcwd()):
|
||||||
|
self.href = href
|
||||||
|
self.fragment = fragment
|
||||||
|
self.text = text
|
||||||
|
self.parent = parent
|
||||||
|
self.base_path = base_path
|
||||||
|
self.play_order = play_order
|
||||||
|
|
||||||
|
def add_item(self, href, fragment, text):
|
||||||
|
self.append(TOC(href=href, fragment=fragment, text=text, parent=self, base_path=self.base_path))
|
||||||
|
return self[-1]
|
||||||
|
|
||||||
|
def top_level_items(self):
|
||||||
|
for item in self:
|
||||||
|
if item.text is not None:
|
||||||
|
yield item
|
||||||
|
|
||||||
|
def depth(self):
|
||||||
|
depth = 1
|
||||||
|
for obj in self:
|
||||||
|
c = obj.depth()
|
||||||
|
if c > depth - 1:
|
||||||
|
depth = c + 1
|
||||||
|
return depth
|
||||||
|
|
||||||
|
@apply
|
||||||
|
def abspath():
|
||||||
|
doc='Return the file this toc entry points to as a absolute path to a file on the system.'
|
||||||
|
def fget(self):
|
||||||
|
path = self.href.replace('/', os.sep)
|
||||||
|
if not os.path.isabs(path):
|
||||||
|
path = os.path.join(self.base_path, path)
|
||||||
|
return path
|
||||||
|
return property(fget=fget, doc=doc)
|
||||||
|
|
||||||
|
def read_from_opf(self, opfreader):
|
||||||
|
toc = opfreader.soup.find('spine', toc=True)
|
||||||
|
if toc is not None:
|
||||||
|
toc = toc['toc']
|
||||||
|
if toc is None:
|
||||||
|
try:
|
||||||
|
toc = opfreader.soup.find('guide').find('reference', attrs={'type':'toc'})['href']
|
||||||
|
except:
|
||||||
|
for item in opfreader.manifest:
|
||||||
|
if 'toc' in item.href.lower():
|
||||||
|
toc = item.href
|
||||||
|
break
|
||||||
|
|
||||||
|
if toc is not None:
|
||||||
|
if toc.lower() != 'ncx':
|
||||||
|
toc = urlparse(unquote(toc))[2]
|
||||||
|
toc = toc.replace('/', os.sep)
|
||||||
|
if not os.path.isabs(toc):
|
||||||
|
toc = os.path.join(self.base_path, toc)
|
||||||
|
try:
|
||||||
|
if not os.path.exists(toc):
|
||||||
|
bn = os.path.basename(toc)
|
||||||
|
bn = bn.replace('_top.htm', '_toc.htm') # Bug in BAEN OPF files
|
||||||
|
toc = os.path.join(os.path.dirname(toc), bn)
|
||||||
|
|
||||||
|
self.read_html_toc(toc, self.base_path)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
cwd = os.path.abspath(self.base_path)
|
||||||
|
m = glob.glob(os.path.join(cwd, '*.ncx'))
|
||||||
|
if m:
|
||||||
|
toc = m[0]
|
||||||
|
self.read_ncx_toc(toc)
|
||||||
|
|
||||||
|
def read_ncx_toc(self, toc):
|
||||||
|
self.base_path = os.path.dirname(toc)
|
||||||
|
soup = NCXSoup(open(toc, 'rb').read())
|
||||||
|
|
||||||
|
def process_navpoint(np, dest):
|
||||||
|
play_order = np.get('playOrder', 1)
|
||||||
|
href = fragment = text = None
|
||||||
|
nl = np.find('navlabel')
|
||||||
|
if nl is not None:
|
||||||
|
text = u''
|
||||||
|
for txt in nl.findAll('text'):
|
||||||
|
text += ''.join([unicode(s) for s in txt.findAll(text=True)])
|
||||||
|
content = elem.find('content')
|
||||||
|
if content is None or not content.has_key('src') or not txt:
|
||||||
|
return
|
||||||
|
|
||||||
|
purl = urlparse(unquote(content['src']))
|
||||||
|
href, fragment = purl[2], purl[5]
|
||||||
|
nd = dest.add_item(href, fragment, text)
|
||||||
|
nd.play_order = play_order
|
||||||
|
|
||||||
|
for c in np:
|
||||||
|
if getattr(c, 'name', None) == 'navpoint':
|
||||||
|
process_navpoint(c, nd)
|
||||||
|
|
||||||
|
nm = soup.find('navmap')
|
||||||
|
for elem in nm:
|
||||||
|
if getattr(elem, 'name', None) == 'navpoint':
|
||||||
|
process_navpoint(elem, self)
|
||||||
|
|
||||||
|
|
||||||
|
def read_html_toc(self, toc):
|
||||||
|
self.base_path = os.path.dirname(toc)
|
||||||
|
soup = BeautifulSoup(open(toc, 'rb').read(), convertEntities=BeautifulSoup.HTML_ENTITIES)
|
||||||
|
for a in soup.findAll('a'):
|
||||||
|
if not a.has_key('href'):
|
||||||
|
continue
|
||||||
|
purl = urlparse(unquote(a['href']))
|
||||||
|
href, fragment = purl[2], purl[5]
|
||||||
|
txt = ''.join([unicode(s).strip() for s in a.findAll(text=True)])
|
||||||
|
self.add_item(href, fragment, txt)
|
||||||
|
|
||||||
|
def render(self, stream, uid):
|
||||||
|
from libprs500.resources import ncx_template
|
||||||
|
from genshi.template import MarkupTemplate
|
||||||
|
doctype = ('ncx', "-//NISO//DTD ncx 2005-1//EN", "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd")
|
||||||
|
template = MarkupTemplate(ncx_template)
|
||||||
|
raw = template.generate(uid=uid, toc=self, __appname__=__appname__)
|
||||||
|
raw = raw.render(doctype=doctype)
|
||||||
|
stream.write(raw)
|
@ -186,11 +186,11 @@ class MobiReader(object):
|
|||||||
|
|
||||||
if self.book_header.exth is not None:
|
if self.book_header.exth is not None:
|
||||||
opf = self.create_opf(htmlfile)
|
opf = self.create_opf(htmlfile)
|
||||||
opf.write(open(os.path.splitext(htmlfile)[0]+'.opf', 'wb'))
|
opf.render(open(os.path.splitext(htmlfile)[0]+'.opf', 'wb'))
|
||||||
|
|
||||||
def create_opf(self, htmlfile):
|
def create_opf(self, htmlfile):
|
||||||
mi = self.book_header.exth.mi
|
mi = self.book_header.exth.mi
|
||||||
opf = OPFCreator(mi)
|
opf = OPFCreator(os.path.dirname(htmlfile), mi)
|
||||||
if hasattr(self.book_header.exth, 'cover_offset'):
|
if hasattr(self.book_header.exth, 'cover_offset'):
|
||||||
opf.cover = 'images/%05d.jpg'%(self.book_header.exth.cover_offset+1)
|
opf.cover = 'images/%05d.jpg'%(self.book_header.exth.cover_offset+1)
|
||||||
manifest = [(os.path.basename(htmlfile), 'text/x-oeb1-document')]
|
manifest = [(os.path.basename(htmlfile), 'text/x-oeb1-document')]
|
||||||
|
@ -1340,7 +1340,7 @@ ALTER TABLE books ADD COLUMN isbn TEXT DEFAULT "" COLLATE NOCASE;
|
|||||||
mi.rating = self.rating(idx, index_is_id=index_is_id)
|
mi.rating = self.rating(idx, index_is_id=index_is_id)
|
||||||
mi.isbn = self.isbn(idx, index_is_id=index_is_id)
|
mi.isbn = self.isbn(idx, index_is_id=index_is_id)
|
||||||
id = idx if index_is_id else self.id(idx)
|
id = idx if index_is_id else self.id(idx)
|
||||||
mi.libprs_id = id
|
mi.application_id = id
|
||||||
return mi
|
return mi
|
||||||
|
|
||||||
def vacuum(self):
|
def vacuum(self):
|
||||||
@ -1382,7 +1382,7 @@ ALTER TABLE books ADD COLUMN isbn TEXT DEFAULT "" COLLATE NOCASE;
|
|||||||
name += '_'+id
|
name += '_'+id
|
||||||
base = dir if single_dir else tpath
|
base = dir if single_dir else tpath
|
||||||
|
|
||||||
mi = OPFCreator(self.get_metadata(idx, index_is_id=index_is_id))
|
mi = OPFCreator(base, self.get_metadata(idx, index_is_id=index_is_id))
|
||||||
cover = self.cover(idx, index_is_id=index_is_id)
|
cover = self.cover(idx, index_is_id=index_is_id)
|
||||||
if cover is not None:
|
if cover is not None:
|
||||||
cname = name + '.jpg'
|
cname = name + '.jpg'
|
||||||
@ -1390,7 +1390,7 @@ ALTER TABLE books ADD COLUMN isbn TEXT DEFAULT "" COLLATE NOCASE;
|
|||||||
open(cpath, 'wb').write(cover)
|
open(cpath, 'wb').write(cover)
|
||||||
mi.cover = cname
|
mi.cover = cname
|
||||||
f = open(os.path.join(base, name+'.opf'), 'wb')
|
f = open(os.path.join(base, name+'.opf'), 'wb')
|
||||||
mi.write(f)
|
mi.render(f)
|
||||||
f.close()
|
f.close()
|
||||||
|
|
||||||
for fmt in self.formats(idx, index_is_id=index_is_id).split(','):
|
for fmt in self.formats(idx, index_is_id=index_is_id).split(','):
|
||||||
|
@ -44,6 +44,7 @@ entry_points = {
|
|||||||
'rtf2lrf = libprs500.ebooks.lrf.rtf.convert_from:main',
|
'rtf2lrf = libprs500.ebooks.lrf.rtf.convert_from:main',
|
||||||
'web2disk = libprs500.web.fetch.simple:main',
|
'web2disk = libprs500.web.fetch.simple:main',
|
||||||
'feeds2disk = libprs500.web.feeds.main:main',
|
'feeds2disk = libprs500.web.feeds.main:main',
|
||||||
|
'feeds2lrf = libprs500.ebooks.lrf.feeds.convert_from:main',
|
||||||
'web2lrf = libprs500.ebooks.lrf.web.convert_from:main',
|
'web2lrf = libprs500.ebooks.lrf.web.convert_from:main',
|
||||||
'pdf2lrf = libprs500.ebooks.lrf.pdf.convert_from:main',
|
'pdf2lrf = libprs500.ebooks.lrf.pdf.convert_from:main',
|
||||||
'mobi2lrf = libprs500.ebooks.lrf.mobi.convert_from:main',
|
'mobi2lrf = libprs500.ebooks.lrf.mobi.convert_from:main',
|
||||||
|
@ -201,6 +201,7 @@ class ProgressBar:
|
|||||||
self.term.BOL + self.term.UP + self.term.CLEAR_EOL +
|
self.term.BOL + self.term.UP + self.term.CLEAR_EOL +
|
||||||
(self.bar % (100*percent, '='*n, '-'*(self.width-10-n))) +
|
(self.bar % (100*percent, '='*n, '-'*(self.width-10-n))) +
|
||||||
self.term.CLEAR_EOL + msg)
|
self.term.CLEAR_EOL + msg)
|
||||||
|
sys.stdout.flush()
|
||||||
|
|
||||||
def clear(self):
|
def clear(self):
|
||||||
if not self.cleared:
|
if not self.cleared:
|
||||||
|
@ -17,12 +17,13 @@
|
|||||||
The backend to parse feeds and create HTML that can then be converted
|
The backend to parse feeds and create HTML that can then be converted
|
||||||
to an ebook.
|
to an ebook.
|
||||||
'''
|
'''
|
||||||
import logging, os, cStringIO, time, itertools, traceback
|
import logging, os, cStringIO, time, traceback
|
||||||
import urlparse
|
import urlparse
|
||||||
|
|
||||||
from libprs500 import browser, __appname__
|
from libprs500 import browser, __appname__
|
||||||
from libprs500.ebooks.BeautifulSoup import BeautifulSoup
|
from libprs500.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
from libprs500.ebooks.metadata.opf import OPFCreator
|
from libprs500.ebooks.metadata.opf import OPFCreator
|
||||||
|
from libprs500.ebooks.metadata.toc import TOC
|
||||||
from libprs500.ebooks.metadata import MetaInformation
|
from libprs500.ebooks.metadata import MetaInformation
|
||||||
from libprs500.web.feeds import feed_from_xml, templates
|
from libprs500.web.feeds import feed_from_xml, templates
|
||||||
from libprs500.web.fetch.simple import option_parser as web2disk_option_parser
|
from libprs500.web.fetch.simple import option_parser as web2disk_option_parser
|
||||||
@ -94,6 +95,9 @@ class BasicNewsRecipe(object):
|
|||||||
#: using cp1252. If None, try to detect the encoding.
|
#: using cp1252. If None, try to detect the encoding.
|
||||||
encoding = None
|
encoding = None
|
||||||
|
|
||||||
|
#: Specify any extra CSS that should be addded to downloaded HTML files
|
||||||
|
extra_css = None
|
||||||
|
|
||||||
#: List of regular expressions that determines which links to follow
|
#: List of regular expressions that determines which links to follow
|
||||||
#: If empty, it is ignored.
|
#: If empty, it is ignored.
|
||||||
#: Only one of L{match_regexps} or L{filter_regexps} should be defined
|
#: Only one of L{match_regexps} or L{filter_regexps} should be defined
|
||||||
@ -276,8 +280,9 @@ class BasicNewsRecipe(object):
|
|||||||
|
|
||||||
self.web2disk_options = web2disk_option_parser().parse_args(web2disk_cmdline)[0]
|
self.web2disk_options = web2disk_option_parser().parse_args(web2disk_cmdline)[0]
|
||||||
for extra in ('keep_only_tags', 'remove_tags', 'preprocess_regexps',
|
for extra in ('keep_only_tags', 'remove_tags', 'preprocess_regexps',
|
||||||
'preprocess_html', 'remove_tags_after', 'postprocess_html'):
|
'preprocess_html', 'remove_tags_after'):
|
||||||
setattr(self.web2disk_options, extra, getattr(self, extra))
|
setattr(self.web2disk_options, extra, getattr(self, extra))
|
||||||
|
self.web2disk_options.postprocess_html = [self._postprocess_html, self.postprocess_html]
|
||||||
|
|
||||||
if self.delay > 0:
|
if self.delay > 0:
|
||||||
self.simultaneous_downloads = 1
|
self.simultaneous_downloads = 1
|
||||||
@ -288,6 +293,14 @@ class BasicNewsRecipe(object):
|
|||||||
self.failed_downloads = []
|
self.failed_downloads = []
|
||||||
self.partial_failures = []
|
self.partial_failures = []
|
||||||
|
|
||||||
|
def _postprocess_html(self, soup):
|
||||||
|
if self.extra_css is not None:
|
||||||
|
head = soup.find('head')
|
||||||
|
if head:
|
||||||
|
style = BeautifulSoup(u'<style type="text/css">%s</style>'%self.extra_css).find('style')
|
||||||
|
head.insert(len(head.contents), style)
|
||||||
|
return soup
|
||||||
|
|
||||||
def download(self):
|
def download(self):
|
||||||
'''
|
'''
|
||||||
Download and pre-process all articles from the feeds in this recipe.
|
Download and pre-process all articles from the feeds in this recipe.
|
||||||
@ -297,6 +310,7 @@ class BasicNewsRecipe(object):
|
|||||||
@rtype: string
|
@rtype: string
|
||||||
'''
|
'''
|
||||||
self.report_progress(0, _('Trying to download cover...'))
|
self.report_progress(0, _('Trying to download cover...'))
|
||||||
|
|
||||||
self.download_cover()
|
self.download_cover()
|
||||||
res = self.build_index()
|
res = self.build_index()
|
||||||
self.cleanup()
|
self.cleanup()
|
||||||
@ -362,7 +376,7 @@ class BasicNewsRecipe(object):
|
|||||||
fetcher.current_dir = dir
|
fetcher.current_dir = dir
|
||||||
fetcher.show_progress = False
|
fetcher.show_progress = False
|
||||||
res, path, failures = fetcher.start_fetch(url), fetcher.downloaded_paths, fetcher.failed_links
|
res, path, failures = fetcher.start_fetch(url), fetcher.downloaded_paths, fetcher.failed_links
|
||||||
if not res:
|
if not res or not os.path.exists(res):
|
||||||
raise Exception(_('Could not fetch article. Run with --debug to see the reason'))
|
raise Exception(_('Could not fetch article. Run with --debug to see the reason'))
|
||||||
return res, path, failures
|
return res, path, failures
|
||||||
|
|
||||||
@ -446,28 +460,44 @@ class BasicNewsRecipe(object):
|
|||||||
if dir is None:
|
if dir is None:
|
||||||
dir = self.output_dir
|
dir = self.output_dir
|
||||||
mi = MetaInformation(self.title + time.strftime(self.timefmt), [__appname__])
|
mi = MetaInformation(self.title + time.strftime(self.timefmt), [__appname__])
|
||||||
opf = OPFCreator(mi)
|
|
||||||
opf_path = os.path.join(dir, 'index.opf')
|
opf_path = os.path.join(dir, 'index.opf')
|
||||||
|
ncx_path = os.path.join(dir, 'index.ncx')
|
||||||
|
opf = OPFCreator(dir, mi)
|
||||||
|
|
||||||
|
|
||||||
|
manifest = ['feed_%d'%i for i in range(len(feeds))]
|
||||||
|
manifest.append('index.html')
|
||||||
cpath = getattr(self, 'cover_path', None)
|
cpath = getattr(self, 'cover_path', None)
|
||||||
if cpath is not None and os.access(cpath, os.R_OK):
|
if cpath is not None and os.access(cpath, os.R_OK):
|
||||||
opf.cover = cpath
|
opf.cover = cpath
|
||||||
|
manifest.append(cpath)
|
||||||
|
opf.create_manifest_from_files_in(manifest)
|
||||||
|
|
||||||
entries = ['index.html']
|
entries = ['index.html']
|
||||||
|
toc = TOC(base_path=dir)
|
||||||
for i, f in enumerate(feeds):
|
for i, f in enumerate(feeds):
|
||||||
entries.append('feed_%d/index.html'%i)
|
entries.append('feed_%d/index.html'%i)
|
||||||
|
feed = toc.add_item('feed_%d/index.html'%i, None, f.title)
|
||||||
for j, a in enumerate(f):
|
for j, a in enumerate(f):
|
||||||
if getattr(a, 'downloaded', False):
|
if getattr(a, 'downloaded', False):
|
||||||
adir = 'feed_%d/article_%d/'%(i, j)
|
adir = 'feed_%d/article_%d/'%(i, j)
|
||||||
entries.append('%sindex.html'%adir)
|
entries.append('%sindex.html'%adir)
|
||||||
|
feed.add_item('%sindex.html'%adir, None, a.title if a.title else 'Untitled article')
|
||||||
for sp in a.sub_pages:
|
for sp in a.sub_pages:
|
||||||
prefix = os.path.commonprefix([opf_path, sp])
|
prefix = os.path.commonprefix([opf_path, sp])
|
||||||
relp = sp[len(prefix):]
|
relp = sp[len(prefix):]
|
||||||
entries.append(relp.replace(os.sep, '/'))
|
entries.append(relp.replace(os.sep, '/'))
|
||||||
|
|
||||||
opf.create_manifest(itertools.izip(entries, itertools.repeat('text/html')))
|
|
||||||
opf.create_spine(entries)
|
opf.create_spine(entries)
|
||||||
opf.write(open(opf_path, 'wb'))
|
opf.set_toc(toc)
|
||||||
|
|
||||||
|
for i, f in enumerate(feeds):
|
||||||
|
|
||||||
|
for j, a in enumerate(f):
|
||||||
|
if getattr(a, 'downloaded', False):
|
||||||
|
adir = 'feed_%d/article_%d/'%(i, j)
|
||||||
|
|
||||||
|
opf.render(open(opf_path, 'wb'), open(ncx_path, 'wb'))
|
||||||
|
|
||||||
|
|
||||||
def article_downloaded(self, request, result):
|
def article_downloaded(self, request, result):
|
||||||
@ -516,7 +546,7 @@ class BasicNewsRecipe(object):
|
|||||||
title, url = None, obj
|
title, url = None, obj
|
||||||
else:
|
else:
|
||||||
title, url = obj
|
title, url = obj
|
||||||
self.report_progress(0, _('Fetching feed %s...'%(title if title else url)))
|
self.report_progress(0, _('Fetching feed')+' %s...'%(title if title else url))
|
||||||
parsed_feeds.append(feed_from_xml(self.browser.open(url).read(),
|
parsed_feeds.append(feed_from_xml(self.browser.open(url).read(),
|
||||||
title=title,
|
title=title,
|
||||||
oldest_article=self.oldest_article,
|
oldest_article=self.oldest_article,
|
||||||
|
@ -41,7 +41,7 @@ class Newsweek(BasicNewsRecipe):
|
|||||||
'http://feeds.newsweek.com/newsweek/columnists/AnnaQuindlen',
|
'http://feeds.newsweek.com/newsweek/columnists/AnnaQuindlen',
|
||||||
]
|
]
|
||||||
|
|
||||||
extra_css = '#content { font:serif,120%; }'
|
extra_css = '#content { font:serif 1.2em; }'
|
||||||
keep_only_tags = [dict(name='div', id='content')]
|
keep_only_tags = [dict(name='div', id='content')]
|
||||||
|
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
@ -55,8 +55,8 @@ class Newsweek(BasicNewsRecipe):
|
|||||||
match_regexps = [r'http://www.newsweek.com/id/\S+/page/\d+']
|
match_regexps = [r'http://www.newsweek.com/id/\S+/page/\d+']
|
||||||
|
|
||||||
# For testing
|
# For testing
|
||||||
#feeds = feeds[:2]
|
#feeds = feeds[3:5]
|
||||||
#max_articles_per_feed = 1
|
#max_articles_per_feed = 2
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -57,16 +57,17 @@ class NavBarTemplate(Template):
|
|||||||
<body>
|
<body>
|
||||||
<div class="navbar" style="text-align:center">
|
<div class="navbar" style="text-align:center">
|
||||||
<hr py:if="bottom" />
|
<hr py:if="bottom" />
|
||||||
<a href="../index.html#article_${str(art)}">Up one level</a>
|
<py:if test="art != num - 1">
|
||||||
|
| <a href="../article_${str(art+1)}/index.html">Next</a>
|
||||||
|
</py:if>
|
||||||
|
| <a href="../index.html#article_${str(art)}">Up one level</a>
|
||||||
<py:if test="two_levels">
|
<py:if test="two_levels">
|
||||||
| <a href="../../index.html#_${str(feed)}">Up two levels</a>
|
| <a href="../../index.html#_${str(feed)}">Up two levels</a>
|
||||||
</py:if>
|
</py:if>
|
||||||
<py:if test="art != 0">
|
<py:if test="art != 0">
|
||||||
| <a href="../article_${str(art-1)}/index.html">Previous</a>
|
| <a href="../article_${str(art-1)}/index.html">Previous</a>
|
||||||
</py:if>
|
</py:if>
|
||||||
<py:if test="art != num - 1">
|
|
|
||||||
| <a href="../article_${str(art+1)}/index.html">Next</a>
|
|
||||||
</py:if>
|
|
||||||
<hr py:if="not bottom" />
|
<hr py:if="not bottom" />
|
||||||
</div>
|
</div>
|
||||||
</body>
|
</body>
|
||||||
|
@ -38,9 +38,9 @@ def basename(url):
|
|||||||
|
|
||||||
def save_soup(soup, target):
|
def save_soup(soup, target):
|
||||||
nm = Tag(soup, '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
|
nm = Tag(soup, '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
|
||||||
for meta in soup.find('meta', content=True):
|
meta = soup.find('meta', content=True)
|
||||||
if 'charset' in meta['content']:
|
if meta and 'charset' in meta['content']:
|
||||||
meta.replaceWith(nm)
|
meta.replaceWith(nm)
|
||||||
f = codecs.open(target, 'w', 'utf-8')
|
f = codecs.open(target, 'w', 'utf-8')
|
||||||
f.write(unicode(soup))
|
f.write(unicode(soup))
|
||||||
f.close()
|
f.close()
|
||||||
@ -85,7 +85,7 @@ class RecursiveFetcher(object):
|
|||||||
self.remove_tags_after = getattr(options, 'remove_tags_after', None)
|
self.remove_tags_after = getattr(options, 'remove_tags_after', None)
|
||||||
self.keep_only_tags = getattr(options, 'keep_only_tags', [])
|
self.keep_only_tags = getattr(options, 'keep_only_tags', [])
|
||||||
self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup)
|
self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup)
|
||||||
self.postprocess_html_ext= getattr(options, 'postprocess_html', lambda soup: soup)
|
self.postprocess_html_ext= getattr(options, 'postprocess_html', [])
|
||||||
self.download_stylesheets = not options.no_stylesheets
|
self.download_stylesheets = not options.no_stylesheets
|
||||||
self.show_progress = True
|
self.show_progress = True
|
||||||
self.failed_links = []
|
self.failed_links = []
|
||||||
@ -336,7 +336,9 @@ class RecursiveFetcher(object):
|
|||||||
self.process_return_links(soup, iurl)
|
self.process_return_links(soup, iurl)
|
||||||
self.logger.debug('Recursion limit reached. Skipping links in %s', iurl)
|
self.logger.debug('Recursion limit reached. Skipping links in %s', iurl)
|
||||||
|
|
||||||
save_soup(self.postprocess_html_ext(soup), res)
|
for func in self.postprocess_html_ext:
|
||||||
|
soup = func(soup)
|
||||||
|
save_soup(soup, res)
|
||||||
|
|
||||||
self.localize_link(tag, 'href', res)
|
self.localize_link(tag, 'href', res)
|
||||||
except Exception, err:
|
except Exception, err:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user