Refactored OPF creation code. Implemented Table of Contents support in feeds2disk.

This commit is contained in:
Kovid Goyal 2008-03-14 19:25:48 +00:00
parent 748c184ccb
commit 6982652f92
18 changed files with 482 additions and 209 deletions

View File

@ -1,6 +1,6 @@
PYTHON = python PYTHON = python
all : gui2 translations all : gui2 translations resources
clean : clean :
cd src/libprs500/gui2 && ${PYTHON} make.py clean cd src/libprs500/gui2 && ${PYTHON} make.py clean
@ -14,3 +14,7 @@ test : gui2
translations : translations :
cd src/libprs500 && ${PYTHON} translations/__init__.py cd src/libprs500 && ${PYTHON} translations/__init__.py
resources:
${PYTHON} resources.py

39
resources.py Normal file
View File

@ -0,0 +1,39 @@
#!/usr/bin/env python
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
'''
Compile resource files.
'''
import os, sys
sys.path.insert(1, os.path.join(os.getcwd(), 'src'))
from libprs500 import __appname__
RESOURCES = dict(
opf_template = '%p/ebooks/metadata/opf.xml',
ncx_template = '%p/ebooks/metadata/ncx.xml',
)
def main(args=sys.argv):
data = ''
for key, value in RESOURCES.items():
path = value.replace('%p', 'src'+os.sep+__appname__)
bytes = repr(open(path, 'rb').read())
data += key + ' = ' + bytes + '\n\n'
open('src'+os.sep+__appname__+os.sep+'/resources.py', 'wb').write(data)
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -60,6 +60,8 @@ def update_css(ncss, ocss):
def munge_paths(basepath, url): def munge_paths(basepath, url):
purl = urlparse(unquote(url),) purl = urlparse(unquote(url),)
path, fragment = purl[2], purl[5] path, fragment = purl[2], purl[5]
if path:
path = path.replace('/', os.sep)
if not path: if not path:
path = basepath path = basepath
elif not os.path.isabs(path): elif not os.path.isabs(path):
@ -223,7 +225,6 @@ class HTMLConverter(object):
self.extra_toc_entries = [] #: TOC entries gleaned from semantic information self.extra_toc_entries = [] #: TOC entries gleaned from semantic information
self.image_memory = [] self.image_memory = []
self.id_counter = 0 self.id_counter = 0
self.toc_from_metadata = False #: If True means that the toc has been populated from metadata
self.unused_target_blocks = [] #: Used to remove extra TextBlocks self.unused_target_blocks = [] #: Used to remove extra TextBlocks
self.link_level = 0 #: Current link level self.link_level = 0 #: Current link level
self.memory = [] #: Used to ensure that duplicate CSS unhandled erros are not reported self.memory = [] #: Used to ensure that duplicate CSS unhandled erros are not reported
@ -543,7 +544,7 @@ class HTMLConverter(object):
path, fragment = munge_paths(self.target_prefix, tag['href']) path, fragment = munge_paths(self.target_prefix, tag['href'])
return {'para':para, 'text':text, 'path':os.path.abspath(path), return {'para':para, 'text':text, 'path':os.path.abspath(path),
'fragment':fragment, 'in toc': (self.link_level == 0 and not self.toc_from_metadata)} 'fragment':fragment, 'in toc': (self.link_level == 0 and not self.use_spine)}
def get_text(self, tag, limit=None): def get_text(self, tag, limit=None):
@ -637,13 +638,12 @@ class HTMLConverter(object):
return outside_links return outside_links
def create_toc(self, toc): def create_toc(self, toc):
for (path, fragment, txt) in toc: for item in toc.top_level_items():
ascii_text = txt.encode('ascii', 'ignore') # Bug in SONY LRF renderer ascii_text = item.text.encode('ascii', 'ignore') # Bug in SONY LRF renderer
self.toc_from_metadata = True if not item.fragment and item.abspath in self.tops:
if not fragment and path in self.tops: self.book.addTocEntry(ascii_text, self.tops[item.abspath])
self.book.addTocEntry(ascii_text, self.tops[path])
else: else:
url = path+fragment url = item.abspath+item.fragment
if url in self.targets: if url in self.targets:
self.book.addTocEntry(ascii_text, self.targets[url]) self.book.addTocEntry(ascii_text, self.targets[url])
@ -1846,6 +1846,7 @@ def try_opf(path, options, logger):
options.cover = None options.cover = None
cover = opf.cover cover = opf.cover
if cover: if cover:
cover = cover.replace('/', os.sep)
if not os.path.isabs(cover): if not os.path.isabs(cover):
cover = os.path.join(dirpath, cover) cover = os.path.join(dirpath, cover)
if os.access(cover, os.R_OK): if os.access(cover, os.R_OK):

View File

@ -65,7 +65,7 @@ class LRFConverter(object):
def create_metadata(self): def create_metadata(self):
self.logger.info('Reading metadata...') self.logger.info('Reading metadata...')
mi = get_metadata(self.lrf) mi = get_metadata(self.lrf)
self.opf = OPFCreator(mi) self.opf = OPFCreator(self.output_dir, mi)
def create_page_styles(self): def create_page_styles(self):
self.page_css = '' self.page_css = ''

View File

@ -45,12 +45,13 @@ class MetaInformation(object):
ans = MetaInformation(mi.title, mi.authors) ans = MetaInformation(mi.title, mi.authors)
for attr in ('author_sort', 'title_sort', 'comments', 'category', for attr in ('author_sort', 'title_sort', 'comments', 'category',
'publisher', 'series', 'series_index', 'rating', 'publisher', 'series', 'series_index', 'rating',
'isbn', 'tags', 'cover_data', 'libprs_id'): 'isbn', 'tags', 'cover_data', 'application_id',
'manifest', 'spine', 'toc', 'cover'):
if hasattr(mi, attr): if hasattr(mi, attr):
setattr(ans, attr, getattr(mi, attr)) setattr(ans, attr, getattr(mi, attr))
def __init__(self, title, authors): def __init__(self, title, authors=['Unknown']):
''' '''
@param title: title or "Unknown" or a MetaInformation object @param title: title or "Unknown" or a MetaInformation object
@param authors: List of strings or [] @param authors: List of strings or []
@ -76,8 +77,11 @@ class MetaInformation(object):
self.isbn = None if not mi else mi.isbn self.isbn = None if not mi else mi.isbn
self.tags = [] if not mi else mi.tags self.tags = [] if not mi else mi.tags
self.cover_data = mi.cover_data if (mi and hasattr(mi, 'cover_data')) else (None, None) self.cover_data = mi.cover_data if (mi and hasattr(mi, 'cover_data')) else (None, None)
self.libprs_id = mi.libprs_id if (mi and hasattr(mi, 'libprs_id')) else None self.application_id = mi.application_id if (mi and hasattr(mi, 'application_id')) else None
self.manifest = getattr(mi, 'manifest', None)
self.toc = getattr(mi, 'toc', None)
self.spine = getattr(mi, 'spine', None)
self.cover = getattr(mi, 'cover', None)
def smart_update(self, mi): def smart_update(self, mi):
''' '''
@ -92,7 +96,7 @@ class MetaInformation(object):
for attr in ('author_sort', 'title_sort', 'comments', 'category', for attr in ('author_sort', 'title_sort', 'comments', 'category',
'publisher', 'series', 'series_index', 'rating', 'publisher', 'series', 'series_index', 'rating',
'isbn', 'libprs_id'): 'isbn', 'application_id', 'manifest', 'spine', 'toc', 'cover'):
if hasattr(mi, attr): if hasattr(mi, attr):
val = getattr(mi, attr) val = getattr(mi, attr)
if val is not None: if val is not None:

View File

@ -51,7 +51,7 @@ def metadata_from_formats(formats):
ext = path_to_ext(path) ext = path_to_ext(path)
stream = open(path, 'rb') stream = open(path, 'rb')
mi.smart_update(get_metadata(stream, stream_type=ext, use_libprs_metadata=True)) mi.smart_update(get_metadata(stream, stream_type=ext, use_libprs_metadata=True))
if getattr(mi, 'libprs_id', None) is not None: if getattr(mi, 'application_id', None) is not None:
return mi return mi
return mi return mi
@ -69,7 +69,7 @@ def get_metadata(stream, stream_type='lrf', use_libprs_metadata=False):
if os.access(c, os.R_OK): if os.access(c, os.R_OK):
opf = opf_metadata(os.path.abspath(c)) opf = opf_metadata(os.path.abspath(c))
if use_libprs_metadata and getattr(opf, 'libprs_id', None) is not None: if use_libprs_metadata and getattr(opf, 'application_id', None) is not None:
return opf return opf
try: try:
@ -147,7 +147,7 @@ def opf_metadata(opfpath):
f = open(opfpath, 'rb') f = open(opfpath, 'rb')
opf = OPFReader(f, os.path.dirname(opfpath)) opf = OPFReader(f, os.path.dirname(opfpath))
try: try:
if opf.libprs_id is not None: if opf.application_id is not None:
mi = MetaInformation(opf, None) mi = MetaInformation(opf, None)
if hasattr(opf, 'cover') and opf.cover: if hasattr(opf, 'cover') and opf.cover:
cpath = os.path.join(os.path.dirname(opfpath), opf.cover) cpath = os.path.join(os.path.dirname(opfpath), opf.cover)

View File

@ -0,0 +1,27 @@
<ncx version="2005-1"
xml:lang="en"
xmlns="http://www.daisy.org/z3986/2005/ncx/"
xmlns:py="http://genshi.edgewall.org/"
>
<head>
<meta name="dtb:uid" content="${uid}"/>
<meta name="dtb:depth" content="${toc.depth()}"/>
<meta name="dtb:generator" content="${__appname__}"/>
<meta name="dtb:totalPageCount" content="0"/>
<meta name="dtb:maxPageNumber" content="0"/>
</head>
<docTitle><text>Table of Contents</text></docTitle>
<py:def function="navpoint(np, level)">
${'%*s'%(4*level,'')}<navPoint playOrder="${str(np.play_order)}">
${'%*s'%(4*level,'')}<navLabel>
${'%*s'%(4*level,'')}<text>${np.text}</text>
${'%*s'%(4*level,'')}</navLabel>
${'%*s'%(4*level,'')}<content src="${str(np.href)+(('#' + str(np.fragment)) if np.fragment else '')}" />
<py:for each="np2 in np">${navpoint(np2, level+1)}</py:for>
${'%*s'%(4*level,'')}</navPoint>
</py:def>
<navMap>
<py:for each="np in toc">${navpoint(np, 0)}</py:for>
</navMap>
</ncx>

View File

@ -12,18 +12,21 @@
## You should have received a copy of the GNU General Public License along ## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc., ## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
import uuid
'''Read/Write metadata from Open Packaging Format (.opf) files.''' '''Read/Write metadata from Open Packaging Format (.opf) files.'''
import sys, re, os, glob import sys, re, os, mimetypes
from urllib import unquote from urllib import unquote
from urlparse import urlparse from urlparse import urlparse
import xml.dom.minidom as dom import xml.dom.minidom as dom
from itertools import repeat from itertools import repeat
from libprs500 import __appname__
from libprs500.ebooks.metadata import MetaInformation from libprs500.ebooks.metadata import MetaInformation
from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup, BeautifulSoup from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup
from libprs500.ebooks.lrf import entity_to_unicode from libprs500.ebooks.lrf import entity_to_unicode
from libprs500.ebooks.metadata import get_parser from libprs500.ebooks.metadata import get_parser
from libprs500.ebooks.metadata.toc import TOC
class ManifestItem(object): class ManifestItem(object):
def __init__(self, item, cwd): def __init__(self, item, cwd):
@ -41,6 +44,14 @@ class ManifestItem(object):
def __unicode__(self): def __unicode__(self):
return u'<item id="%s" href="%s" media-type="%s" />'%(self.id, self.href, self.media_type) return u'<item id="%s" href="%s" media-type="%s" />'%(self.id, self.href, self.media_type)
def __getitem__(self, index):
if index == 0:
return self.href
if index == 1:
return self.media_type
raise IndexError('%d out of bounds.'%index)
class Manifest(list): class Manifest(list):
def __init__(self, soup, dir): def __init__(self, soup, dir):
@ -82,84 +93,10 @@ class Spine(object):
for i in self.linear_ids + self.nonlinear_ids: for i in self.linear_ids + self.nonlinear_ids:
yield self.manifest.item(i) yield self.manifest.item(i)
class TOC(list): def __iter__(self):
for i in self.linear_ids + self.nonlinear_ids:
yield i
def __init__(self, opfreader, cwd):
self.toc = None
toc = opfreader.soup.find('spine', toc=True)
if toc is not None:
toc = toc['toc']
if toc is None:
try:
toc = opfreader.soup.find('guide').find('reference', attrs={'type':'toc'})['href']
except:
for item in opfreader.manifest:
if 'toc' in item.href.lower():
toc = item.href
break
if toc is not None:
if toc.lower() != 'ncx':
toc = urlparse(unquote(toc))[2]
if not os.path.isabs(toc):
toc = os.path.join(cwd, toc)
try:
if not os.path.exists(toc):
bn = os.path.basename(toc)
bn = bn.replace('_top.htm', '_toc.htm') # Bug in BAEN OPF files
toc = os.path.join(os.path.dirname(toc), bn)
self.read_html_toc(toc, cwd)
self.toc = toc
except:
pass
else:
cwd = os.path.abspath(cwd)
m = glob.glob(os.path.join(cwd, '*.ncx'))
if m:
toc = m[0]
try:
self.read_ncx_toc(toc)
self.toc = toc
except:
raise
pass
def read_ncx_toc(self, toc):
bdir = os.path.dirname(toc)
soup = BeautifulStoneSoup(open(toc, 'rb').read(),
convertEntities=BeautifulSoup.HTML_ENTITIES)
elems = soup.findAll('navpoint')
elems.sort(cmp=lambda x, y: cmp(int(x['playorder']), int(y['playorder'])))
for elem in elems:
txt = u''
for nl in elem.findAll('navlabel'):
for text in nl.findAll('text'):
txt += ''.join([unicode(s) for s in text.findAll(text=True)])
content = elem.find('content')
if content is None or not content.has_key('src') or not txt:
continue
purl = urlparse(unquote(content['src']))
href, fragment = purl[2], purl[5]
if not os.path.isabs(href):
href = os.path.join(bdir, href)
self.append((href, fragment, txt))
def read_html_toc(self, toc, cwd):
soup = BeautifulSoup(open(toc, 'rb').read(), convertEntities=BeautifulSoup.HTML_ENTITIES)
for a in soup.findAll('a'):
if not a.has_key('href'):
continue
purl = urlparse(unquote(a['href']))
href, fragment = purl[2], purl[5]
if not os.path.isabs(href):
href = os.path.join(cwd, href)
txt = ''.join([unicode(s).strip() for s in a.findAll(text=True)])
self.append((href, fragment, txt))
class standard_field(object): class standard_field(object):
@ -178,21 +115,21 @@ class OPF(MetaInformation):
MIMETYPE = 'application/oebps-package+xml' MIMETYPE = 'application/oebps-package+xml'
ENTITY_PATTERN = re.compile(r'&(\S+?);') ENTITY_PATTERN = re.compile(r'&(\S+?);')
uid = standard_field('uid') uid = standard_field('uid')
libprs_id = standard_field('libprs_id') application_id = standard_field('application_id')
title = standard_field('title') title = standard_field('title')
authors = standard_field('authors') authors = standard_field('authors')
title_sort = standard_field('title_sort') title_sort = standard_field('title_sort')
author_sort = standard_field('author_sort') author_sort = standard_field('author_sort')
comments = standard_field('comments') comments = standard_field('comments')
category = standard_field('category') category = standard_field('category')
publisher = standard_field('publisher') publisher = standard_field('publisher')
isbn = standard_field('isbn') isbn = standard_field('isbn')
cover = standard_field('cover') cover = standard_field('cover')
series = standard_field('series') series = standard_field('series')
series_index = standard_field('series_index') series_index = standard_field('series_index')
rating = standard_field('rating') rating = standard_field('rating')
tags = standard_field('tags') tags = standard_field('tags')
HEADER = '''\ HEADER = '''\
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
@ -207,14 +144,14 @@ class OPF(MetaInformation):
if not hasattr(self, 'soup'): if not hasattr(self, 'soup'):
self.soup = BeautifulStoneSoup(u'''\ self.soup = BeautifulStoneSoup(u'''\
%s %s
<package unique-identifier="libprs_id"> <package unique-identifier="%s_id">
<metadata> <metadata>
<dc-metadata <dc-metadata
xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:oebpackage="http://openebook.org/namespaces/oeb-package/1.0/" /> xmlns:oebpackage="http://openebook.org/namespaces/oeb-package/1.0/" />
</metadata> </metadata>
</package> </package>
'''%self.HEADER) '''%(__appname__, self.HEADER))
def _commit(self, doc): def _commit(self, doc):
self.soup = BeautifulStoneSoup(doc.toxml('utf-8'), fromEncoding='utf-8') self.soup = BeautifulStoneSoup(doc.toxml('utf-8'), fromEncoding='utf-8')
@ -403,15 +340,15 @@ class OPF(MetaInformation):
self._set_metadata_element('dc:identifier', isbn, [('scheme', 'ISBN')], self._set_metadata_element('dc:identifier', isbn, [('scheme', 'ISBN')],
replace=True) replace=True)
def get_libprs_id(self): def get_application_id(self):
for item in self.soup.package.metadata.findAll('dc:identifier'): for item in self.soup.package.metadata.findAll('dc:identifier'):
if item.has_key('scheme') and item['scheme'] == 'libprs': if item.has_key('scheme') and item['scheme'] == __appname__:
return str(item.string).strip() return str(item.string).strip()
return None return None
def set_libprs_id(self, val): def set_application_id(self, val):
if val: if val:
self._set_metadata_element('dc:identifier', str(val), [('scheme', 'libprs'), ('id', 'libprs_id')], self._set_metadata_element('dc:identifier', str(val), [('scheme', __appname__), ('id', __appname__+'_id')],
replace=True) replace=True)
def get_cover(self): def get_cover(self):
@ -564,61 +501,72 @@ class OPFReader(OPF):
stream.close() stream.close()
self.manifest = Manifest(self.soup, dir) self.manifest = Manifest(self.soup, dir)
self.spine = Spine(self.soup, self.manifest) self.spine = Spine(self.soup, self.manifest)
self.toc = TOC(self, dir) self.toc = TOC()
self.toc.read_from_opf(self)
self.cover_data = (None, None) self.cover_data = (None, None)
class OPFCreator(OPF): class OPFCreator(MetaInformation):
def __init__(self, mi): def __init__(self, base_path, *args, **kwargs):
self.title = mi.title '''
self.authors = mi.authors Initialize.
if mi.category: @param base_path: An absolute path to the directory in which this OPF file
self.category = mi.category will eventually be. This is used by the L{create_manifest} method
if mi.comments: to convert paths to files into relative paths.
self.comments = mi.comments '''
if mi.publisher: MetaInformation.__init__(self, *args, **kwargs)
self.publisher = mi.publisher self.base_path = os.path.abspath(base_path)
if mi.rating: if self.application_id is None:
self.rating = mi.rating self.application_id = str(uuid.uuid4())
if mi.series: self.toc = None
self.series = mi.series if isinstance(self.manifest, Manifest):
if mi.series_index: manifest = []
self.series_index = mi.series_index for path, mt in self.manifest:
if mi.tags: if not path.startswith(self.base_path):
self.tags = mi.tags raise ValueError('Inavlid manifest item %s for base path %s'%(path, self.base_path))
if mi.isbn: path = path[len(self.base_path)+1:]
self.isbn = mi.isbn manifest.append((path, mt))
self.cover_data = mi.cover_data self.manifest = manifest
if hasattr(mi, 'libprs_id'):
self.libprs_id = mi.libprs_id
if hasattr(mi, 'uid'):
self.uid = mi.uid
def create_manifest(self, entries): def create_manifest(self, entries):
''' '''
Create <manifest> Create <manifest>
@param entries: List of (URL, mime-type) @param entries: List of (path, mime-type)
@param base_path: It is used to convert each path into a path relative to itself
@type entries: list of 2-tuples @type entries: list of 2-tuples
''' '''
doc = dom.parseString(self.soup.__str__('UTF-8').strip()) rentries = []
package = doc.documentElement base_path = self.base_path
manifest = doc.createElement('manifest') mimetypes.init()
package.appendChild(manifest) for href, mt in entries:
package.appendChild(doc.createTextNode('\n')) href = os.path.abspath(href)
if not href.startswith(base_path):
raise ValueError('OPF should only refer to files below it. %s is above %s'%(href, base_path))
href = href[len(base_path)+1:].replace(os.sep, '/')
if not mt:
mt = mimetypes.guess_type(href)[0]
if not mt:
mt = ''
rentries.append((href, mt))
self.href_map = {} self.manifest = rentries
for href, media_type in entries: def create_manifest_from_files_in(self, files_and_dirs):
item = doc.createElement('item') entries = []
item.setAttribute('href', href)
item.setAttribute('media-type', media_type)
self.href_map[href] = str(hash(href))
item.setAttribute('id', self.href_map[href])
manifest.appendChild(item)
manifest.appendChild(doc.createTextNode('\n'))
self._commit(doc) def dodir(dir):
for root, dirs, files in os.walk(dir):
for name in files:
path = os.path.join(root, name)
entries.append((path, None))
for i in files_and_dirs:
if os.path.isdir(i):
dodir(i)
else:
entries.append((i, None))
self.create_manifest(entries)
def create_spine(self, entries): def create_spine(self, entries):
''' '''
@ -626,19 +574,43 @@ class OPFCreator(OPF):
@param: List of paths @param: List of paths
@type param: list of strings @type param: list of strings
''' '''
doc = dom.parseString(self.soup.__str__('UTF-8').strip()) self.spine = []
package = doc.documentElement
spine = doc.createElement('spine')
package.appendChild(spine)
package.appendChild(doc.createTextNode('\n'))
for href in entries: for path in entries:
itemref = doc.createElement('itemref') if not os.path.isabs(path):
itemref.setAttribute('idref', self.href_map[href]) path = os.path.join(self.base_path, path)
spine.appendChild(itemref) if not path.startswith(self.base_path):
spine.appendChild(doc.createTextNode('\n')) raise ValueError('Invalid entry %s for base path %s'%(path, self.base_path))
href = path[len(self.base_path)+1:]
in_manifest = False
for i, m in enumerate(self.manifest):
if m[0] == href:
in_manifest = True
break
if not in_manifest:
raise ValueError('%s is not in the manifest. (%s)'%(href, path))
self.spine.append(i)
self._commit(doc)
def set_toc(self, toc):
'''
Set the toc. You must call L{create_spine} before calling this
method.
@param toc: A Table of Contents
@type toc: L{TOC}
'''
self.toc = toc
def render(self, opf_stream, ncx_stream=None):
from libprs500.resources import opf_template
from genshi.template import MarkupTemplate
template = MarkupTemplate(opf_template)
opf = template.generate(__appname__=__appname__, mi=self).render('xml')
opf_stream.write(opf)
toc = getattr(self, 'toc', None)
if toc is not None and ncx_stream is not None:
toc.render(ncx_stream, self.application_id)
def option_parser(): def option_parser():
return get_parser('opf') return get_parser('opf')
@ -649,7 +621,7 @@ def main(args=sys.argv):
if len(args) != 2: if len(args) != 2:
parser.print_help() parser.print_help()
return 1 return 1
mi = OPFReader(open(args[1], 'rb')) mi = MetaInformation(OPFReader(open(args[1], 'rb')))
if opts.title is not None: if opts.title is not None:
mi.title = opts.title.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;') mi.title = opts.title.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
if opts.authors is not None: if opts.authors is not None:
@ -660,7 +632,8 @@ def main(args=sys.argv):
if opts.comment is not None: if opts.comment is not None:
mi.comments = opts.comment.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;') mi.comments = opts.comment.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
print mi print mi
mi.write(open(args[1], 'wb')) mo = OPFCreator(os.getcwd(), mi)
mo.render(open(args[1], 'wb'))
return 0 return 0
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -0,0 +1,36 @@
<?xml version="1.0" encoding="UTF-8"?>
<package version="2.0"
xmlns:opf="http://www.idpf.org/2007/opf"
xmlns:py="http://genshi.edgewall.org/"
unique-identifier="${__appname__}_id"
>
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
<dc:title py:with="attrs={'files-as':mi.title_sort}" py:attrs="attrs">${mi.title}</dc:title>
<dc:creator opf:role="aut" py:for="i, author in enumerate(mi.authors)" py:with="attrs={'file-as':mi.author_sort if i==0 else None}" py:attrs="attrs">${author}</dc:creator>
<dc:identifier scheme="${__appname__}" id="${__appname__}_id">${mi.application_id}</dc:identifier>
<dc:type py:if="mi.category">${mi.category}</dc:type>
<dc:description py:if="mi.comments">${mi.comments}</dc:description>
<dc:publisher py:if="mi.publisher">${mi.publisher}</dc:publisher>
<dc:identifier opf:scheme="ISBN" py:if="mi.isbn">${mi.isbn}</dc:identifier>
<series py:if="mi.series">${mi.series}</series>
<series-index py:if="mi.series_index is not None">${mi.series_index}</series-index>
<rating py:if="mi.rating is not None">${mi.rating}</rating>
<dc:subject py:if="mi.tags is not None" py:for="tag in mi.tags">${tag}</dc:subject>
</metadata>
<guide>
<reference py:if="mi.cover" type="cover" href="${mi.cover}" />
</guide>
<manifest>
<py:for each="i, m in enumerate(mi.manifest)">
<item id="${str(i)}" href="${m[0]}" media-type="${m[1]}" />
</py:for>
</manifest>
<spine py:with="attrs={'toc':'ncx' if mi.toc else None}" py:attrs="attrs">
<itemref py:for="idref in mi.spine" idref="${str(idref)}" />
</spine>
</package>

View File

@ -0,0 +1,154 @@
#!/usr/bin/env python
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
import os, glob
from urlparse import urlparse
from urllib import unquote
from libprs500 import __appname__
from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup, BeautifulSoup
class NCXSoup(BeautifulStoneSoup):
NESTABLE_TAGS = {'navpoint':[]}
def __init__(self, raw):
BeautifulStoneSoup.__init__(self, raw,
convertEntities=BeautifulSoup.HTML_ENTITIES,
selfClosingTags=['meta', 'content'])
class TOC(list):
def __init__(self, href=None, fragment=None, text=None, parent=None, play_order=1,
base_path=os.getcwd()):
self.href = href
self.fragment = fragment
self.text = text
self.parent = parent
self.base_path = base_path
self.play_order = play_order
def add_item(self, href, fragment, text):
self.append(TOC(href=href, fragment=fragment, text=text, parent=self, base_path=self.base_path))
return self[-1]
def top_level_items(self):
for item in self:
if item.text is not None:
yield item
def depth(self):
depth = 1
for obj in self:
c = obj.depth()
if c > depth - 1:
depth = c + 1
return depth
@apply
def abspath():
doc='Return the file this toc entry points to as a absolute path to a file on the system.'
def fget(self):
path = self.href.replace('/', os.sep)
if not os.path.isabs(path):
path = os.path.join(self.base_path, path)
return path
return property(fget=fget, doc=doc)
def read_from_opf(self, opfreader):
toc = opfreader.soup.find('spine', toc=True)
if toc is not None:
toc = toc['toc']
if toc is None:
try:
toc = opfreader.soup.find('guide').find('reference', attrs={'type':'toc'})['href']
except:
for item in opfreader.manifest:
if 'toc' in item.href.lower():
toc = item.href
break
if toc is not None:
if toc.lower() != 'ncx':
toc = urlparse(unquote(toc))[2]
toc = toc.replace('/', os.sep)
if not os.path.isabs(toc):
toc = os.path.join(self.base_path, toc)
try:
if not os.path.exists(toc):
bn = os.path.basename(toc)
bn = bn.replace('_top.htm', '_toc.htm') # Bug in BAEN OPF files
toc = os.path.join(os.path.dirname(toc), bn)
self.read_html_toc(toc, self.base_path)
except:
pass
else:
cwd = os.path.abspath(self.base_path)
m = glob.glob(os.path.join(cwd, '*.ncx'))
if m:
toc = m[0]
self.read_ncx_toc(toc)
def read_ncx_toc(self, toc):
self.base_path = os.path.dirname(toc)
soup = NCXSoup(open(toc, 'rb').read())
def process_navpoint(np, dest):
play_order = np.get('playOrder', 1)
href = fragment = text = None
nl = np.find('navlabel')
if nl is not None:
text = u''
for txt in nl.findAll('text'):
text += ''.join([unicode(s) for s in txt.findAll(text=True)])
content = elem.find('content')
if content is None or not content.has_key('src') or not txt:
return
purl = urlparse(unquote(content['src']))
href, fragment = purl[2], purl[5]
nd = dest.add_item(href, fragment, text)
nd.play_order = play_order
for c in np:
if getattr(c, 'name', None) == 'navpoint':
process_navpoint(c, nd)
nm = soup.find('navmap')
for elem in nm:
if getattr(elem, 'name', None) == 'navpoint':
process_navpoint(elem, self)
def read_html_toc(self, toc):
self.base_path = os.path.dirname(toc)
soup = BeautifulSoup(open(toc, 'rb').read(), convertEntities=BeautifulSoup.HTML_ENTITIES)
for a in soup.findAll('a'):
if not a.has_key('href'):
continue
purl = urlparse(unquote(a['href']))
href, fragment = purl[2], purl[5]
txt = ''.join([unicode(s).strip() for s in a.findAll(text=True)])
self.add_item(href, fragment, txt)
def render(self, stream, uid):
from libprs500.resources import ncx_template
from genshi.template import MarkupTemplate
doctype = ('ncx', "-//NISO//DTD ncx 2005-1//EN", "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd")
template = MarkupTemplate(ncx_template)
raw = template.generate(uid=uid, toc=self, __appname__=__appname__)
raw = raw.render(doctype=doctype)
stream.write(raw)

View File

@ -186,11 +186,11 @@ class MobiReader(object):
if self.book_header.exth is not None: if self.book_header.exth is not None:
opf = self.create_opf(htmlfile) opf = self.create_opf(htmlfile)
opf.write(open(os.path.splitext(htmlfile)[0]+'.opf', 'wb')) opf.render(open(os.path.splitext(htmlfile)[0]+'.opf', 'wb'))
def create_opf(self, htmlfile): def create_opf(self, htmlfile):
mi = self.book_header.exth.mi mi = self.book_header.exth.mi
opf = OPFCreator(mi) opf = OPFCreator(os.path.dirname(htmlfile), mi)
if hasattr(self.book_header.exth, 'cover_offset'): if hasattr(self.book_header.exth, 'cover_offset'):
opf.cover = 'images/%05d.jpg'%(self.book_header.exth.cover_offset+1) opf.cover = 'images/%05d.jpg'%(self.book_header.exth.cover_offset+1)
manifest = [(os.path.basename(htmlfile), 'text/x-oeb1-document')] manifest = [(os.path.basename(htmlfile), 'text/x-oeb1-document')]

View File

@ -1340,7 +1340,7 @@ ALTER TABLE books ADD COLUMN isbn TEXT DEFAULT "" COLLATE NOCASE;
mi.rating = self.rating(idx, index_is_id=index_is_id) mi.rating = self.rating(idx, index_is_id=index_is_id)
mi.isbn = self.isbn(idx, index_is_id=index_is_id) mi.isbn = self.isbn(idx, index_is_id=index_is_id)
id = idx if index_is_id else self.id(idx) id = idx if index_is_id else self.id(idx)
mi.libprs_id = id mi.application_id = id
return mi return mi
def vacuum(self): def vacuum(self):
@ -1382,7 +1382,7 @@ ALTER TABLE books ADD COLUMN isbn TEXT DEFAULT "" COLLATE NOCASE;
name += '_'+id name += '_'+id
base = dir if single_dir else tpath base = dir if single_dir else tpath
mi = OPFCreator(self.get_metadata(idx, index_is_id=index_is_id)) mi = OPFCreator(base, self.get_metadata(idx, index_is_id=index_is_id))
cover = self.cover(idx, index_is_id=index_is_id) cover = self.cover(idx, index_is_id=index_is_id)
if cover is not None: if cover is not None:
cname = name + '.jpg' cname = name + '.jpg'
@ -1390,7 +1390,7 @@ ALTER TABLE books ADD COLUMN isbn TEXT DEFAULT "" COLLATE NOCASE;
open(cpath, 'wb').write(cover) open(cpath, 'wb').write(cover)
mi.cover = cname mi.cover = cname
f = open(os.path.join(base, name+'.opf'), 'wb') f = open(os.path.join(base, name+'.opf'), 'wb')
mi.write(f) mi.render(f)
f.close() f.close()
for fmt in self.formats(idx, index_is_id=index_is_id).split(','): for fmt in self.formats(idx, index_is_id=index_is_id).split(','):

View File

@ -44,6 +44,7 @@ entry_points = {
'rtf2lrf = libprs500.ebooks.lrf.rtf.convert_from:main', 'rtf2lrf = libprs500.ebooks.lrf.rtf.convert_from:main',
'web2disk = libprs500.web.fetch.simple:main', 'web2disk = libprs500.web.fetch.simple:main',
'feeds2disk = libprs500.web.feeds.main:main', 'feeds2disk = libprs500.web.feeds.main:main',
'feeds2lrf = libprs500.ebooks.lrf.feeds.convert_from:main',
'web2lrf = libprs500.ebooks.lrf.web.convert_from:main', 'web2lrf = libprs500.ebooks.lrf.web.convert_from:main',
'pdf2lrf = libprs500.ebooks.lrf.pdf.convert_from:main', 'pdf2lrf = libprs500.ebooks.lrf.pdf.convert_from:main',
'mobi2lrf = libprs500.ebooks.lrf.mobi.convert_from:main', 'mobi2lrf = libprs500.ebooks.lrf.mobi.convert_from:main',

View File

@ -201,6 +201,7 @@ class ProgressBar:
self.term.BOL + self.term.UP + self.term.CLEAR_EOL + self.term.BOL + self.term.UP + self.term.CLEAR_EOL +
(self.bar % (100*percent, '='*n, '-'*(self.width-10-n))) + (self.bar % (100*percent, '='*n, '-'*(self.width-10-n))) +
self.term.CLEAR_EOL + msg) self.term.CLEAR_EOL + msg)
sys.stdout.flush()
def clear(self): def clear(self):
if not self.cleared: if not self.cleared:

View File

@ -17,12 +17,13 @@
The backend to parse feeds and create HTML that can then be converted The backend to parse feeds and create HTML that can then be converted
to an ebook. to an ebook.
''' '''
import logging, os, cStringIO, time, itertools, traceback import logging, os, cStringIO, time, traceback
import urlparse import urlparse
from libprs500 import browser, __appname__ from libprs500 import browser, __appname__
from libprs500.ebooks.BeautifulSoup import BeautifulSoup from libprs500.ebooks.BeautifulSoup import BeautifulSoup
from libprs500.ebooks.metadata.opf import OPFCreator from libprs500.ebooks.metadata.opf import OPFCreator
from libprs500.ebooks.metadata.toc import TOC
from libprs500.ebooks.metadata import MetaInformation from libprs500.ebooks.metadata import MetaInformation
from libprs500.web.feeds import feed_from_xml, templates from libprs500.web.feeds import feed_from_xml, templates
from libprs500.web.fetch.simple import option_parser as web2disk_option_parser from libprs500.web.fetch.simple import option_parser as web2disk_option_parser
@ -94,6 +95,9 @@ class BasicNewsRecipe(object):
#: using cp1252. If None, try to detect the encoding. #: using cp1252. If None, try to detect the encoding.
encoding = None encoding = None
#: Specify any extra CSS that should be addded to downloaded HTML files
extra_css = None
#: List of regular expressions that determines which links to follow #: List of regular expressions that determines which links to follow
#: If empty, it is ignored. #: If empty, it is ignored.
#: Only one of L{match_regexps} or L{filter_regexps} should be defined #: Only one of L{match_regexps} or L{filter_regexps} should be defined
@ -276,8 +280,9 @@ class BasicNewsRecipe(object):
self.web2disk_options = web2disk_option_parser().parse_args(web2disk_cmdline)[0] self.web2disk_options = web2disk_option_parser().parse_args(web2disk_cmdline)[0]
for extra in ('keep_only_tags', 'remove_tags', 'preprocess_regexps', for extra in ('keep_only_tags', 'remove_tags', 'preprocess_regexps',
'preprocess_html', 'remove_tags_after', 'postprocess_html'): 'preprocess_html', 'remove_tags_after'):
setattr(self.web2disk_options, extra, getattr(self, extra)) setattr(self.web2disk_options, extra, getattr(self, extra))
self.web2disk_options.postprocess_html = [self._postprocess_html, self.postprocess_html]
if self.delay > 0: if self.delay > 0:
self.simultaneous_downloads = 1 self.simultaneous_downloads = 1
@ -288,6 +293,14 @@ class BasicNewsRecipe(object):
self.failed_downloads = [] self.failed_downloads = []
self.partial_failures = [] self.partial_failures = []
def _postprocess_html(self, soup):
if self.extra_css is not None:
head = soup.find('head')
if head:
style = BeautifulSoup(u'<style type="text/css">%s</style>'%self.extra_css).find('style')
head.insert(len(head.contents), style)
return soup
def download(self): def download(self):
''' '''
Download and pre-process all articles from the feeds in this recipe. Download and pre-process all articles from the feeds in this recipe.
@ -297,6 +310,7 @@ class BasicNewsRecipe(object):
@rtype: string @rtype: string
''' '''
self.report_progress(0, _('Trying to download cover...')) self.report_progress(0, _('Trying to download cover...'))
self.download_cover() self.download_cover()
res = self.build_index() res = self.build_index()
self.cleanup() self.cleanup()
@ -362,7 +376,7 @@ class BasicNewsRecipe(object):
fetcher.current_dir = dir fetcher.current_dir = dir
fetcher.show_progress = False fetcher.show_progress = False
res, path, failures = fetcher.start_fetch(url), fetcher.downloaded_paths, fetcher.failed_links res, path, failures = fetcher.start_fetch(url), fetcher.downloaded_paths, fetcher.failed_links
if not res: if not res or not os.path.exists(res):
raise Exception(_('Could not fetch article. Run with --debug to see the reason')) raise Exception(_('Could not fetch article. Run with --debug to see the reason'))
return res, path, failures return res, path, failures
@ -446,28 +460,44 @@ class BasicNewsRecipe(object):
if dir is None: if dir is None:
dir = self.output_dir dir = self.output_dir
mi = MetaInformation(self.title + time.strftime(self.timefmt), [__appname__]) mi = MetaInformation(self.title + time.strftime(self.timefmt), [__appname__])
opf = OPFCreator(mi)
opf_path = os.path.join(dir, 'index.opf') opf_path = os.path.join(dir, 'index.opf')
ncx_path = os.path.join(dir, 'index.ncx')
opf = OPFCreator(dir, mi)
manifest = ['feed_%d'%i for i in range(len(feeds))]
manifest.append('index.html')
cpath = getattr(self, 'cover_path', None) cpath = getattr(self, 'cover_path', None)
if cpath is not None and os.access(cpath, os.R_OK): if cpath is not None and os.access(cpath, os.R_OK):
opf.cover = cpath opf.cover = cpath
manifest.append(cpath)
opf.create_manifest_from_files_in(manifest)
entries = ['index.html'] entries = ['index.html']
toc = TOC(base_path=dir)
for i, f in enumerate(feeds): for i, f in enumerate(feeds):
entries.append('feed_%d/index.html'%i) entries.append('feed_%d/index.html'%i)
feed = toc.add_item('feed_%d/index.html'%i, None, f.title)
for j, a in enumerate(f): for j, a in enumerate(f):
if getattr(a, 'downloaded', False): if getattr(a, 'downloaded', False):
adir = 'feed_%d/article_%d/'%(i, j) adir = 'feed_%d/article_%d/'%(i, j)
entries.append('%sindex.html'%adir) entries.append('%sindex.html'%adir)
feed.add_item('%sindex.html'%adir, None, a.title if a.title else 'Untitled article')
for sp in a.sub_pages: for sp in a.sub_pages:
prefix = os.path.commonprefix([opf_path, sp]) prefix = os.path.commonprefix([opf_path, sp])
relp = sp[len(prefix):] relp = sp[len(prefix):]
entries.append(relp.replace(os.sep, '/')) entries.append(relp.replace(os.sep, '/'))
opf.create_manifest(itertools.izip(entries, itertools.repeat('text/html')))
opf.create_spine(entries) opf.create_spine(entries)
opf.write(open(opf_path, 'wb')) opf.set_toc(toc)
for i, f in enumerate(feeds):
for j, a in enumerate(f):
if getattr(a, 'downloaded', False):
adir = 'feed_%d/article_%d/'%(i, j)
opf.render(open(opf_path, 'wb'), open(ncx_path, 'wb'))
def article_downloaded(self, request, result): def article_downloaded(self, request, result):
@ -516,7 +546,7 @@ class BasicNewsRecipe(object):
title, url = None, obj title, url = None, obj
else: else:
title, url = obj title, url = obj
self.report_progress(0, _('Fetching feed %s...'%(title if title else url))) self.report_progress(0, _('Fetching feed')+' %s...'%(title if title else url))
parsed_feeds.append(feed_from_xml(self.browser.open(url).read(), parsed_feeds.append(feed_from_xml(self.browser.open(url).read(),
title=title, title=title,
oldest_article=self.oldest_article, oldest_article=self.oldest_article,

View File

@ -41,7 +41,7 @@ class Newsweek(BasicNewsRecipe):
'http://feeds.newsweek.com/newsweek/columnists/AnnaQuindlen', 'http://feeds.newsweek.com/newsweek/columnists/AnnaQuindlen',
] ]
extra_css = '#content { font:serif,120%; }' extra_css = '#content { font:serif 1.2em; }'
keep_only_tags = [dict(name='div', id='content')] keep_only_tags = [dict(name='div', id='content')]
remove_tags = [ remove_tags = [
@ -55,8 +55,8 @@ class Newsweek(BasicNewsRecipe):
match_regexps = [r'http://www.newsweek.com/id/\S+/page/\d+'] match_regexps = [r'http://www.newsweek.com/id/\S+/page/\d+']
# For testing # For testing
#feeds = feeds[:2] #feeds = feeds[3:5]
#max_articles_per_feed = 1 #max_articles_per_feed = 2

View File

@ -57,16 +57,17 @@ class NavBarTemplate(Template):
<body> <body>
<div class="navbar" style="text-align:center"> <div class="navbar" style="text-align:center">
<hr py:if="bottom" /> <hr py:if="bottom" />
<a href="../index.html#article_${str(art)}">Up one level</a> <py:if test="art != num - 1">
| <a href="../article_${str(art+1)}/index.html">Next</a>
</py:if>
| <a href="../index.html#article_${str(art)}">Up one level</a>
<py:if test="two_levels"> <py:if test="two_levels">
| <a href="../../index.html#_${str(feed)}">Up two levels</a> | <a href="../../index.html#_${str(feed)}">Up two levels</a>
</py:if> </py:if>
<py:if test="art != 0"> <py:if test="art != 0">
| <a href="../article_${str(art-1)}/index.html">Previous</a> | <a href="../article_${str(art-1)}/index.html">Previous</a>
</py:if> </py:if>
<py:if test="art != num - 1"> |
| <a href="../article_${str(art+1)}/index.html">Next</a>
</py:if>
<hr py:if="not bottom" /> <hr py:if="not bottom" />
</div> </div>
</body> </body>

View File

@ -38,9 +38,9 @@ def basename(url):
def save_soup(soup, target): def save_soup(soup, target):
nm = Tag(soup, '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />') nm = Tag(soup, '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
for meta in soup.find('meta', content=True): meta = soup.find('meta', content=True)
if 'charset' in meta['content']: if meta and 'charset' in meta['content']:
meta.replaceWith(nm) meta.replaceWith(nm)
f = codecs.open(target, 'w', 'utf-8') f = codecs.open(target, 'w', 'utf-8')
f.write(unicode(soup)) f.write(unicode(soup))
f.close() f.close()
@ -85,7 +85,7 @@ class RecursiveFetcher(object):
self.remove_tags_after = getattr(options, 'remove_tags_after', None) self.remove_tags_after = getattr(options, 'remove_tags_after', None)
self.keep_only_tags = getattr(options, 'keep_only_tags', []) self.keep_only_tags = getattr(options, 'keep_only_tags', [])
self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup) self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup)
self.postprocess_html_ext= getattr(options, 'postprocess_html', lambda soup: soup) self.postprocess_html_ext= getattr(options, 'postprocess_html', [])
self.download_stylesheets = not options.no_stylesheets self.download_stylesheets = not options.no_stylesheets
self.show_progress = True self.show_progress = True
self.failed_links = [] self.failed_links = []
@ -336,7 +336,9 @@ class RecursiveFetcher(object):
self.process_return_links(soup, iurl) self.process_return_links(soup, iurl)
self.logger.debug('Recursion limit reached. Skipping links in %s', iurl) self.logger.debug('Recursion limit reached. Skipping links in %s', iurl)
save_soup(self.postprocess_html_ext(soup), res) for func in self.postprocess_html_ext:
soup = func(soup)
save_soup(soup, res)
self.localize_link(tag, 'href', res) self.localize_link(tag, 'href', res)
except Exception, err: except Exception, err: