Refactored OPF creation code. Implemented Table of Contents support in feeds2disk.

This commit is contained in:
Kovid Goyal 2008-03-14 19:25:48 +00:00
parent 748c184ccb
commit 6982652f92
18 changed files with 482 additions and 209 deletions

View File

@ -1,6 +1,6 @@
PYTHON = python
all : gui2 translations
all : gui2 translations resources
clean :
cd src/libprs500/gui2 && ${PYTHON} make.py clean
@ -14,3 +14,7 @@ test : gui2
translations :
cd src/libprs500 && ${PYTHON} translations/__init__.py
resources:
${PYTHON} resources.py

39
resources.py Normal file
View File

@ -0,0 +1,39 @@
#!/usr/bin/env python
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
'''
Compile resource files.
'''
import os, sys
sys.path.insert(1, os.path.join(os.getcwd(), 'src'))
from libprs500 import __appname__
RESOURCES = dict(
opf_template = '%p/ebooks/metadata/opf.xml',
ncx_template = '%p/ebooks/metadata/ncx.xml',
)
def main(args=sys.argv):
data = ''
for key, value in RESOURCES.items():
path = value.replace('%p', 'src'+os.sep+__appname__)
bytes = repr(open(path, 'rb').read())
data += key + ' = ' + bytes + '\n\n'
open('src'+os.sep+__appname__+os.sep+'/resources.py', 'wb').write(data)
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -60,6 +60,8 @@ def update_css(ncss, ocss):
def munge_paths(basepath, url):
purl = urlparse(unquote(url),)
path, fragment = purl[2], purl[5]
if path:
path = path.replace('/', os.sep)
if not path:
path = basepath
elif not os.path.isabs(path):
@ -223,7 +225,6 @@ class HTMLConverter(object):
self.extra_toc_entries = [] #: TOC entries gleaned from semantic information
self.image_memory = []
self.id_counter = 0
self.toc_from_metadata = False #: If True means that the toc has been populated from metadata
self.unused_target_blocks = [] #: Used to remove extra TextBlocks
self.link_level = 0 #: Current link level
self.memory = [] #: Used to ensure that duplicate CSS unhandled erros are not reported
@ -543,7 +544,7 @@ class HTMLConverter(object):
path, fragment = munge_paths(self.target_prefix, tag['href'])
return {'para':para, 'text':text, 'path':os.path.abspath(path),
'fragment':fragment, 'in toc': (self.link_level == 0 and not self.toc_from_metadata)}
'fragment':fragment, 'in toc': (self.link_level == 0 and not self.use_spine)}
def get_text(self, tag, limit=None):
@ -637,13 +638,12 @@ class HTMLConverter(object):
return outside_links
def create_toc(self, toc):
for (path, fragment, txt) in toc:
ascii_text = txt.encode('ascii', 'ignore') # Bug in SONY LRF renderer
self.toc_from_metadata = True
if not fragment and path in self.tops:
self.book.addTocEntry(ascii_text, self.tops[path])
for item in toc.top_level_items():
ascii_text = item.text.encode('ascii', 'ignore') # Bug in SONY LRF renderer
if not item.fragment and item.abspath in self.tops:
self.book.addTocEntry(ascii_text, self.tops[item.abspath])
else:
url = path+fragment
url = item.abspath+item.fragment
if url in self.targets:
self.book.addTocEntry(ascii_text, self.targets[url])
@ -1846,6 +1846,7 @@ def try_opf(path, options, logger):
options.cover = None
cover = opf.cover
if cover:
cover = cover.replace('/', os.sep)
if not os.path.isabs(cover):
cover = os.path.join(dirpath, cover)
if os.access(cover, os.R_OK):

View File

@ -65,7 +65,7 @@ class LRFConverter(object):
def create_metadata(self):
self.logger.info('Reading metadata...')
mi = get_metadata(self.lrf)
self.opf = OPFCreator(mi)
self.opf = OPFCreator(self.output_dir, mi)
def create_page_styles(self):
self.page_css = ''

View File

@ -45,12 +45,13 @@ class MetaInformation(object):
ans = MetaInformation(mi.title, mi.authors)
for attr in ('author_sort', 'title_sort', 'comments', 'category',
'publisher', 'series', 'series_index', 'rating',
'isbn', 'tags', 'cover_data', 'libprs_id'):
'isbn', 'tags', 'cover_data', 'application_id',
'manifest', 'spine', 'toc', 'cover'):
if hasattr(mi, attr):
setattr(ans, attr, getattr(mi, attr))
def __init__(self, title, authors):
def __init__(self, title, authors=['Unknown']):
'''
@param title: title or "Unknown" or a MetaInformation object
@param authors: List of strings or []
@ -76,8 +77,11 @@ class MetaInformation(object):
self.isbn = None if not mi else mi.isbn
self.tags = [] if not mi else mi.tags
self.cover_data = mi.cover_data if (mi and hasattr(mi, 'cover_data')) else (None, None)
self.libprs_id = mi.libprs_id if (mi and hasattr(mi, 'libprs_id')) else None
self.application_id = mi.application_id if (mi and hasattr(mi, 'application_id')) else None
self.manifest = getattr(mi, 'manifest', None)
self.toc = getattr(mi, 'toc', None)
self.spine = getattr(mi, 'spine', None)
self.cover = getattr(mi, 'cover', None)
def smart_update(self, mi):
'''
@ -92,7 +96,7 @@ class MetaInformation(object):
for attr in ('author_sort', 'title_sort', 'comments', 'category',
'publisher', 'series', 'series_index', 'rating',
'isbn', 'libprs_id'):
'isbn', 'application_id', 'manifest', 'spine', 'toc', 'cover'):
if hasattr(mi, attr):
val = getattr(mi, attr)
if val is not None:

View File

@ -51,7 +51,7 @@ def metadata_from_formats(formats):
ext = path_to_ext(path)
stream = open(path, 'rb')
mi.smart_update(get_metadata(stream, stream_type=ext, use_libprs_metadata=True))
if getattr(mi, 'libprs_id', None) is not None:
if getattr(mi, 'application_id', None) is not None:
return mi
return mi
@ -69,7 +69,7 @@ def get_metadata(stream, stream_type='lrf', use_libprs_metadata=False):
if os.access(c, os.R_OK):
opf = opf_metadata(os.path.abspath(c))
if use_libprs_metadata and getattr(opf, 'libprs_id', None) is not None:
if use_libprs_metadata and getattr(opf, 'application_id', None) is not None:
return opf
try:
@ -147,7 +147,7 @@ def opf_metadata(opfpath):
f = open(opfpath, 'rb')
opf = OPFReader(f, os.path.dirname(opfpath))
try:
if opf.libprs_id is not None:
if opf.application_id is not None:
mi = MetaInformation(opf, None)
if hasattr(opf, 'cover') and opf.cover:
cpath = os.path.join(os.path.dirname(opfpath), opf.cover)

View File

@ -0,0 +1,27 @@
<ncx version="2005-1"
xml:lang="en"
xmlns="http://www.daisy.org/z3986/2005/ncx/"
xmlns:py="http://genshi.edgewall.org/"
>
<head>
<meta name="dtb:uid" content="${uid}"/>
<meta name="dtb:depth" content="${toc.depth()}"/>
<meta name="dtb:generator" content="${__appname__}"/>
<meta name="dtb:totalPageCount" content="0"/>
<meta name="dtb:maxPageNumber" content="0"/>
</head>
<docTitle><text>Table of Contents</text></docTitle>
<py:def function="navpoint(np, level)">
${'%*s'%(4*level,'')}<navPoint playOrder="${str(np.play_order)}">
${'%*s'%(4*level,'')}<navLabel>
${'%*s'%(4*level,'')}<text>${np.text}</text>
${'%*s'%(4*level,'')}</navLabel>
${'%*s'%(4*level,'')}<content src="${str(np.href)+(('#' + str(np.fragment)) if np.fragment else '')}" />
<py:for each="np2 in np">${navpoint(np2, level+1)}</py:for>
${'%*s'%(4*level,'')}</navPoint>
</py:def>
<navMap>
<py:for each="np in toc">${navpoint(np, 0)}</py:for>
</navMap>
</ncx>

View File

@ -12,18 +12,21 @@
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
import uuid
'''Read/Write metadata from Open Packaging Format (.opf) files.'''
import sys, re, os, glob
import sys, re, os, mimetypes
from urllib import unquote
from urlparse import urlparse
import xml.dom.minidom as dom
from itertools import repeat
from libprs500 import __appname__
from libprs500.ebooks.metadata import MetaInformation
from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup, BeautifulSoup
from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup
from libprs500.ebooks.lrf import entity_to_unicode
from libprs500.ebooks.metadata import get_parser
from libprs500.ebooks.metadata.toc import TOC
class ManifestItem(object):
def __init__(self, item, cwd):
@ -41,6 +44,14 @@ class ManifestItem(object):
def __unicode__(self):
return u'<item id="%s" href="%s" media-type="%s" />'%(self.id, self.href, self.media_type)
def __getitem__(self, index):
if index == 0:
return self.href
if index == 1:
return self.media_type
raise IndexError('%d out of bounds.'%index)
class Manifest(list):
def __init__(self, soup, dir):
@ -82,84 +93,10 @@ class Spine(object):
for i in self.linear_ids + self.nonlinear_ids:
yield self.manifest.item(i)
class TOC(list):
def __iter__(self):
for i in self.linear_ids + self.nonlinear_ids:
yield i
def __init__(self, opfreader, cwd):
self.toc = None
toc = opfreader.soup.find('spine', toc=True)
if toc is not None:
toc = toc['toc']
if toc is None:
try:
toc = opfreader.soup.find('guide').find('reference', attrs={'type':'toc'})['href']
except:
for item in opfreader.manifest:
if 'toc' in item.href.lower():
toc = item.href
break
if toc is not None:
if toc.lower() != 'ncx':
toc = urlparse(unquote(toc))[2]
if not os.path.isabs(toc):
toc = os.path.join(cwd, toc)
try:
if not os.path.exists(toc):
bn = os.path.basename(toc)
bn = bn.replace('_top.htm', '_toc.htm') # Bug in BAEN OPF files
toc = os.path.join(os.path.dirname(toc), bn)
self.read_html_toc(toc, cwd)
self.toc = toc
except:
pass
else:
cwd = os.path.abspath(cwd)
m = glob.glob(os.path.join(cwd, '*.ncx'))
if m:
toc = m[0]
try:
self.read_ncx_toc(toc)
self.toc = toc
except:
raise
pass
def read_ncx_toc(self, toc):
bdir = os.path.dirname(toc)
soup = BeautifulStoneSoup(open(toc, 'rb').read(),
convertEntities=BeautifulSoup.HTML_ENTITIES)
elems = soup.findAll('navpoint')
elems.sort(cmp=lambda x, y: cmp(int(x['playorder']), int(y['playorder'])))
for elem in elems:
txt = u''
for nl in elem.findAll('navlabel'):
for text in nl.findAll('text'):
txt += ''.join([unicode(s) for s in text.findAll(text=True)])
content = elem.find('content')
if content is None or not content.has_key('src') or not txt:
continue
purl = urlparse(unquote(content['src']))
href, fragment = purl[2], purl[5]
if not os.path.isabs(href):
href = os.path.join(bdir, href)
self.append((href, fragment, txt))
def read_html_toc(self, toc, cwd):
soup = BeautifulSoup(open(toc, 'rb').read(), convertEntities=BeautifulSoup.HTML_ENTITIES)
for a in soup.findAll('a'):
if not a.has_key('href'):
continue
purl = urlparse(unquote(a['href']))
href, fragment = purl[2], purl[5]
if not os.path.isabs(href):
href = os.path.join(cwd, href)
txt = ''.join([unicode(s).strip() for s in a.findAll(text=True)])
self.append((href, fragment, txt))
class standard_field(object):
@ -179,7 +116,7 @@ class OPF(MetaInformation):
ENTITY_PATTERN = re.compile(r'&(\S+?);')
uid = standard_field('uid')
libprs_id = standard_field('libprs_id')
application_id = standard_field('application_id')
title = standard_field('title')
authors = standard_field('authors')
title_sort = standard_field('title_sort')
@ -207,14 +144,14 @@ class OPF(MetaInformation):
if not hasattr(self, 'soup'):
self.soup = BeautifulStoneSoup(u'''\
%s
<package unique-identifier="libprs_id">
<package unique-identifier="%s_id">
<metadata>
<dc-metadata
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:oebpackage="http://openebook.org/namespaces/oeb-package/1.0/" />
</metadata>
</package>
'''%self.HEADER)
'''%(__appname__, self.HEADER))
def _commit(self, doc):
self.soup = BeautifulStoneSoup(doc.toxml('utf-8'), fromEncoding='utf-8')
@ -403,15 +340,15 @@ class OPF(MetaInformation):
self._set_metadata_element('dc:identifier', isbn, [('scheme', 'ISBN')],
replace=True)
def get_libprs_id(self):
def get_application_id(self):
for item in self.soup.package.metadata.findAll('dc:identifier'):
if item.has_key('scheme') and item['scheme'] == 'libprs':
if item.has_key('scheme') and item['scheme'] == __appname__:
return str(item.string).strip()
return None
def set_libprs_id(self, val):
def set_application_id(self, val):
if val:
self._set_metadata_element('dc:identifier', str(val), [('scheme', 'libprs'), ('id', 'libprs_id')],
self._set_metadata_element('dc:identifier', str(val), [('scheme', __appname__), ('id', __appname__+'_id')],
replace=True)
def get_cover(self):
@ -564,61 +501,72 @@ class OPFReader(OPF):
stream.close()
self.manifest = Manifest(self.soup, dir)
self.spine = Spine(self.soup, self.manifest)
self.toc = TOC(self, dir)
self.toc = TOC()
self.toc.read_from_opf(self)
self.cover_data = (None, None)
class OPFCreator(OPF):
class OPFCreator(MetaInformation):
def __init__(self, mi):
self.title = mi.title
self.authors = mi.authors
if mi.category:
self.category = mi.category
if mi.comments:
self.comments = mi.comments
if mi.publisher:
self.publisher = mi.publisher
if mi.rating:
self.rating = mi.rating
if mi.series:
self.series = mi.series
if mi.series_index:
self.series_index = mi.series_index
if mi.tags:
self.tags = mi.tags
if mi.isbn:
self.isbn = mi.isbn
self.cover_data = mi.cover_data
if hasattr(mi, 'libprs_id'):
self.libprs_id = mi.libprs_id
if hasattr(mi, 'uid'):
self.uid = mi.uid
def __init__(self, base_path, *args, **kwargs):
'''
Initialize.
@param base_path: An absolute path to the directory in which this OPF file
will eventually be. This is used by the L{create_manifest} method
to convert paths to files into relative paths.
'''
MetaInformation.__init__(self, *args, **kwargs)
self.base_path = os.path.abspath(base_path)
if self.application_id is None:
self.application_id = str(uuid.uuid4())
self.toc = None
if isinstance(self.manifest, Manifest):
manifest = []
for path, mt in self.manifest:
if not path.startswith(self.base_path):
raise ValueError('Inavlid manifest item %s for base path %s'%(path, self.base_path))
path = path[len(self.base_path)+1:]
manifest.append((path, mt))
self.manifest = manifest
def create_manifest(self, entries):
'''
Create <manifest>
@param entries: List of (URL, mime-type)
@param entries: List of (path, mime-type)
@param base_path: It is used to convert each path into a path relative to itself
@type entries: list of 2-tuples
'''
doc = dom.parseString(self.soup.__str__('UTF-8').strip())
package = doc.documentElement
manifest = doc.createElement('manifest')
package.appendChild(manifest)
package.appendChild(doc.createTextNode('\n'))
rentries = []
base_path = self.base_path
mimetypes.init()
for href, mt in entries:
href = os.path.abspath(href)
if not href.startswith(base_path):
raise ValueError('OPF should only refer to files below it. %s is above %s'%(href, base_path))
href = href[len(base_path)+1:].replace(os.sep, '/')
if not mt:
mt = mimetypes.guess_type(href)[0]
if not mt:
mt = ''
rentries.append((href, mt))
self.href_map = {}
self.manifest = rentries
for href, media_type in entries:
item = doc.createElement('item')
item.setAttribute('href', href)
item.setAttribute('media-type', media_type)
self.href_map[href] = str(hash(href))
item.setAttribute('id', self.href_map[href])
manifest.appendChild(item)
manifest.appendChild(doc.createTextNode('\n'))
def create_manifest_from_files_in(self, files_and_dirs):
entries = []
self._commit(doc)
def dodir(dir):
for root, dirs, files in os.walk(dir):
for name in files:
path = os.path.join(root, name)
entries.append((path, None))
for i in files_and_dirs:
if os.path.isdir(i):
dodir(i)
else:
entries.append((i, None))
self.create_manifest(entries)
def create_spine(self, entries):
'''
@ -626,19 +574,43 @@ class OPFCreator(OPF):
@param: List of paths
@type param: list of strings
'''
doc = dom.parseString(self.soup.__str__('UTF-8').strip())
package = doc.documentElement
spine = doc.createElement('spine')
package.appendChild(spine)
package.appendChild(doc.createTextNode('\n'))
self.spine = []
for href in entries:
itemref = doc.createElement('itemref')
itemref.setAttribute('idref', self.href_map[href])
spine.appendChild(itemref)
spine.appendChild(doc.createTextNode('\n'))
for path in entries:
if not os.path.isabs(path):
path = os.path.join(self.base_path, path)
if not path.startswith(self.base_path):
raise ValueError('Invalid entry %s for base path %s'%(path, self.base_path))
href = path[len(self.base_path)+1:]
in_manifest = False
for i, m in enumerate(self.manifest):
if m[0] == href:
in_manifest = True
break
if not in_manifest:
raise ValueError('%s is not in the manifest. (%s)'%(href, path))
self.spine.append(i)
self._commit(doc)
def set_toc(self, toc):
'''
Set the toc. You must call L{create_spine} before calling this
method.
@param toc: A Table of Contents
@type toc: L{TOC}
'''
self.toc = toc
def render(self, opf_stream, ncx_stream=None):
from libprs500.resources import opf_template
from genshi.template import MarkupTemplate
template = MarkupTemplate(opf_template)
opf = template.generate(__appname__=__appname__, mi=self).render('xml')
opf_stream.write(opf)
toc = getattr(self, 'toc', None)
if toc is not None and ncx_stream is not None:
toc.render(ncx_stream, self.application_id)
def option_parser():
return get_parser('opf')
@ -649,7 +621,7 @@ def main(args=sys.argv):
if len(args) != 2:
parser.print_help()
return 1
mi = OPFReader(open(args[1], 'rb'))
mi = MetaInformation(OPFReader(open(args[1], 'rb')))
if opts.title is not None:
mi.title = opts.title.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
if opts.authors is not None:
@ -660,7 +632,8 @@ def main(args=sys.argv):
if opts.comment is not None:
mi.comments = opts.comment.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
print mi
mi.write(open(args[1], 'wb'))
mo = OPFCreator(os.getcwd(), mi)
mo.render(open(args[1], 'wb'))
return 0
if __name__ == '__main__':

View File

@ -0,0 +1,36 @@
<?xml version="1.0" encoding="UTF-8"?>
<package version="2.0"
xmlns:opf="http://www.idpf.org/2007/opf"
xmlns:py="http://genshi.edgewall.org/"
unique-identifier="${__appname__}_id"
>
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
<dc:title py:with="attrs={'files-as':mi.title_sort}" py:attrs="attrs">${mi.title}</dc:title>
<dc:creator opf:role="aut" py:for="i, author in enumerate(mi.authors)" py:with="attrs={'file-as':mi.author_sort if i==0 else None}" py:attrs="attrs">${author}</dc:creator>
<dc:identifier scheme="${__appname__}" id="${__appname__}_id">${mi.application_id}</dc:identifier>
<dc:type py:if="mi.category">${mi.category}</dc:type>
<dc:description py:if="mi.comments">${mi.comments}</dc:description>
<dc:publisher py:if="mi.publisher">${mi.publisher}</dc:publisher>
<dc:identifier opf:scheme="ISBN" py:if="mi.isbn">${mi.isbn}</dc:identifier>
<series py:if="mi.series">${mi.series}</series>
<series-index py:if="mi.series_index is not None">${mi.series_index}</series-index>
<rating py:if="mi.rating is not None">${mi.rating}</rating>
<dc:subject py:if="mi.tags is not None" py:for="tag in mi.tags">${tag}</dc:subject>
</metadata>
<guide>
<reference py:if="mi.cover" type="cover" href="${mi.cover}" />
</guide>
<manifest>
<py:for each="i, m in enumerate(mi.manifest)">
<item id="${str(i)}" href="${m[0]}" media-type="${m[1]}" />
</py:for>
</manifest>
<spine py:with="attrs={'toc':'ncx' if mi.toc else None}" py:attrs="attrs">
<itemref py:for="idref in mi.spine" idref="${str(idref)}" />
</spine>
</package>

View File

@ -0,0 +1,154 @@
#!/usr/bin/env python
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
import os, glob
from urlparse import urlparse
from urllib import unquote
from libprs500 import __appname__
from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup, BeautifulSoup
class NCXSoup(BeautifulStoneSoup):
NESTABLE_TAGS = {'navpoint':[]}
def __init__(self, raw):
BeautifulStoneSoup.__init__(self, raw,
convertEntities=BeautifulSoup.HTML_ENTITIES,
selfClosingTags=['meta', 'content'])
class TOC(list):
def __init__(self, href=None, fragment=None, text=None, parent=None, play_order=1,
base_path=os.getcwd()):
self.href = href
self.fragment = fragment
self.text = text
self.parent = parent
self.base_path = base_path
self.play_order = play_order
def add_item(self, href, fragment, text):
self.append(TOC(href=href, fragment=fragment, text=text, parent=self, base_path=self.base_path))
return self[-1]
def top_level_items(self):
for item in self:
if item.text is not None:
yield item
def depth(self):
depth = 1
for obj in self:
c = obj.depth()
if c > depth - 1:
depth = c + 1
return depth
@apply
def abspath():
doc='Return the file this toc entry points to as a absolute path to a file on the system.'
def fget(self):
path = self.href.replace('/', os.sep)
if not os.path.isabs(path):
path = os.path.join(self.base_path, path)
return path
return property(fget=fget, doc=doc)
def read_from_opf(self, opfreader):
toc = opfreader.soup.find('spine', toc=True)
if toc is not None:
toc = toc['toc']
if toc is None:
try:
toc = opfreader.soup.find('guide').find('reference', attrs={'type':'toc'})['href']
except:
for item in opfreader.manifest:
if 'toc' in item.href.lower():
toc = item.href
break
if toc is not None:
if toc.lower() != 'ncx':
toc = urlparse(unquote(toc))[2]
toc = toc.replace('/', os.sep)
if not os.path.isabs(toc):
toc = os.path.join(self.base_path, toc)
try:
if not os.path.exists(toc):
bn = os.path.basename(toc)
bn = bn.replace('_top.htm', '_toc.htm') # Bug in BAEN OPF files
toc = os.path.join(os.path.dirname(toc), bn)
self.read_html_toc(toc, self.base_path)
except:
pass
else:
cwd = os.path.abspath(self.base_path)
m = glob.glob(os.path.join(cwd, '*.ncx'))
if m:
toc = m[0]
self.read_ncx_toc(toc)
def read_ncx_toc(self, toc):
self.base_path = os.path.dirname(toc)
soup = NCXSoup(open(toc, 'rb').read())
def process_navpoint(np, dest):
play_order = np.get('playOrder', 1)
href = fragment = text = None
nl = np.find('navlabel')
if nl is not None:
text = u''
for txt in nl.findAll('text'):
text += ''.join([unicode(s) for s in txt.findAll(text=True)])
content = elem.find('content')
if content is None or not content.has_key('src') or not txt:
return
purl = urlparse(unquote(content['src']))
href, fragment = purl[2], purl[5]
nd = dest.add_item(href, fragment, text)
nd.play_order = play_order
for c in np:
if getattr(c, 'name', None) == 'navpoint':
process_navpoint(c, nd)
nm = soup.find('navmap')
for elem in nm:
if getattr(elem, 'name', None) == 'navpoint':
process_navpoint(elem, self)
def read_html_toc(self, toc):
self.base_path = os.path.dirname(toc)
soup = BeautifulSoup(open(toc, 'rb').read(), convertEntities=BeautifulSoup.HTML_ENTITIES)
for a in soup.findAll('a'):
if not a.has_key('href'):
continue
purl = urlparse(unquote(a['href']))
href, fragment = purl[2], purl[5]
txt = ''.join([unicode(s).strip() for s in a.findAll(text=True)])
self.add_item(href, fragment, txt)
def render(self, stream, uid):
from libprs500.resources import ncx_template
from genshi.template import MarkupTemplate
doctype = ('ncx', "-//NISO//DTD ncx 2005-1//EN", "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd")
template = MarkupTemplate(ncx_template)
raw = template.generate(uid=uid, toc=self, __appname__=__appname__)
raw = raw.render(doctype=doctype)
stream.write(raw)

View File

@ -186,11 +186,11 @@ class MobiReader(object):
if self.book_header.exth is not None:
opf = self.create_opf(htmlfile)
opf.write(open(os.path.splitext(htmlfile)[0]+'.opf', 'wb'))
opf.render(open(os.path.splitext(htmlfile)[0]+'.opf', 'wb'))
def create_opf(self, htmlfile):
mi = self.book_header.exth.mi
opf = OPFCreator(mi)
opf = OPFCreator(os.path.dirname(htmlfile), mi)
if hasattr(self.book_header.exth, 'cover_offset'):
opf.cover = 'images/%05d.jpg'%(self.book_header.exth.cover_offset+1)
manifest = [(os.path.basename(htmlfile), 'text/x-oeb1-document')]

View File

@ -1340,7 +1340,7 @@ ALTER TABLE books ADD COLUMN isbn TEXT DEFAULT "" COLLATE NOCASE;
mi.rating = self.rating(idx, index_is_id=index_is_id)
mi.isbn = self.isbn(idx, index_is_id=index_is_id)
id = idx if index_is_id else self.id(idx)
mi.libprs_id = id
mi.application_id = id
return mi
def vacuum(self):
@ -1382,7 +1382,7 @@ ALTER TABLE books ADD COLUMN isbn TEXT DEFAULT "" COLLATE NOCASE;
name += '_'+id
base = dir if single_dir else tpath
mi = OPFCreator(self.get_metadata(idx, index_is_id=index_is_id))
mi = OPFCreator(base, self.get_metadata(idx, index_is_id=index_is_id))
cover = self.cover(idx, index_is_id=index_is_id)
if cover is not None:
cname = name + '.jpg'
@ -1390,7 +1390,7 @@ ALTER TABLE books ADD COLUMN isbn TEXT DEFAULT "" COLLATE NOCASE;
open(cpath, 'wb').write(cover)
mi.cover = cname
f = open(os.path.join(base, name+'.opf'), 'wb')
mi.write(f)
mi.render(f)
f.close()
for fmt in self.formats(idx, index_is_id=index_is_id).split(','):

View File

@ -44,6 +44,7 @@ entry_points = {
'rtf2lrf = libprs500.ebooks.lrf.rtf.convert_from:main',
'web2disk = libprs500.web.fetch.simple:main',
'feeds2disk = libprs500.web.feeds.main:main',
'feeds2lrf = libprs500.ebooks.lrf.feeds.convert_from:main',
'web2lrf = libprs500.ebooks.lrf.web.convert_from:main',
'pdf2lrf = libprs500.ebooks.lrf.pdf.convert_from:main',
'mobi2lrf = libprs500.ebooks.lrf.mobi.convert_from:main',

View File

@ -201,6 +201,7 @@ class ProgressBar:
self.term.BOL + self.term.UP + self.term.CLEAR_EOL +
(self.bar % (100*percent, '='*n, '-'*(self.width-10-n))) +
self.term.CLEAR_EOL + msg)
sys.stdout.flush()
def clear(self):
if not self.cleared:

View File

@ -17,12 +17,13 @@
The backend to parse feeds and create HTML that can then be converted
to an ebook.
'''
import logging, os, cStringIO, time, itertools, traceback
import logging, os, cStringIO, time, traceback
import urlparse
from libprs500 import browser, __appname__
from libprs500.ebooks.BeautifulSoup import BeautifulSoup
from libprs500.ebooks.metadata.opf import OPFCreator
from libprs500.ebooks.metadata.toc import TOC
from libprs500.ebooks.metadata import MetaInformation
from libprs500.web.feeds import feed_from_xml, templates
from libprs500.web.fetch.simple import option_parser as web2disk_option_parser
@ -94,6 +95,9 @@ class BasicNewsRecipe(object):
#: using cp1252. If None, try to detect the encoding.
encoding = None
#: Specify any extra CSS that should be addded to downloaded HTML files
extra_css = None
#: List of regular expressions that determines which links to follow
#: If empty, it is ignored.
#: Only one of L{match_regexps} or L{filter_regexps} should be defined
@ -276,8 +280,9 @@ class BasicNewsRecipe(object):
self.web2disk_options = web2disk_option_parser().parse_args(web2disk_cmdline)[0]
for extra in ('keep_only_tags', 'remove_tags', 'preprocess_regexps',
'preprocess_html', 'remove_tags_after', 'postprocess_html'):
'preprocess_html', 'remove_tags_after'):
setattr(self.web2disk_options, extra, getattr(self, extra))
self.web2disk_options.postprocess_html = [self._postprocess_html, self.postprocess_html]
if self.delay > 0:
self.simultaneous_downloads = 1
@ -288,6 +293,14 @@ class BasicNewsRecipe(object):
self.failed_downloads = []
self.partial_failures = []
def _postprocess_html(self, soup):
if self.extra_css is not None:
head = soup.find('head')
if head:
style = BeautifulSoup(u'<style type="text/css">%s</style>'%self.extra_css).find('style')
head.insert(len(head.contents), style)
return soup
def download(self):
'''
Download and pre-process all articles from the feeds in this recipe.
@ -297,6 +310,7 @@ class BasicNewsRecipe(object):
@rtype: string
'''
self.report_progress(0, _('Trying to download cover...'))
self.download_cover()
res = self.build_index()
self.cleanup()
@ -362,7 +376,7 @@ class BasicNewsRecipe(object):
fetcher.current_dir = dir
fetcher.show_progress = False
res, path, failures = fetcher.start_fetch(url), fetcher.downloaded_paths, fetcher.failed_links
if not res:
if not res or not os.path.exists(res):
raise Exception(_('Could not fetch article. Run with --debug to see the reason'))
return res, path, failures
@ -446,28 +460,44 @@ class BasicNewsRecipe(object):
if dir is None:
dir = self.output_dir
mi = MetaInformation(self.title + time.strftime(self.timefmt), [__appname__])
opf = OPFCreator(mi)
opf_path = os.path.join(dir, 'index.opf')
ncx_path = os.path.join(dir, 'index.ncx')
opf = OPFCreator(dir, mi)
manifest = ['feed_%d'%i for i in range(len(feeds))]
manifest.append('index.html')
cpath = getattr(self, 'cover_path', None)
if cpath is not None and os.access(cpath, os.R_OK):
opf.cover = cpath
manifest.append(cpath)
opf.create_manifest_from_files_in(manifest)
entries = ['index.html']
toc = TOC(base_path=dir)
for i, f in enumerate(feeds):
entries.append('feed_%d/index.html'%i)
feed = toc.add_item('feed_%d/index.html'%i, None, f.title)
for j, a in enumerate(f):
if getattr(a, 'downloaded', False):
adir = 'feed_%d/article_%d/'%(i, j)
entries.append('%sindex.html'%adir)
feed.add_item('%sindex.html'%adir, None, a.title if a.title else 'Untitled article')
for sp in a.sub_pages:
prefix = os.path.commonprefix([opf_path, sp])
relp = sp[len(prefix):]
entries.append(relp.replace(os.sep, '/'))
opf.create_manifest(itertools.izip(entries, itertools.repeat('text/html')))
opf.create_spine(entries)
opf.write(open(opf_path, 'wb'))
opf.set_toc(toc)
for i, f in enumerate(feeds):
for j, a in enumerate(f):
if getattr(a, 'downloaded', False):
adir = 'feed_%d/article_%d/'%(i, j)
opf.render(open(opf_path, 'wb'), open(ncx_path, 'wb'))
def article_downloaded(self, request, result):
@ -516,7 +546,7 @@ class BasicNewsRecipe(object):
title, url = None, obj
else:
title, url = obj
self.report_progress(0, _('Fetching feed %s...'%(title if title else url)))
self.report_progress(0, _('Fetching feed')+' %s...'%(title if title else url))
parsed_feeds.append(feed_from_xml(self.browser.open(url).read(),
title=title,
oldest_article=self.oldest_article,

View File

@ -41,7 +41,7 @@ class Newsweek(BasicNewsRecipe):
'http://feeds.newsweek.com/newsweek/columnists/AnnaQuindlen',
]
extra_css = '#content { font:serif,120%; }'
extra_css = '#content { font:serif 1.2em; }'
keep_only_tags = [dict(name='div', id='content')]
remove_tags = [
@ -55,8 +55,8 @@ class Newsweek(BasicNewsRecipe):
match_regexps = [r'http://www.newsweek.com/id/\S+/page/\d+']
# For testing
#feeds = feeds[:2]
#max_articles_per_feed = 1
#feeds = feeds[3:5]
#max_articles_per_feed = 2

View File

@ -57,16 +57,17 @@ class NavBarTemplate(Template):
<body>
<div class="navbar" style="text-align:center">
<hr py:if="bottom" />
<a href="../index.html#article_${str(art)}">Up one level</a>
<py:if test="art != num - 1">
| <a href="../article_${str(art+1)}/index.html">Next</a>
</py:if>
| <a href="../index.html#article_${str(art)}">Up one level</a>
<py:if test="two_levels">
| <a href="../../index.html#_${str(feed)}">Up two levels</a>
</py:if>
<py:if test="art != 0">
| <a href="../article_${str(art-1)}/index.html">Previous</a>
</py:if>
<py:if test="art != num - 1">
| <a href="../article_${str(art+1)}/index.html">Next</a>
</py:if>
|
<hr py:if="not bottom" />
</div>
</body>

View File

@ -38,8 +38,8 @@ def basename(url):
def save_soup(soup, target):
nm = Tag(soup, '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
for meta in soup.find('meta', content=True):
if 'charset' in meta['content']:
meta = soup.find('meta', content=True)
if meta and 'charset' in meta['content']:
meta.replaceWith(nm)
f = codecs.open(target, 'w', 'utf-8')
f.write(unicode(soup))
@ -85,7 +85,7 @@ class RecursiveFetcher(object):
self.remove_tags_after = getattr(options, 'remove_tags_after', None)
self.keep_only_tags = getattr(options, 'keep_only_tags', [])
self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup)
self.postprocess_html_ext= getattr(options, 'postprocess_html', lambda soup: soup)
self.postprocess_html_ext= getattr(options, 'postprocess_html', [])
self.download_stylesheets = not options.no_stylesheets
self.show_progress = True
self.failed_links = []
@ -336,7 +336,9 @@ class RecursiveFetcher(object):
self.process_return_links(soup, iurl)
self.logger.debug('Recursion limit reached. Skipping links in %s', iurl)
save_soup(self.postprocess_html_ext(soup), res)
for func in self.postprocess_html_ext:
soup = func(soup)
save_soup(soup, res)
self.localize_link(tag, 'href', res)
except Exception, err: