diff --git a/Makefile b/Makefile
index c3514fb0de..4b920c6a39 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
PYTHON = python
-all : gui2 translations
+all : gui2 translations resources
clean :
cd src/libprs500/gui2 && ${PYTHON} make.py clean
@@ -13,4 +13,8 @@ test : gui2
translations :
cd src/libprs500 && ${PYTHON} translations/__init__.py
+
+resources:
+ ${PYTHON} resources.py
+
diff --git a/resources.py b/resources.py
new file mode 100644
index 0000000000..cf5cf58253
--- /dev/null
+++ b/resources.py
@@ -0,0 +1,39 @@
+#!/usr/bin/env python
+
+## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2 of the License, or
+## (at your option) any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License along
+## with this program; if not, write to the Free Software Foundation, Inc.,
+## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+'''
+Compile resource files.
+'''
+import os, sys
+sys.path.insert(1, os.path.join(os.getcwd(), 'src'))
+from libprs500 import __appname__
+
+RESOURCES = dict(
+ opf_template = '%p/ebooks/metadata/opf.xml',
+ ncx_template = '%p/ebooks/metadata/ncx.xml',
+ )
+
+def main(args=sys.argv):
+ data = ''
+ for key, value in RESOURCES.items():
+ path = value.replace('%p', 'src'+os.sep+__appname__)
+ bytes = repr(open(path, 'rb').read())
+ data += key + ' = ' + bytes + '\n\n'
+ open('src'+os.sep+__appname__+os.sep+'/resources.py', 'wb').write(data)
+ return 0
+
+if __name__ == '__main__':
+ sys.exit(main())
\ No newline at end of file
diff --git a/src/libprs500/ebooks/lrf/html/convert_from.py b/src/libprs500/ebooks/lrf/html/convert_from.py
index eb4149c521..6ec3f06c53 100644
--- a/src/libprs500/ebooks/lrf/html/convert_from.py
+++ b/src/libprs500/ebooks/lrf/html/convert_from.py
@@ -60,6 +60,8 @@ def update_css(ncss, ocss):
def munge_paths(basepath, url):
purl = urlparse(unquote(url),)
path, fragment = purl[2], purl[5]
+ if path:
+ path = path.replace('/', os.sep)
if not path:
path = basepath
elif not os.path.isabs(path):
@@ -223,7 +225,6 @@ class HTMLConverter(object):
self.extra_toc_entries = [] #: TOC entries gleaned from semantic information
self.image_memory = []
self.id_counter = 0
- self.toc_from_metadata = False #: If True means that the toc has been populated from metadata
self.unused_target_blocks = [] #: Used to remove extra TextBlocks
self.link_level = 0 #: Current link level
self.memory = [] #: Used to ensure that duplicate CSS unhandled erros are not reported
@@ -543,7 +544,7 @@ class HTMLConverter(object):
path, fragment = munge_paths(self.target_prefix, tag['href'])
return {'para':para, 'text':text, 'path':os.path.abspath(path),
- 'fragment':fragment, 'in toc': (self.link_level == 0 and not self.toc_from_metadata)}
+ 'fragment':fragment, 'in toc': (self.link_level == 0 and not self.use_spine)}
def get_text(self, tag, limit=None):
@@ -637,13 +638,12 @@ class HTMLConverter(object):
return outside_links
def create_toc(self, toc):
- for (path, fragment, txt) in toc:
- ascii_text = txt.encode('ascii', 'ignore') # Bug in SONY LRF renderer
- self.toc_from_metadata = True
- if not fragment and path in self.tops:
- self.book.addTocEntry(ascii_text, self.tops[path])
+ for item in toc.top_level_items():
+ ascii_text = item.text.encode('ascii', 'ignore') # Bug in SONY LRF renderer
+ if not item.fragment and item.abspath in self.tops:
+ self.book.addTocEntry(ascii_text, self.tops[item.abspath])
else:
- url = path+fragment
+ url = item.abspath+item.fragment
if url in self.targets:
self.book.addTocEntry(ascii_text, self.targets[url])
@@ -1846,6 +1846,7 @@ def try_opf(path, options, logger):
options.cover = None
cover = opf.cover
if cover:
+ cover = cover.replace('/', os.sep)
if not os.path.isabs(cover):
cover = os.path.join(dirpath, cover)
if os.access(cover, os.R_OK):
diff --git a/src/libprs500/ebooks/lrf/html/convert_to.py b/src/libprs500/ebooks/lrf/html/convert_to.py
index 0e42a4d5b7..242b43d0df 100644
--- a/src/libprs500/ebooks/lrf/html/convert_to.py
+++ b/src/libprs500/ebooks/lrf/html/convert_to.py
@@ -65,7 +65,7 @@ class LRFConverter(object):
def create_metadata(self):
self.logger.info('Reading metadata...')
mi = get_metadata(self.lrf)
- self.opf = OPFCreator(mi)
+ self.opf = OPFCreator(self.output_dir, mi)
def create_page_styles(self):
self.page_css = ''
@@ -126,4 +126,4 @@ def main(args=sys.argv):
if __name__ == '__main__':
- sys.exit(main())
\ No newline at end of file
+ sys.exit(main())
diff --git a/src/libprs500/ebooks/metadata/__init__.py b/src/libprs500/ebooks/metadata/__init__.py
index dbd1886f68..544bb6c3d0 100644
--- a/src/libprs500/ebooks/metadata/__init__.py
+++ b/src/libprs500/ebooks/metadata/__init__.py
@@ -45,12 +45,13 @@ class MetaInformation(object):
ans = MetaInformation(mi.title, mi.authors)
for attr in ('author_sort', 'title_sort', 'comments', 'category',
'publisher', 'series', 'series_index', 'rating',
- 'isbn', 'tags', 'cover_data', 'libprs_id'):
+ 'isbn', 'tags', 'cover_data', 'application_id',
+ 'manifest', 'spine', 'toc', 'cover'):
if hasattr(mi, attr):
setattr(ans, attr, getattr(mi, attr))
- def __init__(self, title, authors):
+ def __init__(self, title, authors=['Unknown']):
'''
@param title: title or "Unknown" or a MetaInformation object
@param authors: List of strings or []
@@ -76,8 +77,11 @@ class MetaInformation(object):
self.isbn = None if not mi else mi.isbn
self.tags = [] if not mi else mi.tags
self.cover_data = mi.cover_data if (mi and hasattr(mi, 'cover_data')) else (None, None)
- self.libprs_id = mi.libprs_id if (mi and hasattr(mi, 'libprs_id')) else None
-
+ self.application_id = mi.application_id if (mi and hasattr(mi, 'application_id')) else None
+ self.manifest = getattr(mi, 'manifest', None)
+ self.toc = getattr(mi, 'toc', None)
+ self.spine = getattr(mi, 'spine', None)
+ self.cover = getattr(mi, 'cover', None)
def smart_update(self, mi):
'''
@@ -92,7 +96,7 @@ class MetaInformation(object):
for attr in ('author_sort', 'title_sort', 'comments', 'category',
'publisher', 'series', 'series_index', 'rating',
- 'isbn', 'libprs_id'):
+ 'isbn', 'application_id', 'manifest', 'spine', 'toc', 'cover'):
if hasattr(mi, attr):
val = getattr(mi, attr)
if val is not None:
@@ -117,4 +121,4 @@ class MetaInformation(object):
return ans.strip()
def __nonzero__(self):
- return bool(self.title or self.author or self.comments or self.category)
\ No newline at end of file
+ return bool(self.title or self.author or self.comments or self.category)
diff --git a/src/libprs500/ebooks/metadata/meta.py b/src/libprs500/ebooks/metadata/meta.py
index 8e2f3e5524..ed78f39a14 100644
--- a/src/libprs500/ebooks/metadata/meta.py
+++ b/src/libprs500/ebooks/metadata/meta.py
@@ -51,7 +51,7 @@ def metadata_from_formats(formats):
ext = path_to_ext(path)
stream = open(path, 'rb')
mi.smart_update(get_metadata(stream, stream_type=ext, use_libprs_metadata=True))
- if getattr(mi, 'libprs_id', None) is not None:
+ if getattr(mi, 'application_id', None) is not None:
return mi
return mi
@@ -69,7 +69,7 @@ def get_metadata(stream, stream_type='lrf', use_libprs_metadata=False):
if os.access(c, os.R_OK):
opf = opf_metadata(os.path.abspath(c))
- if use_libprs_metadata and getattr(opf, 'libprs_id', None) is not None:
+ if use_libprs_metadata and getattr(opf, 'application_id', None) is not None:
return opf
try:
@@ -147,7 +147,7 @@ def opf_metadata(opfpath):
f = open(opfpath, 'rb')
opf = OPFReader(f, os.path.dirname(opfpath))
try:
- if opf.libprs_id is not None:
+ if opf.application_id is not None:
mi = MetaInformation(opf, None)
if hasattr(opf, 'cover') and opf.cover:
cpath = os.path.join(os.path.dirname(opfpath), opf.cover)
diff --git a/src/libprs500/ebooks/metadata/ncx.xml b/src/libprs500/ebooks/metadata/ncx.xml
new file mode 100644
index 0000000000..7bcb9ac479
--- /dev/null
+++ b/src/libprs500/ebooks/metadata/ncx.xml
@@ -0,0 +1,27 @@
+
+
+
+
+
+
+
+
+ Table of Contents
+
+
+ ${'%*s'%(4*level,'')}
+ ${'%*s'%(4*level,'')}
+ ${'%*s'%(4*level,'')}${np.text}
+ ${'%*s'%(4*level,'')}
+ ${'%*s'%(4*level,'')}
+ ${navpoint(np2, level+1)}
+ ${'%*s'%(4*level,'')}
+
+
+ ${navpoint(np, 0)}
+
+
\ No newline at end of file
diff --git a/src/libprs500/ebooks/metadata/opf.py b/src/libprs500/ebooks/metadata/opf.py
index 833f8ae51f..c1d88706da 100644
--- a/src/libprs500/ebooks/metadata/opf.py
+++ b/src/libprs500/ebooks/metadata/opf.py
@@ -12,18 +12,21 @@
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+import uuid
'''Read/Write metadata from Open Packaging Format (.opf) files.'''
-import sys, re, os, glob
+import sys, re, os, mimetypes
from urllib import unquote
from urlparse import urlparse
import xml.dom.minidom as dom
from itertools import repeat
+from libprs500 import __appname__
from libprs500.ebooks.metadata import MetaInformation
-from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup, BeautifulSoup
+from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup
from libprs500.ebooks.lrf import entity_to_unicode
from libprs500.ebooks.metadata import get_parser
+from libprs500.ebooks.metadata.toc import TOC
class ManifestItem(object):
def __init__(self, item, cwd):
@@ -40,6 +43,14 @@ class ManifestItem(object):
def __unicode__(self):
return u' '%(self.id, self.href, self.media_type)
+
+ def __getitem__(self, index):
+ if index == 0:
+ return self.href
+ if index == 1:
+ return self.media_type
+ raise IndexError('%d out of bounds.'%index)
+
class Manifest(list):
@@ -81,85 +92,11 @@ class Spine(object):
def items(self):
for i in self.linear_ids + self.nonlinear_ids:
yield self.manifest.item(i)
+
+ def __iter__(self):
+ for i in self.linear_ids + self.nonlinear_ids:
+ yield i
-class TOC(list):
-
- def __init__(self, opfreader, cwd):
- self.toc = None
- toc = opfreader.soup.find('spine', toc=True)
- if toc is not None:
- toc = toc['toc']
- if toc is None:
- try:
- toc = opfreader.soup.find('guide').find('reference', attrs={'type':'toc'})['href']
- except:
- for item in opfreader.manifest:
- if 'toc' in item.href.lower():
- toc = item.href
- break
-
- if toc is not None:
- if toc.lower() != 'ncx':
- toc = urlparse(unquote(toc))[2]
- if not os.path.isabs(toc):
- toc = os.path.join(cwd, toc)
- try:
- if not os.path.exists(toc):
- bn = os.path.basename(toc)
- bn = bn.replace('_top.htm', '_toc.htm') # Bug in BAEN OPF files
- toc = os.path.join(os.path.dirname(toc), bn)
-
- self.read_html_toc(toc, cwd)
- self.toc = toc
- except:
- pass
- else:
- cwd = os.path.abspath(cwd)
- m = glob.glob(os.path.join(cwd, '*.ncx'))
- if m:
- toc = m[0]
- try:
- self.read_ncx_toc(toc)
- self.toc = toc
- except:
- raise
- pass
-
- def read_ncx_toc(self, toc):
- bdir = os.path.dirname(toc)
- soup = BeautifulStoneSoup(open(toc, 'rb').read(),
- convertEntities=BeautifulSoup.HTML_ENTITIES)
- elems = soup.findAll('navpoint')
- elems.sort(cmp=lambda x, y: cmp(int(x['playorder']), int(y['playorder'])))
-
- for elem in elems:
- txt = u''
- for nl in elem.findAll('navlabel'):
- for text in nl.findAll('text'):
- txt += ''.join([unicode(s) for s in text.findAll(text=True)])
-
- content = elem.find('content')
- if content is None or not content.has_key('src') or not txt:
- continue
-
- purl = urlparse(unquote(content['src']))
- href, fragment = purl[2], purl[5]
- if not os.path.isabs(href):
- href = os.path.join(bdir, href)
- self.append((href, fragment, txt))
-
-
- def read_html_toc(self, toc, cwd):
- soup = BeautifulSoup(open(toc, 'rb').read(), convertEntities=BeautifulSoup.HTML_ENTITIES)
- for a in soup.findAll('a'):
- if not a.has_key('href'):
- continue
- purl = urlparse(unquote(a['href']))
- href, fragment = purl[2], purl[5]
- if not os.path.isabs(href):
- href = os.path.join(cwd, href)
- txt = ''.join([unicode(s).strip() for s in a.findAll(text=True)])
- self.append((href, fragment, txt))
class standard_field(object):
@@ -178,21 +115,21 @@ class OPF(MetaInformation):
MIMETYPE = 'application/oebps-package+xml'
ENTITY_PATTERN = re.compile(r'&(\S+?);')
- uid = standard_field('uid')
- libprs_id = standard_field('libprs_id')
- title = standard_field('title')
- authors = standard_field('authors')
- title_sort = standard_field('title_sort')
- author_sort = standard_field('author_sort')
- comments = standard_field('comments')
- category = standard_field('category')
- publisher = standard_field('publisher')
- isbn = standard_field('isbn')
- cover = standard_field('cover')
- series = standard_field('series')
- series_index = standard_field('series_index')
- rating = standard_field('rating')
- tags = standard_field('tags')
+ uid = standard_field('uid')
+ application_id = standard_field('application_id')
+ title = standard_field('title')
+ authors = standard_field('authors')
+ title_sort = standard_field('title_sort')
+ author_sort = standard_field('author_sort')
+ comments = standard_field('comments')
+ category = standard_field('category')
+ publisher = standard_field('publisher')
+ isbn = standard_field('isbn')
+ cover = standard_field('cover')
+ series = standard_field('series')
+ series_index = standard_field('series_index')
+ rating = standard_field('rating')
+ tags = standard_field('tags')
HEADER = '''\
@@ -207,14 +144,14 @@ class OPF(MetaInformation):
if not hasattr(self, 'soup'):
self.soup = BeautifulStoneSoup(u'''\
%s
-
+
-'''%self.HEADER)
+'''%(__appname__, self.HEADER))
def _commit(self, doc):
self.soup = BeautifulStoneSoup(doc.toxml('utf-8'), fromEncoding='utf-8')
@@ -403,15 +340,15 @@ class OPF(MetaInformation):
self._set_metadata_element('dc:identifier', isbn, [('scheme', 'ISBN')],
replace=True)
- def get_libprs_id(self):
+ def get_application_id(self):
for item in self.soup.package.metadata.findAll('dc:identifier'):
- if item.has_key('scheme') and item['scheme'] == 'libprs':
+ if item.has_key('scheme') and item['scheme'] == __appname__:
return str(item.string).strip()
return None
- def set_libprs_id(self, val):
+ def set_application_id(self, val):
if val:
- self._set_metadata_element('dc:identifier', str(val), [('scheme', 'libprs'), ('id', 'libprs_id')],
+ self._set_metadata_element('dc:identifier', str(val), [('scheme', __appname__), ('id', __appname__+'_id')],
replace=True)
def get_cover(self):
@@ -564,61 +501,72 @@ class OPFReader(OPF):
stream.close()
self.manifest = Manifest(self.soup, dir)
self.spine = Spine(self.soup, self.manifest)
- self.toc = TOC(self, dir)
+ self.toc = TOC()
+ self.toc.read_from_opf(self)
self.cover_data = (None, None)
-class OPFCreator(OPF):
+class OPFCreator(MetaInformation):
+
+ def __init__(self, base_path, *args, **kwargs):
+ '''
+ Initialize.
+ @param base_path: An absolute path to the directory in which this OPF file
+ will eventually be. This is used by the L{create_manifest} method
+ to convert paths to files into relative paths.
+ '''
+ MetaInformation.__init__(self, *args, **kwargs)
+ self.base_path = os.path.abspath(base_path)
+ if self.application_id is None:
+ self.application_id = str(uuid.uuid4())
+ self.toc = None
+ if isinstance(self.manifest, Manifest):
+ manifest = []
+ for path, mt in self.manifest:
+ if not path.startswith(self.base_path):
+ raise ValueError('Inavlid manifest item %s for base path %s'%(path, self.base_path))
+ path = path[len(self.base_path)+1:]
+ manifest.append((path, mt))
+ self.manifest = manifest
- def __init__(self, mi):
- self.title = mi.title
- self.authors = mi.authors
- if mi.category:
- self.category = mi.category
- if mi.comments:
- self.comments = mi.comments
- if mi.publisher:
- self.publisher = mi.publisher
- if mi.rating:
- self.rating = mi.rating
- if mi.series:
- self.series = mi.series
- if mi.series_index:
- self.series_index = mi.series_index
- if mi.tags:
- self.tags = mi.tags
- if mi.isbn:
- self.isbn = mi.isbn
- self.cover_data = mi.cover_data
- if hasattr(mi, 'libprs_id'):
- self.libprs_id = mi.libprs_id
- if hasattr(mi, 'uid'):
- self.uid = mi.uid
-
def create_manifest(self, entries):
'''
Create
- @param entries: List of (URL, mime-type)
+ @param entries: List of (path, mime-type)
+ @param base_path: It is used to convert each path into a path relative to itself
@type entries: list of 2-tuples
'''
- doc = dom.parseString(self.soup.__str__('UTF-8').strip())
- package = doc.documentElement
- manifest = doc.createElement('manifest')
- package.appendChild(manifest)
- package.appendChild(doc.createTextNode('\n'))
-
- self.href_map = {}
-
- for href, media_type in entries:
- item = doc.createElement('item')
- item.setAttribute('href', href)
- item.setAttribute('media-type', media_type)
- self.href_map[href] = str(hash(href))
- item.setAttribute('id', self.href_map[href])
- manifest.appendChild(item)
- manifest.appendChild(doc.createTextNode('\n'))
-
- self._commit(doc)
+ rentries = []
+ base_path = self.base_path
+ mimetypes.init()
+ for href, mt in entries:
+ href = os.path.abspath(href)
+ if not href.startswith(base_path):
+ raise ValueError('OPF should only refer to files below it. %s is above %s'%(href, base_path))
+ href = href[len(base_path)+1:].replace(os.sep, '/')
+ if not mt:
+ mt = mimetypes.guess_type(href)[0]
+ if not mt:
+ mt = ''
+ rentries.append((href, mt))
+ self.manifest = rentries
+
+ def create_manifest_from_files_in(self, files_and_dirs):
+ entries = []
+
+ def dodir(dir):
+ for root, dirs, files in os.walk(dir):
+ for name in files:
+ path = os.path.join(root, name)
+ entries.append((path, None))
+
+ for i in files_and_dirs:
+ if os.path.isdir(i):
+ dodir(i)
+ else:
+ entries.append((i, None))
+
+ self.create_manifest(entries)
def create_spine(self, entries):
'''
@@ -626,19 +574,43 @@ class OPFCreator(OPF):
@param: List of paths
@type param: list of strings
'''
- doc = dom.parseString(self.soup.__str__('UTF-8').strip())
- package = doc.documentElement
- spine = doc.createElement('spine')
- package.appendChild(spine)
- package.appendChild(doc.createTextNode('\n'))
+ self.spine = []
- for href in entries:
- itemref = doc.createElement('itemref')
- itemref.setAttribute('idref', self.href_map[href])
- spine.appendChild(itemref)
- spine.appendChild(doc.createTextNode('\n'))
+ for path in entries:
+ if not os.path.isabs(path):
+ path = os.path.join(self.base_path, path)
+ if not path.startswith(self.base_path):
+ raise ValueError('Invalid entry %s for base path %s'%(path, self.base_path))
+ href = path[len(self.base_path)+1:]
+ in_manifest = False
+ for i, m in enumerate(self.manifest):
+ if m[0] == href:
+ in_manifest = True
+ break
+ if not in_manifest:
+ raise ValueError('%s is not in the manifest. (%s)'%(href, path))
+ self.spine.append(i)
+
- self._commit(doc)
+
+ def set_toc(self, toc):
+ '''
+ Set the toc. You must call L{create_spine} before calling this
+ method.
+ @param toc: A Table of Contents
+ @type toc: L{TOC}
+ '''
+ self.toc = toc
+
+ def render(self, opf_stream, ncx_stream=None):
+ from libprs500.resources import opf_template
+ from genshi.template import MarkupTemplate
+ template = MarkupTemplate(opf_template)
+ opf = template.generate(__appname__=__appname__, mi=self).render('xml')
+ opf_stream.write(opf)
+ toc = getattr(self, 'toc', None)
+ if toc is not None and ncx_stream is not None:
+ toc.render(ncx_stream, self.application_id)
def option_parser():
return get_parser('opf')
@@ -649,7 +621,7 @@ def main(args=sys.argv):
if len(args) != 2:
parser.print_help()
return 1
- mi = OPFReader(open(args[1], 'rb'))
+ mi = MetaInformation(OPFReader(open(args[1], 'rb')))
if opts.title is not None:
mi.title = opts.title.replace('&', '&').replace('<', '<').replace('>', '>')
if opts.authors is not None:
@@ -660,7 +632,8 @@ def main(args=sys.argv):
if opts.comment is not None:
mi.comments = opts.comment.replace('&', '&').replace('<', '<').replace('>', '>')
print mi
- mi.write(open(args[1], 'wb'))
+ mo = OPFCreator(os.getcwd(), mi)
+ mo.render(open(args[1], 'wb'))
return 0
if __name__ == '__main__':
diff --git a/src/libprs500/ebooks/metadata/opf.xml b/src/libprs500/ebooks/metadata/opf.xml
new file mode 100644
index 0000000000..a847bae2c8
--- /dev/null
+++ b/src/libprs500/ebooks/metadata/opf.xml
@@ -0,0 +1,36 @@
+
+
+
+ ${mi.title}
+ ${author}
+ ${mi.application_id}
+
+ ${mi.category}
+ ${mi.comments}
+ ${mi.publisher}
+ ${mi.isbn}
+ ${mi.series}
+ ${mi.series_index}
+ ${mi.rating}
+ ${tag}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/libprs500/ebooks/metadata/toc.py b/src/libprs500/ebooks/metadata/toc.py
new file mode 100644
index 0000000000..89aaadbe11
--- /dev/null
+++ b/src/libprs500/ebooks/metadata/toc.py
@@ -0,0 +1,154 @@
+#!/usr/bin/env python
+## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2 of the License, or
+## (at your option) any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License along
+## with this program; if not, write to the Free Software Foundation, Inc.,
+## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+import os, glob
+from urlparse import urlparse
+from urllib import unquote
+
+from libprs500 import __appname__
+from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup, BeautifulSoup
+
+class NCXSoup(BeautifulStoneSoup):
+
+ NESTABLE_TAGS = {'navpoint':[]}
+
+ def __init__(self, raw):
+ BeautifulStoneSoup.__init__(self, raw,
+ convertEntities=BeautifulSoup.HTML_ENTITIES,
+ selfClosingTags=['meta', 'content'])
+
+class TOC(list):
+
+ def __init__(self, href=None, fragment=None, text=None, parent=None, play_order=1,
+ base_path=os.getcwd()):
+ self.href = href
+ self.fragment = fragment
+ self.text = text
+ self.parent = parent
+ self.base_path = base_path
+ self.play_order = play_order
+
+ def add_item(self, href, fragment, text):
+ self.append(TOC(href=href, fragment=fragment, text=text, parent=self, base_path=self.base_path))
+ return self[-1]
+
+ def top_level_items(self):
+ for item in self:
+ if item.text is not None:
+ yield item
+
+ def depth(self):
+ depth = 1
+ for obj in self:
+ c = obj.depth()
+ if c > depth - 1:
+ depth = c + 1
+ return depth
+
+ @apply
+ def abspath():
+ doc='Return the file this toc entry points to as a absolute path to a file on the system.'
+ def fget(self):
+ path = self.href.replace('/', os.sep)
+ if not os.path.isabs(path):
+ path = os.path.join(self.base_path, path)
+ return path
+ return property(fget=fget, doc=doc)
+
+ def read_from_opf(self, opfreader):
+ toc = opfreader.soup.find('spine', toc=True)
+ if toc is not None:
+ toc = toc['toc']
+ if toc is None:
+ try:
+ toc = opfreader.soup.find('guide').find('reference', attrs={'type':'toc'})['href']
+ except:
+ for item in opfreader.manifest:
+ if 'toc' in item.href.lower():
+ toc = item.href
+ break
+
+ if toc is not None:
+ if toc.lower() != 'ncx':
+ toc = urlparse(unquote(toc))[2]
+ toc = toc.replace('/', os.sep)
+ if not os.path.isabs(toc):
+ toc = os.path.join(self.base_path, toc)
+ try:
+ if not os.path.exists(toc):
+ bn = os.path.basename(toc)
+ bn = bn.replace('_top.htm', '_toc.htm') # Bug in BAEN OPF files
+ toc = os.path.join(os.path.dirname(toc), bn)
+
+ self.read_html_toc(toc, self.base_path)
+ except:
+ pass
+ else:
+ cwd = os.path.abspath(self.base_path)
+ m = glob.glob(os.path.join(cwd, '*.ncx'))
+ if m:
+ toc = m[0]
+ self.read_ncx_toc(toc)
+
+ def read_ncx_toc(self, toc):
+ self.base_path = os.path.dirname(toc)
+ soup = NCXSoup(open(toc, 'rb').read())
+
+ def process_navpoint(np, dest):
+ play_order = np.get('playOrder', 1)
+ href = fragment = text = None
+ nl = np.find('navlabel')
+ if nl is not None:
+ text = u''
+ for txt in nl.findAll('text'):
+ text += ''.join([unicode(s) for s in txt.findAll(text=True)])
+ content = elem.find('content')
+ if content is None or not content.has_key('src') or not txt:
+ return
+
+ purl = urlparse(unquote(content['src']))
+ href, fragment = purl[2], purl[5]
+ nd = dest.add_item(href, fragment, text)
+ nd.play_order = play_order
+
+ for c in np:
+ if getattr(c, 'name', None) == 'navpoint':
+ process_navpoint(c, nd)
+
+ nm = soup.find('navmap')
+ for elem in nm:
+ if getattr(elem, 'name', None) == 'navpoint':
+ process_navpoint(elem, self)
+
+
+ def read_html_toc(self, toc):
+ self.base_path = os.path.dirname(toc)
+ soup = BeautifulSoup(open(toc, 'rb').read(), convertEntities=BeautifulSoup.HTML_ENTITIES)
+ for a in soup.findAll('a'):
+ if not a.has_key('href'):
+ continue
+ purl = urlparse(unquote(a['href']))
+ href, fragment = purl[2], purl[5]
+ txt = ''.join([unicode(s).strip() for s in a.findAll(text=True)])
+ self.add_item(href, fragment, txt)
+
+ def render(self, stream, uid):
+ from libprs500.resources import ncx_template
+ from genshi.template import MarkupTemplate
+ doctype = ('ncx', "-//NISO//DTD ncx 2005-1//EN", "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd")
+ template = MarkupTemplate(ncx_template)
+ raw = template.generate(uid=uid, toc=self, __appname__=__appname__)
+ raw = raw.render(doctype=doctype)
+ stream.write(raw)
\ No newline at end of file
diff --git a/src/libprs500/ebooks/mobi/reader.py b/src/libprs500/ebooks/mobi/reader.py
index 71a0c3f026..c89daa1ae8 100644
--- a/src/libprs500/ebooks/mobi/reader.py
+++ b/src/libprs500/ebooks/mobi/reader.py
@@ -186,11 +186,11 @@ class MobiReader(object):
if self.book_header.exth is not None:
opf = self.create_opf(htmlfile)
- opf.write(open(os.path.splitext(htmlfile)[0]+'.opf', 'wb'))
+ opf.render(open(os.path.splitext(htmlfile)[0]+'.opf', 'wb'))
def create_opf(self, htmlfile):
mi = self.book_header.exth.mi
- opf = OPFCreator(mi)
+ opf = OPFCreator(os.path.dirname(htmlfile), mi)
if hasattr(self.book_header.exth, 'cover_offset'):
opf.cover = 'images/%05d.jpg'%(self.book_header.exth.cover_offset+1)
manifest = [(os.path.basename(htmlfile), 'text/x-oeb1-document')]
@@ -333,4 +333,4 @@ def main(args=sys.argv):
return 0
if __name__ == '__main__':
- sys.exit(main())
\ No newline at end of file
+ sys.exit(main())
diff --git a/src/libprs500/library/database.py b/src/libprs500/library/database.py
index e7d67dba91..6d8c3f4be8 100644
--- a/src/libprs500/library/database.py
+++ b/src/libprs500/library/database.py
@@ -1340,7 +1340,7 @@ ALTER TABLE books ADD COLUMN isbn TEXT DEFAULT "" COLLATE NOCASE;
mi.rating = self.rating(idx, index_is_id=index_is_id)
mi.isbn = self.isbn(idx, index_is_id=index_is_id)
id = idx if index_is_id else self.id(idx)
- mi.libprs_id = id
+ mi.application_id = id
return mi
def vacuum(self):
@@ -1382,7 +1382,7 @@ ALTER TABLE books ADD COLUMN isbn TEXT DEFAULT "" COLLATE NOCASE;
name += '_'+id
base = dir if single_dir else tpath
- mi = OPFCreator(self.get_metadata(idx, index_is_id=index_is_id))
+ mi = OPFCreator(base, self.get_metadata(idx, index_is_id=index_is_id))
cover = self.cover(idx, index_is_id=index_is_id)
if cover is not None:
cname = name + '.jpg'
@@ -1390,7 +1390,7 @@ ALTER TABLE books ADD COLUMN isbn TEXT DEFAULT "" COLLATE NOCASE;
open(cpath, 'wb').write(cover)
mi.cover = cname
f = open(os.path.join(base, name+'.opf'), 'wb')
- mi.write(f)
+ mi.render(f)
f.close()
for fmt in self.formats(idx, index_is_id=index_is_id).split(','):
diff --git a/src/libprs500/linux.py b/src/libprs500/linux.py
index 0d06482aee..de3846f4a5 100644
--- a/src/libprs500/linux.py
+++ b/src/libprs500/linux.py
@@ -44,6 +44,7 @@ entry_points = {
'rtf2lrf = libprs500.ebooks.lrf.rtf.convert_from:main',
'web2disk = libprs500.web.fetch.simple:main',
'feeds2disk = libprs500.web.feeds.main:main',
+ 'feeds2lrf = libprs500.ebooks.lrf.feeds.convert_from:main',
'web2lrf = libprs500.ebooks.lrf.web.convert_from:main',
'pdf2lrf = libprs500.ebooks.lrf.pdf.convert_from:main',
'mobi2lrf = libprs500.ebooks.lrf.mobi.convert_from:main',
diff --git a/src/libprs500/terminfo.py b/src/libprs500/terminfo.py
index fca163d988..2114f8ad7f 100644
--- a/src/libprs500/terminfo.py
+++ b/src/libprs500/terminfo.py
@@ -201,6 +201,7 @@ class ProgressBar:
self.term.BOL + self.term.UP + self.term.CLEAR_EOL +
(self.bar % (100*percent, '='*n, '-'*(self.width-10-n))) +
self.term.CLEAR_EOL + msg)
+ sys.stdout.flush()
def clear(self):
if not self.cleared:
diff --git a/src/libprs500/web/feeds/news.py b/src/libprs500/web/feeds/news.py
index 46c5549598..98e2405c72 100644
--- a/src/libprs500/web/feeds/news.py
+++ b/src/libprs500/web/feeds/news.py
@@ -17,12 +17,13 @@
The backend to parse feeds and create HTML that can then be converted
to an ebook.
'''
-import logging, os, cStringIO, time, itertools, traceback
+import logging, os, cStringIO, time, traceback
import urlparse
from libprs500 import browser, __appname__
from libprs500.ebooks.BeautifulSoup import BeautifulSoup
from libprs500.ebooks.metadata.opf import OPFCreator
+from libprs500.ebooks.metadata.toc import TOC
from libprs500.ebooks.metadata import MetaInformation
from libprs500.web.feeds import feed_from_xml, templates
from libprs500.web.fetch.simple import option_parser as web2disk_option_parser
@@ -94,6 +95,9 @@ class BasicNewsRecipe(object):
#: using cp1252. If None, try to detect the encoding.
encoding = None
+ #: Specify any extra CSS that should be addded to downloaded HTML files
+ extra_css = None
+
#: List of regular expressions that determines which links to follow
#: If empty, it is ignored.
#: Only one of L{match_regexps} or L{filter_regexps} should be defined
@@ -276,8 +280,9 @@ class BasicNewsRecipe(object):
self.web2disk_options = web2disk_option_parser().parse_args(web2disk_cmdline)[0]
for extra in ('keep_only_tags', 'remove_tags', 'preprocess_regexps',
- 'preprocess_html', 'remove_tags_after', 'postprocess_html'):
+ 'preprocess_html', 'remove_tags_after'):
setattr(self.web2disk_options, extra, getattr(self, extra))
+ self.web2disk_options.postprocess_html = [self._postprocess_html, self.postprocess_html]
if self.delay > 0:
self.simultaneous_downloads = 1
@@ -288,6 +293,14 @@ class BasicNewsRecipe(object):
self.failed_downloads = []
self.partial_failures = []
+ def _postprocess_html(self, soup):
+ if self.extra_css is not None:
+ head = soup.find('head')
+ if head:
+ style = BeautifulSoup(u''%self.extra_css).find('style')
+ head.insert(len(head.contents), style)
+ return soup
+
def download(self):
'''
Download and pre-process all articles from the feeds in this recipe.
@@ -297,6 +310,7 @@ class BasicNewsRecipe(object):
@rtype: string
'''
self.report_progress(0, _('Trying to download cover...'))
+
self.download_cover()
res = self.build_index()
self.cleanup()
@@ -362,7 +376,7 @@ class BasicNewsRecipe(object):
fetcher.current_dir = dir
fetcher.show_progress = False
res, path, failures = fetcher.start_fetch(url), fetcher.downloaded_paths, fetcher.failed_links
- if not res:
+ if not res or not os.path.exists(res):
raise Exception(_('Could not fetch article. Run with --debug to see the reason'))
return res, path, failures
@@ -446,28 +460,44 @@ class BasicNewsRecipe(object):
if dir is None:
dir = self.output_dir
mi = MetaInformation(self.title + time.strftime(self.timefmt), [__appname__])
- opf = OPFCreator(mi)
opf_path = os.path.join(dir, 'index.opf')
+ ncx_path = os.path.join(dir, 'index.ncx')
+ opf = OPFCreator(dir, mi)
+
+ manifest = ['feed_%d'%i for i in range(len(feeds))]
+ manifest.append('index.html')
cpath = getattr(self, 'cover_path', None)
if cpath is not None and os.access(cpath, os.R_OK):
opf.cover = cpath
+ manifest.append(cpath)
+ opf.create_manifest_from_files_in(manifest)
entries = ['index.html']
+ toc = TOC(base_path=dir)
for i, f in enumerate(feeds):
entries.append('feed_%d/index.html'%i)
+ feed = toc.add_item('feed_%d/index.html'%i, None, f.title)
for j, a in enumerate(f):
if getattr(a, 'downloaded', False):
adir = 'feed_%d/article_%d/'%(i, j)
entries.append('%sindex.html'%adir)
+ feed.add_item('%sindex.html'%adir, None, a.title if a.title else 'Untitled article')
for sp in a.sub_pages:
prefix = os.path.commonprefix([opf_path, sp])
relp = sp[len(prefix):]
entries.append(relp.replace(os.sep, '/'))
- opf.create_manifest(itertools.izip(entries, itertools.repeat('text/html')))
opf.create_spine(entries)
- opf.write(open(opf_path, 'wb'))
+ opf.set_toc(toc)
+
+ for i, f in enumerate(feeds):
+
+ for j, a in enumerate(f):
+ if getattr(a, 'downloaded', False):
+ adir = 'feed_%d/article_%d/'%(i, j)
+
+ opf.render(open(opf_path, 'wb'), open(ncx_path, 'wb'))
def article_downloaded(self, request, result):
@@ -516,7 +546,7 @@ class BasicNewsRecipe(object):
title, url = None, obj
else:
title, url = obj
- self.report_progress(0, _('Fetching feed %s...'%(title if title else url)))
+ self.report_progress(0, _('Fetching feed')+' %s...'%(title if title else url))
parsed_feeds.append(feed_from_xml(self.browser.open(url).read(),
title=title,
oldest_article=self.oldest_article,
diff --git a/src/libprs500/web/feeds/recipes/newsweek.py b/src/libprs500/web/feeds/recipes/newsweek.py
index 88ca183b08..0313e52f33 100644
--- a/src/libprs500/web/feeds/recipes/newsweek.py
+++ b/src/libprs500/web/feeds/recipes/newsweek.py
@@ -33,15 +33,15 @@ class Newsweek(BasicNewsRecipe):
('National News', 'http://feeds.newsweek.com/newsweek/NationalNews'),
('World News', 'http://feeds.newsweek.com/newsweek/WorldNews'),
'http://feeds.newsweek.com/newsweek/Columnists/ChristopherDickey',
- 'http://feeds.newsweek.com/newsweek/Columnists/FareedZakaria',
+ 'http://feeds.newsweek.com/newsweek/Columnists/FareedZakaria',
('Iraq', 'http://feeds.newsweek.com/newsweek/iraq'),
('Society', 'http://feeds.newsweek.com/newsweek/society'),
('Entertainment', 'http://feeds.newsweek.com/newsweek/entertainment'),
- 'http://feeds.newsweek.com/newsweek/columnists/GeorgeFWill',
+ 'http://feeds.newsweek.com/newsweek/columnists/GeorgeFWill',
'http://feeds.newsweek.com/newsweek/columnists/AnnaQuindlen',
]
- extra_css = '#content { font:serif,120%; }'
+ extra_css = '#content { font:serif 1.2em; }'
keep_only_tags = [dict(name='div', id='content')]
remove_tags = [
@@ -55,8 +55,8 @@ class Newsweek(BasicNewsRecipe):
match_regexps = [r'http://www.newsweek.com/id/\S+/page/\d+']
# For testing
- #feeds = feeds[:2]
- #max_articles_per_feed = 1
+ #feeds = feeds[3:5]
+ #max_articles_per_feed = 2
@@ -91,4 +91,4 @@ class Newsweek(BasicNewsRecipe):
img = soup.find(alt='Cover')
if img is not None and img.has_key('src'):
small = img['src']
- return small.replace('coversmall', 'coverlarge')
\ No newline at end of file
+ return small.replace('coversmall', 'coverlarge')
diff --git a/src/libprs500/web/feeds/templates.py b/src/libprs500/web/feeds/templates.py
index dd12a1b2ff..1d1becbb51 100644
--- a/src/libprs500/web/feeds/templates.py
+++ b/src/libprs500/web/feeds/templates.py
@@ -57,16 +57,17 @@ class NavBarTemplate(Template):
@@ -159,4 +160,4 @@ class FeedTemplate(Template):
''')
def generate(self, feed):
- return Template.generate(self, feed=feed)
\ No newline at end of file
+ return Template.generate(self, feed=feed)
diff --git a/src/libprs500/web/fetch/simple.py b/src/libprs500/web/fetch/simple.py
index 644f5bc241..b6622631e9 100644
--- a/src/libprs500/web/fetch/simple.py
+++ b/src/libprs500/web/fetch/simple.py
@@ -38,9 +38,9 @@ def basename(url):
def save_soup(soup, target):
nm = Tag(soup, '')
- for meta in soup.find('meta', content=True):
- if 'charset' in meta['content']:
- meta.replaceWith(nm)
+ meta = soup.find('meta', content=True)
+ if meta and 'charset' in meta['content']:
+ meta.replaceWith(nm)
f = codecs.open(target, 'w', 'utf-8')
f.write(unicode(soup))
f.close()
@@ -85,7 +85,7 @@ class RecursiveFetcher(object):
self.remove_tags_after = getattr(options, 'remove_tags_after', None)
self.keep_only_tags = getattr(options, 'keep_only_tags', [])
self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup)
- self.postprocess_html_ext= getattr(options, 'postprocess_html', lambda soup: soup)
+ self.postprocess_html_ext= getattr(options, 'postprocess_html', [])
self.download_stylesheets = not options.no_stylesheets
self.show_progress = True
self.failed_links = []
@@ -336,7 +336,9 @@ class RecursiveFetcher(object):
self.process_return_links(soup, iurl)
self.logger.debug('Recursion limit reached. Skipping links in %s', iurl)
- save_soup(self.postprocess_html_ext(soup), res)
+ for func in self.postprocess_html_ext:
+ soup = func(soup)
+ save_soup(soup, res)
self.localize_link(tag, 'href', res)
except Exception, err: