From 6982652f923b8c0b1bfe9e69ba816733ab9fba21 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 14 Mar 2008 19:25:48 +0000 Subject: [PATCH] Refactored OPF creation code. Implemented Table of Contents support in feeds2disk. --- Makefile | 6 +- resources.py | 39 +++ src/libprs500/ebooks/lrf/html/convert_from.py | 17 +- src/libprs500/ebooks/lrf/html/convert_to.py | 4 +- src/libprs500/ebooks/metadata/__init__.py | 16 +- src/libprs500/ebooks/metadata/meta.py | 6 +- src/libprs500/ebooks/metadata/ncx.xml | 27 ++ src/libprs500/ebooks/metadata/opf.py | 293 ++++++++---------- src/libprs500/ebooks/metadata/opf.xml | 36 +++ src/libprs500/ebooks/metadata/toc.py | 154 +++++++++ src/libprs500/ebooks/mobi/reader.py | 6 +- src/libprs500/library/database.py | 6 +- src/libprs500/linux.py | 1 + src/libprs500/terminfo.py | 1 + src/libprs500/web/feeds/news.py | 44 ++- src/libprs500/web/feeds/recipes/newsweek.py | 12 +- src/libprs500/web/feeds/templates.py | 11 +- src/libprs500/web/fetch/simple.py | 12 +- 18 files changed, 482 insertions(+), 209 deletions(-) create mode 100644 resources.py create mode 100644 src/libprs500/ebooks/metadata/ncx.xml create mode 100644 src/libprs500/ebooks/metadata/opf.xml create mode 100644 src/libprs500/ebooks/metadata/toc.py diff --git a/Makefile b/Makefile index c3514fb0de..4b920c6a39 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ PYTHON = python -all : gui2 translations +all : gui2 translations resources clean : cd src/libprs500/gui2 && ${PYTHON} make.py clean @@ -13,4 +13,8 @@ test : gui2 translations : cd src/libprs500 && ${PYTHON} translations/__init__.py + +resources: + ${PYTHON} resources.py + diff --git a/resources.py b/resources.py new file mode 100644 index 0000000000..cf5cf58253 --- /dev/null +++ b/resources.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python + +## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +''' +Compile resource files. +''' +import os, sys +sys.path.insert(1, os.path.join(os.getcwd(), 'src')) +from libprs500 import __appname__ + +RESOURCES = dict( + opf_template = '%p/ebooks/metadata/opf.xml', + ncx_template = '%p/ebooks/metadata/ncx.xml', + ) + +def main(args=sys.argv): + data = '' + for key, value in RESOURCES.items(): + path = value.replace('%p', 'src'+os.sep+__appname__) + bytes = repr(open(path, 'rb').read()) + data += key + ' = ' + bytes + '\n\n' + open('src'+os.sep+__appname__+os.sep+'/resources.py', 'wb').write(data) + return 0 + +if __name__ == '__main__': + sys.exit(main()) \ No newline at end of file diff --git a/src/libprs500/ebooks/lrf/html/convert_from.py b/src/libprs500/ebooks/lrf/html/convert_from.py index eb4149c521..6ec3f06c53 100644 --- a/src/libprs500/ebooks/lrf/html/convert_from.py +++ b/src/libprs500/ebooks/lrf/html/convert_from.py @@ -60,6 +60,8 @@ def update_css(ncss, ocss): def munge_paths(basepath, url): purl = urlparse(unquote(url),) path, fragment = purl[2], purl[5] + if path: + path = path.replace('/', os.sep) if not path: path = basepath elif not os.path.isabs(path): @@ -223,7 +225,6 @@ class HTMLConverter(object): self.extra_toc_entries = [] #: TOC entries gleaned from semantic information self.image_memory = [] self.id_counter = 0 - self.toc_from_metadata = False #: If True means that the toc has been populated from metadata self.unused_target_blocks = [] #: Used to remove extra TextBlocks self.link_level = 0 #: Current link level self.memory = [] #: Used to ensure that duplicate CSS unhandled erros are not reported @@ -543,7 +544,7 @@ class HTMLConverter(object): path, fragment = munge_paths(self.target_prefix, tag['href']) return {'para':para, 'text':text, 'path':os.path.abspath(path), - 'fragment':fragment, 'in toc': (self.link_level == 0 and not self.toc_from_metadata)} + 'fragment':fragment, 'in toc': (self.link_level == 0 and not self.use_spine)} def get_text(self, tag, limit=None): @@ -637,13 +638,12 @@ class HTMLConverter(object): return outside_links def create_toc(self, toc): - for (path, fragment, txt) in toc: - ascii_text = txt.encode('ascii', 'ignore') # Bug in SONY LRF renderer - self.toc_from_metadata = True - if not fragment and path in self.tops: - self.book.addTocEntry(ascii_text, self.tops[path]) + for item in toc.top_level_items(): + ascii_text = item.text.encode('ascii', 'ignore') # Bug in SONY LRF renderer + if not item.fragment and item.abspath in self.tops: + self.book.addTocEntry(ascii_text, self.tops[item.abspath]) else: - url = path+fragment + url = item.abspath+item.fragment if url in self.targets: self.book.addTocEntry(ascii_text, self.targets[url]) @@ -1846,6 +1846,7 @@ def try_opf(path, options, logger): options.cover = None cover = opf.cover if cover: + cover = cover.replace('/', os.sep) if not os.path.isabs(cover): cover = os.path.join(dirpath, cover) if os.access(cover, os.R_OK): diff --git a/src/libprs500/ebooks/lrf/html/convert_to.py b/src/libprs500/ebooks/lrf/html/convert_to.py index 0e42a4d5b7..242b43d0df 100644 --- a/src/libprs500/ebooks/lrf/html/convert_to.py +++ b/src/libprs500/ebooks/lrf/html/convert_to.py @@ -65,7 +65,7 @@ class LRFConverter(object): def create_metadata(self): self.logger.info('Reading metadata...') mi = get_metadata(self.lrf) - self.opf = OPFCreator(mi) + self.opf = OPFCreator(self.output_dir, mi) def create_page_styles(self): self.page_css = '' @@ -126,4 +126,4 @@ def main(args=sys.argv): if __name__ == '__main__': - sys.exit(main()) \ No newline at end of file + sys.exit(main()) diff --git a/src/libprs500/ebooks/metadata/__init__.py b/src/libprs500/ebooks/metadata/__init__.py index dbd1886f68..544bb6c3d0 100644 --- a/src/libprs500/ebooks/metadata/__init__.py +++ b/src/libprs500/ebooks/metadata/__init__.py @@ -45,12 +45,13 @@ class MetaInformation(object): ans = MetaInformation(mi.title, mi.authors) for attr in ('author_sort', 'title_sort', 'comments', 'category', 'publisher', 'series', 'series_index', 'rating', - 'isbn', 'tags', 'cover_data', 'libprs_id'): + 'isbn', 'tags', 'cover_data', 'application_id', + 'manifest', 'spine', 'toc', 'cover'): if hasattr(mi, attr): setattr(ans, attr, getattr(mi, attr)) - def __init__(self, title, authors): + def __init__(self, title, authors=['Unknown']): ''' @param title: title or "Unknown" or a MetaInformation object @param authors: List of strings or [] @@ -76,8 +77,11 @@ class MetaInformation(object): self.isbn = None if not mi else mi.isbn self.tags = [] if not mi else mi.tags self.cover_data = mi.cover_data if (mi and hasattr(mi, 'cover_data')) else (None, None) - self.libprs_id = mi.libprs_id if (mi and hasattr(mi, 'libprs_id')) else None - + self.application_id = mi.application_id if (mi and hasattr(mi, 'application_id')) else None + self.manifest = getattr(mi, 'manifest', None) + self.toc = getattr(mi, 'toc', None) + self.spine = getattr(mi, 'spine', None) + self.cover = getattr(mi, 'cover', None) def smart_update(self, mi): ''' @@ -92,7 +96,7 @@ class MetaInformation(object): for attr in ('author_sort', 'title_sort', 'comments', 'category', 'publisher', 'series', 'series_index', 'rating', - 'isbn', 'libprs_id'): + 'isbn', 'application_id', 'manifest', 'spine', 'toc', 'cover'): if hasattr(mi, attr): val = getattr(mi, attr) if val is not None: @@ -117,4 +121,4 @@ class MetaInformation(object): return ans.strip() def __nonzero__(self): - return bool(self.title or self.author or self.comments or self.category) \ No newline at end of file + return bool(self.title or self.author or self.comments or self.category) diff --git a/src/libprs500/ebooks/metadata/meta.py b/src/libprs500/ebooks/metadata/meta.py index 8e2f3e5524..ed78f39a14 100644 --- a/src/libprs500/ebooks/metadata/meta.py +++ b/src/libprs500/ebooks/metadata/meta.py @@ -51,7 +51,7 @@ def metadata_from_formats(formats): ext = path_to_ext(path) stream = open(path, 'rb') mi.smart_update(get_metadata(stream, stream_type=ext, use_libprs_metadata=True)) - if getattr(mi, 'libprs_id', None) is not None: + if getattr(mi, 'application_id', None) is not None: return mi return mi @@ -69,7 +69,7 @@ def get_metadata(stream, stream_type='lrf', use_libprs_metadata=False): if os.access(c, os.R_OK): opf = opf_metadata(os.path.abspath(c)) - if use_libprs_metadata and getattr(opf, 'libprs_id', None) is not None: + if use_libprs_metadata and getattr(opf, 'application_id', None) is not None: return opf try: @@ -147,7 +147,7 @@ def opf_metadata(opfpath): f = open(opfpath, 'rb') opf = OPFReader(f, os.path.dirname(opfpath)) try: - if opf.libprs_id is not None: + if opf.application_id is not None: mi = MetaInformation(opf, None) if hasattr(opf, 'cover') and opf.cover: cpath = os.path.join(os.path.dirname(opfpath), opf.cover) diff --git a/src/libprs500/ebooks/metadata/ncx.xml b/src/libprs500/ebooks/metadata/ncx.xml new file mode 100644 index 0000000000..7bcb9ac479 --- /dev/null +++ b/src/libprs500/ebooks/metadata/ncx.xml @@ -0,0 +1,27 @@ + + + + + + + + + Table of Contents + + + ${'%*s'%(4*level,'')} + ${'%*s'%(4*level,'')} + ${'%*s'%(4*level,'')}${np.text} + ${'%*s'%(4*level,'')} + ${'%*s'%(4*level,'')} + ${navpoint(np2, level+1)} + ${'%*s'%(4*level,'')} + + + ${navpoint(np, 0)} + + \ No newline at end of file diff --git a/src/libprs500/ebooks/metadata/opf.py b/src/libprs500/ebooks/metadata/opf.py index 833f8ae51f..c1d88706da 100644 --- a/src/libprs500/ebooks/metadata/opf.py +++ b/src/libprs500/ebooks/metadata/opf.py @@ -12,18 +12,21 @@ ## You should have received a copy of the GNU General Public License along ## with this program; if not, write to the Free Software Foundation, Inc., ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +import uuid '''Read/Write metadata from Open Packaging Format (.opf) files.''' -import sys, re, os, glob +import sys, re, os, mimetypes from urllib import unquote from urlparse import urlparse import xml.dom.minidom as dom from itertools import repeat +from libprs500 import __appname__ from libprs500.ebooks.metadata import MetaInformation -from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup, BeautifulSoup +from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup from libprs500.ebooks.lrf import entity_to_unicode from libprs500.ebooks.metadata import get_parser +from libprs500.ebooks.metadata.toc import TOC class ManifestItem(object): def __init__(self, item, cwd): @@ -40,6 +43,14 @@ class ManifestItem(object): def __unicode__(self): return u''%(self.id, self.href, self.media_type) + + def __getitem__(self, index): + if index == 0: + return self.href + if index == 1: + return self.media_type + raise IndexError('%d out of bounds.'%index) + class Manifest(list): @@ -81,85 +92,11 @@ class Spine(object): def items(self): for i in self.linear_ids + self.nonlinear_ids: yield self.manifest.item(i) + + def __iter__(self): + for i in self.linear_ids + self.nonlinear_ids: + yield i -class TOC(list): - - def __init__(self, opfreader, cwd): - self.toc = None - toc = opfreader.soup.find('spine', toc=True) - if toc is not None: - toc = toc['toc'] - if toc is None: - try: - toc = opfreader.soup.find('guide').find('reference', attrs={'type':'toc'})['href'] - except: - for item in opfreader.manifest: - if 'toc' in item.href.lower(): - toc = item.href - break - - if toc is not None: - if toc.lower() != 'ncx': - toc = urlparse(unquote(toc))[2] - if not os.path.isabs(toc): - toc = os.path.join(cwd, toc) - try: - if not os.path.exists(toc): - bn = os.path.basename(toc) - bn = bn.replace('_top.htm', '_toc.htm') # Bug in BAEN OPF files - toc = os.path.join(os.path.dirname(toc), bn) - - self.read_html_toc(toc, cwd) - self.toc = toc - except: - pass - else: - cwd = os.path.abspath(cwd) - m = glob.glob(os.path.join(cwd, '*.ncx')) - if m: - toc = m[0] - try: - self.read_ncx_toc(toc) - self.toc = toc - except: - raise - pass - - def read_ncx_toc(self, toc): - bdir = os.path.dirname(toc) - soup = BeautifulStoneSoup(open(toc, 'rb').read(), - convertEntities=BeautifulSoup.HTML_ENTITIES) - elems = soup.findAll('navpoint') - elems.sort(cmp=lambda x, y: cmp(int(x['playorder']), int(y['playorder']))) - - for elem in elems: - txt = u'' - for nl in elem.findAll('navlabel'): - for text in nl.findAll('text'): - txt += ''.join([unicode(s) for s in text.findAll(text=True)]) - - content = elem.find('content') - if content is None or not content.has_key('src') or not txt: - continue - - purl = urlparse(unquote(content['src'])) - href, fragment = purl[2], purl[5] - if not os.path.isabs(href): - href = os.path.join(bdir, href) - self.append((href, fragment, txt)) - - - def read_html_toc(self, toc, cwd): - soup = BeautifulSoup(open(toc, 'rb').read(), convertEntities=BeautifulSoup.HTML_ENTITIES) - for a in soup.findAll('a'): - if not a.has_key('href'): - continue - purl = urlparse(unquote(a['href'])) - href, fragment = purl[2], purl[5] - if not os.path.isabs(href): - href = os.path.join(cwd, href) - txt = ''.join([unicode(s).strip() for s in a.findAll(text=True)]) - self.append((href, fragment, txt)) class standard_field(object): @@ -178,21 +115,21 @@ class OPF(MetaInformation): MIMETYPE = 'application/oebps-package+xml' ENTITY_PATTERN = re.compile(r'&(\S+?);') - uid = standard_field('uid') - libprs_id = standard_field('libprs_id') - title = standard_field('title') - authors = standard_field('authors') - title_sort = standard_field('title_sort') - author_sort = standard_field('author_sort') - comments = standard_field('comments') - category = standard_field('category') - publisher = standard_field('publisher') - isbn = standard_field('isbn') - cover = standard_field('cover') - series = standard_field('series') - series_index = standard_field('series_index') - rating = standard_field('rating') - tags = standard_field('tags') + uid = standard_field('uid') + application_id = standard_field('application_id') + title = standard_field('title') + authors = standard_field('authors') + title_sort = standard_field('title_sort') + author_sort = standard_field('author_sort') + comments = standard_field('comments') + category = standard_field('category') + publisher = standard_field('publisher') + isbn = standard_field('isbn') + cover = standard_field('cover') + series = standard_field('series') + series_index = standard_field('series_index') + rating = standard_field('rating') + tags = standard_field('tags') HEADER = '''\ @@ -207,14 +144,14 @@ class OPF(MetaInformation): if not hasattr(self, 'soup'): self.soup = BeautifulStoneSoup(u'''\ %s - + -'''%self.HEADER) +'''%(__appname__, self.HEADER)) def _commit(self, doc): self.soup = BeautifulStoneSoup(doc.toxml('utf-8'), fromEncoding='utf-8') @@ -403,15 +340,15 @@ class OPF(MetaInformation): self._set_metadata_element('dc:identifier', isbn, [('scheme', 'ISBN')], replace=True) - def get_libprs_id(self): + def get_application_id(self): for item in self.soup.package.metadata.findAll('dc:identifier'): - if item.has_key('scheme') and item['scheme'] == 'libprs': + if item.has_key('scheme') and item['scheme'] == __appname__: return str(item.string).strip() return None - def set_libprs_id(self, val): + def set_application_id(self, val): if val: - self._set_metadata_element('dc:identifier', str(val), [('scheme', 'libprs'), ('id', 'libprs_id')], + self._set_metadata_element('dc:identifier', str(val), [('scheme', __appname__), ('id', __appname__+'_id')], replace=True) def get_cover(self): @@ -564,61 +501,72 @@ class OPFReader(OPF): stream.close() self.manifest = Manifest(self.soup, dir) self.spine = Spine(self.soup, self.manifest) - self.toc = TOC(self, dir) + self.toc = TOC() + self.toc.read_from_opf(self) self.cover_data = (None, None) -class OPFCreator(OPF): +class OPFCreator(MetaInformation): + + def __init__(self, base_path, *args, **kwargs): + ''' + Initialize. + @param base_path: An absolute path to the directory in which this OPF file + will eventually be. This is used by the L{create_manifest} method + to convert paths to files into relative paths. + ''' + MetaInformation.__init__(self, *args, **kwargs) + self.base_path = os.path.abspath(base_path) + if self.application_id is None: + self.application_id = str(uuid.uuid4()) + self.toc = None + if isinstance(self.manifest, Manifest): + manifest = [] + for path, mt in self.manifest: + if not path.startswith(self.base_path): + raise ValueError('Inavlid manifest item %s for base path %s'%(path, self.base_path)) + path = path[len(self.base_path)+1:] + manifest.append((path, mt)) + self.manifest = manifest - def __init__(self, mi): - self.title = mi.title - self.authors = mi.authors - if mi.category: - self.category = mi.category - if mi.comments: - self.comments = mi.comments - if mi.publisher: - self.publisher = mi.publisher - if mi.rating: - self.rating = mi.rating - if mi.series: - self.series = mi.series - if mi.series_index: - self.series_index = mi.series_index - if mi.tags: - self.tags = mi.tags - if mi.isbn: - self.isbn = mi.isbn - self.cover_data = mi.cover_data - if hasattr(mi, 'libprs_id'): - self.libprs_id = mi.libprs_id - if hasattr(mi, 'uid'): - self.uid = mi.uid - def create_manifest(self, entries): ''' Create - @param entries: List of (URL, mime-type) + @param entries: List of (path, mime-type) + @param base_path: It is used to convert each path into a path relative to itself @type entries: list of 2-tuples ''' - doc = dom.parseString(self.soup.__str__('UTF-8').strip()) - package = doc.documentElement - manifest = doc.createElement('manifest') - package.appendChild(manifest) - package.appendChild(doc.createTextNode('\n')) - - self.href_map = {} - - for href, media_type in entries: - item = doc.createElement('item') - item.setAttribute('href', href) - item.setAttribute('media-type', media_type) - self.href_map[href] = str(hash(href)) - item.setAttribute('id', self.href_map[href]) - manifest.appendChild(item) - manifest.appendChild(doc.createTextNode('\n')) - - self._commit(doc) + rentries = [] + base_path = self.base_path + mimetypes.init() + for href, mt in entries: + href = os.path.abspath(href) + if not href.startswith(base_path): + raise ValueError('OPF should only refer to files below it. %s is above %s'%(href, base_path)) + href = href[len(base_path)+1:].replace(os.sep, '/') + if not mt: + mt = mimetypes.guess_type(href)[0] + if not mt: + mt = '' + rentries.append((href, mt)) + self.manifest = rentries + + def create_manifest_from_files_in(self, files_and_dirs): + entries = [] + + def dodir(dir): + for root, dirs, files in os.walk(dir): + for name in files: + path = os.path.join(root, name) + entries.append((path, None)) + + for i in files_and_dirs: + if os.path.isdir(i): + dodir(i) + else: + entries.append((i, None)) + + self.create_manifest(entries) def create_spine(self, entries): ''' @@ -626,19 +574,43 @@ class OPFCreator(OPF): @param: List of paths @type param: list of strings ''' - doc = dom.parseString(self.soup.__str__('UTF-8').strip()) - package = doc.documentElement - spine = doc.createElement('spine') - package.appendChild(spine) - package.appendChild(doc.createTextNode('\n')) + self.spine = [] - for href in entries: - itemref = doc.createElement('itemref') - itemref.setAttribute('idref', self.href_map[href]) - spine.appendChild(itemref) - spine.appendChild(doc.createTextNode('\n')) + for path in entries: + if not os.path.isabs(path): + path = os.path.join(self.base_path, path) + if not path.startswith(self.base_path): + raise ValueError('Invalid entry %s for base path %s'%(path, self.base_path)) + href = path[len(self.base_path)+1:] + in_manifest = False + for i, m in enumerate(self.manifest): + if m[0] == href: + in_manifest = True + break + if not in_manifest: + raise ValueError('%s is not in the manifest. (%s)'%(href, path)) + self.spine.append(i) + - self._commit(doc) + + def set_toc(self, toc): + ''' + Set the toc. You must call L{create_spine} before calling this + method. + @param toc: A Table of Contents + @type toc: L{TOC} + ''' + self.toc = toc + + def render(self, opf_stream, ncx_stream=None): + from libprs500.resources import opf_template + from genshi.template import MarkupTemplate + template = MarkupTemplate(opf_template) + opf = template.generate(__appname__=__appname__, mi=self).render('xml') + opf_stream.write(opf) + toc = getattr(self, 'toc', None) + if toc is not None and ncx_stream is not None: + toc.render(ncx_stream, self.application_id) def option_parser(): return get_parser('opf') @@ -649,7 +621,7 @@ def main(args=sys.argv): if len(args) != 2: parser.print_help() return 1 - mi = OPFReader(open(args[1], 'rb')) + mi = MetaInformation(OPFReader(open(args[1], 'rb'))) if opts.title is not None: mi.title = opts.title.replace('&', '&').replace('<', '<').replace('>', '>') if opts.authors is not None: @@ -660,7 +632,8 @@ def main(args=sys.argv): if opts.comment is not None: mi.comments = opts.comment.replace('&', '&').replace('<', '<').replace('>', '>') print mi - mi.write(open(args[1], 'wb')) + mo = OPFCreator(os.getcwd(), mi) + mo.render(open(args[1], 'wb')) return 0 if __name__ == '__main__': diff --git a/src/libprs500/ebooks/metadata/opf.xml b/src/libprs500/ebooks/metadata/opf.xml new file mode 100644 index 0000000000..a847bae2c8 --- /dev/null +++ b/src/libprs500/ebooks/metadata/opf.xml @@ -0,0 +1,36 @@ + + + + ${mi.title} + ${author} + ${mi.application_id} + + ${mi.category} + ${mi.comments} + ${mi.publisher} + ${mi.isbn} + ${mi.series} + ${mi.series_index} + ${mi.rating} + ${tag} + + + + + + + + + + + + + + + + diff --git a/src/libprs500/ebooks/metadata/toc.py b/src/libprs500/ebooks/metadata/toc.py new file mode 100644 index 0000000000..89aaadbe11 --- /dev/null +++ b/src/libprs500/ebooks/metadata/toc.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python +## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +import os, glob +from urlparse import urlparse +from urllib import unquote + +from libprs500 import __appname__ +from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup, BeautifulSoup + +class NCXSoup(BeautifulStoneSoup): + + NESTABLE_TAGS = {'navpoint':[]} + + def __init__(self, raw): + BeautifulStoneSoup.__init__(self, raw, + convertEntities=BeautifulSoup.HTML_ENTITIES, + selfClosingTags=['meta', 'content']) + +class TOC(list): + + def __init__(self, href=None, fragment=None, text=None, parent=None, play_order=1, + base_path=os.getcwd()): + self.href = href + self.fragment = fragment + self.text = text + self.parent = parent + self.base_path = base_path + self.play_order = play_order + + def add_item(self, href, fragment, text): + self.append(TOC(href=href, fragment=fragment, text=text, parent=self, base_path=self.base_path)) + return self[-1] + + def top_level_items(self): + for item in self: + if item.text is not None: + yield item + + def depth(self): + depth = 1 + for obj in self: + c = obj.depth() + if c > depth - 1: + depth = c + 1 + return depth + + @apply + def abspath(): + doc='Return the file this toc entry points to as a absolute path to a file on the system.' + def fget(self): + path = self.href.replace('/', os.sep) + if not os.path.isabs(path): + path = os.path.join(self.base_path, path) + return path + return property(fget=fget, doc=doc) + + def read_from_opf(self, opfreader): + toc = opfreader.soup.find('spine', toc=True) + if toc is not None: + toc = toc['toc'] + if toc is None: + try: + toc = opfreader.soup.find('guide').find('reference', attrs={'type':'toc'})['href'] + except: + for item in opfreader.manifest: + if 'toc' in item.href.lower(): + toc = item.href + break + + if toc is not None: + if toc.lower() != 'ncx': + toc = urlparse(unquote(toc))[2] + toc = toc.replace('/', os.sep) + if not os.path.isabs(toc): + toc = os.path.join(self.base_path, toc) + try: + if not os.path.exists(toc): + bn = os.path.basename(toc) + bn = bn.replace('_top.htm', '_toc.htm') # Bug in BAEN OPF files + toc = os.path.join(os.path.dirname(toc), bn) + + self.read_html_toc(toc, self.base_path) + except: + pass + else: + cwd = os.path.abspath(self.base_path) + m = glob.glob(os.path.join(cwd, '*.ncx')) + if m: + toc = m[0] + self.read_ncx_toc(toc) + + def read_ncx_toc(self, toc): + self.base_path = os.path.dirname(toc) + soup = NCXSoup(open(toc, 'rb').read()) + + def process_navpoint(np, dest): + play_order = np.get('playOrder', 1) + href = fragment = text = None + nl = np.find('navlabel') + if nl is not None: + text = u'' + for txt in nl.findAll('text'): + text += ''.join([unicode(s) for s in txt.findAll(text=True)]) + content = elem.find('content') + if content is None or not content.has_key('src') or not txt: + return + + purl = urlparse(unquote(content['src'])) + href, fragment = purl[2], purl[5] + nd = dest.add_item(href, fragment, text) + nd.play_order = play_order + + for c in np: + if getattr(c, 'name', None) == 'navpoint': + process_navpoint(c, nd) + + nm = soup.find('navmap') + for elem in nm: + if getattr(elem, 'name', None) == 'navpoint': + process_navpoint(elem, self) + + + def read_html_toc(self, toc): + self.base_path = os.path.dirname(toc) + soup = BeautifulSoup(open(toc, 'rb').read(), convertEntities=BeautifulSoup.HTML_ENTITIES) + for a in soup.findAll('a'): + if not a.has_key('href'): + continue + purl = urlparse(unquote(a['href'])) + href, fragment = purl[2], purl[5] + txt = ''.join([unicode(s).strip() for s in a.findAll(text=True)]) + self.add_item(href, fragment, txt) + + def render(self, stream, uid): + from libprs500.resources import ncx_template + from genshi.template import MarkupTemplate + doctype = ('ncx', "-//NISO//DTD ncx 2005-1//EN", "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd") + template = MarkupTemplate(ncx_template) + raw = template.generate(uid=uid, toc=self, __appname__=__appname__) + raw = raw.render(doctype=doctype) + stream.write(raw) \ No newline at end of file diff --git a/src/libprs500/ebooks/mobi/reader.py b/src/libprs500/ebooks/mobi/reader.py index 71a0c3f026..c89daa1ae8 100644 --- a/src/libprs500/ebooks/mobi/reader.py +++ b/src/libprs500/ebooks/mobi/reader.py @@ -186,11 +186,11 @@ class MobiReader(object): if self.book_header.exth is not None: opf = self.create_opf(htmlfile) - opf.write(open(os.path.splitext(htmlfile)[0]+'.opf', 'wb')) + opf.render(open(os.path.splitext(htmlfile)[0]+'.opf', 'wb')) def create_opf(self, htmlfile): mi = self.book_header.exth.mi - opf = OPFCreator(mi) + opf = OPFCreator(os.path.dirname(htmlfile), mi) if hasattr(self.book_header.exth, 'cover_offset'): opf.cover = 'images/%05d.jpg'%(self.book_header.exth.cover_offset+1) manifest = [(os.path.basename(htmlfile), 'text/x-oeb1-document')] @@ -333,4 +333,4 @@ def main(args=sys.argv): return 0 if __name__ == '__main__': - sys.exit(main()) \ No newline at end of file + sys.exit(main()) diff --git a/src/libprs500/library/database.py b/src/libprs500/library/database.py index e7d67dba91..6d8c3f4be8 100644 --- a/src/libprs500/library/database.py +++ b/src/libprs500/library/database.py @@ -1340,7 +1340,7 @@ ALTER TABLE books ADD COLUMN isbn TEXT DEFAULT "" COLLATE NOCASE; mi.rating = self.rating(idx, index_is_id=index_is_id) mi.isbn = self.isbn(idx, index_is_id=index_is_id) id = idx if index_is_id else self.id(idx) - mi.libprs_id = id + mi.application_id = id return mi def vacuum(self): @@ -1382,7 +1382,7 @@ ALTER TABLE books ADD COLUMN isbn TEXT DEFAULT "" COLLATE NOCASE; name += '_'+id base = dir if single_dir else tpath - mi = OPFCreator(self.get_metadata(idx, index_is_id=index_is_id)) + mi = OPFCreator(base, self.get_metadata(idx, index_is_id=index_is_id)) cover = self.cover(idx, index_is_id=index_is_id) if cover is not None: cname = name + '.jpg' @@ -1390,7 +1390,7 @@ ALTER TABLE books ADD COLUMN isbn TEXT DEFAULT "" COLLATE NOCASE; open(cpath, 'wb').write(cover) mi.cover = cname f = open(os.path.join(base, name+'.opf'), 'wb') - mi.write(f) + mi.render(f) f.close() for fmt in self.formats(idx, index_is_id=index_is_id).split(','): diff --git a/src/libprs500/linux.py b/src/libprs500/linux.py index 0d06482aee..de3846f4a5 100644 --- a/src/libprs500/linux.py +++ b/src/libprs500/linux.py @@ -44,6 +44,7 @@ entry_points = { 'rtf2lrf = libprs500.ebooks.lrf.rtf.convert_from:main', 'web2disk = libprs500.web.fetch.simple:main', 'feeds2disk = libprs500.web.feeds.main:main', + 'feeds2lrf = libprs500.ebooks.lrf.feeds.convert_from:main', 'web2lrf = libprs500.ebooks.lrf.web.convert_from:main', 'pdf2lrf = libprs500.ebooks.lrf.pdf.convert_from:main', 'mobi2lrf = libprs500.ebooks.lrf.mobi.convert_from:main', diff --git a/src/libprs500/terminfo.py b/src/libprs500/terminfo.py index fca163d988..2114f8ad7f 100644 --- a/src/libprs500/terminfo.py +++ b/src/libprs500/terminfo.py @@ -201,6 +201,7 @@ class ProgressBar: self.term.BOL + self.term.UP + self.term.CLEAR_EOL + (self.bar % (100*percent, '='*n, '-'*(self.width-10-n))) + self.term.CLEAR_EOL + msg) + sys.stdout.flush() def clear(self): if not self.cleared: diff --git a/src/libprs500/web/feeds/news.py b/src/libprs500/web/feeds/news.py index 46c5549598..98e2405c72 100644 --- a/src/libprs500/web/feeds/news.py +++ b/src/libprs500/web/feeds/news.py @@ -17,12 +17,13 @@ The backend to parse feeds and create HTML that can then be converted to an ebook. ''' -import logging, os, cStringIO, time, itertools, traceback +import logging, os, cStringIO, time, traceback import urlparse from libprs500 import browser, __appname__ from libprs500.ebooks.BeautifulSoup import BeautifulSoup from libprs500.ebooks.metadata.opf import OPFCreator +from libprs500.ebooks.metadata.toc import TOC from libprs500.ebooks.metadata import MetaInformation from libprs500.web.feeds import feed_from_xml, templates from libprs500.web.fetch.simple import option_parser as web2disk_option_parser @@ -94,6 +95,9 @@ class BasicNewsRecipe(object): #: using cp1252. If None, try to detect the encoding. encoding = None + #: Specify any extra CSS that should be addded to downloaded HTML files + extra_css = None + #: List of regular expressions that determines which links to follow #: If empty, it is ignored. #: Only one of L{match_regexps} or L{filter_regexps} should be defined @@ -276,8 +280,9 @@ class BasicNewsRecipe(object): self.web2disk_options = web2disk_option_parser().parse_args(web2disk_cmdline)[0] for extra in ('keep_only_tags', 'remove_tags', 'preprocess_regexps', - 'preprocess_html', 'remove_tags_after', 'postprocess_html'): + 'preprocess_html', 'remove_tags_after'): setattr(self.web2disk_options, extra, getattr(self, extra)) + self.web2disk_options.postprocess_html = [self._postprocess_html, self.postprocess_html] if self.delay > 0: self.simultaneous_downloads = 1 @@ -288,6 +293,14 @@ class BasicNewsRecipe(object): self.failed_downloads = [] self.partial_failures = [] + def _postprocess_html(self, soup): + if self.extra_css is not None: + head = soup.find('head') + if head: + style = BeautifulSoup(u''%self.extra_css).find('style') + head.insert(len(head.contents), style) + return soup + def download(self): ''' Download and pre-process all articles from the feeds in this recipe. @@ -297,6 +310,7 @@ class BasicNewsRecipe(object): @rtype: string ''' self.report_progress(0, _('Trying to download cover...')) + self.download_cover() res = self.build_index() self.cleanup() @@ -362,7 +376,7 @@ class BasicNewsRecipe(object): fetcher.current_dir = dir fetcher.show_progress = False res, path, failures = fetcher.start_fetch(url), fetcher.downloaded_paths, fetcher.failed_links - if not res: + if not res or not os.path.exists(res): raise Exception(_('Could not fetch article. Run with --debug to see the reason')) return res, path, failures @@ -446,28 +460,44 @@ class BasicNewsRecipe(object): if dir is None: dir = self.output_dir mi = MetaInformation(self.title + time.strftime(self.timefmt), [__appname__]) - opf = OPFCreator(mi) opf_path = os.path.join(dir, 'index.opf') + ncx_path = os.path.join(dir, 'index.ncx') + opf = OPFCreator(dir, mi) + + manifest = ['feed_%d'%i for i in range(len(feeds))] + manifest.append('index.html') cpath = getattr(self, 'cover_path', None) if cpath is not None and os.access(cpath, os.R_OK): opf.cover = cpath + manifest.append(cpath) + opf.create_manifest_from_files_in(manifest) entries = ['index.html'] + toc = TOC(base_path=dir) for i, f in enumerate(feeds): entries.append('feed_%d/index.html'%i) + feed = toc.add_item('feed_%d/index.html'%i, None, f.title) for j, a in enumerate(f): if getattr(a, 'downloaded', False): adir = 'feed_%d/article_%d/'%(i, j) entries.append('%sindex.html'%adir) + feed.add_item('%sindex.html'%adir, None, a.title if a.title else 'Untitled article') for sp in a.sub_pages: prefix = os.path.commonprefix([opf_path, sp]) relp = sp[len(prefix):] entries.append(relp.replace(os.sep, '/')) - opf.create_manifest(itertools.izip(entries, itertools.repeat('text/html'))) opf.create_spine(entries) - opf.write(open(opf_path, 'wb')) + opf.set_toc(toc) + + for i, f in enumerate(feeds): + + for j, a in enumerate(f): + if getattr(a, 'downloaded', False): + adir = 'feed_%d/article_%d/'%(i, j) + + opf.render(open(opf_path, 'wb'), open(ncx_path, 'wb')) def article_downloaded(self, request, result): @@ -516,7 +546,7 @@ class BasicNewsRecipe(object): title, url = None, obj else: title, url = obj - self.report_progress(0, _('Fetching feed %s...'%(title if title else url))) + self.report_progress(0, _('Fetching feed')+' %s...'%(title if title else url)) parsed_feeds.append(feed_from_xml(self.browser.open(url).read(), title=title, oldest_article=self.oldest_article, diff --git a/src/libprs500/web/feeds/recipes/newsweek.py b/src/libprs500/web/feeds/recipes/newsweek.py index 88ca183b08..0313e52f33 100644 --- a/src/libprs500/web/feeds/recipes/newsweek.py +++ b/src/libprs500/web/feeds/recipes/newsweek.py @@ -33,15 +33,15 @@ class Newsweek(BasicNewsRecipe): ('National News', 'http://feeds.newsweek.com/newsweek/NationalNews'), ('World News', 'http://feeds.newsweek.com/newsweek/WorldNews'), 'http://feeds.newsweek.com/newsweek/Columnists/ChristopherDickey', - 'http://feeds.newsweek.com/newsweek/Columnists/FareedZakaria', + 'http://feeds.newsweek.com/newsweek/Columnists/FareedZakaria', ('Iraq', 'http://feeds.newsweek.com/newsweek/iraq'), ('Society', 'http://feeds.newsweek.com/newsweek/society'), ('Entertainment', 'http://feeds.newsweek.com/newsweek/entertainment'), - 'http://feeds.newsweek.com/newsweek/columnists/GeorgeFWill', + 'http://feeds.newsweek.com/newsweek/columnists/GeorgeFWill', 'http://feeds.newsweek.com/newsweek/columnists/AnnaQuindlen', ] - extra_css = '#content { font:serif,120%; }' + extra_css = '#content { font:serif 1.2em; }' keep_only_tags = [dict(name='div', id='content')] remove_tags = [ @@ -55,8 +55,8 @@ class Newsweek(BasicNewsRecipe): match_regexps = [r'http://www.newsweek.com/id/\S+/page/\d+'] # For testing - #feeds = feeds[:2] - #max_articles_per_feed = 1 + #feeds = feeds[3:5] + #max_articles_per_feed = 2 @@ -91,4 +91,4 @@ class Newsweek(BasicNewsRecipe): img = soup.find(alt='Cover') if img is not None and img.has_key('src'): small = img['src'] - return small.replace('coversmall', 'coverlarge') \ No newline at end of file + return small.replace('coversmall', 'coverlarge') diff --git a/src/libprs500/web/feeds/templates.py b/src/libprs500/web/feeds/templates.py index dd12a1b2ff..1d1becbb51 100644 --- a/src/libprs500/web/feeds/templates.py +++ b/src/libprs500/web/feeds/templates.py @@ -57,16 +57,17 @@ class NavBarTemplate(Template): @@ -159,4 +160,4 @@ class FeedTemplate(Template): ''') def generate(self, feed): - return Template.generate(self, feed=feed) \ No newline at end of file + return Template.generate(self, feed=feed) diff --git a/src/libprs500/web/fetch/simple.py b/src/libprs500/web/fetch/simple.py index 644f5bc241..b6622631e9 100644 --- a/src/libprs500/web/fetch/simple.py +++ b/src/libprs500/web/fetch/simple.py @@ -38,9 +38,9 @@ def basename(url): def save_soup(soup, target): nm = Tag(soup, '') - for meta in soup.find('meta', content=True): - if 'charset' in meta['content']: - meta.replaceWith(nm) + meta = soup.find('meta', content=True) + if meta and 'charset' in meta['content']: + meta.replaceWith(nm) f = codecs.open(target, 'w', 'utf-8') f.write(unicode(soup)) f.close() @@ -85,7 +85,7 @@ class RecursiveFetcher(object): self.remove_tags_after = getattr(options, 'remove_tags_after', None) self.keep_only_tags = getattr(options, 'keep_only_tags', []) self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup) - self.postprocess_html_ext= getattr(options, 'postprocess_html', lambda soup: soup) + self.postprocess_html_ext= getattr(options, 'postprocess_html', []) self.download_stylesheets = not options.no_stylesheets self.show_progress = True self.failed_links = [] @@ -336,7 +336,9 @@ class RecursiveFetcher(object): self.process_return_links(soup, iurl) self.logger.debug('Recursion limit reached. Skipping links in %s', iurl) - save_soup(self.postprocess_html_ext(soup), res) + for func in self.postprocess_html_ext: + soup = func(soup) + save_soup(soup, res) self.localize_link(tag, 'href', res) except Exception, err: