mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Refactored OPF creation code. Implemented Table of Contents support in feeds2disk.
This commit is contained in:
parent
748c184ccb
commit
6982652f92
6
Makefile
6
Makefile
@ -1,6 +1,6 @@
|
||||
PYTHON = python
|
||||
|
||||
all : gui2 translations
|
||||
all : gui2 translations resources
|
||||
|
||||
clean :
|
||||
cd src/libprs500/gui2 && ${PYTHON} make.py clean
|
||||
@ -13,4 +13,8 @@ test : gui2
|
||||
|
||||
translations :
|
||||
cd src/libprs500 && ${PYTHON} translations/__init__.py
|
||||
|
||||
resources:
|
||||
${PYTHON} resources.py
|
||||
|
||||
|
||||
|
39
resources.py
Normal file
39
resources.py
Normal file
@ -0,0 +1,39 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
|
||||
## This program is free software; you can redistribute it and/or modify
|
||||
## it under the terms of the GNU General Public License as published by
|
||||
## the Free Software Foundation; either version 2 of the License, or
|
||||
## (at your option) any later version.
|
||||
##
|
||||
## This program is distributed in the hope that it will be useful,
|
||||
## but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
## GNU General Public License for more details.
|
||||
##
|
||||
## You should have received a copy of the GNU General Public License along
|
||||
## with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
'''
|
||||
Compile resource files.
|
||||
'''
|
||||
import os, sys
|
||||
sys.path.insert(1, os.path.join(os.getcwd(), 'src'))
|
||||
from libprs500 import __appname__
|
||||
|
||||
RESOURCES = dict(
|
||||
opf_template = '%p/ebooks/metadata/opf.xml',
|
||||
ncx_template = '%p/ebooks/metadata/ncx.xml',
|
||||
)
|
||||
|
||||
def main(args=sys.argv):
|
||||
data = ''
|
||||
for key, value in RESOURCES.items():
|
||||
path = value.replace('%p', 'src'+os.sep+__appname__)
|
||||
bytes = repr(open(path, 'rb').read())
|
||||
data += key + ' = ' + bytes + '\n\n'
|
||||
open('src'+os.sep+__appname__+os.sep+'/resources.py', 'wb').write(data)
|
||||
return 0
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
@ -60,6 +60,8 @@ def update_css(ncss, ocss):
|
||||
def munge_paths(basepath, url):
|
||||
purl = urlparse(unquote(url),)
|
||||
path, fragment = purl[2], purl[5]
|
||||
if path:
|
||||
path = path.replace('/', os.sep)
|
||||
if not path:
|
||||
path = basepath
|
||||
elif not os.path.isabs(path):
|
||||
@ -223,7 +225,6 @@ class HTMLConverter(object):
|
||||
self.extra_toc_entries = [] #: TOC entries gleaned from semantic information
|
||||
self.image_memory = []
|
||||
self.id_counter = 0
|
||||
self.toc_from_metadata = False #: If True means that the toc has been populated from metadata
|
||||
self.unused_target_blocks = [] #: Used to remove extra TextBlocks
|
||||
self.link_level = 0 #: Current link level
|
||||
self.memory = [] #: Used to ensure that duplicate CSS unhandled erros are not reported
|
||||
@ -543,7 +544,7 @@ class HTMLConverter(object):
|
||||
|
||||
path, fragment = munge_paths(self.target_prefix, tag['href'])
|
||||
return {'para':para, 'text':text, 'path':os.path.abspath(path),
|
||||
'fragment':fragment, 'in toc': (self.link_level == 0 and not self.toc_from_metadata)}
|
||||
'fragment':fragment, 'in toc': (self.link_level == 0 and not self.use_spine)}
|
||||
|
||||
|
||||
def get_text(self, tag, limit=None):
|
||||
@ -637,13 +638,12 @@ class HTMLConverter(object):
|
||||
return outside_links
|
||||
|
||||
def create_toc(self, toc):
|
||||
for (path, fragment, txt) in toc:
|
||||
ascii_text = txt.encode('ascii', 'ignore') # Bug in SONY LRF renderer
|
||||
self.toc_from_metadata = True
|
||||
if not fragment and path in self.tops:
|
||||
self.book.addTocEntry(ascii_text, self.tops[path])
|
||||
for item in toc.top_level_items():
|
||||
ascii_text = item.text.encode('ascii', 'ignore') # Bug in SONY LRF renderer
|
||||
if not item.fragment and item.abspath in self.tops:
|
||||
self.book.addTocEntry(ascii_text, self.tops[item.abspath])
|
||||
else:
|
||||
url = path+fragment
|
||||
url = item.abspath+item.fragment
|
||||
if url in self.targets:
|
||||
self.book.addTocEntry(ascii_text, self.targets[url])
|
||||
|
||||
@ -1846,6 +1846,7 @@ def try_opf(path, options, logger):
|
||||
options.cover = None
|
||||
cover = opf.cover
|
||||
if cover:
|
||||
cover = cover.replace('/', os.sep)
|
||||
if not os.path.isabs(cover):
|
||||
cover = os.path.join(dirpath, cover)
|
||||
if os.access(cover, os.R_OK):
|
||||
|
@ -65,7 +65,7 @@ class LRFConverter(object):
|
||||
def create_metadata(self):
|
||||
self.logger.info('Reading metadata...')
|
||||
mi = get_metadata(self.lrf)
|
||||
self.opf = OPFCreator(mi)
|
||||
self.opf = OPFCreator(self.output_dir, mi)
|
||||
|
||||
def create_page_styles(self):
|
||||
self.page_css = ''
|
||||
@ -126,4 +126,4 @@ def main(args=sys.argv):
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
sys.exit(main())
|
||||
|
@ -45,12 +45,13 @@ class MetaInformation(object):
|
||||
ans = MetaInformation(mi.title, mi.authors)
|
||||
for attr in ('author_sort', 'title_sort', 'comments', 'category',
|
||||
'publisher', 'series', 'series_index', 'rating',
|
||||
'isbn', 'tags', 'cover_data', 'libprs_id'):
|
||||
'isbn', 'tags', 'cover_data', 'application_id',
|
||||
'manifest', 'spine', 'toc', 'cover'):
|
||||
if hasattr(mi, attr):
|
||||
setattr(ans, attr, getattr(mi, attr))
|
||||
|
||||
|
||||
def __init__(self, title, authors):
|
||||
def __init__(self, title, authors=['Unknown']):
|
||||
'''
|
||||
@param title: title or "Unknown" or a MetaInformation object
|
||||
@param authors: List of strings or []
|
||||
@ -76,8 +77,11 @@ class MetaInformation(object):
|
||||
self.isbn = None if not mi else mi.isbn
|
||||
self.tags = [] if not mi else mi.tags
|
||||
self.cover_data = mi.cover_data if (mi and hasattr(mi, 'cover_data')) else (None, None)
|
||||
self.libprs_id = mi.libprs_id if (mi and hasattr(mi, 'libprs_id')) else None
|
||||
|
||||
self.application_id = mi.application_id if (mi and hasattr(mi, 'application_id')) else None
|
||||
self.manifest = getattr(mi, 'manifest', None)
|
||||
self.toc = getattr(mi, 'toc', None)
|
||||
self.spine = getattr(mi, 'spine', None)
|
||||
self.cover = getattr(mi, 'cover', None)
|
||||
|
||||
def smart_update(self, mi):
|
||||
'''
|
||||
@ -92,7 +96,7 @@ class MetaInformation(object):
|
||||
|
||||
for attr in ('author_sort', 'title_sort', 'comments', 'category',
|
||||
'publisher', 'series', 'series_index', 'rating',
|
||||
'isbn', 'libprs_id'):
|
||||
'isbn', 'application_id', 'manifest', 'spine', 'toc', 'cover'):
|
||||
if hasattr(mi, attr):
|
||||
val = getattr(mi, attr)
|
||||
if val is not None:
|
||||
@ -117,4 +121,4 @@ class MetaInformation(object):
|
||||
return ans.strip()
|
||||
|
||||
def __nonzero__(self):
|
||||
return bool(self.title or self.author or self.comments or self.category)
|
||||
return bool(self.title or self.author or self.comments or self.category)
|
||||
|
@ -51,7 +51,7 @@ def metadata_from_formats(formats):
|
||||
ext = path_to_ext(path)
|
||||
stream = open(path, 'rb')
|
||||
mi.smart_update(get_metadata(stream, stream_type=ext, use_libprs_metadata=True))
|
||||
if getattr(mi, 'libprs_id', None) is not None:
|
||||
if getattr(mi, 'application_id', None) is not None:
|
||||
return mi
|
||||
|
||||
return mi
|
||||
@ -69,7 +69,7 @@ def get_metadata(stream, stream_type='lrf', use_libprs_metadata=False):
|
||||
if os.access(c, os.R_OK):
|
||||
opf = opf_metadata(os.path.abspath(c))
|
||||
|
||||
if use_libprs_metadata and getattr(opf, 'libprs_id', None) is not None:
|
||||
if use_libprs_metadata and getattr(opf, 'application_id', None) is not None:
|
||||
return opf
|
||||
|
||||
try:
|
||||
@ -147,7 +147,7 @@ def opf_metadata(opfpath):
|
||||
f = open(opfpath, 'rb')
|
||||
opf = OPFReader(f, os.path.dirname(opfpath))
|
||||
try:
|
||||
if opf.libprs_id is not None:
|
||||
if opf.application_id is not None:
|
||||
mi = MetaInformation(opf, None)
|
||||
if hasattr(opf, 'cover') and opf.cover:
|
||||
cpath = os.path.join(os.path.dirname(opfpath), opf.cover)
|
||||
|
27
src/libprs500/ebooks/metadata/ncx.xml
Normal file
27
src/libprs500/ebooks/metadata/ncx.xml
Normal file
@ -0,0 +1,27 @@
|
||||
<ncx version="2005-1"
|
||||
xml:lang="en"
|
||||
xmlns="http://www.daisy.org/z3986/2005/ncx/"
|
||||
xmlns:py="http://genshi.edgewall.org/"
|
||||
>
|
||||
<head>
|
||||
<meta name="dtb:uid" content="${uid}"/>
|
||||
<meta name="dtb:depth" content="${toc.depth()}"/>
|
||||
<meta name="dtb:generator" content="${__appname__}"/>
|
||||
<meta name="dtb:totalPageCount" content="0"/>
|
||||
<meta name="dtb:maxPageNumber" content="0"/>
|
||||
</head>
|
||||
<docTitle><text>Table of Contents</text></docTitle>
|
||||
|
||||
<py:def function="navpoint(np, level)">
|
||||
${'%*s'%(4*level,'')}<navPoint playOrder="${str(np.play_order)}">
|
||||
${'%*s'%(4*level,'')}<navLabel>
|
||||
${'%*s'%(4*level,'')}<text>${np.text}</text>
|
||||
${'%*s'%(4*level,'')}</navLabel>
|
||||
${'%*s'%(4*level,'')}<content src="${str(np.href)+(('#' + str(np.fragment)) if np.fragment else '')}" />
|
||||
<py:for each="np2 in np">${navpoint(np2, level+1)}</py:for>
|
||||
${'%*s'%(4*level,'')}</navPoint>
|
||||
</py:def>
|
||||
<navMap>
|
||||
<py:for each="np in toc">${navpoint(np, 0)}</py:for>
|
||||
</navMap>
|
||||
</ncx>
|
@ -12,18 +12,21 @@
|
||||
## You should have received a copy of the GNU General Public License along
|
||||
## with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
import uuid
|
||||
'''Read/Write metadata from Open Packaging Format (.opf) files.'''
|
||||
|
||||
import sys, re, os, glob
|
||||
import sys, re, os, mimetypes
|
||||
from urllib import unquote
|
||||
from urlparse import urlparse
|
||||
import xml.dom.minidom as dom
|
||||
from itertools import repeat
|
||||
|
||||
from libprs500 import __appname__
|
||||
from libprs500.ebooks.metadata import MetaInformation
|
||||
from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup, BeautifulSoup
|
||||
from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup
|
||||
from libprs500.ebooks.lrf import entity_to_unicode
|
||||
from libprs500.ebooks.metadata import get_parser
|
||||
from libprs500.ebooks.metadata.toc import TOC
|
||||
|
||||
class ManifestItem(object):
|
||||
def __init__(self, item, cwd):
|
||||
@ -40,6 +43,14 @@ class ManifestItem(object):
|
||||
|
||||
def __unicode__(self):
|
||||
return u'<item id="%s" href="%s" media-type="%s" />'%(self.id, self.href, self.media_type)
|
||||
|
||||
def __getitem__(self, index):
|
||||
if index == 0:
|
||||
return self.href
|
||||
if index == 1:
|
||||
return self.media_type
|
||||
raise IndexError('%d out of bounds.'%index)
|
||||
|
||||
|
||||
class Manifest(list):
|
||||
|
||||
@ -81,85 +92,11 @@ class Spine(object):
|
||||
def items(self):
|
||||
for i in self.linear_ids + self.nonlinear_ids:
|
||||
yield self.manifest.item(i)
|
||||
|
||||
def __iter__(self):
|
||||
for i in self.linear_ids + self.nonlinear_ids:
|
||||
yield i
|
||||
|
||||
class TOC(list):
|
||||
|
||||
def __init__(self, opfreader, cwd):
|
||||
self.toc = None
|
||||
toc = opfreader.soup.find('spine', toc=True)
|
||||
if toc is not None:
|
||||
toc = toc['toc']
|
||||
if toc is None:
|
||||
try:
|
||||
toc = opfreader.soup.find('guide').find('reference', attrs={'type':'toc'})['href']
|
||||
except:
|
||||
for item in opfreader.manifest:
|
||||
if 'toc' in item.href.lower():
|
||||
toc = item.href
|
||||
break
|
||||
|
||||
if toc is not None:
|
||||
if toc.lower() != 'ncx':
|
||||
toc = urlparse(unquote(toc))[2]
|
||||
if not os.path.isabs(toc):
|
||||
toc = os.path.join(cwd, toc)
|
||||
try:
|
||||
if not os.path.exists(toc):
|
||||
bn = os.path.basename(toc)
|
||||
bn = bn.replace('_top.htm', '_toc.htm') # Bug in BAEN OPF files
|
||||
toc = os.path.join(os.path.dirname(toc), bn)
|
||||
|
||||
self.read_html_toc(toc, cwd)
|
||||
self.toc = toc
|
||||
except:
|
||||
pass
|
||||
else:
|
||||
cwd = os.path.abspath(cwd)
|
||||
m = glob.glob(os.path.join(cwd, '*.ncx'))
|
||||
if m:
|
||||
toc = m[0]
|
||||
try:
|
||||
self.read_ncx_toc(toc)
|
||||
self.toc = toc
|
||||
except:
|
||||
raise
|
||||
pass
|
||||
|
||||
def read_ncx_toc(self, toc):
|
||||
bdir = os.path.dirname(toc)
|
||||
soup = BeautifulStoneSoup(open(toc, 'rb').read(),
|
||||
convertEntities=BeautifulSoup.HTML_ENTITIES)
|
||||
elems = soup.findAll('navpoint')
|
||||
elems.sort(cmp=lambda x, y: cmp(int(x['playorder']), int(y['playorder'])))
|
||||
|
||||
for elem in elems:
|
||||
txt = u''
|
||||
for nl in elem.findAll('navlabel'):
|
||||
for text in nl.findAll('text'):
|
||||
txt += ''.join([unicode(s) for s in text.findAll(text=True)])
|
||||
|
||||
content = elem.find('content')
|
||||
if content is None or not content.has_key('src') or not txt:
|
||||
continue
|
||||
|
||||
purl = urlparse(unquote(content['src']))
|
||||
href, fragment = purl[2], purl[5]
|
||||
if not os.path.isabs(href):
|
||||
href = os.path.join(bdir, href)
|
||||
self.append((href, fragment, txt))
|
||||
|
||||
|
||||
def read_html_toc(self, toc, cwd):
|
||||
soup = BeautifulSoup(open(toc, 'rb').read(), convertEntities=BeautifulSoup.HTML_ENTITIES)
|
||||
for a in soup.findAll('a'):
|
||||
if not a.has_key('href'):
|
||||
continue
|
||||
purl = urlparse(unquote(a['href']))
|
||||
href, fragment = purl[2], purl[5]
|
||||
if not os.path.isabs(href):
|
||||
href = os.path.join(cwd, href)
|
||||
txt = ''.join([unicode(s).strip() for s in a.findAll(text=True)])
|
||||
self.append((href, fragment, txt))
|
||||
|
||||
|
||||
class standard_field(object):
|
||||
@ -178,21 +115,21 @@ class OPF(MetaInformation):
|
||||
MIMETYPE = 'application/oebps-package+xml'
|
||||
ENTITY_PATTERN = re.compile(r'&(\S+?);')
|
||||
|
||||
uid = standard_field('uid')
|
||||
libprs_id = standard_field('libprs_id')
|
||||
title = standard_field('title')
|
||||
authors = standard_field('authors')
|
||||
title_sort = standard_field('title_sort')
|
||||
author_sort = standard_field('author_sort')
|
||||
comments = standard_field('comments')
|
||||
category = standard_field('category')
|
||||
publisher = standard_field('publisher')
|
||||
isbn = standard_field('isbn')
|
||||
cover = standard_field('cover')
|
||||
series = standard_field('series')
|
||||
series_index = standard_field('series_index')
|
||||
rating = standard_field('rating')
|
||||
tags = standard_field('tags')
|
||||
uid = standard_field('uid')
|
||||
application_id = standard_field('application_id')
|
||||
title = standard_field('title')
|
||||
authors = standard_field('authors')
|
||||
title_sort = standard_field('title_sort')
|
||||
author_sort = standard_field('author_sort')
|
||||
comments = standard_field('comments')
|
||||
category = standard_field('category')
|
||||
publisher = standard_field('publisher')
|
||||
isbn = standard_field('isbn')
|
||||
cover = standard_field('cover')
|
||||
series = standard_field('series')
|
||||
series_index = standard_field('series_index')
|
||||
rating = standard_field('rating')
|
||||
tags = standard_field('tags')
|
||||
|
||||
HEADER = '''\
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
@ -207,14 +144,14 @@ class OPF(MetaInformation):
|
||||
if not hasattr(self, 'soup'):
|
||||
self.soup = BeautifulStoneSoup(u'''\
|
||||
%s
|
||||
<package unique-identifier="libprs_id">
|
||||
<package unique-identifier="%s_id">
|
||||
<metadata>
|
||||
<dc-metadata
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:oebpackage="http://openebook.org/namespaces/oeb-package/1.0/" />
|
||||
</metadata>
|
||||
</package>
|
||||
'''%self.HEADER)
|
||||
'''%(__appname__, self.HEADER))
|
||||
|
||||
def _commit(self, doc):
|
||||
self.soup = BeautifulStoneSoup(doc.toxml('utf-8'), fromEncoding='utf-8')
|
||||
@ -403,15 +340,15 @@ class OPF(MetaInformation):
|
||||
self._set_metadata_element('dc:identifier', isbn, [('scheme', 'ISBN')],
|
||||
replace=True)
|
||||
|
||||
def get_libprs_id(self):
|
||||
def get_application_id(self):
|
||||
for item in self.soup.package.metadata.findAll('dc:identifier'):
|
||||
if item.has_key('scheme') and item['scheme'] == 'libprs':
|
||||
if item.has_key('scheme') and item['scheme'] == __appname__:
|
||||
return str(item.string).strip()
|
||||
return None
|
||||
|
||||
def set_libprs_id(self, val):
|
||||
def set_application_id(self, val):
|
||||
if val:
|
||||
self._set_metadata_element('dc:identifier', str(val), [('scheme', 'libprs'), ('id', 'libprs_id')],
|
||||
self._set_metadata_element('dc:identifier', str(val), [('scheme', __appname__), ('id', __appname__+'_id')],
|
||||
replace=True)
|
||||
|
||||
def get_cover(self):
|
||||
@ -564,61 +501,72 @@ class OPFReader(OPF):
|
||||
stream.close()
|
||||
self.manifest = Manifest(self.soup, dir)
|
||||
self.spine = Spine(self.soup, self.manifest)
|
||||
self.toc = TOC(self, dir)
|
||||
self.toc = TOC()
|
||||
self.toc.read_from_opf(self)
|
||||
self.cover_data = (None, None)
|
||||
|
||||
class OPFCreator(OPF):
|
||||
class OPFCreator(MetaInformation):
|
||||
|
||||
def __init__(self, base_path, *args, **kwargs):
|
||||
'''
|
||||
Initialize.
|
||||
@param base_path: An absolute path to the directory in which this OPF file
|
||||
will eventually be. This is used by the L{create_manifest} method
|
||||
to convert paths to files into relative paths.
|
||||
'''
|
||||
MetaInformation.__init__(self, *args, **kwargs)
|
||||
self.base_path = os.path.abspath(base_path)
|
||||
if self.application_id is None:
|
||||
self.application_id = str(uuid.uuid4())
|
||||
self.toc = None
|
||||
if isinstance(self.manifest, Manifest):
|
||||
manifest = []
|
||||
for path, mt in self.manifest:
|
||||
if not path.startswith(self.base_path):
|
||||
raise ValueError('Inavlid manifest item %s for base path %s'%(path, self.base_path))
|
||||
path = path[len(self.base_path)+1:]
|
||||
manifest.append((path, mt))
|
||||
self.manifest = manifest
|
||||
|
||||
def __init__(self, mi):
|
||||
self.title = mi.title
|
||||
self.authors = mi.authors
|
||||
if mi.category:
|
||||
self.category = mi.category
|
||||
if mi.comments:
|
||||
self.comments = mi.comments
|
||||
if mi.publisher:
|
||||
self.publisher = mi.publisher
|
||||
if mi.rating:
|
||||
self.rating = mi.rating
|
||||
if mi.series:
|
||||
self.series = mi.series
|
||||
if mi.series_index:
|
||||
self.series_index = mi.series_index
|
||||
if mi.tags:
|
||||
self.tags = mi.tags
|
||||
if mi.isbn:
|
||||
self.isbn = mi.isbn
|
||||
self.cover_data = mi.cover_data
|
||||
if hasattr(mi, 'libprs_id'):
|
||||
self.libprs_id = mi.libprs_id
|
||||
if hasattr(mi, 'uid'):
|
||||
self.uid = mi.uid
|
||||
|
||||
def create_manifest(self, entries):
|
||||
'''
|
||||
Create <manifest>
|
||||
@param entries: List of (URL, mime-type)
|
||||
@param entries: List of (path, mime-type)
|
||||
@param base_path: It is used to convert each path into a path relative to itself
|
||||
@type entries: list of 2-tuples
|
||||
'''
|
||||
doc = dom.parseString(self.soup.__str__('UTF-8').strip())
|
||||
package = doc.documentElement
|
||||
manifest = doc.createElement('manifest')
|
||||
package.appendChild(manifest)
|
||||
package.appendChild(doc.createTextNode('\n'))
|
||||
|
||||
self.href_map = {}
|
||||
|
||||
for href, media_type in entries:
|
||||
item = doc.createElement('item')
|
||||
item.setAttribute('href', href)
|
||||
item.setAttribute('media-type', media_type)
|
||||
self.href_map[href] = str(hash(href))
|
||||
item.setAttribute('id', self.href_map[href])
|
||||
manifest.appendChild(item)
|
||||
manifest.appendChild(doc.createTextNode('\n'))
|
||||
|
||||
self._commit(doc)
|
||||
rentries = []
|
||||
base_path = self.base_path
|
||||
mimetypes.init()
|
||||
for href, mt in entries:
|
||||
href = os.path.abspath(href)
|
||||
if not href.startswith(base_path):
|
||||
raise ValueError('OPF should only refer to files below it. %s is above %s'%(href, base_path))
|
||||
href = href[len(base_path)+1:].replace(os.sep, '/')
|
||||
if not mt:
|
||||
mt = mimetypes.guess_type(href)[0]
|
||||
if not mt:
|
||||
mt = ''
|
||||
rentries.append((href, mt))
|
||||
|
||||
self.manifest = rentries
|
||||
|
||||
def create_manifest_from_files_in(self, files_and_dirs):
|
||||
entries = []
|
||||
|
||||
def dodir(dir):
|
||||
for root, dirs, files in os.walk(dir):
|
||||
for name in files:
|
||||
path = os.path.join(root, name)
|
||||
entries.append((path, None))
|
||||
|
||||
for i in files_and_dirs:
|
||||
if os.path.isdir(i):
|
||||
dodir(i)
|
||||
else:
|
||||
entries.append((i, None))
|
||||
|
||||
self.create_manifest(entries)
|
||||
|
||||
def create_spine(self, entries):
|
||||
'''
|
||||
@ -626,19 +574,43 @@ class OPFCreator(OPF):
|
||||
@param: List of paths
|
||||
@type param: list of strings
|
||||
'''
|
||||
doc = dom.parseString(self.soup.__str__('UTF-8').strip())
|
||||
package = doc.documentElement
|
||||
spine = doc.createElement('spine')
|
||||
package.appendChild(spine)
|
||||
package.appendChild(doc.createTextNode('\n'))
|
||||
self.spine = []
|
||||
|
||||
for href in entries:
|
||||
itemref = doc.createElement('itemref')
|
||||
itemref.setAttribute('idref', self.href_map[href])
|
||||
spine.appendChild(itemref)
|
||||
spine.appendChild(doc.createTextNode('\n'))
|
||||
for path in entries:
|
||||
if not os.path.isabs(path):
|
||||
path = os.path.join(self.base_path, path)
|
||||
if not path.startswith(self.base_path):
|
||||
raise ValueError('Invalid entry %s for base path %s'%(path, self.base_path))
|
||||
href = path[len(self.base_path)+1:]
|
||||
in_manifest = False
|
||||
for i, m in enumerate(self.manifest):
|
||||
if m[0] == href:
|
||||
in_manifest = True
|
||||
break
|
||||
if not in_manifest:
|
||||
raise ValueError('%s is not in the manifest. (%s)'%(href, path))
|
||||
self.spine.append(i)
|
||||
|
||||
|
||||
self._commit(doc)
|
||||
|
||||
def set_toc(self, toc):
|
||||
'''
|
||||
Set the toc. You must call L{create_spine} before calling this
|
||||
method.
|
||||
@param toc: A Table of Contents
|
||||
@type toc: L{TOC}
|
||||
'''
|
||||
self.toc = toc
|
||||
|
||||
def render(self, opf_stream, ncx_stream=None):
|
||||
from libprs500.resources import opf_template
|
||||
from genshi.template import MarkupTemplate
|
||||
template = MarkupTemplate(opf_template)
|
||||
opf = template.generate(__appname__=__appname__, mi=self).render('xml')
|
||||
opf_stream.write(opf)
|
||||
toc = getattr(self, 'toc', None)
|
||||
if toc is not None and ncx_stream is not None:
|
||||
toc.render(ncx_stream, self.application_id)
|
||||
|
||||
def option_parser():
|
||||
return get_parser('opf')
|
||||
@ -649,7 +621,7 @@ def main(args=sys.argv):
|
||||
if len(args) != 2:
|
||||
parser.print_help()
|
||||
return 1
|
||||
mi = OPFReader(open(args[1], 'rb'))
|
||||
mi = MetaInformation(OPFReader(open(args[1], 'rb')))
|
||||
if opts.title is not None:
|
||||
mi.title = opts.title.replace('&', '&').replace('<', '<').replace('>', '>')
|
||||
if opts.authors is not None:
|
||||
@ -660,7 +632,8 @@ def main(args=sys.argv):
|
||||
if opts.comment is not None:
|
||||
mi.comments = opts.comment.replace('&', '&').replace('<', '<').replace('>', '>')
|
||||
print mi
|
||||
mi.write(open(args[1], 'wb'))
|
||||
mo = OPFCreator(os.getcwd(), mi)
|
||||
mo.render(open(args[1], 'wb'))
|
||||
return 0
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
36
src/libprs500/ebooks/metadata/opf.xml
Normal file
36
src/libprs500/ebooks/metadata/opf.xml
Normal file
@ -0,0 +1,36 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<package version="2.0"
|
||||
xmlns:opf="http://www.idpf.org/2007/opf"
|
||||
xmlns:py="http://genshi.edgewall.org/"
|
||||
unique-identifier="${__appname__}_id"
|
||||
|
||||
>
|
||||
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
|
||||
<dc:title py:with="attrs={'files-as':mi.title_sort}" py:attrs="attrs">${mi.title}</dc:title>
|
||||
<dc:creator opf:role="aut" py:for="i, author in enumerate(mi.authors)" py:with="attrs={'file-as':mi.author_sort if i==0 else None}" py:attrs="attrs">${author}</dc:creator>
|
||||
<dc:identifier scheme="${__appname__}" id="${__appname__}_id">${mi.application_id}</dc:identifier>
|
||||
|
||||
<dc:type py:if="mi.category">${mi.category}</dc:type>
|
||||
<dc:description py:if="mi.comments">${mi.comments}</dc:description>
|
||||
<dc:publisher py:if="mi.publisher">${mi.publisher}</dc:publisher>
|
||||
<dc:identifier opf:scheme="ISBN" py:if="mi.isbn">${mi.isbn}</dc:identifier>
|
||||
<series py:if="mi.series">${mi.series}</series>
|
||||
<series-index py:if="mi.series_index is not None">${mi.series_index}</series-index>
|
||||
<rating py:if="mi.rating is not None">${mi.rating}</rating>
|
||||
<dc:subject py:if="mi.tags is not None" py:for="tag in mi.tags">${tag}</dc:subject>
|
||||
</metadata>
|
||||
|
||||
<guide>
|
||||
<reference py:if="mi.cover" type="cover" href="${mi.cover}" />
|
||||
</guide>
|
||||
|
||||
<manifest>
|
||||
<py:for each="i, m in enumerate(mi.manifest)">
|
||||
<item id="${str(i)}" href="${m[0]}" media-type="${m[1]}" />
|
||||
</py:for>
|
||||
</manifest>
|
||||
|
||||
<spine py:with="attrs={'toc':'ncx' if mi.toc else None}" py:attrs="attrs">
|
||||
<itemref py:for="idref in mi.spine" idref="${str(idref)}" />
|
||||
</spine>
|
||||
</package>
|
154
src/libprs500/ebooks/metadata/toc.py
Normal file
154
src/libprs500/ebooks/metadata/toc.py
Normal file
@ -0,0 +1,154 @@
|
||||
#!/usr/bin/env python
|
||||
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
|
||||
## This program is free software; you can redistribute it and/or modify
|
||||
## it under the terms of the GNU General Public License as published by
|
||||
## the Free Software Foundation; either version 2 of the License, or
|
||||
## (at your option) any later version.
|
||||
##
|
||||
## This program is distributed in the hope that it will be useful,
|
||||
## but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
## GNU General Public License for more details.
|
||||
##
|
||||
## You should have received a copy of the GNU General Public License along
|
||||
## with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
import os, glob
|
||||
from urlparse import urlparse
|
||||
from urllib import unquote
|
||||
|
||||
from libprs500 import __appname__
|
||||
from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup, BeautifulSoup
|
||||
|
||||
class NCXSoup(BeautifulStoneSoup):
|
||||
|
||||
NESTABLE_TAGS = {'navpoint':[]}
|
||||
|
||||
def __init__(self, raw):
|
||||
BeautifulStoneSoup.__init__(self, raw,
|
||||
convertEntities=BeautifulSoup.HTML_ENTITIES,
|
||||
selfClosingTags=['meta', 'content'])
|
||||
|
||||
class TOC(list):
|
||||
|
||||
def __init__(self, href=None, fragment=None, text=None, parent=None, play_order=1,
|
||||
base_path=os.getcwd()):
|
||||
self.href = href
|
||||
self.fragment = fragment
|
||||
self.text = text
|
||||
self.parent = parent
|
||||
self.base_path = base_path
|
||||
self.play_order = play_order
|
||||
|
||||
def add_item(self, href, fragment, text):
|
||||
self.append(TOC(href=href, fragment=fragment, text=text, parent=self, base_path=self.base_path))
|
||||
return self[-1]
|
||||
|
||||
def top_level_items(self):
|
||||
for item in self:
|
||||
if item.text is not None:
|
||||
yield item
|
||||
|
||||
def depth(self):
|
||||
depth = 1
|
||||
for obj in self:
|
||||
c = obj.depth()
|
||||
if c > depth - 1:
|
||||
depth = c + 1
|
||||
return depth
|
||||
|
||||
@apply
|
||||
def abspath():
|
||||
doc='Return the file this toc entry points to as a absolute path to a file on the system.'
|
||||
def fget(self):
|
||||
path = self.href.replace('/', os.sep)
|
||||
if not os.path.isabs(path):
|
||||
path = os.path.join(self.base_path, path)
|
||||
return path
|
||||
return property(fget=fget, doc=doc)
|
||||
|
||||
def read_from_opf(self, opfreader):
|
||||
toc = opfreader.soup.find('spine', toc=True)
|
||||
if toc is not None:
|
||||
toc = toc['toc']
|
||||
if toc is None:
|
||||
try:
|
||||
toc = opfreader.soup.find('guide').find('reference', attrs={'type':'toc'})['href']
|
||||
except:
|
||||
for item in opfreader.manifest:
|
||||
if 'toc' in item.href.lower():
|
||||
toc = item.href
|
||||
break
|
||||
|
||||
if toc is not None:
|
||||
if toc.lower() != 'ncx':
|
||||
toc = urlparse(unquote(toc))[2]
|
||||
toc = toc.replace('/', os.sep)
|
||||
if not os.path.isabs(toc):
|
||||
toc = os.path.join(self.base_path, toc)
|
||||
try:
|
||||
if not os.path.exists(toc):
|
||||
bn = os.path.basename(toc)
|
||||
bn = bn.replace('_top.htm', '_toc.htm') # Bug in BAEN OPF files
|
||||
toc = os.path.join(os.path.dirname(toc), bn)
|
||||
|
||||
self.read_html_toc(toc, self.base_path)
|
||||
except:
|
||||
pass
|
||||
else:
|
||||
cwd = os.path.abspath(self.base_path)
|
||||
m = glob.glob(os.path.join(cwd, '*.ncx'))
|
||||
if m:
|
||||
toc = m[0]
|
||||
self.read_ncx_toc(toc)
|
||||
|
||||
def read_ncx_toc(self, toc):
|
||||
self.base_path = os.path.dirname(toc)
|
||||
soup = NCXSoup(open(toc, 'rb').read())
|
||||
|
||||
def process_navpoint(np, dest):
|
||||
play_order = np.get('playOrder', 1)
|
||||
href = fragment = text = None
|
||||
nl = np.find('navlabel')
|
||||
if nl is not None:
|
||||
text = u''
|
||||
for txt in nl.findAll('text'):
|
||||
text += ''.join([unicode(s) for s in txt.findAll(text=True)])
|
||||
content = elem.find('content')
|
||||
if content is None or not content.has_key('src') or not txt:
|
||||
return
|
||||
|
||||
purl = urlparse(unquote(content['src']))
|
||||
href, fragment = purl[2], purl[5]
|
||||
nd = dest.add_item(href, fragment, text)
|
||||
nd.play_order = play_order
|
||||
|
||||
for c in np:
|
||||
if getattr(c, 'name', None) == 'navpoint':
|
||||
process_navpoint(c, nd)
|
||||
|
||||
nm = soup.find('navmap')
|
||||
for elem in nm:
|
||||
if getattr(elem, 'name', None) == 'navpoint':
|
||||
process_navpoint(elem, self)
|
||||
|
||||
|
||||
def read_html_toc(self, toc):
|
||||
self.base_path = os.path.dirname(toc)
|
||||
soup = BeautifulSoup(open(toc, 'rb').read(), convertEntities=BeautifulSoup.HTML_ENTITIES)
|
||||
for a in soup.findAll('a'):
|
||||
if not a.has_key('href'):
|
||||
continue
|
||||
purl = urlparse(unquote(a['href']))
|
||||
href, fragment = purl[2], purl[5]
|
||||
txt = ''.join([unicode(s).strip() for s in a.findAll(text=True)])
|
||||
self.add_item(href, fragment, txt)
|
||||
|
||||
def render(self, stream, uid):
|
||||
from libprs500.resources import ncx_template
|
||||
from genshi.template import MarkupTemplate
|
||||
doctype = ('ncx', "-//NISO//DTD ncx 2005-1//EN", "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd")
|
||||
template = MarkupTemplate(ncx_template)
|
||||
raw = template.generate(uid=uid, toc=self, __appname__=__appname__)
|
||||
raw = raw.render(doctype=doctype)
|
||||
stream.write(raw)
|
@ -186,11 +186,11 @@ class MobiReader(object):
|
||||
|
||||
if self.book_header.exth is not None:
|
||||
opf = self.create_opf(htmlfile)
|
||||
opf.write(open(os.path.splitext(htmlfile)[0]+'.opf', 'wb'))
|
||||
opf.render(open(os.path.splitext(htmlfile)[0]+'.opf', 'wb'))
|
||||
|
||||
def create_opf(self, htmlfile):
|
||||
mi = self.book_header.exth.mi
|
||||
opf = OPFCreator(mi)
|
||||
opf = OPFCreator(os.path.dirname(htmlfile), mi)
|
||||
if hasattr(self.book_header.exth, 'cover_offset'):
|
||||
opf.cover = 'images/%05d.jpg'%(self.book_header.exth.cover_offset+1)
|
||||
manifest = [(os.path.basename(htmlfile), 'text/x-oeb1-document')]
|
||||
@ -333,4 +333,4 @@ def main(args=sys.argv):
|
||||
return 0
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
sys.exit(main())
|
||||
|
@ -1340,7 +1340,7 @@ ALTER TABLE books ADD COLUMN isbn TEXT DEFAULT "" COLLATE NOCASE;
|
||||
mi.rating = self.rating(idx, index_is_id=index_is_id)
|
||||
mi.isbn = self.isbn(idx, index_is_id=index_is_id)
|
||||
id = idx if index_is_id else self.id(idx)
|
||||
mi.libprs_id = id
|
||||
mi.application_id = id
|
||||
return mi
|
||||
|
||||
def vacuum(self):
|
||||
@ -1382,7 +1382,7 @@ ALTER TABLE books ADD COLUMN isbn TEXT DEFAULT "" COLLATE NOCASE;
|
||||
name += '_'+id
|
||||
base = dir if single_dir else tpath
|
||||
|
||||
mi = OPFCreator(self.get_metadata(idx, index_is_id=index_is_id))
|
||||
mi = OPFCreator(base, self.get_metadata(idx, index_is_id=index_is_id))
|
||||
cover = self.cover(idx, index_is_id=index_is_id)
|
||||
if cover is not None:
|
||||
cname = name + '.jpg'
|
||||
@ -1390,7 +1390,7 @@ ALTER TABLE books ADD COLUMN isbn TEXT DEFAULT "" COLLATE NOCASE;
|
||||
open(cpath, 'wb').write(cover)
|
||||
mi.cover = cname
|
||||
f = open(os.path.join(base, name+'.opf'), 'wb')
|
||||
mi.write(f)
|
||||
mi.render(f)
|
||||
f.close()
|
||||
|
||||
for fmt in self.formats(idx, index_is_id=index_is_id).split(','):
|
||||
|
@ -44,6 +44,7 @@ entry_points = {
|
||||
'rtf2lrf = libprs500.ebooks.lrf.rtf.convert_from:main',
|
||||
'web2disk = libprs500.web.fetch.simple:main',
|
||||
'feeds2disk = libprs500.web.feeds.main:main',
|
||||
'feeds2lrf = libprs500.ebooks.lrf.feeds.convert_from:main',
|
||||
'web2lrf = libprs500.ebooks.lrf.web.convert_from:main',
|
||||
'pdf2lrf = libprs500.ebooks.lrf.pdf.convert_from:main',
|
||||
'mobi2lrf = libprs500.ebooks.lrf.mobi.convert_from:main',
|
||||
|
@ -201,6 +201,7 @@ class ProgressBar:
|
||||
self.term.BOL + self.term.UP + self.term.CLEAR_EOL +
|
||||
(self.bar % (100*percent, '='*n, '-'*(self.width-10-n))) +
|
||||
self.term.CLEAR_EOL + msg)
|
||||
sys.stdout.flush()
|
||||
|
||||
def clear(self):
|
||||
if not self.cleared:
|
||||
|
@ -17,12 +17,13 @@
|
||||
The backend to parse feeds and create HTML that can then be converted
|
||||
to an ebook.
|
||||
'''
|
||||
import logging, os, cStringIO, time, itertools, traceback
|
||||
import logging, os, cStringIO, time, traceback
|
||||
import urlparse
|
||||
|
||||
from libprs500 import browser, __appname__
|
||||
from libprs500.ebooks.BeautifulSoup import BeautifulSoup
|
||||
from libprs500.ebooks.metadata.opf import OPFCreator
|
||||
from libprs500.ebooks.metadata.toc import TOC
|
||||
from libprs500.ebooks.metadata import MetaInformation
|
||||
from libprs500.web.feeds import feed_from_xml, templates
|
||||
from libprs500.web.fetch.simple import option_parser as web2disk_option_parser
|
||||
@ -94,6 +95,9 @@ class BasicNewsRecipe(object):
|
||||
#: using cp1252. If None, try to detect the encoding.
|
||||
encoding = None
|
||||
|
||||
#: Specify any extra CSS that should be addded to downloaded HTML files
|
||||
extra_css = None
|
||||
|
||||
#: List of regular expressions that determines which links to follow
|
||||
#: If empty, it is ignored.
|
||||
#: Only one of L{match_regexps} or L{filter_regexps} should be defined
|
||||
@ -276,8 +280,9 @@ class BasicNewsRecipe(object):
|
||||
|
||||
self.web2disk_options = web2disk_option_parser().parse_args(web2disk_cmdline)[0]
|
||||
for extra in ('keep_only_tags', 'remove_tags', 'preprocess_regexps',
|
||||
'preprocess_html', 'remove_tags_after', 'postprocess_html'):
|
||||
'preprocess_html', 'remove_tags_after'):
|
||||
setattr(self.web2disk_options, extra, getattr(self, extra))
|
||||
self.web2disk_options.postprocess_html = [self._postprocess_html, self.postprocess_html]
|
||||
|
||||
if self.delay > 0:
|
||||
self.simultaneous_downloads = 1
|
||||
@ -288,6 +293,14 @@ class BasicNewsRecipe(object):
|
||||
self.failed_downloads = []
|
||||
self.partial_failures = []
|
||||
|
||||
def _postprocess_html(self, soup):
|
||||
if self.extra_css is not None:
|
||||
head = soup.find('head')
|
||||
if head:
|
||||
style = BeautifulSoup(u'<style type="text/css">%s</style>'%self.extra_css).find('style')
|
||||
head.insert(len(head.contents), style)
|
||||
return soup
|
||||
|
||||
def download(self):
|
||||
'''
|
||||
Download and pre-process all articles from the feeds in this recipe.
|
||||
@ -297,6 +310,7 @@ class BasicNewsRecipe(object):
|
||||
@rtype: string
|
||||
'''
|
||||
self.report_progress(0, _('Trying to download cover...'))
|
||||
|
||||
self.download_cover()
|
||||
res = self.build_index()
|
||||
self.cleanup()
|
||||
@ -362,7 +376,7 @@ class BasicNewsRecipe(object):
|
||||
fetcher.current_dir = dir
|
||||
fetcher.show_progress = False
|
||||
res, path, failures = fetcher.start_fetch(url), fetcher.downloaded_paths, fetcher.failed_links
|
||||
if not res:
|
||||
if not res or not os.path.exists(res):
|
||||
raise Exception(_('Could not fetch article. Run with --debug to see the reason'))
|
||||
return res, path, failures
|
||||
|
||||
@ -446,28 +460,44 @@ class BasicNewsRecipe(object):
|
||||
if dir is None:
|
||||
dir = self.output_dir
|
||||
mi = MetaInformation(self.title + time.strftime(self.timefmt), [__appname__])
|
||||
opf = OPFCreator(mi)
|
||||
opf_path = os.path.join(dir, 'index.opf')
|
||||
ncx_path = os.path.join(dir, 'index.ncx')
|
||||
opf = OPFCreator(dir, mi)
|
||||
|
||||
|
||||
manifest = ['feed_%d'%i for i in range(len(feeds))]
|
||||
manifest.append('index.html')
|
||||
cpath = getattr(self, 'cover_path', None)
|
||||
if cpath is not None and os.access(cpath, os.R_OK):
|
||||
opf.cover = cpath
|
||||
manifest.append(cpath)
|
||||
opf.create_manifest_from_files_in(manifest)
|
||||
|
||||
entries = ['index.html']
|
||||
toc = TOC(base_path=dir)
|
||||
for i, f in enumerate(feeds):
|
||||
entries.append('feed_%d/index.html'%i)
|
||||
feed = toc.add_item('feed_%d/index.html'%i, None, f.title)
|
||||
for j, a in enumerate(f):
|
||||
if getattr(a, 'downloaded', False):
|
||||
adir = 'feed_%d/article_%d/'%(i, j)
|
||||
entries.append('%sindex.html'%adir)
|
||||
feed.add_item('%sindex.html'%adir, None, a.title if a.title else 'Untitled article')
|
||||
for sp in a.sub_pages:
|
||||
prefix = os.path.commonprefix([opf_path, sp])
|
||||
relp = sp[len(prefix):]
|
||||
entries.append(relp.replace(os.sep, '/'))
|
||||
|
||||
opf.create_manifest(itertools.izip(entries, itertools.repeat('text/html')))
|
||||
opf.create_spine(entries)
|
||||
opf.write(open(opf_path, 'wb'))
|
||||
opf.set_toc(toc)
|
||||
|
||||
for i, f in enumerate(feeds):
|
||||
|
||||
for j, a in enumerate(f):
|
||||
if getattr(a, 'downloaded', False):
|
||||
adir = 'feed_%d/article_%d/'%(i, j)
|
||||
|
||||
opf.render(open(opf_path, 'wb'), open(ncx_path, 'wb'))
|
||||
|
||||
|
||||
def article_downloaded(self, request, result):
|
||||
@ -516,7 +546,7 @@ class BasicNewsRecipe(object):
|
||||
title, url = None, obj
|
||||
else:
|
||||
title, url = obj
|
||||
self.report_progress(0, _('Fetching feed %s...'%(title if title else url)))
|
||||
self.report_progress(0, _('Fetching feed')+' %s...'%(title if title else url))
|
||||
parsed_feeds.append(feed_from_xml(self.browser.open(url).read(),
|
||||
title=title,
|
||||
oldest_article=self.oldest_article,
|
||||
|
@ -33,15 +33,15 @@ class Newsweek(BasicNewsRecipe):
|
||||
('National News', 'http://feeds.newsweek.com/newsweek/NationalNews'),
|
||||
('World News', 'http://feeds.newsweek.com/newsweek/WorldNews'),
|
||||
'http://feeds.newsweek.com/newsweek/Columnists/ChristopherDickey',
|
||||
'http://feeds.newsweek.com/newsweek/Columnists/FareedZakaria',
|
||||
'http://feeds.newsweek.com/newsweek/Columnists/FareedZakaria',
|
||||
('Iraq', 'http://feeds.newsweek.com/newsweek/iraq'),
|
||||
('Society', 'http://feeds.newsweek.com/newsweek/society'),
|
||||
('Entertainment', 'http://feeds.newsweek.com/newsweek/entertainment'),
|
||||
'http://feeds.newsweek.com/newsweek/columnists/GeorgeFWill',
|
||||
'http://feeds.newsweek.com/newsweek/columnists/GeorgeFWill',
|
||||
'http://feeds.newsweek.com/newsweek/columnists/AnnaQuindlen',
|
||||
]
|
||||
|
||||
extra_css = '#content { font:serif,120%; }'
|
||||
extra_css = '#content { font:serif 1.2em; }'
|
||||
keep_only_tags = [dict(name='div', id='content')]
|
||||
|
||||
remove_tags = [
|
||||
@ -55,8 +55,8 @@ class Newsweek(BasicNewsRecipe):
|
||||
match_regexps = [r'http://www.newsweek.com/id/\S+/page/\d+']
|
||||
|
||||
# For testing
|
||||
#feeds = feeds[:2]
|
||||
#max_articles_per_feed = 1
|
||||
#feeds = feeds[3:5]
|
||||
#max_articles_per_feed = 2
|
||||
|
||||
|
||||
|
||||
@ -91,4 +91,4 @@ class Newsweek(BasicNewsRecipe):
|
||||
img = soup.find(alt='Cover')
|
||||
if img is not None and img.has_key('src'):
|
||||
small = img['src']
|
||||
return small.replace('coversmall', 'coverlarge')
|
||||
return small.replace('coversmall', 'coverlarge')
|
||||
|
@ -57,16 +57,17 @@ class NavBarTemplate(Template):
|
||||
<body>
|
||||
<div class="navbar" style="text-align:center">
|
||||
<hr py:if="bottom" />
|
||||
<a href="../index.html#article_${str(art)}">Up one level</a>
|
||||
<py:if test="art != num - 1">
|
||||
| <a href="../article_${str(art+1)}/index.html">Next</a>
|
||||
</py:if>
|
||||
| <a href="../index.html#article_${str(art)}">Up one level</a>
|
||||
<py:if test="two_levels">
|
||||
| <a href="../../index.html#_${str(feed)}">Up two levels</a>
|
||||
</py:if>
|
||||
<py:if test="art != 0">
|
||||
| <a href="../article_${str(art-1)}/index.html">Previous</a>
|
||||
</py:if>
|
||||
<py:if test="art != num - 1">
|
||||
| <a href="../article_${str(art+1)}/index.html">Next</a>
|
||||
</py:if>
|
||||
|
|
||||
<hr py:if="not bottom" />
|
||||
</div>
|
||||
</body>
|
||||
@ -159,4 +160,4 @@ class FeedTemplate(Template):
|
||||
''')
|
||||
|
||||
def generate(self, feed):
|
||||
return Template.generate(self, feed=feed)
|
||||
return Template.generate(self, feed=feed)
|
||||
|
@ -38,9 +38,9 @@ def basename(url):
|
||||
|
||||
def save_soup(soup, target):
|
||||
nm = Tag(soup, '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
|
||||
for meta in soup.find('meta', content=True):
|
||||
if 'charset' in meta['content']:
|
||||
meta.replaceWith(nm)
|
||||
meta = soup.find('meta', content=True)
|
||||
if meta and 'charset' in meta['content']:
|
||||
meta.replaceWith(nm)
|
||||
f = codecs.open(target, 'w', 'utf-8')
|
||||
f.write(unicode(soup))
|
||||
f.close()
|
||||
@ -85,7 +85,7 @@ class RecursiveFetcher(object):
|
||||
self.remove_tags_after = getattr(options, 'remove_tags_after', None)
|
||||
self.keep_only_tags = getattr(options, 'keep_only_tags', [])
|
||||
self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup)
|
||||
self.postprocess_html_ext= getattr(options, 'postprocess_html', lambda soup: soup)
|
||||
self.postprocess_html_ext= getattr(options, 'postprocess_html', [])
|
||||
self.download_stylesheets = not options.no_stylesheets
|
||||
self.show_progress = True
|
||||
self.failed_links = []
|
||||
@ -336,7 +336,9 @@ class RecursiveFetcher(object):
|
||||
self.process_return_links(soup, iurl)
|
||||
self.logger.debug('Recursion limit reached. Skipping links in %s', iurl)
|
||||
|
||||
save_soup(self.postprocess_html_ext(soup), res)
|
||||
for func in self.postprocess_html_ext:
|
||||
soup = func(soup)
|
||||
save_soup(soup, res)
|
||||
|
||||
self.localize_link(tag, 'href', res)
|
||||
except Exception, err:
|
||||
|
Loading…
x
Reference in New Issue
Block a user