mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 10:14:46 -04:00
Working HTML/OPF input plugin. Also fixed feeds download and removed cover processing from OEBBook
This commit is contained in:
parent
296853cd43
commit
95d1b58ae3
@ -18,7 +18,7 @@ every time you add an HTML file to the library.\
|
||||
file_types = set(['html', 'htm', 'xhtml', 'xhtm'])
|
||||
supported_platforms = ['windows', 'osx', 'linux']
|
||||
on_import = True
|
||||
|
||||
|
||||
def run(self, htmlfile):
|
||||
of = self.temporary_file('_plugin_html2zip.zip')
|
||||
from calibre.ebooks.html import gui_main as html2oeb
|
||||
@ -26,172 +26,173 @@ every time you add an HTML file to the library.\
|
||||
return of.name
|
||||
|
||||
class OPFMetadataReader(MetadataReaderPlugin):
|
||||
|
||||
|
||||
name = 'Read OPF metadata'
|
||||
file_types = set(['opf'])
|
||||
description = _('Read metadata from %s files')%'OPF'
|
||||
|
||||
|
||||
def get_metadata(self, stream, ftype):
|
||||
from calibre.ebooks.metadata.opf2 import OPF
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
return MetaInformation(OPF(stream, os.getcwd()))
|
||||
|
||||
class RTFMetadataReader(MetadataReaderPlugin):
|
||||
|
||||
name = 'Read RTF metadata'
|
||||
|
||||
name = 'Read RTF metadata'
|
||||
file_types = set(['rtf'])
|
||||
description = _('Read metadata from %s files')%'RTF'
|
||||
|
||||
|
||||
def get_metadata(self, stream, ftype):
|
||||
from calibre.ebooks.metadata.rtf import get_metadata
|
||||
return get_metadata(stream)
|
||||
|
||||
class FB2MetadataReader(MetadataReaderPlugin):
|
||||
|
||||
|
||||
name = 'Read FB2 metadata'
|
||||
file_types = set(['fb2'])
|
||||
description = _('Read metadata from %s files')%'FB2'
|
||||
|
||||
|
||||
def get_metadata(self, stream, ftype):
|
||||
from calibre.ebooks.metadata.fb2 import get_metadata
|
||||
return get_metadata(stream)
|
||||
|
||||
|
||||
class LRFMetadataReader(MetadataReaderPlugin):
|
||||
|
||||
|
||||
name = 'Read LRF metadata'
|
||||
file_types = set(['lrf'])
|
||||
description = _('Read metadata from %s files')%'LRF'
|
||||
|
||||
|
||||
def get_metadata(self, stream, ftype):
|
||||
from calibre.ebooks.lrf.meta import get_metadata
|
||||
return get_metadata(stream)
|
||||
|
||||
class PDFMetadataReader(MetadataReaderPlugin):
|
||||
|
||||
|
||||
name = 'Read PDF metadata'
|
||||
file_types = set(['pdf'])
|
||||
description = _('Read metadata from %s files')%'PDF'
|
||||
|
||||
|
||||
def get_metadata(self, stream, ftype):
|
||||
from calibre.ebooks.metadata.pdf import get_metadata
|
||||
return get_metadata(stream)
|
||||
|
||||
class LITMetadataReader(MetadataReaderPlugin):
|
||||
|
||||
|
||||
name = 'Read LIT metadata'
|
||||
file_types = set(['lit'])
|
||||
description = _('Read metadata from %s files')%'LIT'
|
||||
|
||||
|
||||
def get_metadata(self, stream, ftype):
|
||||
from calibre.ebooks.metadata.lit import get_metadata
|
||||
return get_metadata(stream)
|
||||
|
||||
class IMPMetadataReader(MetadataReaderPlugin):
|
||||
|
||||
|
||||
name = 'Read IMP metadata'
|
||||
file_types = set(['imp'])
|
||||
description = _('Read metadata from %s files')%'IMP'
|
||||
author = 'Ashish Kulkarni'
|
||||
|
||||
|
||||
def get_metadata(self, stream, ftype):
|
||||
from calibre.ebooks.metadata.imp import get_metadata
|
||||
return get_metadata(stream)
|
||||
|
||||
class RBMetadataReader(MetadataReaderPlugin):
|
||||
|
||||
|
||||
name = 'Read RB metadata'
|
||||
file_types = set(['rb'])
|
||||
description = _('Read metadata from %s files')%'RB'
|
||||
author = 'Ashish Kulkarni'
|
||||
|
||||
|
||||
def get_metadata(self, stream, ftype):
|
||||
from calibre.ebooks.metadata.rb import get_metadata
|
||||
return get_metadata(stream)
|
||||
|
||||
class EPUBMetadataReader(MetadataReaderPlugin):
|
||||
|
||||
|
||||
name = 'Read EPUB metadata'
|
||||
file_types = set(['epub'])
|
||||
description = _('Read metadata from %s files')%'EPUB'
|
||||
|
||||
|
||||
def get_metadata(self, stream, ftype):
|
||||
from calibre.ebooks.metadata.epub import get_metadata
|
||||
return get_metadata(stream)
|
||||
|
||||
class HTMLMetadataReader(MetadataReaderPlugin):
|
||||
|
||||
|
||||
name = 'Read HTML metadata'
|
||||
file_types = set(['html'])
|
||||
description = _('Read metadata from %s files')%'HTML'
|
||||
|
||||
|
||||
def get_metadata(self, stream, ftype):
|
||||
from calibre.ebooks.metadata.html import get_metadata
|
||||
return get_metadata(stream)
|
||||
|
||||
class MOBIMetadataReader(MetadataReaderPlugin):
|
||||
|
||||
|
||||
name = 'Read MOBI metadata'
|
||||
file_types = set(['mobi', 'prc', 'azw'])
|
||||
description = _('Read metadata from %s files')%'MOBI'
|
||||
|
||||
|
||||
def get_metadata(self, stream, ftype):
|
||||
from calibre.ebooks.mobi.reader import get_metadata
|
||||
return get_metadata(stream)
|
||||
|
||||
|
||||
class TOPAZMetadataReader(MetadataReaderPlugin):
|
||||
|
||||
|
||||
name = 'Read Topaz metadata'
|
||||
file_types = set(['tpz', 'azw1'])
|
||||
description = _('Read metadata from %s files')%'MOBI'
|
||||
|
||||
|
||||
def get_metadata(self, stream, ftype):
|
||||
from calibre.ebooks.metadata.topaz import get_metadata
|
||||
return get_metadata(stream)
|
||||
|
||||
class ODTMetadataReader(MetadataReaderPlugin):
|
||||
|
||||
|
||||
name = 'Read ODT metadata'
|
||||
file_types = set(['odt'])
|
||||
description = _('Read metadata from %s files')%'ODT'
|
||||
|
||||
|
||||
def get_metadata(self, stream, ftype):
|
||||
from calibre.ebooks.metadata.odt import get_metadata
|
||||
return get_metadata(stream)
|
||||
|
||||
|
||||
class TXTMetadataReader(MetadataReaderPlugin):
|
||||
|
||||
|
||||
name = 'Read TXT metadata'
|
||||
file_types = set(['txt'])
|
||||
description = _('Read metadata from %s files') % 'TXT'
|
||||
|
||||
|
||||
def get_metadata(self, stream, ftype):
|
||||
from calibre.ebooks.metadata.txt import get_metadata
|
||||
return get_metadata(stream)
|
||||
|
||||
class LRXMetadataReader(MetadataReaderPlugin):
|
||||
|
||||
|
||||
name = 'Read LRX metadata'
|
||||
file_types = set(['lrx'])
|
||||
description = _('Read metadata from %s files')%'LRX'
|
||||
|
||||
|
||||
def get_metadata(self, stream, ftype):
|
||||
from calibre.ebooks.metadata.lrx import get_metadata
|
||||
return get_metadata(stream)
|
||||
|
||||
class ComicMetadataReader(MetadataReaderPlugin):
|
||||
|
||||
|
||||
name = 'Read comic metadata'
|
||||
file_types = set(['cbr', 'cbz'])
|
||||
description = _('Extract cover from comic files')
|
||||
|
||||
|
||||
def get_metadata(self, stream, ftype):
|
||||
if ftype == 'cbr':
|
||||
from calibre.libunrar import extract_member as extract_first
|
||||
extract_first
|
||||
else:
|
||||
from calibre.libunzip import extract_member as extract_first
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
ret = extract_first(stream)
|
||||
mi = MetaInformation(None, None)
|
||||
if ret is not None:
|
||||
@ -199,65 +200,65 @@ class ComicMetadataReader(MetadataReaderPlugin):
|
||||
ext = os.path.splitext(path)[1][1:]
|
||||
mi.cover_data = (ext.lower(), data)
|
||||
return mi
|
||||
|
||||
|
||||
class ZipMetadataReader(MetadataReaderPlugin):
|
||||
|
||||
|
||||
name = 'Read ZIP metadata'
|
||||
file_types = set(['zip', 'oebzip'])
|
||||
description = _('Read metadata from ebooks in ZIP archives')
|
||||
|
||||
|
||||
def get_metadata(self, stream, ftype):
|
||||
from calibre.ebooks.metadata.zip import get_metadata
|
||||
return get_metadata(stream)
|
||||
|
||||
class RARMetadataReader(MetadataReaderPlugin):
|
||||
|
||||
|
||||
name = 'Read RAR metadata'
|
||||
file_types = set(['rar'])
|
||||
description = _('Read metadata from ebooks in RAR archives')
|
||||
|
||||
|
||||
def get_metadata(self, stream, ftype):
|
||||
from calibre.ebooks.metadata.rar import get_metadata
|
||||
return get_metadata(stream)
|
||||
|
||||
|
||||
class EPUBMetadataWriter(MetadataWriterPlugin):
|
||||
|
||||
|
||||
name = 'Set EPUB metadata'
|
||||
file_types = set(['epub'])
|
||||
description = _('Set metadata in %s files')%'EPUB'
|
||||
|
||||
|
||||
def set_metadata(self, stream, mi, type):
|
||||
from calibre.ebooks.metadata.epub import set_metadata
|
||||
set_metadata(stream, mi)
|
||||
|
||||
|
||||
class LRFMetadataWriter(MetadataWriterPlugin):
|
||||
|
||||
|
||||
name = 'Set LRF metadata'
|
||||
file_types = set(['lrf'])
|
||||
description = _('Set metadata in %s files')%'LRF'
|
||||
|
||||
|
||||
def set_metadata(self, stream, mi, type):
|
||||
from calibre.ebooks.lrf.meta import set_metadata
|
||||
set_metadata(stream, mi)
|
||||
|
||||
class RTFMetadataWriter(MetadataWriterPlugin):
|
||||
|
||||
|
||||
name = 'Set RTF metadata'
|
||||
file_types = set(['rtf'])
|
||||
description = _('Set metadata in %s files')%'RTF'
|
||||
|
||||
|
||||
def set_metadata(self, stream, mi, type):
|
||||
from calibre.ebooks.metadata.rtf import set_metadata
|
||||
set_metadata(stream, mi)
|
||||
|
||||
class MOBIMetadataWriter(MetadataWriterPlugin):
|
||||
|
||||
|
||||
name = 'Set MOBI metadata'
|
||||
file_types = set(['mobi', 'prc', 'azw'])
|
||||
description = _('Set metadata in %s files')%'MOBI'
|
||||
author = 'Marshall T. Vandegrift'
|
||||
|
||||
|
||||
def set_metadata(self, stream, mi, type):
|
||||
from calibre.ebooks.metadata.mobi import set_metadata
|
||||
set_metadata(stream, mi)
|
||||
@ -267,14 +268,16 @@ from calibre.ebooks.epub.input import EPUBInput
|
||||
from calibre.ebooks.mobi.input import MOBIInput
|
||||
from calibre.ebooks.pdf.input import PDFInput
|
||||
from calibre.ebooks.txt.input import TXTInput
|
||||
from calibre.ebooks.html.input import HTMLInput
|
||||
from calibre.ebooks.oeb.output import OEBOutput
|
||||
from calibre.ebooks.txt.output import TXTOutput
|
||||
from calibre.ebooks.pdf.output import PDFOutput
|
||||
from calibre.customize.profiles import input_profiles, output_profiles
|
||||
|
||||
plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDFInput, TXTInput, OEBOutput, TXTOutput, PDFOutput]
|
||||
plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDFInput, HTMLInput,
|
||||
TXTInput, OEBOutput, TXTOutput, PDFOutput]
|
||||
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
|
||||
x.__name__.endswith('MetadataReader')]
|
||||
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
|
||||
x.__name__.endswith('MetadataWriter')]
|
||||
plugins += input_profiles + output_profiles
|
||||
plugins += input_profiles + output_profiles
|
||||
|
@ -163,9 +163,9 @@ class InputFormatPlugin(Plugin):
|
||||
for x in os.listdir('.'):
|
||||
shutil.rmtree(x) if os.path.isdir(x) else os.remove(x)
|
||||
|
||||
|
||||
ret = self.convert(stream, options, file_ext,
|
||||
log, accelerators)
|
||||
|
||||
if options.debug_input is not None:
|
||||
options.debug_input = os.path.abspath(options.debug_input)
|
||||
if not os.path.exists(options.debug_input):
|
||||
|
@ -17,7 +17,7 @@ def tostring(root, strip_comments=False, pretty_print=False):
|
||||
root.set('xmlns', 'http://www.w3.org/1999/xhtml')
|
||||
root.set('{http://www.w3.org/1999/xhtml}xlink', 'http://www.w3.org/1999/xlink')
|
||||
for x in root.iter():
|
||||
if x.tag.rpartition('}')[-1].lower() == 'svg':
|
||||
if hasattr(x.tag, 'rpartition') and x.tag.rpartition('}')[-1].lower() == 'svg':
|
||||
x.set('xmlns', 'http://www.w3.org/2000/svg')
|
||||
|
||||
ans = _tostring(root, encoding='utf-8', pretty_print=pretty_print)
|
||||
|
@ -11,14 +11,12 @@ __docformat__ = 'restructuredtext en'
|
||||
Input plugin for HTML or OPF ebooks.
|
||||
'''
|
||||
|
||||
import os, re, sys, cStringIO
|
||||
import os, re, sys
|
||||
from urlparse import urlparse, urlunparse
|
||||
from urllib import unquote
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from calibre.ebooks.metadata.meta import get_metadata
|
||||
from calibre.ebooks.metadata.opf2 import OPF, OPFCreator
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.customize.conversion import OptionRecommendation
|
||||
from calibre import unicode_path
|
||||
@ -213,72 +211,21 @@ def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None)
|
||||
sys.setrecursionlimit(orec)
|
||||
|
||||
|
||||
def opf_traverse(opf_reader, verbose=0, encoding=None):
|
||||
'''
|
||||
Return a list of :class:`HTMLFile` objects in the order specified by the
|
||||
`<spine>` element of the OPF.
|
||||
|
||||
:param opf_reader: An :class:`calibre.ebooks.metadata.opf2.OPF` instance.
|
||||
:param encoding: Specify character encoding of HTML files. If `None` it is
|
||||
auto-detected.
|
||||
'''
|
||||
if not opf_reader.spine:
|
||||
raise ValueError('OPF does not have a spine')
|
||||
flat = []
|
||||
for path in opf_reader.spine.items():
|
||||
path = os.path.abspath(path)
|
||||
if path not in flat:
|
||||
flat.append(os.path.abspath(path))
|
||||
for item in opf_reader.manifest:
|
||||
if 'html' in item.mime_type:
|
||||
path = os.path.abspath(item.path)
|
||||
if path not in flat:
|
||||
flat.append(path)
|
||||
for i, path in enumerate(flat):
|
||||
if not os.path.exists(path):
|
||||
path = path.replace('&', '%26')
|
||||
if os.path.exists(path):
|
||||
flat[i] = path
|
||||
for item in opf_reader.itermanifest():
|
||||
item.set('href', item.get('href').replace('&', '%26'))
|
||||
ans = []
|
||||
for path in flat:
|
||||
if os.path.exists(path):
|
||||
ans.append(HTMLFile(path, 0, encoding, verbose))
|
||||
else:
|
||||
print 'WARNING: OPF spine item %s does not exist'%path
|
||||
ans = [f for f in ans if not f.is_binary]
|
||||
return ans
|
||||
|
||||
def search_for_opf(dir):
|
||||
for f in os.listdir(dir):
|
||||
if f.lower().endswith('.opf'):
|
||||
return OPF(open(os.path.join(dir, f), 'rb'), dir)
|
||||
|
||||
def get_filelist(htmlfile, dir, opts, log):
|
||||
'''
|
||||
Build list of files referenced by html file or try to detect and use an
|
||||
OPF file instead.
|
||||
'''
|
||||
print 'Building file list...'
|
||||
opf = search_for_opf(dir)
|
||||
filelist = None
|
||||
if opf is not None:
|
||||
try:
|
||||
filelist = opf_traverse(opf, verbose=opts.verbose,
|
||||
encoding=opts.input_encoding)
|
||||
except:
|
||||
pass
|
||||
if not filelist:
|
||||
filelist = traverse(htmlfile, max_levels=int(opts.max_levels),
|
||||
verbose=opts.verbose,
|
||||
encoding=opts.input_encoding)\
|
||||
[0 if opts.breadth_first else 1]
|
||||
log.info('Building file list...')
|
||||
filelist = traverse(htmlfile, max_levels=int(opts.max_levels),
|
||||
verbose=opts.verbose,
|
||||
encoding=opts.input_encoding)\
|
||||
[0 if opts.breadth_first else 1]
|
||||
if opts.verbose:
|
||||
log.debug('\tFound files...')
|
||||
for f in filelist:
|
||||
log.debug('\t\t', f)
|
||||
return opf, filelist
|
||||
return filelist
|
||||
|
||||
|
||||
class HTMLInput(InputFormatPlugin):
|
||||
@ -309,34 +256,32 @@ class HTMLInput(InputFormatPlugin):
|
||||
|
||||
def convert(self, stream, opts, file_ext, log,
|
||||
accelerators):
|
||||
from calibre.ebooks.metadata.meta import get_metadata
|
||||
|
||||
basedir = os.getcwd()
|
||||
|
||||
if hasattr(stream, 'name'):
|
||||
basedir = os.path.dirname(stream.name)
|
||||
if file_ext == 'opf':
|
||||
opf = OPF(stream, basedir)
|
||||
filelist = opf_traverse(opf, verbose=opts.verbose,
|
||||
encoding=opts.input_encoding)
|
||||
mi = MetaInformation(opf)
|
||||
opfpath = stream.name
|
||||
else:
|
||||
opf, filelist = get_filelist(stream.name, basedir, opts, log)
|
||||
mi = MetaInformation(opf)
|
||||
mi.smart_update(get_metadata(stream, 'html'))
|
||||
filelist = get_filelist(stream.name, basedir, opts, log)
|
||||
mi = get_metadata(stream, 'html')
|
||||
mi = OPFCreator(os.getcwdu(), mi)
|
||||
mi.guide = None
|
||||
entries = [(f.path, 'application/xhtml+xml') for f in filelist]
|
||||
mi.create_manifest(entries)
|
||||
mi.create_spine([f.path for f in filelist])
|
||||
|
||||
mi = OPFCreator(os.getcwdu(), mi)
|
||||
mi.guide = None
|
||||
entries = [(f.path, 'application/xhtml+xml') for f in filelist]
|
||||
mi.create_manifest(entries)
|
||||
mi.create_spine([f.path for f in filelist])
|
||||
|
||||
tocbuf = cStringIO.StringIO()
|
||||
mi.render(open('metadata.opf', 'wb'), tocbuf, 'toc.ncx')
|
||||
toc = tocbuf.getvalue()
|
||||
if toc:
|
||||
open('toc.ncx', 'wb').write(toc)
|
||||
mi.render(open('metadata.opf', 'wb'))
|
||||
opfpath = os.path.abspath('metadata.opf')
|
||||
|
||||
from calibre.ebooks.conversion.plumber import create_oebbook
|
||||
return create_oebbook(log, os.path.abspath('metadata.opf'))
|
||||
|
||||
|
||||
oeb = create_oebbook(log, opfpath)
|
||||
|
||||
from calibre.ebooks.oeb.transforms.package import Package
|
||||
Package(os.getcwdu())(oeb, opts)
|
||||
|
||||
return oeb
|
||||
|
||||
|
||||
|
@ -573,7 +573,7 @@ class OEBReader(object):
|
||||
item = self._find_ncx(opf)
|
||||
self._toc_from_opf(opf, item)
|
||||
self._pages_from_opf(opf, item)
|
||||
self._ensure_cover_image()
|
||||
#self._ensure_cover_image()
|
||||
|
||||
|
||||
def main(argv=sys.argv):
|
||||
|
@ -6,13 +6,14 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
import os, re
|
||||
from urllib import unquote as urlunquote
|
||||
from functools import partial
|
||||
|
||||
from lxml import etree
|
||||
import cssutils
|
||||
|
||||
from calibre import sanitize_file_name
|
||||
from calibre.constants import islinux
|
||||
from calibre.ebooks.oeb.base import OEB_DOCS, urlnormalize, urldefrag, \
|
||||
rewrite_links
|
||||
@ -36,15 +37,21 @@ class Package(object):
|
||||
self.new_base_path = os.path.abspath(base)
|
||||
|
||||
def rewrite_links_in(self, item):
|
||||
base = os.path.join(self.new_base_path, *item.href.split('/'))
|
||||
old_href = item.old_href.split('#')[0]
|
||||
new_href = item.href.split('#')[0]
|
||||
base = os.path.join(self.old_base_path, *old_href.split('/'))
|
||||
base = os.path.dirname(base)
|
||||
self.log.debug('\tRewriting links in', base+'/'+
|
||||
item.href.rpartition('/')[-1])
|
||||
new_base = os.path.join(self.new_base_path, *new_href.split('/'))
|
||||
new_base = os.path.dirname(new_base)
|
||||
|
||||
if etree.iselement(item.data):
|
||||
self.rewrite_links_in_xml(item.data, base)
|
||||
self.rewrite_links_in_xml(item.data, base, new_base)
|
||||
elif hasattr(item.data, 'cssText'):
|
||||
self.rewrite_links_in_css(item.data, base)
|
||||
self.rewrite_links_in_css(item.data, base, new_base)
|
||||
|
||||
def link_replacer(self, link_, base=''):
|
||||
def link_replacer(self, link_, base='', new_base=''):
|
||||
link = urlnormalize(link_)
|
||||
link, frag = urldefrag(link)
|
||||
link = urlunquote(link).replace('/', os.sep)
|
||||
@ -55,20 +62,33 @@ class Package(object):
|
||||
link = link.lower()
|
||||
if link not in self.map:
|
||||
return link_
|
||||
nlink = os.path.relpath(self.map[link], base)
|
||||
nlink = os.path.relpath(self.map[link], new_base)
|
||||
if frag:
|
||||
nlink = '#'.join(nlink, frag)
|
||||
nlink = '#'.join((nlink, frag))
|
||||
return nlink.replace(os.sep, '/')
|
||||
|
||||
def rewrite_links_in_css(self, sheet, base):
|
||||
repl = partial(self.link_replacer, base=base)
|
||||
def rewrite_links_in_css(self, sheet, base, new_base):
|
||||
repl = partial(self.link_replacer, base=base, new_base=new_base)
|
||||
cssutils.replaceUrls(sheet, repl)
|
||||
|
||||
def rewrite_links_in_xml(self, root, base):
|
||||
repl = partial(self.link_replacer, base=base)
|
||||
def rewrite_links_in_xml(self, root, base, new_base):
|
||||
repl = partial(self.link_replacer, base=base, new_base=new_base)
|
||||
rewrite_links(root, repl)
|
||||
|
||||
def move_manifest_item(self, item):
|
||||
def uniqify_name(self, new_href, hrefs):
|
||||
c = 0
|
||||
while new_href in hrefs:
|
||||
c += 1
|
||||
parts = new_href.split('/')
|
||||
name, ext = os.path.splitext(parts[-1])
|
||||
name = re.sub(r'_\d+$', '', name)
|
||||
name += '_%d'%c
|
||||
parts[-1] = name + ext
|
||||
new_href = '/'.join(parts)
|
||||
return new_href
|
||||
|
||||
|
||||
def move_manifest_item(self, item, hrefs):
|
||||
item.data # Make sure the data has been loaded and cached
|
||||
old_abspath = os.path.join(self.old_base_path,
|
||||
*(urldefrag(item.href)[0].split('/')))
|
||||
@ -79,11 +99,17 @@ class Package(object):
|
||||
new_href = 'content/'
|
||||
elif item.href.lower().endswith('.ncx'):
|
||||
new_href = ''
|
||||
new_href += bname
|
||||
new_href += sanitize_file_name(bname)
|
||||
|
||||
if new_href in hrefs:
|
||||
new_href = self.uniqify_name(new_href, hrefs)
|
||||
hrefs.add(new_href)
|
||||
|
||||
new_abspath = os.path.join(self.new_base_path, *new_href.split('/'))
|
||||
new_abspath = os.path.abspath(new_abspath)
|
||||
item.old_href = self.oeb.manifest.hrefs.pop(item.href).href
|
||||
item.href = new_href
|
||||
self.oeb.manifest.hrefs[item.href] = item
|
||||
if not islinux:
|
||||
old_abspath, new_abspath = old_abspath.lower(), new_abspath.lower()
|
||||
if old_abspath != new_abspath:
|
||||
@ -91,25 +117,33 @@ class Package(object):
|
||||
|
||||
def rewrite_links_in_toc(self, toc):
|
||||
if toc.href:
|
||||
toc.href = self.link_replacer(toc.href, base=self.new_base_path)
|
||||
toc.href = self.link_replacer(toc.href, base=self.old_base_path,
|
||||
new_base=self.new_base_path)
|
||||
|
||||
for x in toc:
|
||||
self.rewrite_links_in_toc(x)
|
||||
|
||||
def __call__(self, oeb, context):
|
||||
self.map = {}
|
||||
self.log = self.oeb.log
|
||||
self.log = oeb.log
|
||||
self.oeb = oeb
|
||||
self.old_base_path = os.path.abspath(oeb.container.rootdir)
|
||||
|
||||
hrefs = set([])
|
||||
for item in self.oeb.manifest:
|
||||
self.move_manifest_item(item)
|
||||
self.move_manifest_item(item, hrefs)
|
||||
|
||||
self.log.debug('Rewriting links in OEB documents...')
|
||||
for item in self.oeb.manifest:
|
||||
self.rewrite_links_in(item)
|
||||
|
||||
if getattr(oeb.toc, 'nodes', False):
|
||||
self.log.debug('Rewriting links in TOC...')
|
||||
self.rewrite_links_in_toc(oeb.toc)
|
||||
|
||||
if hasattr(oeb, 'guide'):
|
||||
self.log.debug('Rewriting links in guide...')
|
||||
for ref in oeb.guide.values():
|
||||
ref.href = self.link_replacer(ref.href, base=self.new_base_path)
|
||||
ref.href = self.link_replacer(ref.href,
|
||||
base=self.old_base_path,
|
||||
new_base=self.new_base_path)
|
||||
|
@ -48,7 +48,8 @@ class OEBWriter(object):
|
||||
pretty_print=pretty_print)
|
||||
|
||||
def __call__(self, oeb, path):
|
||||
"""Read the book in the :class:`OEBBook` object :param:`oeb` to a file
|
||||
"""
|
||||
Read the book in the :class:`OEBBook` object :param:`oeb` to a file
|
||||
at :param:`path`.
|
||||
"""
|
||||
version = int(self.version[0])
|
||||
|
@ -1,11 +1,11 @@
|
||||
#!/usr/bin/env python
|
||||
#!/usr/bin/env python
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
'''
|
||||
CLI for downloading feeds.
|
||||
'''
|
||||
|
||||
import sys, os, logging
|
||||
import sys, os
|
||||
from calibre.web.feeds.recipes import get_builtin_recipe, compile_recipe, titles
|
||||
from calibre.web.fetch.simple import option_parser as _option_parser
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
@ -14,13 +14,13 @@ from calibre.utils.config import Config, StringConfig
|
||||
def config(defaults=None):
|
||||
desc = _('Options to control the fetching of periodical content from the web.')
|
||||
c = Config('feeds2disk', desc) if defaults is None else StringConfig(defaults, desc)
|
||||
|
||||
|
||||
web2disk = c.add_group('web2disk', _('Customize the download engine'))
|
||||
web2disk('timeout', ['-t', '--timeout'], default=10.0,
|
||||
web2disk('timeout', ['-t', '--timeout'], default=10.0,
|
||||
help=_('Timeout in seconds to wait for a response from the server. Default: %default s'),)
|
||||
web2disk('delay', ['--delay'], default=0,
|
||||
web2disk('delay', ['--delay'], default=0,
|
||||
help=_('Minimum interval in seconds between consecutive fetches. Default is %default s'))
|
||||
web2disk('encoding', ['--encoding'], default=None,
|
||||
web2disk('encoding', ['--encoding'], default=None,
|
||||
help=_('The character encoding for the websites you are trying to download. The default is to try and guess the encoding.'))
|
||||
web2disk('match_regexps', ['--match-regexp'], default=[], action='append',
|
||||
help=_('Only links that match this regular expression will be followed. This option can be specified multiple times, in which case as long as a link matches any one regexp, it will be followed. By default all links are followed.'))
|
||||
@ -28,42 +28,42 @@ def config(defaults=None):
|
||||
help=_('Any link that matches this regular expression will be ignored. This option can be specified multiple times, in which case as long as any regexp matches a link, it will be ignored.By default, no links are ignored. If both --filter-regexp and --match-regexp are specified, then --filter-regexp is applied first.'))
|
||||
web2disk('no_stylesheets', ['--dont-download-stylesheets'], action='store_true', default=False,
|
||||
help=_('Do not download CSS stylesheets.'))
|
||||
|
||||
|
||||
c.add_opt('feeds', ['--feeds'], default=None,
|
||||
help=_('''Specify a list of feeds to download. For example:
|
||||
help=_('''Specify a list of feeds to download. For example:
|
||||
"['http://feeds.newsweek.com/newsweek/TopNews', 'http://feeds.newsweek.com/headlines/politics']"
|
||||
If you specify this option, any argument to %prog is ignored and a default recipe is used to download the feeds.'''))
|
||||
c.add_opt('verbose', ['-v', '--verbose'], default=0, action='count',
|
||||
help=_('''Be more verbose while processing.'''))
|
||||
c.add_opt('title', ['--title'], default=None,
|
||||
help=_('The title for this recipe. Used as the title for any ebooks created from the downloaded feeds.'))
|
||||
c.add_opt('username', ['-u', '--username'], default=None,
|
||||
c.add_opt('username', ['-u', '--username'], default=None,
|
||||
help=_('Username for sites that require a login to access content.'))
|
||||
c.add_opt('password', ['-p', '--password'], default=None,
|
||||
c.add_opt('password', ['-p', '--password'], default=None,
|
||||
help=_('Password for sites that require a login to access content.'))
|
||||
c.add_opt('lrf', ['--lrf'], default=False, action='store_true',
|
||||
c.add_opt('lrf', ['--lrf'], default=False, action='store_true',
|
||||
help='Optimize fetching for subsequent conversion to LRF.')
|
||||
c.add_opt('epub', ['--epub'], default=False, action='store_true',
|
||||
c.add_opt('epub', ['--epub'], default=False, action='store_true',
|
||||
help='Optimize fetching for subsequent conversion to EPUB.')
|
||||
c.add_opt('mobi', ['--mobi'], default=False, action='store_true',
|
||||
c.add_opt('mobi', ['--mobi'], default=False, action='store_true',
|
||||
help='Optimize fetching for subsequent conversion to MOBI.')
|
||||
c.add_opt('recursions', ['--recursions'], default=0,
|
||||
help=_('Number of levels of links to follow on webpages that are linked to from feeds. Defaul %default'))
|
||||
c.add_opt('output_dir', ['--output-dir'], default='.',
|
||||
c.add_opt('output_dir', ['--output-dir'], default='.',
|
||||
help=_('The directory in which to store the downloaded feeds. Defaults to the current directory.'))
|
||||
c.add_opt('no_progress_bar', ['--no-progress-bar'], default=False, action='store_true',
|
||||
help=_("Don't show the progress bar"))
|
||||
c.add_opt('debug', ['--debug'], action='store_true', default=False,
|
||||
help=_('Very verbose output, useful for debugging.'))
|
||||
c.add_opt('test', ['--test'], action='store_true', default=False,
|
||||
c.add_opt('test', ['--test'], action='store_true', default=False,
|
||||
help=_('Useful for recipe development. Forces max_articles_per_feed to 2 and downloads at most 2 feeds.'))
|
||||
|
||||
|
||||
return c
|
||||
|
||||
|
||||
USAGE=_('''\
|
||||
%%prog [options] ARG
|
||||
|
||||
%%prog parses an online source of articles, like an RSS or ATOM feed and
|
||||
%%prog parses an online source of articles, like an RSS or ATOM feed and
|
||||
fetches the article contents organized in a nice hierarchy.
|
||||
|
||||
ARG can be one of:
|
||||
@ -85,9 +85,9 @@ def option_parser(usage=USAGE):
|
||||
p.remove_option('--verbose')
|
||||
p.remove_option('--max-files')
|
||||
p.subsume('WEB2DISK OPTIONS', _('Options to control web2disk (used to fetch websites linked from feeds)'))
|
||||
|
||||
|
||||
p.add_option('--feeds', default=None,
|
||||
help=_('''Specify a list of feeds to download. For example:
|
||||
help=_('''Specify a list of feeds to download. For example:
|
||||
"['http://feeds.newsweek.com/newsweek/TopNews', 'http://feeds.newsweek.com/headlines/politics']"
|
||||
If you specify this option, any argument to %prog is ignored and a default recipe is used to download the feeds.'''))
|
||||
p.add_option('--verbose', default=False, action='store_true',
|
||||
@ -99,70 +99,62 @@ If you specify this option, any argument to %prog is ignored and a default recip
|
||||
p.add_option('--lrf', default=False, action='store_true', help='Optimize fetching for subsequent conversion to LRF.')
|
||||
p.add_option('--recursions', default=0, type='int',
|
||||
help=_('Number of levels of links to follow on webpages that are linked to from feeds. Defaul %default'))
|
||||
p.add_option('--output-dir', default=os.getcwd(),
|
||||
p.add_option('--output-dir', default=os.getcwd(),
|
||||
help=_('The directory in which to store the downloaded feeds. Defaults to the current directory.'))
|
||||
p.add_option('--no-progress-bar', dest='no_progress_bar', default=False, action='store_true',
|
||||
help=_('Dont show the progress bar'))
|
||||
p.add_option('--debug', action='store_true', default=False,
|
||||
help=_('Very verbose output, useful for debugging.'))
|
||||
p.add_option('--test', action='store_true', default=False,
|
||||
p.add_option('--test', action='store_true', default=False,
|
||||
help=_('Useful for recipe development. Forces max_articles_per_feed to 2 and downloads at most 2 feeds.'))
|
||||
|
||||
|
||||
return p
|
||||
|
||||
|
||||
class RecipeError(Exception):
|
||||
pass
|
||||
|
||||
def run_recipe(opts, recipe_arg, parser, notification=None, handler=None):
|
||||
def run_recipe(opts, recipe_arg, parser, notification=None):
|
||||
if notification is None:
|
||||
from calibre.utils.terminfo import TerminalController, ProgressBar
|
||||
term = TerminalController(sys.stdout)
|
||||
pb = ProgressBar(term, _('Fetching feeds...'), no_progress_bar=opts.no_progress_bar)
|
||||
notification = pb.update
|
||||
|
||||
|
||||
recipe = None
|
||||
if opts.feeds is not None:
|
||||
recipe = BasicNewsRecipe
|
||||
else:
|
||||
try:
|
||||
if os.access(recipe_arg, os.R_OK):
|
||||
recipe = compile_recipe(open(recipe_arg).read())
|
||||
recipe = compile_recipe(open(recipe_arg).read())
|
||||
else:
|
||||
raise Exception('not file')
|
||||
except:
|
||||
recipe = get_builtin_recipe(recipe_arg)
|
||||
if recipe is None:
|
||||
recipe = compile_recipe(recipe_arg)
|
||||
|
||||
|
||||
if recipe is None:
|
||||
raise RecipeError(recipe_arg+ ' is an invalid recipe')
|
||||
|
||||
|
||||
if handler is None:
|
||||
from calibre import ColoredFormatter
|
||||
handler = logging.StreamHandler(sys.stdout)
|
||||
handler.setLevel(logging.DEBUG if opts.debug else logging.INFO if opts.verbose else logging.WARN)
|
||||
handler.setFormatter(ColoredFormatter('%(levelname)s: %(message)s\n')) # The trailing newline is need because of the progress bar
|
||||
logging.getLogger('feeds2disk').addHandler(handler)
|
||||
|
||||
|
||||
recipe = recipe(opts, parser, notification)
|
||||
|
||||
|
||||
if not os.path.exists(recipe.output_dir):
|
||||
os.makedirs(recipe.output_dir)
|
||||
recipe.download(for_lrf=True)
|
||||
|
||||
|
||||
return recipe
|
||||
|
||||
def main(args=sys.argv, notification=None, handler=None):
|
||||
def main(args=sys.argv, notification=None):
|
||||
p = option_parser()
|
||||
opts, args = p.parse_args(args=args[1:])
|
||||
|
||||
|
||||
if len(args) != 1 and opts.feeds is None:
|
||||
p.print_help()
|
||||
return 1
|
||||
recipe_arg = args[0] if len(args) > 0 else None
|
||||
run_recipe(opts, recipe_arg, p, notification=notification, handler=handler)
|
||||
|
||||
run_recipe(opts, recipe_arg, p, notification=notification)
|
||||
|
||||
return 0
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
@ -7,7 +7,7 @@ Defines various abstract base classes that can be subclassed to create powerful
|
||||
__docformat__ = "restructuredtext en"
|
||||
|
||||
|
||||
import logging, os, cStringIO, time, traceback, re, urlparse, sys
|
||||
import os, time, traceback, re, urlparse, sys
|
||||
from collections import defaultdict
|
||||
from functools import partial
|
||||
from contextlib import nested, closing
|
||||
@ -27,6 +27,7 @@ from calibre.web.feeds import feed_from_xml, templates, feeds_from_index, Feed
|
||||
from calibre.web.fetch.simple import option_parser as web2disk_option_parser
|
||||
from calibre.web.fetch.simple import RecursiveFetcher
|
||||
from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending
|
||||
from calibre.utils.logging import Log
|
||||
from calibre.ptempfile import PersistentTemporaryFile, \
|
||||
PersistentTemporaryDirectory
|
||||
|
||||
@ -423,7 +424,7 @@ class BasicNewsRecipe(object):
|
||||
'''
|
||||
raise NotImplementedError
|
||||
|
||||
def get_obfuscated_article(self, url, logger):
|
||||
def get_obfuscated_article(self, url):
|
||||
'''
|
||||
If you set :member:`articles_are_obfuscated` this method is called with
|
||||
every article URL. It should return the path to a file on the filesystem
|
||||
@ -443,6 +444,7 @@ class BasicNewsRecipe(object):
|
||||
:param parser: Command line option parser. Used to intelligently merge options.
|
||||
:param progress_reporter: A Callable that takes two arguments: progress (a number between 0 and 1) and a string message. The message should be optional.
|
||||
'''
|
||||
self.log = Log()
|
||||
if not isinstance(self.title, unicode):
|
||||
self.title = unicode(self.title, 'utf-8', 'replace')
|
||||
|
||||
@ -455,7 +457,6 @@ class BasicNewsRecipe(object):
|
||||
|
||||
|
||||
if self.debug:
|
||||
logging.getLogger('feeds2disk').setLevel(logging.DEBUG)
|
||||
self.verbose = True
|
||||
self.report_progress = progress_reporter
|
||||
|
||||
@ -560,20 +561,20 @@ class BasicNewsRecipe(object):
|
||||
res = self.build_index()
|
||||
self.report_progress(1, _('Download finished'))
|
||||
if self.failed_downloads:
|
||||
self.log_warning(_('Failed to download the following articles:'))
|
||||
self.log.warning(_('Failed to download the following articles:'))
|
||||
for feed, article, debug in self.failed_downloads:
|
||||
self.log_warning(article.title+_(' from ')+feed.title)
|
||||
self.log_debug(article.url)
|
||||
self.log_debug(debug)
|
||||
self.log.warning(article.title+_(' from ')+feed.title)
|
||||
self.log.debug(article.url)
|
||||
self.log.debug(debug)
|
||||
if self.partial_failures:
|
||||
self.log_warning(_('Failed to download parts of the following articles:'))
|
||||
self.log.warning(_('Failed to download parts of the following articles:'))
|
||||
for feed, atitle, aurl, debug in self.partial_failures:
|
||||
self.log_warning(atitle + _(' from ') + feed)
|
||||
self.log_debug(aurl)
|
||||
self.log_warning(_('\tFailed links:'))
|
||||
self.log.warning(atitle + _(' from ') + feed)
|
||||
self.log.debug(aurl)
|
||||
self.log.warning(_('\tFailed links:'))
|
||||
for l, tb in debug:
|
||||
self.log_warning(l)
|
||||
self.log_debug(tb)
|
||||
self.log.warning(l)
|
||||
self.log.debug(tb)
|
||||
return res
|
||||
finally:
|
||||
self.cleanup()
|
||||
@ -636,20 +637,11 @@ class BasicNewsRecipe(object):
|
||||
extra_css=self.extra_css).render(doctype='xhtml')
|
||||
|
||||
|
||||
def create_logger(self, feed_number, article_number):
|
||||
logger = logging.getLogger('feeds2disk.article_%d_%d'%(feed_number, article_number))
|
||||
out = cStringIO.StringIO()
|
||||
handler = logging.StreamHandler(out)
|
||||
handler.setFormatter(logging.Formatter('%(levelname)s: %(message)s'))
|
||||
handler.setLevel(logging.INFO if self.verbose else logging.WARNING)
|
||||
if self.debug:
|
||||
handler.setLevel(logging.DEBUG)
|
||||
logger.addHandler(handler)
|
||||
return logger, out
|
||||
|
||||
def _fetch_article(self, url, dir, logger, f, a, num_of_feeds):
|
||||
def _fetch_article(self, url, dir, f, a, num_of_feeds):
|
||||
self.web2disk_options.browser = self.get_browser() if self.multithreaded_fetch else self.browser
|
||||
fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map, self.css_map, (url, f, a, num_of_feeds))
|
||||
fetcher = RecursiveFetcher(self.web2disk_options, self.log,
|
||||
self.image_map, self.css_map,
|
||||
(url, f, a, num_of_feeds))
|
||||
fetcher.base_dir = dir
|
||||
fetcher.current_dir = dir
|
||||
fetcher.show_progress = False
|
||||
@ -661,21 +653,21 @@ class BasicNewsRecipe(object):
|
||||
raise Exception(_('Could not fetch article. Run with --debug to see the reason'))
|
||||
return res, path, failures
|
||||
|
||||
def fetch_article(self, url, dir, logger, f, a, num_of_feeds):
|
||||
return self._fetch_article(url, dir, logger, f, a, num_of_feeds)
|
||||
def fetch_article(self, url, dir, f, a, num_of_feeds):
|
||||
return self._fetch_article(url, dir, f, a, num_of_feeds)
|
||||
|
||||
def fetch_obfuscated_article(self, url, dir, logger, f, a, num_of_feeds):
|
||||
path = os.path.abspath(self.get_obfuscated_article(url, logger))
|
||||
def fetch_obfuscated_article(self, url, dir, f, a, num_of_feeds):
|
||||
path = os.path.abspath(self.get_obfuscated_article(url))
|
||||
url = ('file:'+path) if iswindows else ('file://'+path)
|
||||
return self._fetch_article(url, dir, logger, f, a, num_of_feeds)
|
||||
return self._fetch_article(url, dir, f, a, num_of_feeds)
|
||||
|
||||
def fetch_embedded_article(self, article, dir, logger, f, a, num_of_feeds):
|
||||
def fetch_embedded_article(self, article, dir, f, a, num_of_feeds):
|
||||
templ = templates.EmbeddedContent()
|
||||
raw = templ.generate(article).render('html')
|
||||
with PersistentTemporaryFile('_feeds2disk.html') as pt:
|
||||
pt.write(raw)
|
||||
url = ('file:'+pt.name) if iswindows else ('file://'+pt.name)
|
||||
return self._fetch_article(url, dir, logger, f, a, num_of_feeds)
|
||||
return self._fetch_article(url, dir, f, a, num_of_feeds)
|
||||
|
||||
|
||||
def build_index(self):
|
||||
@ -716,7 +708,6 @@ class BasicNewsRecipe(object):
|
||||
art_dir = os.path.join(feed_dir, 'article_%d'%a)
|
||||
if not os.path.isdir(art_dir):
|
||||
os.makedirs(art_dir)
|
||||
logger, stream = self.create_logger(f, a)
|
||||
try:
|
||||
url = self.print_version(article.url)
|
||||
except NotImplementedError:
|
||||
@ -726,10 +717,9 @@ class BasicNewsRecipe(object):
|
||||
func, arg = (self.fetch_embedded_article, article) if self.use_embedded_content else \
|
||||
((self.fetch_obfuscated_article if self.articles_are_obfuscated \
|
||||
else self.fetch_article), url)
|
||||
req = WorkRequest(func, (arg, art_dir, logger, f, a, len(feed)),
|
||||
req = WorkRequest(func, (arg, art_dir, f, a, len(feed)),
|
||||
{}, (f, a), self.article_downloaded,
|
||||
self.error_in_article_download)
|
||||
req.stream = stream
|
||||
req.feed = feed
|
||||
req.article = article
|
||||
req.feed_dir = feed_dir
|
||||
@ -768,8 +758,8 @@ class BasicNewsRecipe(object):
|
||||
cu = self.get_cover_url()
|
||||
except Exception, err:
|
||||
cu = None
|
||||
self.log_error(_('Could not download cover: %s')%str(err))
|
||||
self.log_debug(traceback.format_exc())
|
||||
self.log.error(_('Could not download cover: %s')%str(err))
|
||||
self.log.debug(traceback.format_exc())
|
||||
if cu is not None:
|
||||
ext = cu.rpartition('.')[-1]
|
||||
if '?' in ext:
|
||||
@ -841,8 +831,8 @@ class BasicNewsRecipe(object):
|
||||
f.write(html.encode('utf-8'))
|
||||
renderer = render_html(hf)
|
||||
if renderer.tb is not None:
|
||||
self.logger.warning('Failed to render default cover')
|
||||
self.logger.debug(renderer.tb)
|
||||
self.log.warning('Failed to render default cover')
|
||||
self.log.debug(renderer.tb)
|
||||
else:
|
||||
cover_file.write(renderer.data)
|
||||
cover_file.flush()
|
||||
@ -863,7 +853,7 @@ class BasicNewsRecipe(object):
|
||||
manifest.append(os.path.join(dir, 'index.ncx'))
|
||||
cpath = getattr(self, 'cover_path', None)
|
||||
if cpath is None:
|
||||
pf = PersistentTemporaryFile('_recipe_cover.jpg')
|
||||
pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
|
||||
self.default_cover(pf)
|
||||
cpath = pf.name
|
||||
if cpath is not None and os.access(cpath, os.R_OK):
|
||||
@ -944,7 +934,7 @@ class BasicNewsRecipe(object):
|
||||
a = request.requestID[1]
|
||||
|
||||
article = request.article
|
||||
self.log_debug(_('\nDownloaded article %s from %s\n%s')%(article.title, article.url, request.stream.getvalue().decode('utf-8', 'ignore')))
|
||||
self.log.debug(_('\nDownloaded article %s from %s')%(article.title, article.url))
|
||||
article.orig_url = article.url
|
||||
article.url = 'article_%d/index.html'%a
|
||||
article.downloaded = True
|
||||
@ -956,11 +946,11 @@ class BasicNewsRecipe(object):
|
||||
|
||||
def error_in_article_download(self, request, traceback):
|
||||
self.jobs_done += 1
|
||||
self.log_error(_('Failed to download article: %s from %s\n')%(request.article.title, request.article.url))
|
||||
self.log.error(_('Failed to download article: %s from %s\n')%(request.article.title, request.article.url))
|
||||
debug = request.stream.getvalue().decode('utf-8', 'ignore')
|
||||
self.log_debug(debug)
|
||||
self.log_debug(traceback)
|
||||
self.log_debug('\n')
|
||||
self.log.debug(debug)
|
||||
self.log.debug(traceback)
|
||||
self.log.debug('\n')
|
||||
self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article download failed: %s')%request.article.title)
|
||||
self.failed_downloads.append((request.feed, request.article, debug))
|
||||
|
||||
@ -990,7 +980,7 @@ class BasicNewsRecipe(object):
|
||||
feed.populate_from_preparsed_feed(msg, [])
|
||||
feed.description = unicode(err)
|
||||
parsed_feeds.append(feed)
|
||||
self.log_exception(msg)
|
||||
self.log.exception(msg)
|
||||
|
||||
|
||||
return parsed_feeds
|
||||
@ -1057,7 +1047,7 @@ class CustomIndexRecipe(BasicNewsRecipe):
|
||||
index = os.path.abspath(self.custom_index())
|
||||
url = 'file:'+index if iswindows else 'file://'+index
|
||||
self.web2disk_options.browser = self.browser
|
||||
fetcher = RecursiveFetcher(self.web2disk_options, self.logger)
|
||||
fetcher = RecursiveFetcher(self.web2disk_options, self.log)
|
||||
fetcher.base_dir = self.output_dir
|
||||
fetcher.current_dir = self.output_dir
|
||||
fetcher.show_progress = False
|
||||
@ -1069,7 +1059,7 @@ class AutomaticNewsRecipe(BasicNewsRecipe):
|
||||
|
||||
keep_only_tags = [dict(name=['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])]
|
||||
|
||||
def fetch_embedded_article(self, article, dir, logger, f, a, num_of_feeds):
|
||||
def fetch_embedded_article(self, article, dir, f, a, num_of_feeds):
|
||||
if self.use_embedded_content:
|
||||
self.web2disk_options.keep_only_tags = []
|
||||
return BasicNewsRecipe.fetch_embedded_article(self, article, dir, logger, f, a, num_of_feeds)
|
||||
return BasicNewsRecipe.fetch_embedded_article(self, article, dir, f, a, num_of_feeds)
|
||||
|
@ -7,18 +7,19 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
Fetch a webpage and its links recursively. The webpages are saved to disk in
|
||||
UTF-8 encoding with any charset declarations removed.
|
||||
'''
|
||||
import sys, socket, os, urlparse, logging, re, time, copy, urllib2, threading, traceback
|
||||
import sys, socket, os, urlparse, re, time, copy, urllib2, threading, traceback
|
||||
from urllib import url2pathname, quote
|
||||
from threading import RLock
|
||||
from httplib import responses
|
||||
from PIL import Image
|
||||
from cStringIO import StringIO
|
||||
|
||||
from calibre import setup_cli_handlers, browser, sanitize_file_name, \
|
||||
from calibre import browser, sanitize_file_name, \
|
||||
relpath, unicode_path
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.utils.config import OptionParser
|
||||
from calibre.utils.logging import Log
|
||||
|
||||
class FetchError(Exception):
|
||||
pass
|
||||
@ -28,10 +29,10 @@ class closing(object):
|
||||
|
||||
def __init__(self, thing):
|
||||
self.thing = thing
|
||||
|
||||
|
||||
def __enter__(self):
|
||||
return self.thing
|
||||
|
||||
|
||||
def __exit__(self, *exc_info):
|
||||
try:
|
||||
self.thing.close()
|
||||
@ -55,47 +56,48 @@ def save_soup(soup, target):
|
||||
for meta in metas:
|
||||
if 'charset' in meta.get('content', '').lower():
|
||||
meta.replaceWith(nm)
|
||||
|
||||
|
||||
selfdir = os.path.dirname(target)
|
||||
|
||||
|
||||
for tag in soup.findAll(['img', 'link', 'a']):
|
||||
for key in ('src', 'href'):
|
||||
path = tag.get(key, None)
|
||||
if path and os.path.isfile(path) and os.path.exists(path) and os.path.isabs(path):
|
||||
tag[key] = unicode_path(relpath(path, selfdir).replace(os.sep, '/'))
|
||||
|
||||
|
||||
html = unicode(soup)
|
||||
with open(target, 'wb') as f:
|
||||
f.write(html.encode('utf-8'))
|
||||
|
||||
|
||||
class response(str):
|
||||
|
||||
|
||||
def __new__(cls, *args):
|
||||
obj = super(response, cls).__new__(cls, *args)
|
||||
obj.newurl = None
|
||||
return obj
|
||||
|
||||
|
||||
class DummyLock(object):
|
||||
|
||||
|
||||
def __enter__(self, *args): return self
|
||||
def __exit__(self, *args): pass
|
||||
|
||||
class RecursiveFetcher(object):
|
||||
LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in
|
||||
LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in
|
||||
('.exe\s*$', '.mp3\s*$', '.ogg\s*$', '^\s*mailto:', '^\s*$'))
|
||||
#ADBLOCK_FILTER = tuple(re.compile(i, re.IGNORECASE) for it in
|
||||
# (
|
||||
#
|
||||
#
|
||||
# )
|
||||
# )
|
||||
CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE)
|
||||
default_timeout = socket.getdefaulttimeout() # Needed here as it is used in __del__
|
||||
DUMMY_LOCK = DummyLock()
|
||||
|
||||
def __init__(self, options, logger, image_map={}, css_map={}, job_info=None):
|
||||
|
||||
def __init__(self, options, log, image_map={}, css_map={}, job_info=None):
|
||||
self.base_dir = os.path.abspath(os.path.expanduser(options.dir))
|
||||
if not os.path.exists(self.base_dir):
|
||||
os.makedirs(self.base_dir)
|
||||
self.log = log
|
||||
self.default_timeout = socket.getdefaulttimeout()
|
||||
socket.setdefaulttimeout(options.timeout)
|
||||
self.verbose = options.verbose
|
||||
@ -122,19 +124,19 @@ class RecursiveFetcher(object):
|
||||
self.remove_tags_after = getattr(options, 'remove_tags_after', None)
|
||||
self.remove_tags_before = getattr(options, 'remove_tags_before', None)
|
||||
self.keep_only_tags = getattr(options, 'keep_only_tags', [])
|
||||
self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup)
|
||||
self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup)
|
||||
self.postprocess_html_ext= getattr(options, 'postprocess_html', None)
|
||||
self.download_stylesheets = not options.no_stylesheets
|
||||
self.show_progress = True
|
||||
self.failed_links = []
|
||||
self.job_info = job_info
|
||||
|
||||
|
||||
def get_soup(self, src):
|
||||
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
|
||||
nmassage.extend(self.preprocess_regexps)
|
||||
nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL), lambda m: '')] # Some websites have buggy doctype declarations that mess up beautifulsoup
|
||||
soup = BeautifulSoup(xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage)
|
||||
|
||||
|
||||
if self.keep_only_tags:
|
||||
body = Tag(soup, 'body')
|
||||
try:
|
||||
@ -146,7 +148,7 @@ class RecursiveFetcher(object):
|
||||
soup.find('body').replaceWith(body)
|
||||
except AttributeError: # soup has no body element
|
||||
pass
|
||||
|
||||
|
||||
def remove_beyond(tag, next):
|
||||
while tag is not None and tag.name != 'body':
|
||||
after = getattr(tag, next)
|
||||
@ -155,27 +157,27 @@ class RecursiveFetcher(object):
|
||||
after.extract()
|
||||
after = ns
|
||||
tag = tag.parent
|
||||
|
||||
|
||||
if self.remove_tags_after is not None:
|
||||
rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after
|
||||
for spec in rt:
|
||||
tag = soup.find(**spec)
|
||||
remove_beyond(tag, 'nextSibling')
|
||||
|
||||
|
||||
if self.remove_tags_before is not None:
|
||||
tag = soup.find(**self.remove_tags_before)
|
||||
remove_beyond(tag, 'previousSibling')
|
||||
|
||||
|
||||
for kwds in self.remove_tags:
|
||||
for tag in soup.findAll(**kwds):
|
||||
tag.extract()
|
||||
return self.preprocess_html_ext(soup)
|
||||
|
||||
|
||||
|
||||
|
||||
def fetch_url(self, url):
|
||||
data = None
|
||||
self.log_debug('Fetching %s', url)
|
||||
delta = time.time() - self.last_fetch_at
|
||||
self.log.debug('Fetching', url)
|
||||
delta = time.time() - self.last_fetch_at
|
||||
if delta < self.delay:
|
||||
time.sleep(delta)
|
||||
if re.search(r'\s+', url) is not None:
|
||||
@ -190,43 +192,43 @@ class RecursiveFetcher(object):
|
||||
raise FetchError, responses[err.code]
|
||||
if getattr(err, 'reason', [0])[0] == 104 or \
|
||||
getattr(getattr(err, 'args', [None])[0], 'errno', None) == -2: # Connection reset by peer or Name or service not know
|
||||
self.log_debug('Temporary error, retrying in 1 second')
|
||||
self.log.debug('Temporary error, retrying in 1 second')
|
||||
time.sleep(1)
|
||||
with closing(self.browser.open(url)) as f:
|
||||
data = response(f.read()+f.read())
|
||||
data.newurl = f.geturl()
|
||||
else:
|
||||
else:
|
||||
raise err
|
||||
finally:
|
||||
self.last_fetch_at = time.time()
|
||||
return data
|
||||
|
||||
|
||||
|
||||
def start_fetch(self, url):
|
||||
soup = BeautifulSoup(u'<a href="'+url+'" />')
|
||||
self.log_info('Downloading')
|
||||
self.log.debug('Downloading')
|
||||
res = self.process_links(soup, url, 0, into_dir='')
|
||||
self.log_info('%s saved to %s', url, res)
|
||||
self.log.debug('%s saved to %s'%( url, res))
|
||||
return res
|
||||
|
||||
|
||||
def is_link_ok(self, url):
|
||||
for i in self.__class__.LINK_FILTER:
|
||||
if i.search(url):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def is_link_wanted(self, url):
|
||||
if self.filter_regexps:
|
||||
for f in self.filter_regexps:
|
||||
if f.search(url):
|
||||
return False
|
||||
return False
|
||||
if self.match_regexps:
|
||||
for m in self.match_regexps:
|
||||
if m.search(url):
|
||||
return True
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def process_stylesheets(self, soup, baseurl):
|
||||
diskpath = unicode_path(os.path.join(self.current_dir, 'stylesheets'))
|
||||
if not os.path.exists(diskpath):
|
||||
@ -243,8 +245,7 @@ class RecursiveFetcher(object):
|
||||
try:
|
||||
data = self.fetch_url(iurl)
|
||||
except Exception, err:
|
||||
self.log_debug('Could not fetch stylesheet %s', iurl)
|
||||
self.log_debug('Error: %s', str(err), exc_info=True)
|
||||
self.log.exception('Could not fetch stylesheet %s'% iurl)
|
||||
continue
|
||||
stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
|
||||
with self.stylemap_lock:
|
||||
@ -253,7 +254,7 @@ class RecursiveFetcher(object):
|
||||
x.write(data)
|
||||
tag['href'] = stylepath
|
||||
else:
|
||||
for ns in tag.findAll(text=True):
|
||||
for ns in tag.findAll(text=True):
|
||||
src = str(ns)
|
||||
m = self.__class__.CSS_IMPORT_PATTERN.search(src)
|
||||
if m:
|
||||
@ -267,8 +268,7 @@ class RecursiveFetcher(object):
|
||||
try:
|
||||
data = self.fetch_url(iurl)
|
||||
except Exception, err:
|
||||
self.log_warning('Could not fetch stylesheet %s', iurl)
|
||||
self.log_debug('Error: %s', str(err), exc_info=True)
|
||||
self.log.exception('Could not fetch stylesheet %s'% iurl)
|
||||
continue
|
||||
c += 1
|
||||
stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
|
||||
@ -277,9 +277,9 @@ class RecursiveFetcher(object):
|
||||
with open(stylepath, 'wb') as x:
|
||||
x.write(data)
|
||||
ns.replaceWith(src.replace(m.group(1), stylepath))
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def process_images(self, soup, baseurl):
|
||||
diskpath = unicode_path(os.path.join(self.current_dir, 'images'))
|
||||
if not os.path.exists(diskpath):
|
||||
@ -291,9 +291,6 @@ class RecursiveFetcher(object):
|
||||
iurl = self.image_url_processor(baseurl, iurl)
|
||||
ext = os.path.splitext(iurl)[1]
|
||||
ext = ext[:5]
|
||||
#if not ext:
|
||||
# self.log_debug('Skipping extensionless image %s', iurl)
|
||||
# continue
|
||||
if not urlparse.urlsplit(iurl).scheme:
|
||||
iurl = urlparse.urljoin(baseurl, iurl, False)
|
||||
with self.imagemap_lock:
|
||||
@ -303,8 +300,7 @@ class RecursiveFetcher(object):
|
||||
try:
|
||||
data = self.fetch_url(iurl)
|
||||
except Exception, err:
|
||||
self.log_warning('Could not fetch image %s', iurl)
|
||||
self.log_debug('Error: %s', str(err), exc_info=True)
|
||||
self.log.exception('Could not fetch image %s'% iurl)
|
||||
continue
|
||||
c += 1
|
||||
fname = sanitize_file_name('img'+str(c)+ext)
|
||||
@ -322,7 +318,7 @@ class RecursiveFetcher(object):
|
||||
traceback.print_exc()
|
||||
continue
|
||||
|
||||
def absurl(self, baseurl, tag, key, filter=True):
|
||||
def absurl(self, baseurl, tag, key, filter=True):
|
||||
iurl = tag[key]
|
||||
parts = urlparse.urlsplit(iurl)
|
||||
if not parts.netloc and not parts.path:
|
||||
@ -330,32 +326,32 @@ class RecursiveFetcher(object):
|
||||
if not parts.scheme:
|
||||
iurl = urlparse.urljoin(baseurl, iurl, False)
|
||||
if not self.is_link_ok(iurl):
|
||||
self.log_debug('Skipping invalid link: %s', iurl)
|
||||
self.log.debug('Skipping invalid link:', iurl)
|
||||
return None
|
||||
if filter and not self.is_link_wanted(iurl):
|
||||
self.log_debug('Filtered link: '+iurl)
|
||||
self.log.debug('Filtered link: '+iurl)
|
||||
return None
|
||||
return iurl
|
||||
|
||||
|
||||
def normurl(self, url):
|
||||
parts = list(urlparse.urlsplit(url))
|
||||
parts[4] = ''
|
||||
return urlparse.urlunsplit(parts)
|
||||
|
||||
|
||||
def localize_link(self, tag, key, path):
|
||||
parts = urlparse.urlsplit(tag[key])
|
||||
suffix = '#'+parts.fragment if parts.fragment else ''
|
||||
tag[key] = path+suffix
|
||||
|
||||
|
||||
def process_return_links(self, soup, baseurl):
|
||||
for tag in soup.findAll(lambda tag: tag.name.lower()=='a' and tag.has_key('href')):
|
||||
iurl = self.absurl(baseurl, tag, 'href')
|
||||
iurl = self.absurl(baseurl, tag, 'href')
|
||||
if not iurl:
|
||||
continue
|
||||
nurl = self.normurl(iurl)
|
||||
if self.filemap.has_key(nurl):
|
||||
self.localize_link(tag, 'href', self.filemap[nurl])
|
||||
|
||||
|
||||
def process_links(self, soup, baseurl, recursion_level, into_dir='links'):
|
||||
res = ''
|
||||
diskpath = os.path.join(self.current_dir, into_dir)
|
||||
@ -365,7 +361,7 @@ class RecursiveFetcher(object):
|
||||
try:
|
||||
self.current_dir = diskpath
|
||||
tags = list(soup.findAll('a', href=True))
|
||||
|
||||
|
||||
for c, tag in enumerate(tags):
|
||||
if self.show_progress:
|
||||
print '.',
|
||||
@ -395,17 +391,17 @@ class RecursiveFetcher(object):
|
||||
dsrc = dsrc.decode(self.encoding, 'ignore')
|
||||
else:
|
||||
dsrc = xml_to_unicode(dsrc, self.verbose)[0]
|
||||
|
||||
|
||||
soup = self.get_soup(dsrc)
|
||||
|
||||
|
||||
base = soup.find('base', href=True)
|
||||
if base is not None:
|
||||
newbaseurl = base['href']
|
||||
self.log_debug('Processing images...')
|
||||
self.log.debug('Processing images...')
|
||||
self.process_images(soup, newbaseurl)
|
||||
if self.download_stylesheets:
|
||||
self.process_stylesheets(soup, newbaseurl)
|
||||
|
||||
|
||||
_fname = basename(iurl)
|
||||
if not isinstance(_fname, unicode):
|
||||
_fname.decode('latin1', 'replace')
|
||||
@ -416,56 +412,55 @@ class RecursiveFetcher(object):
|
||||
self.downloaded_paths.append(res)
|
||||
self.filemap[nurl] = res
|
||||
if recursion_level < self.max_recursions:
|
||||
self.log_debug('Processing links...')
|
||||
self.log.debug('Processing links...')
|
||||
self.process_links(soup, newbaseurl, recursion_level+1)
|
||||
else:
|
||||
self.process_return_links(soup, newbaseurl)
|
||||
self.log_debug('Recursion limit reached. Skipping links in %s', iurl)
|
||||
|
||||
self.process_return_links(soup, newbaseurl)
|
||||
self.log.debug('Recursion limit reached. Skipping links in', iurl)
|
||||
|
||||
if callable(self.postprocess_html_ext):
|
||||
soup = self.postprocess_html_ext(soup,
|
||||
soup = self.postprocess_html_ext(soup,
|
||||
c==0 and recursion_level==0 and not getattr(self, 'called_first', False),
|
||||
self.job_info)
|
||||
|
||||
|
||||
if c==0 and recursion_level == 0:
|
||||
self.called_first = True
|
||||
|
||||
|
||||
save_soup(soup, res)
|
||||
self.localize_link(tag, 'href', res)
|
||||
except Exception, err:
|
||||
self.failed_links.append((iurl, traceback.format_exc()))
|
||||
self.log_warning('Could not fetch link %s', iurl)
|
||||
self.log_debug('Error: %s', str(err), exc_info=True)
|
||||
self.log.exception('Could not fetch link', iurl)
|
||||
finally:
|
||||
self.current_dir = diskpath
|
||||
self.files += 1
|
||||
self.files += 1
|
||||
finally:
|
||||
self.current_dir = prev_dir
|
||||
if self.show_progress:
|
||||
print
|
||||
return res
|
||||
|
||||
|
||||
def __del__(self):
|
||||
dt = getattr(self, 'default_timeout', None)
|
||||
if dt is not None:
|
||||
socket.setdefaulttimeout(dt)
|
||||
|
||||
|
||||
def option_parser(usage=_('%prog URL\n\nWhere URL is for example http://google.com')):
|
||||
parser = OptionParser(usage=usage)
|
||||
parser.add_option('-d', '--base-dir',
|
||||
parser.add_option('-d', '--base-dir',
|
||||
help=_('Base directory into which URL is saved. Default is %default'),
|
||||
default='.', type='string', dest='dir')
|
||||
parser.add_option('-t', '--timeout',
|
||||
parser.add_option('-t', '--timeout',
|
||||
help=_('Timeout in seconds to wait for a response from the server. Default: %default s'),
|
||||
default=10.0, type='float', dest='timeout')
|
||||
parser.add_option('-r', '--max-recursions', default=1,
|
||||
parser.add_option('-r', '--max-recursions', default=1,
|
||||
help=_('Maximum number of levels to recurse i.e. depth of links to follow. Default %default'),
|
||||
type='int', dest='max_recursions')
|
||||
parser.add_option('-n', '--max-files', default=sys.maxint, type='int', dest='max_files',
|
||||
help=_('The maximum number of files to download. This only applies to files from <a href> tags. Default is %default'))
|
||||
parser.add_option('--delay', default=0, dest='delay', type='int',
|
||||
help=_('Minimum interval in seconds between consecutive fetches. Default is %default s'))
|
||||
parser.add_option('--encoding', default=None,
|
||||
parser.add_option('--encoding', default=None,
|
||||
help=_('The character encoding for the websites you are trying to download. The default is to try and guess the encoding.'))
|
||||
parser.add_option('--match-regexp', default=[], action='append', dest='match_regexps',
|
||||
help=_('Only links that match this regular expression will be followed. This option can be specified multiple times, in which case as long as a link matches any one regexp, it will be followed. By default all links are followed.'))
|
||||
@ -478,23 +473,21 @@ def option_parser(usage=_('%prog URL\n\nWhere URL is for example http://google.c
|
||||
return parser
|
||||
|
||||
|
||||
def create_fetcher(options, logger=None, image_map={}):
|
||||
if logger is None:
|
||||
level = logging.DEBUG if options.verbose else logging.INFO
|
||||
logger = logging.getLogger('web2disk')
|
||||
setup_cli_handlers(logger, level)
|
||||
return RecursiveFetcher(options, logger, image_map={})
|
||||
def create_fetcher(options, image_map={}, log=None):
|
||||
if log is None:
|
||||
log = Log()
|
||||
return RecursiveFetcher(options, log, image_map={})
|
||||
|
||||
def main(args=sys.argv):
|
||||
parser = option_parser()
|
||||
parser = option_parser()
|
||||
options, args = parser.parse_args(args)
|
||||
if len(args) != 2:
|
||||
parser.print_help()
|
||||
return 1
|
||||
|
||||
fetcher = create_fetcher(options)
|
||||
fetcher.start_fetch(args[1])
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
fetcher = create_fetcher(options)
|
||||
fetcher.start_fetch(args[1])
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
|
Loading…
x
Reference in New Issue
Block a user