Working HTML/OPF input plugin. Also fixed feeds download and removed cover processing from OEBBook

This commit is contained in:
Kovid Goyal 2009-04-10 21:12:27 -07:00
parent 296853cd43
commit 95d1b58ae3
10 changed files with 295 additions and 337 deletions

View File

@ -189,6 +189,7 @@ class ComicMetadataReader(MetadataReaderPlugin):
def get_metadata(self, stream, ftype): def get_metadata(self, stream, ftype):
if ftype == 'cbr': if ftype == 'cbr':
from calibre.libunrar import extract_member as extract_first from calibre.libunrar import extract_member as extract_first
extract_first
else: else:
from calibre.libunzip import extract_member as extract_first from calibre.libunzip import extract_member as extract_first
from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata import MetaInformation
@ -267,12 +268,14 @@ from calibre.ebooks.epub.input import EPUBInput
from calibre.ebooks.mobi.input import MOBIInput from calibre.ebooks.mobi.input import MOBIInput
from calibre.ebooks.pdf.input import PDFInput from calibre.ebooks.pdf.input import PDFInput
from calibre.ebooks.txt.input import TXTInput from calibre.ebooks.txt.input import TXTInput
from calibre.ebooks.html.input import HTMLInput
from calibre.ebooks.oeb.output import OEBOutput from calibre.ebooks.oeb.output import OEBOutput
from calibre.ebooks.txt.output import TXTOutput from calibre.ebooks.txt.output import TXTOutput
from calibre.ebooks.pdf.output import PDFOutput from calibre.ebooks.pdf.output import PDFOutput
from calibre.customize.profiles import input_profiles, output_profiles from calibre.customize.profiles import input_profiles, output_profiles
plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDFInput, TXTInput, OEBOutput, TXTOutput, PDFOutput] plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDFInput, HTMLInput,
TXTInput, OEBOutput, TXTOutput, PDFOutput]
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
x.__name__.endswith('MetadataReader')] x.__name__.endswith('MetadataReader')]
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ plugins += [x for x in list(locals().values()) if isinstance(x, type) and \

View File

@ -163,9 +163,9 @@ class InputFormatPlugin(Plugin):
for x in os.listdir('.'): for x in os.listdir('.'):
shutil.rmtree(x) if os.path.isdir(x) else os.remove(x) shutil.rmtree(x) if os.path.isdir(x) else os.remove(x)
ret = self.convert(stream, options, file_ext, ret = self.convert(stream, options, file_ext,
log, accelerators) log, accelerators)
if options.debug_input is not None: if options.debug_input is not None:
options.debug_input = os.path.abspath(options.debug_input) options.debug_input = os.path.abspath(options.debug_input)
if not os.path.exists(options.debug_input): if not os.path.exists(options.debug_input):

View File

@ -17,7 +17,7 @@ def tostring(root, strip_comments=False, pretty_print=False):
root.set('xmlns', 'http://www.w3.org/1999/xhtml') root.set('xmlns', 'http://www.w3.org/1999/xhtml')
root.set('{http://www.w3.org/1999/xhtml}xlink', 'http://www.w3.org/1999/xlink') root.set('{http://www.w3.org/1999/xhtml}xlink', 'http://www.w3.org/1999/xlink')
for x in root.iter(): for x in root.iter():
if x.tag.rpartition('}')[-1].lower() == 'svg': if hasattr(x.tag, 'rpartition') and x.tag.rpartition('}')[-1].lower() == 'svg':
x.set('xmlns', 'http://www.w3.org/2000/svg') x.set('xmlns', 'http://www.w3.org/2000/svg')
ans = _tostring(root, encoding='utf-8', pretty_print=pretty_print) ans = _tostring(root, encoding='utf-8', pretty_print=pretty_print)

View File

@ -11,14 +11,12 @@ __docformat__ = 'restructuredtext en'
Input plugin for HTML or OPF ebooks. Input plugin for HTML or OPF ebooks.
''' '''
import os, re, sys, cStringIO import os, re, sys
from urlparse import urlparse, urlunparse from urlparse import urlparse, urlunparse
from urllib import unquote from urllib import unquote
from calibre.customize.conversion import InputFormatPlugin from calibre.customize.conversion import InputFormatPlugin
from calibre.ebooks.metadata.meta import get_metadata from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.metadata.opf2 import OPF, OPFCreator
from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
from calibre.customize.conversion import OptionRecommendation from calibre.customize.conversion import OptionRecommendation
from calibre import unicode_path from calibre import unicode_path
@ -213,72 +211,21 @@ def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None)
sys.setrecursionlimit(orec) sys.setrecursionlimit(orec)
def opf_traverse(opf_reader, verbose=0, encoding=None):
'''
Return a list of :class:`HTMLFile` objects in the order specified by the
`<spine>` element of the OPF.
:param opf_reader: An :class:`calibre.ebooks.metadata.opf2.OPF` instance.
:param encoding: Specify character encoding of HTML files. If `None` it is
auto-detected.
'''
if not opf_reader.spine:
raise ValueError('OPF does not have a spine')
flat = []
for path in opf_reader.spine.items():
path = os.path.abspath(path)
if path not in flat:
flat.append(os.path.abspath(path))
for item in opf_reader.manifest:
if 'html' in item.mime_type:
path = os.path.abspath(item.path)
if path not in flat:
flat.append(path)
for i, path in enumerate(flat):
if not os.path.exists(path):
path = path.replace('&', '%26')
if os.path.exists(path):
flat[i] = path
for item in opf_reader.itermanifest():
item.set('href', item.get('href').replace('&', '%26'))
ans = []
for path in flat:
if os.path.exists(path):
ans.append(HTMLFile(path, 0, encoding, verbose))
else:
print 'WARNING: OPF spine item %s does not exist'%path
ans = [f for f in ans if not f.is_binary]
return ans
def search_for_opf(dir):
for f in os.listdir(dir):
if f.lower().endswith('.opf'):
return OPF(open(os.path.join(dir, f), 'rb'), dir)
def get_filelist(htmlfile, dir, opts, log): def get_filelist(htmlfile, dir, opts, log):
''' '''
Build list of files referenced by html file or try to detect and use an Build list of files referenced by html file or try to detect and use an
OPF file instead. OPF file instead.
''' '''
print 'Building file list...' log.info('Building file list...')
opf = search_for_opf(dir) filelist = traverse(htmlfile, max_levels=int(opts.max_levels),
filelist = None verbose=opts.verbose,
if opf is not None: encoding=opts.input_encoding)\
try: [0 if opts.breadth_first else 1]
filelist = opf_traverse(opf, verbose=opts.verbose,
encoding=opts.input_encoding)
except:
pass
if not filelist:
filelist = traverse(htmlfile, max_levels=int(opts.max_levels),
verbose=opts.verbose,
encoding=opts.input_encoding)\
[0 if opts.breadth_first else 1]
if opts.verbose: if opts.verbose:
log.debug('\tFound files...') log.debug('\tFound files...')
for f in filelist: for f in filelist:
log.debug('\t\t', f) log.debug('\t\t', f)
return opf, filelist return filelist
class HTMLInput(InputFormatPlugin): class HTMLInput(InputFormatPlugin):
@ -309,34 +256,32 @@ class HTMLInput(InputFormatPlugin):
def convert(self, stream, opts, file_ext, log, def convert(self, stream, opts, file_ext, log,
accelerators): accelerators):
from calibre.ebooks.metadata.meta import get_metadata
basedir = os.getcwd() basedir = os.getcwd()
if hasattr(stream, 'name'): if hasattr(stream, 'name'):
basedir = os.path.dirname(stream.name) basedir = os.path.dirname(stream.name)
if file_ext == 'opf': if file_ext == 'opf':
opf = OPF(stream, basedir) opfpath = stream.name
filelist = opf_traverse(opf, verbose=opts.verbose,
encoding=opts.input_encoding)
mi = MetaInformation(opf)
else: else:
opf, filelist = get_filelist(stream.name, basedir, opts, log) filelist = get_filelist(stream.name, basedir, opts, log)
mi = MetaInformation(opf) mi = get_metadata(stream, 'html')
mi.smart_update(get_metadata(stream, 'html')) mi = OPFCreator(os.getcwdu(), mi)
mi.guide = None
entries = [(f.path, 'application/xhtml+xml') for f in filelist]
mi.create_manifest(entries)
mi.create_spine([f.path for f in filelist])
mi = OPFCreator(os.getcwdu(), mi) mi.render(open('metadata.opf', 'wb'))
mi.guide = None opfpath = os.path.abspath('metadata.opf')
entries = [(f.path, 'application/xhtml+xml') for f in filelist]
mi.create_manifest(entries)
mi.create_spine([f.path for f in filelist])
tocbuf = cStringIO.StringIO()
mi.render(open('metadata.opf', 'wb'), tocbuf, 'toc.ncx')
toc = tocbuf.getvalue()
if toc:
open('toc.ncx', 'wb').write(toc)
from calibre.ebooks.conversion.plumber import create_oebbook from calibre.ebooks.conversion.plumber import create_oebbook
return create_oebbook(log, os.path.abspath('metadata.opf')) oeb = create_oebbook(log, opfpath)
from calibre.ebooks.oeb.transforms.package import Package
Package(os.getcwdu())(oeb, opts)
return oeb

View File

@ -573,7 +573,7 @@ class OEBReader(object):
item = self._find_ncx(opf) item = self._find_ncx(opf)
self._toc_from_opf(opf, item) self._toc_from_opf(opf, item)
self._pages_from_opf(opf, item) self._pages_from_opf(opf, item)
self._ensure_cover_image() #self._ensure_cover_image()
def main(argv=sys.argv): def main(argv=sys.argv):

View File

@ -6,13 +6,14 @@ __license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import os import os, re
from urllib import unquote as urlunquote from urllib import unquote as urlunquote
from functools import partial from functools import partial
from lxml import etree from lxml import etree
import cssutils import cssutils
from calibre import sanitize_file_name
from calibre.constants import islinux from calibre.constants import islinux
from calibre.ebooks.oeb.base import OEB_DOCS, urlnormalize, urldefrag, \ from calibre.ebooks.oeb.base import OEB_DOCS, urlnormalize, urldefrag, \
rewrite_links rewrite_links
@ -36,15 +37,21 @@ class Package(object):
self.new_base_path = os.path.abspath(base) self.new_base_path = os.path.abspath(base)
def rewrite_links_in(self, item): def rewrite_links_in(self, item):
base = os.path.join(self.new_base_path, *item.href.split('/')) old_href = item.old_href.split('#')[0]
new_href = item.href.split('#')[0]
base = os.path.join(self.old_base_path, *old_href.split('/'))
base = os.path.dirname(base) base = os.path.dirname(base)
self.log.debug('\tRewriting links in', base+'/'+
item.href.rpartition('/')[-1])
new_base = os.path.join(self.new_base_path, *new_href.split('/'))
new_base = os.path.dirname(new_base)
if etree.iselement(item.data): if etree.iselement(item.data):
self.rewrite_links_in_xml(item.data, base) self.rewrite_links_in_xml(item.data, base, new_base)
elif hasattr(item.data, 'cssText'): elif hasattr(item.data, 'cssText'):
self.rewrite_links_in_css(item.data, base) self.rewrite_links_in_css(item.data, base, new_base)
def link_replacer(self, link_, base=''): def link_replacer(self, link_, base='', new_base=''):
link = urlnormalize(link_) link = urlnormalize(link_)
link, frag = urldefrag(link) link, frag = urldefrag(link)
link = urlunquote(link).replace('/', os.sep) link = urlunquote(link).replace('/', os.sep)
@ -55,20 +62,33 @@ class Package(object):
link = link.lower() link = link.lower()
if link not in self.map: if link not in self.map:
return link_ return link_
nlink = os.path.relpath(self.map[link], base) nlink = os.path.relpath(self.map[link], new_base)
if frag: if frag:
nlink = '#'.join(nlink, frag) nlink = '#'.join((nlink, frag))
return nlink.replace(os.sep, '/') return nlink.replace(os.sep, '/')
def rewrite_links_in_css(self, sheet, base): def rewrite_links_in_css(self, sheet, base, new_base):
repl = partial(self.link_replacer, base=base) repl = partial(self.link_replacer, base=base, new_base=new_base)
cssutils.replaceUrls(sheet, repl) cssutils.replaceUrls(sheet, repl)
def rewrite_links_in_xml(self, root, base): def rewrite_links_in_xml(self, root, base, new_base):
repl = partial(self.link_replacer, base=base) repl = partial(self.link_replacer, base=base, new_base=new_base)
rewrite_links(root, repl) rewrite_links(root, repl)
def move_manifest_item(self, item): def uniqify_name(self, new_href, hrefs):
c = 0
while new_href in hrefs:
c += 1
parts = new_href.split('/')
name, ext = os.path.splitext(parts[-1])
name = re.sub(r'_\d+$', '', name)
name += '_%d'%c
parts[-1] = name + ext
new_href = '/'.join(parts)
return new_href
def move_manifest_item(self, item, hrefs):
item.data # Make sure the data has been loaded and cached item.data # Make sure the data has been loaded and cached
old_abspath = os.path.join(self.old_base_path, old_abspath = os.path.join(self.old_base_path,
*(urldefrag(item.href)[0].split('/'))) *(urldefrag(item.href)[0].split('/')))
@ -79,11 +99,17 @@ class Package(object):
new_href = 'content/' new_href = 'content/'
elif item.href.lower().endswith('.ncx'): elif item.href.lower().endswith('.ncx'):
new_href = '' new_href = ''
new_href += bname new_href += sanitize_file_name(bname)
if new_href in hrefs:
new_href = self.uniqify_name(new_href, hrefs)
hrefs.add(new_href)
new_abspath = os.path.join(self.new_base_path, *new_href.split('/')) new_abspath = os.path.join(self.new_base_path, *new_href.split('/'))
new_abspath = os.path.abspath(new_abspath) new_abspath = os.path.abspath(new_abspath)
item.old_href = self.oeb.manifest.hrefs.pop(item.href).href
item.href = new_href item.href = new_href
self.oeb.manifest.hrefs[item.href] = item
if not islinux: if not islinux:
old_abspath, new_abspath = old_abspath.lower(), new_abspath.lower() old_abspath, new_abspath = old_abspath.lower(), new_abspath.lower()
if old_abspath != new_abspath: if old_abspath != new_abspath:
@ -91,25 +117,33 @@ class Package(object):
def rewrite_links_in_toc(self, toc): def rewrite_links_in_toc(self, toc):
if toc.href: if toc.href:
toc.href = self.link_replacer(toc.href, base=self.new_base_path) toc.href = self.link_replacer(toc.href, base=self.old_base_path,
new_base=self.new_base_path)
for x in toc: for x in toc:
self.rewrite_links_in_toc(x) self.rewrite_links_in_toc(x)
def __call__(self, oeb, context): def __call__(self, oeb, context):
self.map = {} self.map = {}
self.log = self.oeb.log self.log = oeb.log
self.oeb = oeb
self.old_base_path = os.path.abspath(oeb.container.rootdir) self.old_base_path = os.path.abspath(oeb.container.rootdir)
hrefs = set([])
for item in self.oeb.manifest: for item in self.oeb.manifest:
self.move_manifest_item(item) self.move_manifest_item(item, hrefs)
self.log.debug('Rewriting links in OEB documents...')
for item in self.oeb.manifest: for item in self.oeb.manifest:
self.rewrite_links_in(item) self.rewrite_links_in(item)
if getattr(oeb.toc, 'nodes', False): if getattr(oeb.toc, 'nodes', False):
self.log.debug('Rewriting links in TOC...')
self.rewrite_links_in_toc(oeb.toc) self.rewrite_links_in_toc(oeb.toc)
if hasattr(oeb, 'guide'): if hasattr(oeb, 'guide'):
self.log.debug('Rewriting links in guide...')
for ref in oeb.guide.values(): for ref in oeb.guide.values():
ref.href = self.link_replacer(ref.href, base=self.new_base_path) ref.href = self.link_replacer(ref.href,
base=self.old_base_path,
new_base=self.new_base_path)

View File

@ -48,7 +48,8 @@ class OEBWriter(object):
pretty_print=pretty_print) pretty_print=pretty_print)
def __call__(self, oeb, path): def __call__(self, oeb, path):
"""Read the book in the :class:`OEBBook` object :param:`oeb` to a file """
Read the book in the :class:`OEBBook` object :param:`oeb` to a file
at :param:`path`. at :param:`path`.
""" """
version = int(self.version[0]) version = int(self.version[0])

View File

@ -5,7 +5,7 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
CLI for downloading feeds. CLI for downloading feeds.
''' '''
import sys, os, logging import sys, os
from calibre.web.feeds.recipes import get_builtin_recipe, compile_recipe, titles from calibre.web.feeds.recipes import get_builtin_recipe, compile_recipe, titles
from calibre.web.fetch.simple import option_parser as _option_parser from calibre.web.fetch.simple import option_parser as _option_parser
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
@ -113,7 +113,7 @@ If you specify this option, any argument to %prog is ignored and a default recip
class RecipeError(Exception): class RecipeError(Exception):
pass pass
def run_recipe(opts, recipe_arg, parser, notification=None, handler=None): def run_recipe(opts, recipe_arg, parser, notification=None):
if notification is None: if notification is None:
from calibre.utils.terminfo import TerminalController, ProgressBar from calibre.utils.terminfo import TerminalController, ProgressBar
term = TerminalController(sys.stdout) term = TerminalController(sys.stdout)
@ -137,14 +137,6 @@ def run_recipe(opts, recipe_arg, parser, notification=None, handler=None):
if recipe is None: if recipe is None:
raise RecipeError(recipe_arg+ ' is an invalid recipe') raise RecipeError(recipe_arg+ ' is an invalid recipe')
if handler is None:
from calibre import ColoredFormatter
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.DEBUG if opts.debug else logging.INFO if opts.verbose else logging.WARN)
handler.setFormatter(ColoredFormatter('%(levelname)s: %(message)s\n')) # The trailing newline is need because of the progress bar
logging.getLogger('feeds2disk').addHandler(handler)
recipe = recipe(opts, parser, notification) recipe = recipe(opts, parser, notification)
if not os.path.exists(recipe.output_dir): if not os.path.exists(recipe.output_dir):
@ -153,7 +145,7 @@ def run_recipe(opts, recipe_arg, parser, notification=None, handler=None):
return recipe return recipe
def main(args=sys.argv, notification=None, handler=None): def main(args=sys.argv, notification=None):
p = option_parser() p = option_parser()
opts, args = p.parse_args(args=args[1:]) opts, args = p.parse_args(args=args[1:])
@ -161,7 +153,7 @@ def main(args=sys.argv, notification=None, handler=None):
p.print_help() p.print_help()
return 1 return 1
recipe_arg = args[0] if len(args) > 0 else None recipe_arg = args[0] if len(args) > 0 else None
run_recipe(opts, recipe_arg, p, notification=notification, handler=handler) run_recipe(opts, recipe_arg, p, notification=notification)
return 0 return 0

View File

@ -7,7 +7,7 @@ Defines various abstract base classes that can be subclassed to create powerful
__docformat__ = "restructuredtext en" __docformat__ = "restructuredtext en"
import logging, os, cStringIO, time, traceback, re, urlparse, sys import os, time, traceback, re, urlparse, sys
from collections import defaultdict from collections import defaultdict
from functools import partial from functools import partial
from contextlib import nested, closing from contextlib import nested, closing
@ -27,6 +27,7 @@ from calibre.web.feeds import feed_from_xml, templates, feeds_from_index, Feed
from calibre.web.fetch.simple import option_parser as web2disk_option_parser from calibre.web.fetch.simple import option_parser as web2disk_option_parser
from calibre.web.fetch.simple import RecursiveFetcher from calibre.web.fetch.simple import RecursiveFetcher
from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending
from calibre.utils.logging import Log
from calibre.ptempfile import PersistentTemporaryFile, \ from calibre.ptempfile import PersistentTemporaryFile, \
PersistentTemporaryDirectory PersistentTemporaryDirectory
@ -423,7 +424,7 @@ class BasicNewsRecipe(object):
''' '''
raise NotImplementedError raise NotImplementedError
def get_obfuscated_article(self, url, logger): def get_obfuscated_article(self, url):
''' '''
If you set :member:`articles_are_obfuscated` this method is called with If you set :member:`articles_are_obfuscated` this method is called with
every article URL. It should return the path to a file on the filesystem every article URL. It should return the path to a file on the filesystem
@ -443,6 +444,7 @@ class BasicNewsRecipe(object):
:param parser: Command line option parser. Used to intelligently merge options. :param parser: Command line option parser. Used to intelligently merge options.
:param progress_reporter: A Callable that takes two arguments: progress (a number between 0 and 1) and a string message. The message should be optional. :param progress_reporter: A Callable that takes two arguments: progress (a number between 0 and 1) and a string message. The message should be optional.
''' '''
self.log = Log()
if not isinstance(self.title, unicode): if not isinstance(self.title, unicode):
self.title = unicode(self.title, 'utf-8', 'replace') self.title = unicode(self.title, 'utf-8', 'replace')
@ -455,7 +457,6 @@ class BasicNewsRecipe(object):
if self.debug: if self.debug:
logging.getLogger('feeds2disk').setLevel(logging.DEBUG)
self.verbose = True self.verbose = True
self.report_progress = progress_reporter self.report_progress = progress_reporter
@ -560,20 +561,20 @@ class BasicNewsRecipe(object):
res = self.build_index() res = self.build_index()
self.report_progress(1, _('Download finished')) self.report_progress(1, _('Download finished'))
if self.failed_downloads: if self.failed_downloads:
self.log_warning(_('Failed to download the following articles:')) self.log.warning(_('Failed to download the following articles:'))
for feed, article, debug in self.failed_downloads: for feed, article, debug in self.failed_downloads:
self.log_warning(article.title+_(' from ')+feed.title) self.log.warning(article.title+_(' from ')+feed.title)
self.log_debug(article.url) self.log.debug(article.url)
self.log_debug(debug) self.log.debug(debug)
if self.partial_failures: if self.partial_failures:
self.log_warning(_('Failed to download parts of the following articles:')) self.log.warning(_('Failed to download parts of the following articles:'))
for feed, atitle, aurl, debug in self.partial_failures: for feed, atitle, aurl, debug in self.partial_failures:
self.log_warning(atitle + _(' from ') + feed) self.log.warning(atitle + _(' from ') + feed)
self.log_debug(aurl) self.log.debug(aurl)
self.log_warning(_('\tFailed links:')) self.log.warning(_('\tFailed links:'))
for l, tb in debug: for l, tb in debug:
self.log_warning(l) self.log.warning(l)
self.log_debug(tb) self.log.debug(tb)
return res return res
finally: finally:
self.cleanup() self.cleanup()
@ -636,20 +637,11 @@ class BasicNewsRecipe(object):
extra_css=self.extra_css).render(doctype='xhtml') extra_css=self.extra_css).render(doctype='xhtml')
def create_logger(self, feed_number, article_number): def _fetch_article(self, url, dir, f, a, num_of_feeds):
logger = logging.getLogger('feeds2disk.article_%d_%d'%(feed_number, article_number))
out = cStringIO.StringIO()
handler = logging.StreamHandler(out)
handler.setFormatter(logging.Formatter('%(levelname)s: %(message)s'))
handler.setLevel(logging.INFO if self.verbose else logging.WARNING)
if self.debug:
handler.setLevel(logging.DEBUG)
logger.addHandler(handler)
return logger, out
def _fetch_article(self, url, dir, logger, f, a, num_of_feeds):
self.web2disk_options.browser = self.get_browser() if self.multithreaded_fetch else self.browser self.web2disk_options.browser = self.get_browser() if self.multithreaded_fetch else self.browser
fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map, self.css_map, (url, f, a, num_of_feeds)) fetcher = RecursiveFetcher(self.web2disk_options, self.log,
self.image_map, self.css_map,
(url, f, a, num_of_feeds))
fetcher.base_dir = dir fetcher.base_dir = dir
fetcher.current_dir = dir fetcher.current_dir = dir
fetcher.show_progress = False fetcher.show_progress = False
@ -661,21 +653,21 @@ class BasicNewsRecipe(object):
raise Exception(_('Could not fetch article. Run with --debug to see the reason')) raise Exception(_('Could not fetch article. Run with --debug to see the reason'))
return res, path, failures return res, path, failures
def fetch_article(self, url, dir, logger, f, a, num_of_feeds): def fetch_article(self, url, dir, f, a, num_of_feeds):
return self._fetch_article(url, dir, logger, f, a, num_of_feeds) return self._fetch_article(url, dir, f, a, num_of_feeds)
def fetch_obfuscated_article(self, url, dir, logger, f, a, num_of_feeds): def fetch_obfuscated_article(self, url, dir, f, a, num_of_feeds):
path = os.path.abspath(self.get_obfuscated_article(url, logger)) path = os.path.abspath(self.get_obfuscated_article(url))
url = ('file:'+path) if iswindows else ('file://'+path) url = ('file:'+path) if iswindows else ('file://'+path)
return self._fetch_article(url, dir, logger, f, a, num_of_feeds) return self._fetch_article(url, dir, f, a, num_of_feeds)
def fetch_embedded_article(self, article, dir, logger, f, a, num_of_feeds): def fetch_embedded_article(self, article, dir, f, a, num_of_feeds):
templ = templates.EmbeddedContent() templ = templates.EmbeddedContent()
raw = templ.generate(article).render('html') raw = templ.generate(article).render('html')
with PersistentTemporaryFile('_feeds2disk.html') as pt: with PersistentTemporaryFile('_feeds2disk.html') as pt:
pt.write(raw) pt.write(raw)
url = ('file:'+pt.name) if iswindows else ('file://'+pt.name) url = ('file:'+pt.name) if iswindows else ('file://'+pt.name)
return self._fetch_article(url, dir, logger, f, a, num_of_feeds) return self._fetch_article(url, dir, f, a, num_of_feeds)
def build_index(self): def build_index(self):
@ -716,7 +708,6 @@ class BasicNewsRecipe(object):
art_dir = os.path.join(feed_dir, 'article_%d'%a) art_dir = os.path.join(feed_dir, 'article_%d'%a)
if not os.path.isdir(art_dir): if not os.path.isdir(art_dir):
os.makedirs(art_dir) os.makedirs(art_dir)
logger, stream = self.create_logger(f, a)
try: try:
url = self.print_version(article.url) url = self.print_version(article.url)
except NotImplementedError: except NotImplementedError:
@ -726,10 +717,9 @@ class BasicNewsRecipe(object):
func, arg = (self.fetch_embedded_article, article) if self.use_embedded_content else \ func, arg = (self.fetch_embedded_article, article) if self.use_embedded_content else \
((self.fetch_obfuscated_article if self.articles_are_obfuscated \ ((self.fetch_obfuscated_article if self.articles_are_obfuscated \
else self.fetch_article), url) else self.fetch_article), url)
req = WorkRequest(func, (arg, art_dir, logger, f, a, len(feed)), req = WorkRequest(func, (arg, art_dir, f, a, len(feed)),
{}, (f, a), self.article_downloaded, {}, (f, a), self.article_downloaded,
self.error_in_article_download) self.error_in_article_download)
req.stream = stream
req.feed = feed req.feed = feed
req.article = article req.article = article
req.feed_dir = feed_dir req.feed_dir = feed_dir
@ -768,8 +758,8 @@ class BasicNewsRecipe(object):
cu = self.get_cover_url() cu = self.get_cover_url()
except Exception, err: except Exception, err:
cu = None cu = None
self.log_error(_('Could not download cover: %s')%str(err)) self.log.error(_('Could not download cover: %s')%str(err))
self.log_debug(traceback.format_exc()) self.log.debug(traceback.format_exc())
if cu is not None: if cu is not None:
ext = cu.rpartition('.')[-1] ext = cu.rpartition('.')[-1]
if '?' in ext: if '?' in ext:
@ -841,8 +831,8 @@ class BasicNewsRecipe(object):
f.write(html.encode('utf-8')) f.write(html.encode('utf-8'))
renderer = render_html(hf) renderer = render_html(hf)
if renderer.tb is not None: if renderer.tb is not None:
self.logger.warning('Failed to render default cover') self.log.warning('Failed to render default cover')
self.logger.debug(renderer.tb) self.log.debug(renderer.tb)
else: else:
cover_file.write(renderer.data) cover_file.write(renderer.data)
cover_file.flush() cover_file.flush()
@ -863,7 +853,7 @@ class BasicNewsRecipe(object):
manifest.append(os.path.join(dir, 'index.ncx')) manifest.append(os.path.join(dir, 'index.ncx'))
cpath = getattr(self, 'cover_path', None) cpath = getattr(self, 'cover_path', None)
if cpath is None: if cpath is None:
pf = PersistentTemporaryFile('_recipe_cover.jpg') pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
self.default_cover(pf) self.default_cover(pf)
cpath = pf.name cpath = pf.name
if cpath is not None and os.access(cpath, os.R_OK): if cpath is not None and os.access(cpath, os.R_OK):
@ -944,7 +934,7 @@ class BasicNewsRecipe(object):
a = request.requestID[1] a = request.requestID[1]
article = request.article article = request.article
self.log_debug(_('\nDownloaded article %s from %s\n%s')%(article.title, article.url, request.stream.getvalue().decode('utf-8', 'ignore'))) self.log.debug(_('\nDownloaded article %s from %s')%(article.title, article.url))
article.orig_url = article.url article.orig_url = article.url
article.url = 'article_%d/index.html'%a article.url = 'article_%d/index.html'%a
article.downloaded = True article.downloaded = True
@ -956,11 +946,11 @@ class BasicNewsRecipe(object):
def error_in_article_download(self, request, traceback): def error_in_article_download(self, request, traceback):
self.jobs_done += 1 self.jobs_done += 1
self.log_error(_('Failed to download article: %s from %s\n')%(request.article.title, request.article.url)) self.log.error(_('Failed to download article: %s from %s\n')%(request.article.title, request.article.url))
debug = request.stream.getvalue().decode('utf-8', 'ignore') debug = request.stream.getvalue().decode('utf-8', 'ignore')
self.log_debug(debug) self.log.debug(debug)
self.log_debug(traceback) self.log.debug(traceback)
self.log_debug('\n') self.log.debug('\n')
self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article download failed: %s')%request.article.title) self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article download failed: %s')%request.article.title)
self.failed_downloads.append((request.feed, request.article, debug)) self.failed_downloads.append((request.feed, request.article, debug))
@ -990,7 +980,7 @@ class BasicNewsRecipe(object):
feed.populate_from_preparsed_feed(msg, []) feed.populate_from_preparsed_feed(msg, [])
feed.description = unicode(err) feed.description = unicode(err)
parsed_feeds.append(feed) parsed_feeds.append(feed)
self.log_exception(msg) self.log.exception(msg)
return parsed_feeds return parsed_feeds
@ -1057,7 +1047,7 @@ class CustomIndexRecipe(BasicNewsRecipe):
index = os.path.abspath(self.custom_index()) index = os.path.abspath(self.custom_index())
url = 'file:'+index if iswindows else 'file://'+index url = 'file:'+index if iswindows else 'file://'+index
self.web2disk_options.browser = self.browser self.web2disk_options.browser = self.browser
fetcher = RecursiveFetcher(self.web2disk_options, self.logger) fetcher = RecursiveFetcher(self.web2disk_options, self.log)
fetcher.base_dir = self.output_dir fetcher.base_dir = self.output_dir
fetcher.current_dir = self.output_dir fetcher.current_dir = self.output_dir
fetcher.show_progress = False fetcher.show_progress = False
@ -1069,7 +1059,7 @@ class AutomaticNewsRecipe(BasicNewsRecipe):
keep_only_tags = [dict(name=['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])] keep_only_tags = [dict(name=['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])]
def fetch_embedded_article(self, article, dir, logger, f, a, num_of_feeds): def fetch_embedded_article(self, article, dir, f, a, num_of_feeds):
if self.use_embedded_content: if self.use_embedded_content:
self.web2disk_options.keep_only_tags = [] self.web2disk_options.keep_only_tags = []
return BasicNewsRecipe.fetch_embedded_article(self, article, dir, logger, f, a, num_of_feeds) return BasicNewsRecipe.fetch_embedded_article(self, article, dir, f, a, num_of_feeds)

View File

@ -7,18 +7,19 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
Fetch a webpage and its links recursively. The webpages are saved to disk in Fetch a webpage and its links recursively. The webpages are saved to disk in
UTF-8 encoding with any charset declarations removed. UTF-8 encoding with any charset declarations removed.
''' '''
import sys, socket, os, urlparse, logging, re, time, copy, urllib2, threading, traceback import sys, socket, os, urlparse, re, time, copy, urllib2, threading, traceback
from urllib import url2pathname, quote from urllib import url2pathname, quote
from threading import RLock from threading import RLock
from httplib import responses from httplib import responses
from PIL import Image from PIL import Image
from cStringIO import StringIO from cStringIO import StringIO
from calibre import setup_cli_handlers, browser, sanitize_file_name, \ from calibre import browser, sanitize_file_name, \
relpath, unicode_path relpath, unicode_path
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
from calibre.utils.config import OptionParser from calibre.utils.config import OptionParser
from calibre.utils.logging import Log
class FetchError(Exception): class FetchError(Exception):
pass pass
@ -92,10 +93,11 @@ class RecursiveFetcher(object):
default_timeout = socket.getdefaulttimeout() # Needed here as it is used in __del__ default_timeout = socket.getdefaulttimeout() # Needed here as it is used in __del__
DUMMY_LOCK = DummyLock() DUMMY_LOCK = DummyLock()
def __init__(self, options, logger, image_map={}, css_map={}, job_info=None): def __init__(self, options, log, image_map={}, css_map={}, job_info=None):
self.base_dir = os.path.abspath(os.path.expanduser(options.dir)) self.base_dir = os.path.abspath(os.path.expanduser(options.dir))
if not os.path.exists(self.base_dir): if not os.path.exists(self.base_dir):
os.makedirs(self.base_dir) os.makedirs(self.base_dir)
self.log = log
self.default_timeout = socket.getdefaulttimeout() self.default_timeout = socket.getdefaulttimeout()
socket.setdefaulttimeout(options.timeout) socket.setdefaulttimeout(options.timeout)
self.verbose = options.verbose self.verbose = options.verbose
@ -174,7 +176,7 @@ class RecursiveFetcher(object):
def fetch_url(self, url): def fetch_url(self, url):
data = None data = None
self.log_debug('Fetching %s', url) self.log.debug('Fetching', url)
delta = time.time() - self.last_fetch_at delta = time.time() - self.last_fetch_at
if delta < self.delay: if delta < self.delay:
time.sleep(delta) time.sleep(delta)
@ -190,7 +192,7 @@ class RecursiveFetcher(object):
raise FetchError, responses[err.code] raise FetchError, responses[err.code]
if getattr(err, 'reason', [0])[0] == 104 or \ if getattr(err, 'reason', [0])[0] == 104 or \
getattr(getattr(err, 'args', [None])[0], 'errno', None) == -2: # Connection reset by peer or Name or service not know getattr(getattr(err, 'args', [None])[0], 'errno', None) == -2: # Connection reset by peer or Name or service not know
self.log_debug('Temporary error, retrying in 1 second') self.log.debug('Temporary error, retrying in 1 second')
time.sleep(1) time.sleep(1)
with closing(self.browser.open(url)) as f: with closing(self.browser.open(url)) as f:
data = response(f.read()+f.read()) data = response(f.read()+f.read())
@ -204,9 +206,9 @@ class RecursiveFetcher(object):
def start_fetch(self, url): def start_fetch(self, url):
soup = BeautifulSoup(u'<a href="'+url+'" />') soup = BeautifulSoup(u'<a href="'+url+'" />')
self.log_info('Downloading') self.log.debug('Downloading')
res = self.process_links(soup, url, 0, into_dir='') res = self.process_links(soup, url, 0, into_dir='')
self.log_info('%s saved to %s', url, res) self.log.debug('%s saved to %s'%( url, res))
return res return res
def is_link_ok(self, url): def is_link_ok(self, url):
@ -243,8 +245,7 @@ class RecursiveFetcher(object):
try: try:
data = self.fetch_url(iurl) data = self.fetch_url(iurl)
except Exception, err: except Exception, err:
self.log_debug('Could not fetch stylesheet %s', iurl) self.log.exception('Could not fetch stylesheet %s'% iurl)
self.log_debug('Error: %s', str(err), exc_info=True)
continue continue
stylepath = os.path.join(diskpath, 'style'+str(c)+'.css') stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
with self.stylemap_lock: with self.stylemap_lock:
@ -267,8 +268,7 @@ class RecursiveFetcher(object):
try: try:
data = self.fetch_url(iurl) data = self.fetch_url(iurl)
except Exception, err: except Exception, err:
self.log_warning('Could not fetch stylesheet %s', iurl) self.log.exception('Could not fetch stylesheet %s'% iurl)
self.log_debug('Error: %s', str(err), exc_info=True)
continue continue
c += 1 c += 1
stylepath = os.path.join(diskpath, 'style'+str(c)+'.css') stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
@ -291,9 +291,6 @@ class RecursiveFetcher(object):
iurl = self.image_url_processor(baseurl, iurl) iurl = self.image_url_processor(baseurl, iurl)
ext = os.path.splitext(iurl)[1] ext = os.path.splitext(iurl)[1]
ext = ext[:5] ext = ext[:5]
#if not ext:
# self.log_debug('Skipping extensionless image %s', iurl)
# continue
if not urlparse.urlsplit(iurl).scheme: if not urlparse.urlsplit(iurl).scheme:
iurl = urlparse.urljoin(baseurl, iurl, False) iurl = urlparse.urljoin(baseurl, iurl, False)
with self.imagemap_lock: with self.imagemap_lock:
@ -303,8 +300,7 @@ class RecursiveFetcher(object):
try: try:
data = self.fetch_url(iurl) data = self.fetch_url(iurl)
except Exception, err: except Exception, err:
self.log_warning('Could not fetch image %s', iurl) self.log.exception('Could not fetch image %s'% iurl)
self.log_debug('Error: %s', str(err), exc_info=True)
continue continue
c += 1 c += 1
fname = sanitize_file_name('img'+str(c)+ext) fname = sanitize_file_name('img'+str(c)+ext)
@ -330,10 +326,10 @@ class RecursiveFetcher(object):
if not parts.scheme: if not parts.scheme:
iurl = urlparse.urljoin(baseurl, iurl, False) iurl = urlparse.urljoin(baseurl, iurl, False)
if not self.is_link_ok(iurl): if not self.is_link_ok(iurl):
self.log_debug('Skipping invalid link: %s', iurl) self.log.debug('Skipping invalid link:', iurl)
return None return None
if filter and not self.is_link_wanted(iurl): if filter and not self.is_link_wanted(iurl):
self.log_debug('Filtered link: '+iurl) self.log.debug('Filtered link: '+iurl)
return None return None
return iurl return iurl
@ -401,7 +397,7 @@ class RecursiveFetcher(object):
base = soup.find('base', href=True) base = soup.find('base', href=True)
if base is not None: if base is not None:
newbaseurl = base['href'] newbaseurl = base['href']
self.log_debug('Processing images...') self.log.debug('Processing images...')
self.process_images(soup, newbaseurl) self.process_images(soup, newbaseurl)
if self.download_stylesheets: if self.download_stylesheets:
self.process_stylesheets(soup, newbaseurl) self.process_stylesheets(soup, newbaseurl)
@ -416,11 +412,11 @@ class RecursiveFetcher(object):
self.downloaded_paths.append(res) self.downloaded_paths.append(res)
self.filemap[nurl] = res self.filemap[nurl] = res
if recursion_level < self.max_recursions: if recursion_level < self.max_recursions:
self.log_debug('Processing links...') self.log.debug('Processing links...')
self.process_links(soup, newbaseurl, recursion_level+1) self.process_links(soup, newbaseurl, recursion_level+1)
else: else:
self.process_return_links(soup, newbaseurl) self.process_return_links(soup, newbaseurl)
self.log_debug('Recursion limit reached. Skipping links in %s', iurl) self.log.debug('Recursion limit reached. Skipping links in', iurl)
if callable(self.postprocess_html_ext): if callable(self.postprocess_html_ext):
soup = self.postprocess_html_ext(soup, soup = self.postprocess_html_ext(soup,
@ -434,8 +430,7 @@ class RecursiveFetcher(object):
self.localize_link(tag, 'href', res) self.localize_link(tag, 'href', res)
except Exception, err: except Exception, err:
self.failed_links.append((iurl, traceback.format_exc())) self.failed_links.append((iurl, traceback.format_exc()))
self.log_warning('Could not fetch link %s', iurl) self.log.exception('Could not fetch link', iurl)
self.log_debug('Error: %s', str(err), exc_info=True)
finally: finally:
self.current_dir = diskpath self.current_dir = diskpath
self.files += 1 self.files += 1
@ -478,12 +473,10 @@ def option_parser(usage=_('%prog URL\n\nWhere URL is for example http://google.c
return parser return parser
def create_fetcher(options, logger=None, image_map={}): def create_fetcher(options, image_map={}, log=None):
if logger is None: if log is None:
level = logging.DEBUG if options.verbose else logging.INFO log = Log()
logger = logging.getLogger('web2disk') return RecursiveFetcher(options, log, image_map={})
setup_cli_handlers(logger, level)
return RecursiveFetcher(options, logger, image_map={})
def main(args=sys.argv): def main(args=sys.argv):
parser = option_parser() parser = option_parser()