mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 18:24:30 -04:00
Working HTML/OPF input plugin. Also fixed feeds download and removed cover processing from OEBBook
This commit is contained in:
parent
296853cd43
commit
95d1b58ae3
@ -189,6 +189,7 @@ class ComicMetadataReader(MetadataReaderPlugin):
|
||||
def get_metadata(self, stream, ftype):
|
||||
if ftype == 'cbr':
|
||||
from calibre.libunrar import extract_member as extract_first
|
||||
extract_first
|
||||
else:
|
||||
from calibre.libunzip import extract_member as extract_first
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
@ -267,12 +268,14 @@ from calibre.ebooks.epub.input import EPUBInput
|
||||
from calibre.ebooks.mobi.input import MOBIInput
|
||||
from calibre.ebooks.pdf.input import PDFInput
|
||||
from calibre.ebooks.txt.input import TXTInput
|
||||
from calibre.ebooks.html.input import HTMLInput
|
||||
from calibre.ebooks.oeb.output import OEBOutput
|
||||
from calibre.ebooks.txt.output import TXTOutput
|
||||
from calibre.ebooks.pdf.output import PDFOutput
|
||||
from calibre.customize.profiles import input_profiles, output_profiles
|
||||
|
||||
plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDFInput, TXTInput, OEBOutput, TXTOutput, PDFOutput]
|
||||
plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDFInput, HTMLInput,
|
||||
TXTInput, OEBOutput, TXTOutput, PDFOutput]
|
||||
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
|
||||
x.__name__.endswith('MetadataReader')]
|
||||
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
|
||||
|
@ -163,9 +163,9 @@ class InputFormatPlugin(Plugin):
|
||||
for x in os.listdir('.'):
|
||||
shutil.rmtree(x) if os.path.isdir(x) else os.remove(x)
|
||||
|
||||
|
||||
ret = self.convert(stream, options, file_ext,
|
||||
log, accelerators)
|
||||
|
||||
if options.debug_input is not None:
|
||||
options.debug_input = os.path.abspath(options.debug_input)
|
||||
if not os.path.exists(options.debug_input):
|
||||
|
@ -17,7 +17,7 @@ def tostring(root, strip_comments=False, pretty_print=False):
|
||||
root.set('xmlns', 'http://www.w3.org/1999/xhtml')
|
||||
root.set('{http://www.w3.org/1999/xhtml}xlink', 'http://www.w3.org/1999/xlink')
|
||||
for x in root.iter():
|
||||
if x.tag.rpartition('}')[-1].lower() == 'svg':
|
||||
if hasattr(x.tag, 'rpartition') and x.tag.rpartition('}')[-1].lower() == 'svg':
|
||||
x.set('xmlns', 'http://www.w3.org/2000/svg')
|
||||
|
||||
ans = _tostring(root, encoding='utf-8', pretty_print=pretty_print)
|
||||
|
@ -11,14 +11,12 @@ __docformat__ = 'restructuredtext en'
|
||||
Input plugin for HTML or OPF ebooks.
|
||||
'''
|
||||
|
||||
import os, re, sys, cStringIO
|
||||
import os, re, sys
|
||||
from urlparse import urlparse, urlunparse
|
||||
from urllib import unquote
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from calibre.ebooks.metadata.meta import get_metadata
|
||||
from calibre.ebooks.metadata.opf2 import OPF, OPFCreator
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.customize.conversion import OptionRecommendation
|
||||
from calibre import unicode_path
|
||||
@ -213,72 +211,21 @@ def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None)
|
||||
sys.setrecursionlimit(orec)
|
||||
|
||||
|
||||
def opf_traverse(opf_reader, verbose=0, encoding=None):
|
||||
'''
|
||||
Return a list of :class:`HTMLFile` objects in the order specified by the
|
||||
`<spine>` element of the OPF.
|
||||
|
||||
:param opf_reader: An :class:`calibre.ebooks.metadata.opf2.OPF` instance.
|
||||
:param encoding: Specify character encoding of HTML files. If `None` it is
|
||||
auto-detected.
|
||||
'''
|
||||
if not opf_reader.spine:
|
||||
raise ValueError('OPF does not have a spine')
|
||||
flat = []
|
||||
for path in opf_reader.spine.items():
|
||||
path = os.path.abspath(path)
|
||||
if path not in flat:
|
||||
flat.append(os.path.abspath(path))
|
||||
for item in opf_reader.manifest:
|
||||
if 'html' in item.mime_type:
|
||||
path = os.path.abspath(item.path)
|
||||
if path not in flat:
|
||||
flat.append(path)
|
||||
for i, path in enumerate(flat):
|
||||
if not os.path.exists(path):
|
||||
path = path.replace('&', '%26')
|
||||
if os.path.exists(path):
|
||||
flat[i] = path
|
||||
for item in opf_reader.itermanifest():
|
||||
item.set('href', item.get('href').replace('&', '%26'))
|
||||
ans = []
|
||||
for path in flat:
|
||||
if os.path.exists(path):
|
||||
ans.append(HTMLFile(path, 0, encoding, verbose))
|
||||
else:
|
||||
print 'WARNING: OPF spine item %s does not exist'%path
|
||||
ans = [f for f in ans if not f.is_binary]
|
||||
return ans
|
||||
|
||||
def search_for_opf(dir):
|
||||
for f in os.listdir(dir):
|
||||
if f.lower().endswith('.opf'):
|
||||
return OPF(open(os.path.join(dir, f), 'rb'), dir)
|
||||
|
||||
def get_filelist(htmlfile, dir, opts, log):
|
||||
'''
|
||||
Build list of files referenced by html file or try to detect and use an
|
||||
OPF file instead.
|
||||
'''
|
||||
print 'Building file list...'
|
||||
opf = search_for_opf(dir)
|
||||
filelist = None
|
||||
if opf is not None:
|
||||
try:
|
||||
filelist = opf_traverse(opf, verbose=opts.verbose,
|
||||
encoding=opts.input_encoding)
|
||||
except:
|
||||
pass
|
||||
if not filelist:
|
||||
filelist = traverse(htmlfile, max_levels=int(opts.max_levels),
|
||||
verbose=opts.verbose,
|
||||
encoding=opts.input_encoding)\
|
||||
[0 if opts.breadth_first else 1]
|
||||
log.info('Building file list...')
|
||||
filelist = traverse(htmlfile, max_levels=int(opts.max_levels),
|
||||
verbose=opts.verbose,
|
||||
encoding=opts.input_encoding)\
|
||||
[0 if opts.breadth_first else 1]
|
||||
if opts.verbose:
|
||||
log.debug('\tFound files...')
|
||||
for f in filelist:
|
||||
log.debug('\t\t', f)
|
||||
return opf, filelist
|
||||
return filelist
|
||||
|
||||
|
||||
class HTMLInput(InputFormatPlugin):
|
||||
@ -309,34 +256,32 @@ class HTMLInput(InputFormatPlugin):
|
||||
|
||||
def convert(self, stream, opts, file_ext, log,
|
||||
accelerators):
|
||||
from calibre.ebooks.metadata.meta import get_metadata
|
||||
|
||||
basedir = os.getcwd()
|
||||
|
||||
if hasattr(stream, 'name'):
|
||||
basedir = os.path.dirname(stream.name)
|
||||
if file_ext == 'opf':
|
||||
opf = OPF(stream, basedir)
|
||||
filelist = opf_traverse(opf, verbose=opts.verbose,
|
||||
encoding=opts.input_encoding)
|
||||
mi = MetaInformation(opf)
|
||||
opfpath = stream.name
|
||||
else:
|
||||
opf, filelist = get_filelist(stream.name, basedir, opts, log)
|
||||
mi = MetaInformation(opf)
|
||||
mi.smart_update(get_metadata(stream, 'html'))
|
||||
filelist = get_filelist(stream.name, basedir, opts, log)
|
||||
mi = get_metadata(stream, 'html')
|
||||
mi = OPFCreator(os.getcwdu(), mi)
|
||||
mi.guide = None
|
||||
entries = [(f.path, 'application/xhtml+xml') for f in filelist]
|
||||
mi.create_manifest(entries)
|
||||
mi.create_spine([f.path for f in filelist])
|
||||
|
||||
mi = OPFCreator(os.getcwdu(), mi)
|
||||
mi.guide = None
|
||||
entries = [(f.path, 'application/xhtml+xml') for f in filelist]
|
||||
mi.create_manifest(entries)
|
||||
mi.create_spine([f.path for f in filelist])
|
||||
|
||||
tocbuf = cStringIO.StringIO()
|
||||
mi.render(open('metadata.opf', 'wb'), tocbuf, 'toc.ncx')
|
||||
toc = tocbuf.getvalue()
|
||||
if toc:
|
||||
open('toc.ncx', 'wb').write(toc)
|
||||
mi.render(open('metadata.opf', 'wb'))
|
||||
opfpath = os.path.abspath('metadata.opf')
|
||||
|
||||
from calibre.ebooks.conversion.plumber import create_oebbook
|
||||
return create_oebbook(log, os.path.abspath('metadata.opf'))
|
||||
|
||||
|
||||
oeb = create_oebbook(log, opfpath)
|
||||
|
||||
from calibre.ebooks.oeb.transforms.package import Package
|
||||
Package(os.getcwdu())(oeb, opts)
|
||||
|
||||
return oeb
|
||||
|
||||
|
||||
|
@ -573,7 +573,7 @@ class OEBReader(object):
|
||||
item = self._find_ncx(opf)
|
||||
self._toc_from_opf(opf, item)
|
||||
self._pages_from_opf(opf, item)
|
||||
self._ensure_cover_image()
|
||||
#self._ensure_cover_image()
|
||||
|
||||
|
||||
def main(argv=sys.argv):
|
||||
|
@ -6,13 +6,14 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
import os, re
|
||||
from urllib import unquote as urlunquote
|
||||
from functools import partial
|
||||
|
||||
from lxml import etree
|
||||
import cssutils
|
||||
|
||||
from calibre import sanitize_file_name
|
||||
from calibre.constants import islinux
|
||||
from calibre.ebooks.oeb.base import OEB_DOCS, urlnormalize, urldefrag, \
|
||||
rewrite_links
|
||||
@ -36,15 +37,21 @@ class Package(object):
|
||||
self.new_base_path = os.path.abspath(base)
|
||||
|
||||
def rewrite_links_in(self, item):
|
||||
base = os.path.join(self.new_base_path, *item.href.split('/'))
|
||||
old_href = item.old_href.split('#')[0]
|
||||
new_href = item.href.split('#')[0]
|
||||
base = os.path.join(self.old_base_path, *old_href.split('/'))
|
||||
base = os.path.dirname(base)
|
||||
self.log.debug('\tRewriting links in', base+'/'+
|
||||
item.href.rpartition('/')[-1])
|
||||
new_base = os.path.join(self.new_base_path, *new_href.split('/'))
|
||||
new_base = os.path.dirname(new_base)
|
||||
|
||||
if etree.iselement(item.data):
|
||||
self.rewrite_links_in_xml(item.data, base)
|
||||
self.rewrite_links_in_xml(item.data, base, new_base)
|
||||
elif hasattr(item.data, 'cssText'):
|
||||
self.rewrite_links_in_css(item.data, base)
|
||||
self.rewrite_links_in_css(item.data, base, new_base)
|
||||
|
||||
def link_replacer(self, link_, base=''):
|
||||
def link_replacer(self, link_, base='', new_base=''):
|
||||
link = urlnormalize(link_)
|
||||
link, frag = urldefrag(link)
|
||||
link = urlunquote(link).replace('/', os.sep)
|
||||
@ -55,20 +62,33 @@ class Package(object):
|
||||
link = link.lower()
|
||||
if link not in self.map:
|
||||
return link_
|
||||
nlink = os.path.relpath(self.map[link], base)
|
||||
nlink = os.path.relpath(self.map[link], new_base)
|
||||
if frag:
|
||||
nlink = '#'.join(nlink, frag)
|
||||
nlink = '#'.join((nlink, frag))
|
||||
return nlink.replace(os.sep, '/')
|
||||
|
||||
def rewrite_links_in_css(self, sheet, base):
|
||||
repl = partial(self.link_replacer, base=base)
|
||||
def rewrite_links_in_css(self, sheet, base, new_base):
|
||||
repl = partial(self.link_replacer, base=base, new_base=new_base)
|
||||
cssutils.replaceUrls(sheet, repl)
|
||||
|
||||
def rewrite_links_in_xml(self, root, base):
|
||||
repl = partial(self.link_replacer, base=base)
|
||||
def rewrite_links_in_xml(self, root, base, new_base):
|
||||
repl = partial(self.link_replacer, base=base, new_base=new_base)
|
||||
rewrite_links(root, repl)
|
||||
|
||||
def move_manifest_item(self, item):
|
||||
def uniqify_name(self, new_href, hrefs):
|
||||
c = 0
|
||||
while new_href in hrefs:
|
||||
c += 1
|
||||
parts = new_href.split('/')
|
||||
name, ext = os.path.splitext(parts[-1])
|
||||
name = re.sub(r'_\d+$', '', name)
|
||||
name += '_%d'%c
|
||||
parts[-1] = name + ext
|
||||
new_href = '/'.join(parts)
|
||||
return new_href
|
||||
|
||||
|
||||
def move_manifest_item(self, item, hrefs):
|
||||
item.data # Make sure the data has been loaded and cached
|
||||
old_abspath = os.path.join(self.old_base_path,
|
||||
*(urldefrag(item.href)[0].split('/')))
|
||||
@ -79,11 +99,17 @@ class Package(object):
|
||||
new_href = 'content/'
|
||||
elif item.href.lower().endswith('.ncx'):
|
||||
new_href = ''
|
||||
new_href += bname
|
||||
new_href += sanitize_file_name(bname)
|
||||
|
||||
if new_href in hrefs:
|
||||
new_href = self.uniqify_name(new_href, hrefs)
|
||||
hrefs.add(new_href)
|
||||
|
||||
new_abspath = os.path.join(self.new_base_path, *new_href.split('/'))
|
||||
new_abspath = os.path.abspath(new_abspath)
|
||||
item.old_href = self.oeb.manifest.hrefs.pop(item.href).href
|
||||
item.href = new_href
|
||||
self.oeb.manifest.hrefs[item.href] = item
|
||||
if not islinux:
|
||||
old_abspath, new_abspath = old_abspath.lower(), new_abspath.lower()
|
||||
if old_abspath != new_abspath:
|
||||
@ -91,25 +117,33 @@ class Package(object):
|
||||
|
||||
def rewrite_links_in_toc(self, toc):
|
||||
if toc.href:
|
||||
toc.href = self.link_replacer(toc.href, base=self.new_base_path)
|
||||
toc.href = self.link_replacer(toc.href, base=self.old_base_path,
|
||||
new_base=self.new_base_path)
|
||||
|
||||
for x in toc:
|
||||
self.rewrite_links_in_toc(x)
|
||||
|
||||
def __call__(self, oeb, context):
|
||||
self.map = {}
|
||||
self.log = self.oeb.log
|
||||
self.log = oeb.log
|
||||
self.oeb = oeb
|
||||
self.old_base_path = os.path.abspath(oeb.container.rootdir)
|
||||
|
||||
hrefs = set([])
|
||||
for item in self.oeb.manifest:
|
||||
self.move_manifest_item(item)
|
||||
self.move_manifest_item(item, hrefs)
|
||||
|
||||
self.log.debug('Rewriting links in OEB documents...')
|
||||
for item in self.oeb.manifest:
|
||||
self.rewrite_links_in(item)
|
||||
|
||||
if getattr(oeb.toc, 'nodes', False):
|
||||
self.log.debug('Rewriting links in TOC...')
|
||||
self.rewrite_links_in_toc(oeb.toc)
|
||||
|
||||
if hasattr(oeb, 'guide'):
|
||||
self.log.debug('Rewriting links in guide...')
|
||||
for ref in oeb.guide.values():
|
||||
ref.href = self.link_replacer(ref.href, base=self.new_base_path)
|
||||
ref.href = self.link_replacer(ref.href,
|
||||
base=self.old_base_path,
|
||||
new_base=self.new_base_path)
|
||||
|
@ -48,7 +48,8 @@ class OEBWriter(object):
|
||||
pretty_print=pretty_print)
|
||||
|
||||
def __call__(self, oeb, path):
|
||||
"""Read the book in the :class:`OEBBook` object :param:`oeb` to a file
|
||||
"""
|
||||
Read the book in the :class:`OEBBook` object :param:`oeb` to a file
|
||||
at :param:`path`.
|
||||
"""
|
||||
version = int(self.version[0])
|
||||
|
@ -5,7 +5,7 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
CLI for downloading feeds.
|
||||
'''
|
||||
|
||||
import sys, os, logging
|
||||
import sys, os
|
||||
from calibre.web.feeds.recipes import get_builtin_recipe, compile_recipe, titles
|
||||
from calibre.web.fetch.simple import option_parser as _option_parser
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
@ -113,7 +113,7 @@ If you specify this option, any argument to %prog is ignored and a default recip
|
||||
class RecipeError(Exception):
|
||||
pass
|
||||
|
||||
def run_recipe(opts, recipe_arg, parser, notification=None, handler=None):
|
||||
def run_recipe(opts, recipe_arg, parser, notification=None):
|
||||
if notification is None:
|
||||
from calibre.utils.terminfo import TerminalController, ProgressBar
|
||||
term = TerminalController(sys.stdout)
|
||||
@ -137,14 +137,6 @@ def run_recipe(opts, recipe_arg, parser, notification=None, handler=None):
|
||||
if recipe is None:
|
||||
raise RecipeError(recipe_arg+ ' is an invalid recipe')
|
||||
|
||||
|
||||
if handler is None:
|
||||
from calibre import ColoredFormatter
|
||||
handler = logging.StreamHandler(sys.stdout)
|
||||
handler.setLevel(logging.DEBUG if opts.debug else logging.INFO if opts.verbose else logging.WARN)
|
||||
handler.setFormatter(ColoredFormatter('%(levelname)s: %(message)s\n')) # The trailing newline is need because of the progress bar
|
||||
logging.getLogger('feeds2disk').addHandler(handler)
|
||||
|
||||
recipe = recipe(opts, parser, notification)
|
||||
|
||||
if not os.path.exists(recipe.output_dir):
|
||||
@ -153,7 +145,7 @@ def run_recipe(opts, recipe_arg, parser, notification=None, handler=None):
|
||||
|
||||
return recipe
|
||||
|
||||
def main(args=sys.argv, notification=None, handler=None):
|
||||
def main(args=sys.argv, notification=None):
|
||||
p = option_parser()
|
||||
opts, args = p.parse_args(args=args[1:])
|
||||
|
||||
@ -161,7 +153,7 @@ def main(args=sys.argv, notification=None, handler=None):
|
||||
p.print_help()
|
||||
return 1
|
||||
recipe_arg = args[0] if len(args) > 0 else None
|
||||
run_recipe(opts, recipe_arg, p, notification=notification, handler=handler)
|
||||
run_recipe(opts, recipe_arg, p, notification=notification)
|
||||
|
||||
return 0
|
||||
|
||||
|
@ -7,7 +7,7 @@ Defines various abstract base classes that can be subclassed to create powerful
|
||||
__docformat__ = "restructuredtext en"
|
||||
|
||||
|
||||
import logging, os, cStringIO, time, traceback, re, urlparse, sys
|
||||
import os, time, traceback, re, urlparse, sys
|
||||
from collections import defaultdict
|
||||
from functools import partial
|
||||
from contextlib import nested, closing
|
||||
@ -27,6 +27,7 @@ from calibre.web.feeds import feed_from_xml, templates, feeds_from_index, Feed
|
||||
from calibre.web.fetch.simple import option_parser as web2disk_option_parser
|
||||
from calibre.web.fetch.simple import RecursiveFetcher
|
||||
from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending
|
||||
from calibre.utils.logging import Log
|
||||
from calibre.ptempfile import PersistentTemporaryFile, \
|
||||
PersistentTemporaryDirectory
|
||||
|
||||
@ -423,7 +424,7 @@ class BasicNewsRecipe(object):
|
||||
'''
|
||||
raise NotImplementedError
|
||||
|
||||
def get_obfuscated_article(self, url, logger):
|
||||
def get_obfuscated_article(self, url):
|
||||
'''
|
||||
If you set :member:`articles_are_obfuscated` this method is called with
|
||||
every article URL. It should return the path to a file on the filesystem
|
||||
@ -443,6 +444,7 @@ class BasicNewsRecipe(object):
|
||||
:param parser: Command line option parser. Used to intelligently merge options.
|
||||
:param progress_reporter: A Callable that takes two arguments: progress (a number between 0 and 1) and a string message. The message should be optional.
|
||||
'''
|
||||
self.log = Log()
|
||||
if not isinstance(self.title, unicode):
|
||||
self.title = unicode(self.title, 'utf-8', 'replace')
|
||||
|
||||
@ -455,7 +457,6 @@ class BasicNewsRecipe(object):
|
||||
|
||||
|
||||
if self.debug:
|
||||
logging.getLogger('feeds2disk').setLevel(logging.DEBUG)
|
||||
self.verbose = True
|
||||
self.report_progress = progress_reporter
|
||||
|
||||
@ -560,20 +561,20 @@ class BasicNewsRecipe(object):
|
||||
res = self.build_index()
|
||||
self.report_progress(1, _('Download finished'))
|
||||
if self.failed_downloads:
|
||||
self.log_warning(_('Failed to download the following articles:'))
|
||||
self.log.warning(_('Failed to download the following articles:'))
|
||||
for feed, article, debug in self.failed_downloads:
|
||||
self.log_warning(article.title+_(' from ')+feed.title)
|
||||
self.log_debug(article.url)
|
||||
self.log_debug(debug)
|
||||
self.log.warning(article.title+_(' from ')+feed.title)
|
||||
self.log.debug(article.url)
|
||||
self.log.debug(debug)
|
||||
if self.partial_failures:
|
||||
self.log_warning(_('Failed to download parts of the following articles:'))
|
||||
self.log.warning(_('Failed to download parts of the following articles:'))
|
||||
for feed, atitle, aurl, debug in self.partial_failures:
|
||||
self.log_warning(atitle + _(' from ') + feed)
|
||||
self.log_debug(aurl)
|
||||
self.log_warning(_('\tFailed links:'))
|
||||
self.log.warning(atitle + _(' from ') + feed)
|
||||
self.log.debug(aurl)
|
||||
self.log.warning(_('\tFailed links:'))
|
||||
for l, tb in debug:
|
||||
self.log_warning(l)
|
||||
self.log_debug(tb)
|
||||
self.log.warning(l)
|
||||
self.log.debug(tb)
|
||||
return res
|
||||
finally:
|
||||
self.cleanup()
|
||||
@ -636,20 +637,11 @@ class BasicNewsRecipe(object):
|
||||
extra_css=self.extra_css).render(doctype='xhtml')
|
||||
|
||||
|
||||
def create_logger(self, feed_number, article_number):
|
||||
logger = logging.getLogger('feeds2disk.article_%d_%d'%(feed_number, article_number))
|
||||
out = cStringIO.StringIO()
|
||||
handler = logging.StreamHandler(out)
|
||||
handler.setFormatter(logging.Formatter('%(levelname)s: %(message)s'))
|
||||
handler.setLevel(logging.INFO if self.verbose else logging.WARNING)
|
||||
if self.debug:
|
||||
handler.setLevel(logging.DEBUG)
|
||||
logger.addHandler(handler)
|
||||
return logger, out
|
||||
|
||||
def _fetch_article(self, url, dir, logger, f, a, num_of_feeds):
|
||||
def _fetch_article(self, url, dir, f, a, num_of_feeds):
|
||||
self.web2disk_options.browser = self.get_browser() if self.multithreaded_fetch else self.browser
|
||||
fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map, self.css_map, (url, f, a, num_of_feeds))
|
||||
fetcher = RecursiveFetcher(self.web2disk_options, self.log,
|
||||
self.image_map, self.css_map,
|
||||
(url, f, a, num_of_feeds))
|
||||
fetcher.base_dir = dir
|
||||
fetcher.current_dir = dir
|
||||
fetcher.show_progress = False
|
||||
@ -661,21 +653,21 @@ class BasicNewsRecipe(object):
|
||||
raise Exception(_('Could not fetch article. Run with --debug to see the reason'))
|
||||
return res, path, failures
|
||||
|
||||
def fetch_article(self, url, dir, logger, f, a, num_of_feeds):
|
||||
return self._fetch_article(url, dir, logger, f, a, num_of_feeds)
|
||||
def fetch_article(self, url, dir, f, a, num_of_feeds):
|
||||
return self._fetch_article(url, dir, f, a, num_of_feeds)
|
||||
|
||||
def fetch_obfuscated_article(self, url, dir, logger, f, a, num_of_feeds):
|
||||
path = os.path.abspath(self.get_obfuscated_article(url, logger))
|
||||
def fetch_obfuscated_article(self, url, dir, f, a, num_of_feeds):
|
||||
path = os.path.abspath(self.get_obfuscated_article(url))
|
||||
url = ('file:'+path) if iswindows else ('file://'+path)
|
||||
return self._fetch_article(url, dir, logger, f, a, num_of_feeds)
|
||||
return self._fetch_article(url, dir, f, a, num_of_feeds)
|
||||
|
||||
def fetch_embedded_article(self, article, dir, logger, f, a, num_of_feeds):
|
||||
def fetch_embedded_article(self, article, dir, f, a, num_of_feeds):
|
||||
templ = templates.EmbeddedContent()
|
||||
raw = templ.generate(article).render('html')
|
||||
with PersistentTemporaryFile('_feeds2disk.html') as pt:
|
||||
pt.write(raw)
|
||||
url = ('file:'+pt.name) if iswindows else ('file://'+pt.name)
|
||||
return self._fetch_article(url, dir, logger, f, a, num_of_feeds)
|
||||
return self._fetch_article(url, dir, f, a, num_of_feeds)
|
||||
|
||||
|
||||
def build_index(self):
|
||||
@ -716,7 +708,6 @@ class BasicNewsRecipe(object):
|
||||
art_dir = os.path.join(feed_dir, 'article_%d'%a)
|
||||
if not os.path.isdir(art_dir):
|
||||
os.makedirs(art_dir)
|
||||
logger, stream = self.create_logger(f, a)
|
||||
try:
|
||||
url = self.print_version(article.url)
|
||||
except NotImplementedError:
|
||||
@ -726,10 +717,9 @@ class BasicNewsRecipe(object):
|
||||
func, arg = (self.fetch_embedded_article, article) if self.use_embedded_content else \
|
||||
((self.fetch_obfuscated_article if self.articles_are_obfuscated \
|
||||
else self.fetch_article), url)
|
||||
req = WorkRequest(func, (arg, art_dir, logger, f, a, len(feed)),
|
||||
req = WorkRequest(func, (arg, art_dir, f, a, len(feed)),
|
||||
{}, (f, a), self.article_downloaded,
|
||||
self.error_in_article_download)
|
||||
req.stream = stream
|
||||
req.feed = feed
|
||||
req.article = article
|
||||
req.feed_dir = feed_dir
|
||||
@ -768,8 +758,8 @@ class BasicNewsRecipe(object):
|
||||
cu = self.get_cover_url()
|
||||
except Exception, err:
|
||||
cu = None
|
||||
self.log_error(_('Could not download cover: %s')%str(err))
|
||||
self.log_debug(traceback.format_exc())
|
||||
self.log.error(_('Could not download cover: %s')%str(err))
|
||||
self.log.debug(traceback.format_exc())
|
||||
if cu is not None:
|
||||
ext = cu.rpartition('.')[-1]
|
||||
if '?' in ext:
|
||||
@ -841,8 +831,8 @@ class BasicNewsRecipe(object):
|
||||
f.write(html.encode('utf-8'))
|
||||
renderer = render_html(hf)
|
||||
if renderer.tb is not None:
|
||||
self.logger.warning('Failed to render default cover')
|
||||
self.logger.debug(renderer.tb)
|
||||
self.log.warning('Failed to render default cover')
|
||||
self.log.debug(renderer.tb)
|
||||
else:
|
||||
cover_file.write(renderer.data)
|
||||
cover_file.flush()
|
||||
@ -863,7 +853,7 @@ class BasicNewsRecipe(object):
|
||||
manifest.append(os.path.join(dir, 'index.ncx'))
|
||||
cpath = getattr(self, 'cover_path', None)
|
||||
if cpath is None:
|
||||
pf = PersistentTemporaryFile('_recipe_cover.jpg')
|
||||
pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
|
||||
self.default_cover(pf)
|
||||
cpath = pf.name
|
||||
if cpath is not None and os.access(cpath, os.R_OK):
|
||||
@ -944,7 +934,7 @@ class BasicNewsRecipe(object):
|
||||
a = request.requestID[1]
|
||||
|
||||
article = request.article
|
||||
self.log_debug(_('\nDownloaded article %s from %s\n%s')%(article.title, article.url, request.stream.getvalue().decode('utf-8', 'ignore')))
|
||||
self.log.debug(_('\nDownloaded article %s from %s')%(article.title, article.url))
|
||||
article.orig_url = article.url
|
||||
article.url = 'article_%d/index.html'%a
|
||||
article.downloaded = True
|
||||
@ -956,11 +946,11 @@ class BasicNewsRecipe(object):
|
||||
|
||||
def error_in_article_download(self, request, traceback):
|
||||
self.jobs_done += 1
|
||||
self.log_error(_('Failed to download article: %s from %s\n')%(request.article.title, request.article.url))
|
||||
self.log.error(_('Failed to download article: %s from %s\n')%(request.article.title, request.article.url))
|
||||
debug = request.stream.getvalue().decode('utf-8', 'ignore')
|
||||
self.log_debug(debug)
|
||||
self.log_debug(traceback)
|
||||
self.log_debug('\n')
|
||||
self.log.debug(debug)
|
||||
self.log.debug(traceback)
|
||||
self.log.debug('\n')
|
||||
self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article download failed: %s')%request.article.title)
|
||||
self.failed_downloads.append((request.feed, request.article, debug))
|
||||
|
||||
@ -990,7 +980,7 @@ class BasicNewsRecipe(object):
|
||||
feed.populate_from_preparsed_feed(msg, [])
|
||||
feed.description = unicode(err)
|
||||
parsed_feeds.append(feed)
|
||||
self.log_exception(msg)
|
||||
self.log.exception(msg)
|
||||
|
||||
|
||||
return parsed_feeds
|
||||
@ -1057,7 +1047,7 @@ class CustomIndexRecipe(BasicNewsRecipe):
|
||||
index = os.path.abspath(self.custom_index())
|
||||
url = 'file:'+index if iswindows else 'file://'+index
|
||||
self.web2disk_options.browser = self.browser
|
||||
fetcher = RecursiveFetcher(self.web2disk_options, self.logger)
|
||||
fetcher = RecursiveFetcher(self.web2disk_options, self.log)
|
||||
fetcher.base_dir = self.output_dir
|
||||
fetcher.current_dir = self.output_dir
|
||||
fetcher.show_progress = False
|
||||
@ -1069,7 +1059,7 @@ class AutomaticNewsRecipe(BasicNewsRecipe):
|
||||
|
||||
keep_only_tags = [dict(name=['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])]
|
||||
|
||||
def fetch_embedded_article(self, article, dir, logger, f, a, num_of_feeds):
|
||||
def fetch_embedded_article(self, article, dir, f, a, num_of_feeds):
|
||||
if self.use_embedded_content:
|
||||
self.web2disk_options.keep_only_tags = []
|
||||
return BasicNewsRecipe.fetch_embedded_article(self, article, dir, logger, f, a, num_of_feeds)
|
||||
return BasicNewsRecipe.fetch_embedded_article(self, article, dir, f, a, num_of_feeds)
|
||||
|
@ -7,18 +7,19 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
Fetch a webpage and its links recursively. The webpages are saved to disk in
|
||||
UTF-8 encoding with any charset declarations removed.
|
||||
'''
|
||||
import sys, socket, os, urlparse, logging, re, time, copy, urllib2, threading, traceback
|
||||
import sys, socket, os, urlparse, re, time, copy, urllib2, threading, traceback
|
||||
from urllib import url2pathname, quote
|
||||
from threading import RLock
|
||||
from httplib import responses
|
||||
from PIL import Image
|
||||
from cStringIO import StringIO
|
||||
|
||||
from calibre import setup_cli_handlers, browser, sanitize_file_name, \
|
||||
from calibre import browser, sanitize_file_name, \
|
||||
relpath, unicode_path
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.utils.config import OptionParser
|
||||
from calibre.utils.logging import Log
|
||||
|
||||
class FetchError(Exception):
|
||||
pass
|
||||
@ -92,10 +93,11 @@ class RecursiveFetcher(object):
|
||||
default_timeout = socket.getdefaulttimeout() # Needed here as it is used in __del__
|
||||
DUMMY_LOCK = DummyLock()
|
||||
|
||||
def __init__(self, options, logger, image_map={}, css_map={}, job_info=None):
|
||||
def __init__(self, options, log, image_map={}, css_map={}, job_info=None):
|
||||
self.base_dir = os.path.abspath(os.path.expanduser(options.dir))
|
||||
if not os.path.exists(self.base_dir):
|
||||
os.makedirs(self.base_dir)
|
||||
self.log = log
|
||||
self.default_timeout = socket.getdefaulttimeout()
|
||||
socket.setdefaulttimeout(options.timeout)
|
||||
self.verbose = options.verbose
|
||||
@ -174,7 +176,7 @@ class RecursiveFetcher(object):
|
||||
|
||||
def fetch_url(self, url):
|
||||
data = None
|
||||
self.log_debug('Fetching %s', url)
|
||||
self.log.debug('Fetching', url)
|
||||
delta = time.time() - self.last_fetch_at
|
||||
if delta < self.delay:
|
||||
time.sleep(delta)
|
||||
@ -190,7 +192,7 @@ class RecursiveFetcher(object):
|
||||
raise FetchError, responses[err.code]
|
||||
if getattr(err, 'reason', [0])[0] == 104 or \
|
||||
getattr(getattr(err, 'args', [None])[0], 'errno', None) == -2: # Connection reset by peer or Name or service not know
|
||||
self.log_debug('Temporary error, retrying in 1 second')
|
||||
self.log.debug('Temporary error, retrying in 1 second')
|
||||
time.sleep(1)
|
||||
with closing(self.browser.open(url)) as f:
|
||||
data = response(f.read()+f.read())
|
||||
@ -204,9 +206,9 @@ class RecursiveFetcher(object):
|
||||
|
||||
def start_fetch(self, url):
|
||||
soup = BeautifulSoup(u'<a href="'+url+'" />')
|
||||
self.log_info('Downloading')
|
||||
self.log.debug('Downloading')
|
||||
res = self.process_links(soup, url, 0, into_dir='')
|
||||
self.log_info('%s saved to %s', url, res)
|
||||
self.log.debug('%s saved to %s'%( url, res))
|
||||
return res
|
||||
|
||||
def is_link_ok(self, url):
|
||||
@ -243,8 +245,7 @@ class RecursiveFetcher(object):
|
||||
try:
|
||||
data = self.fetch_url(iurl)
|
||||
except Exception, err:
|
||||
self.log_debug('Could not fetch stylesheet %s', iurl)
|
||||
self.log_debug('Error: %s', str(err), exc_info=True)
|
||||
self.log.exception('Could not fetch stylesheet %s'% iurl)
|
||||
continue
|
||||
stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
|
||||
with self.stylemap_lock:
|
||||
@ -267,8 +268,7 @@ class RecursiveFetcher(object):
|
||||
try:
|
||||
data = self.fetch_url(iurl)
|
||||
except Exception, err:
|
||||
self.log_warning('Could not fetch stylesheet %s', iurl)
|
||||
self.log_debug('Error: %s', str(err), exc_info=True)
|
||||
self.log.exception('Could not fetch stylesheet %s'% iurl)
|
||||
continue
|
||||
c += 1
|
||||
stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
|
||||
@ -291,9 +291,6 @@ class RecursiveFetcher(object):
|
||||
iurl = self.image_url_processor(baseurl, iurl)
|
||||
ext = os.path.splitext(iurl)[1]
|
||||
ext = ext[:5]
|
||||
#if not ext:
|
||||
# self.log_debug('Skipping extensionless image %s', iurl)
|
||||
# continue
|
||||
if not urlparse.urlsplit(iurl).scheme:
|
||||
iurl = urlparse.urljoin(baseurl, iurl, False)
|
||||
with self.imagemap_lock:
|
||||
@ -303,8 +300,7 @@ class RecursiveFetcher(object):
|
||||
try:
|
||||
data = self.fetch_url(iurl)
|
||||
except Exception, err:
|
||||
self.log_warning('Could not fetch image %s', iurl)
|
||||
self.log_debug('Error: %s', str(err), exc_info=True)
|
||||
self.log.exception('Could not fetch image %s'% iurl)
|
||||
continue
|
||||
c += 1
|
||||
fname = sanitize_file_name('img'+str(c)+ext)
|
||||
@ -330,10 +326,10 @@ class RecursiveFetcher(object):
|
||||
if not parts.scheme:
|
||||
iurl = urlparse.urljoin(baseurl, iurl, False)
|
||||
if not self.is_link_ok(iurl):
|
||||
self.log_debug('Skipping invalid link: %s', iurl)
|
||||
self.log.debug('Skipping invalid link:', iurl)
|
||||
return None
|
||||
if filter and not self.is_link_wanted(iurl):
|
||||
self.log_debug('Filtered link: '+iurl)
|
||||
self.log.debug('Filtered link: '+iurl)
|
||||
return None
|
||||
return iurl
|
||||
|
||||
@ -401,7 +397,7 @@ class RecursiveFetcher(object):
|
||||
base = soup.find('base', href=True)
|
||||
if base is not None:
|
||||
newbaseurl = base['href']
|
||||
self.log_debug('Processing images...')
|
||||
self.log.debug('Processing images...')
|
||||
self.process_images(soup, newbaseurl)
|
||||
if self.download_stylesheets:
|
||||
self.process_stylesheets(soup, newbaseurl)
|
||||
@ -416,11 +412,11 @@ class RecursiveFetcher(object):
|
||||
self.downloaded_paths.append(res)
|
||||
self.filemap[nurl] = res
|
||||
if recursion_level < self.max_recursions:
|
||||
self.log_debug('Processing links...')
|
||||
self.log.debug('Processing links...')
|
||||
self.process_links(soup, newbaseurl, recursion_level+1)
|
||||
else:
|
||||
self.process_return_links(soup, newbaseurl)
|
||||
self.log_debug('Recursion limit reached. Skipping links in %s', iurl)
|
||||
self.log.debug('Recursion limit reached. Skipping links in', iurl)
|
||||
|
||||
if callable(self.postprocess_html_ext):
|
||||
soup = self.postprocess_html_ext(soup,
|
||||
@ -434,8 +430,7 @@ class RecursiveFetcher(object):
|
||||
self.localize_link(tag, 'href', res)
|
||||
except Exception, err:
|
||||
self.failed_links.append((iurl, traceback.format_exc()))
|
||||
self.log_warning('Could not fetch link %s', iurl)
|
||||
self.log_debug('Error: %s', str(err), exc_info=True)
|
||||
self.log.exception('Could not fetch link', iurl)
|
||||
finally:
|
||||
self.current_dir = diskpath
|
||||
self.files += 1
|
||||
@ -478,12 +473,10 @@ def option_parser(usage=_('%prog URL\n\nWhere URL is for example http://google.c
|
||||
return parser
|
||||
|
||||
|
||||
def create_fetcher(options, logger=None, image_map={}):
|
||||
if logger is None:
|
||||
level = logging.DEBUG if options.verbose else logging.INFO
|
||||
logger = logging.getLogger('web2disk')
|
||||
setup_cli_handlers(logger, level)
|
||||
return RecursiveFetcher(options, logger, image_map={})
|
||||
def create_fetcher(options, image_map={}, log=None):
|
||||
if log is None:
|
||||
log = Log()
|
||||
return RecursiveFetcher(options, log, image_map={})
|
||||
|
||||
def main(args=sys.argv):
|
||||
parser = option_parser()
|
||||
|
Loading…
x
Reference in New Issue
Block a user