mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 18:24:30 -04:00
Working HTML/OPF input plugin. Also fixed feeds download and removed cover processing from OEBBook
This commit is contained in:
parent
296853cd43
commit
95d1b58ae3
@ -18,7 +18,7 @@ every time you add an HTML file to the library.\
|
|||||||
file_types = set(['html', 'htm', 'xhtml', 'xhtm'])
|
file_types = set(['html', 'htm', 'xhtml', 'xhtm'])
|
||||||
supported_platforms = ['windows', 'osx', 'linux']
|
supported_platforms = ['windows', 'osx', 'linux']
|
||||||
on_import = True
|
on_import = True
|
||||||
|
|
||||||
def run(self, htmlfile):
|
def run(self, htmlfile):
|
||||||
of = self.temporary_file('_plugin_html2zip.zip')
|
of = self.temporary_file('_plugin_html2zip.zip')
|
||||||
from calibre.ebooks.html import gui_main as html2oeb
|
from calibre.ebooks.html import gui_main as html2oeb
|
||||||
@ -26,172 +26,173 @@ every time you add an HTML file to the library.\
|
|||||||
return of.name
|
return of.name
|
||||||
|
|
||||||
class OPFMetadataReader(MetadataReaderPlugin):
|
class OPFMetadataReader(MetadataReaderPlugin):
|
||||||
|
|
||||||
name = 'Read OPF metadata'
|
name = 'Read OPF metadata'
|
||||||
file_types = set(['opf'])
|
file_types = set(['opf'])
|
||||||
description = _('Read metadata from %s files')%'OPF'
|
description = _('Read metadata from %s files')%'OPF'
|
||||||
|
|
||||||
def get_metadata(self, stream, ftype):
|
def get_metadata(self, stream, ftype):
|
||||||
from calibre.ebooks.metadata.opf2 import OPF
|
from calibre.ebooks.metadata.opf2 import OPF
|
||||||
from calibre.ebooks.metadata import MetaInformation
|
from calibre.ebooks.metadata import MetaInformation
|
||||||
return MetaInformation(OPF(stream, os.getcwd()))
|
return MetaInformation(OPF(stream, os.getcwd()))
|
||||||
|
|
||||||
class RTFMetadataReader(MetadataReaderPlugin):
|
class RTFMetadataReader(MetadataReaderPlugin):
|
||||||
|
|
||||||
name = 'Read RTF metadata'
|
name = 'Read RTF metadata'
|
||||||
file_types = set(['rtf'])
|
file_types = set(['rtf'])
|
||||||
description = _('Read metadata from %s files')%'RTF'
|
description = _('Read metadata from %s files')%'RTF'
|
||||||
|
|
||||||
def get_metadata(self, stream, ftype):
|
def get_metadata(self, stream, ftype):
|
||||||
from calibre.ebooks.metadata.rtf import get_metadata
|
from calibre.ebooks.metadata.rtf import get_metadata
|
||||||
return get_metadata(stream)
|
return get_metadata(stream)
|
||||||
|
|
||||||
class FB2MetadataReader(MetadataReaderPlugin):
|
class FB2MetadataReader(MetadataReaderPlugin):
|
||||||
|
|
||||||
name = 'Read FB2 metadata'
|
name = 'Read FB2 metadata'
|
||||||
file_types = set(['fb2'])
|
file_types = set(['fb2'])
|
||||||
description = _('Read metadata from %s files')%'FB2'
|
description = _('Read metadata from %s files')%'FB2'
|
||||||
|
|
||||||
def get_metadata(self, stream, ftype):
|
def get_metadata(self, stream, ftype):
|
||||||
from calibre.ebooks.metadata.fb2 import get_metadata
|
from calibre.ebooks.metadata.fb2 import get_metadata
|
||||||
return get_metadata(stream)
|
return get_metadata(stream)
|
||||||
|
|
||||||
|
|
||||||
class LRFMetadataReader(MetadataReaderPlugin):
|
class LRFMetadataReader(MetadataReaderPlugin):
|
||||||
|
|
||||||
name = 'Read LRF metadata'
|
name = 'Read LRF metadata'
|
||||||
file_types = set(['lrf'])
|
file_types = set(['lrf'])
|
||||||
description = _('Read metadata from %s files')%'LRF'
|
description = _('Read metadata from %s files')%'LRF'
|
||||||
|
|
||||||
def get_metadata(self, stream, ftype):
|
def get_metadata(self, stream, ftype):
|
||||||
from calibre.ebooks.lrf.meta import get_metadata
|
from calibre.ebooks.lrf.meta import get_metadata
|
||||||
return get_metadata(stream)
|
return get_metadata(stream)
|
||||||
|
|
||||||
class PDFMetadataReader(MetadataReaderPlugin):
|
class PDFMetadataReader(MetadataReaderPlugin):
|
||||||
|
|
||||||
name = 'Read PDF metadata'
|
name = 'Read PDF metadata'
|
||||||
file_types = set(['pdf'])
|
file_types = set(['pdf'])
|
||||||
description = _('Read metadata from %s files')%'PDF'
|
description = _('Read metadata from %s files')%'PDF'
|
||||||
|
|
||||||
def get_metadata(self, stream, ftype):
|
def get_metadata(self, stream, ftype):
|
||||||
from calibre.ebooks.metadata.pdf import get_metadata
|
from calibre.ebooks.metadata.pdf import get_metadata
|
||||||
return get_metadata(stream)
|
return get_metadata(stream)
|
||||||
|
|
||||||
class LITMetadataReader(MetadataReaderPlugin):
|
class LITMetadataReader(MetadataReaderPlugin):
|
||||||
|
|
||||||
name = 'Read LIT metadata'
|
name = 'Read LIT metadata'
|
||||||
file_types = set(['lit'])
|
file_types = set(['lit'])
|
||||||
description = _('Read metadata from %s files')%'LIT'
|
description = _('Read metadata from %s files')%'LIT'
|
||||||
|
|
||||||
def get_metadata(self, stream, ftype):
|
def get_metadata(self, stream, ftype):
|
||||||
from calibre.ebooks.metadata.lit import get_metadata
|
from calibre.ebooks.metadata.lit import get_metadata
|
||||||
return get_metadata(stream)
|
return get_metadata(stream)
|
||||||
|
|
||||||
class IMPMetadataReader(MetadataReaderPlugin):
|
class IMPMetadataReader(MetadataReaderPlugin):
|
||||||
|
|
||||||
name = 'Read IMP metadata'
|
name = 'Read IMP metadata'
|
||||||
file_types = set(['imp'])
|
file_types = set(['imp'])
|
||||||
description = _('Read metadata from %s files')%'IMP'
|
description = _('Read metadata from %s files')%'IMP'
|
||||||
author = 'Ashish Kulkarni'
|
author = 'Ashish Kulkarni'
|
||||||
|
|
||||||
def get_metadata(self, stream, ftype):
|
def get_metadata(self, stream, ftype):
|
||||||
from calibre.ebooks.metadata.imp import get_metadata
|
from calibre.ebooks.metadata.imp import get_metadata
|
||||||
return get_metadata(stream)
|
return get_metadata(stream)
|
||||||
|
|
||||||
class RBMetadataReader(MetadataReaderPlugin):
|
class RBMetadataReader(MetadataReaderPlugin):
|
||||||
|
|
||||||
name = 'Read RB metadata'
|
name = 'Read RB metadata'
|
||||||
file_types = set(['rb'])
|
file_types = set(['rb'])
|
||||||
description = _('Read metadata from %s files')%'RB'
|
description = _('Read metadata from %s files')%'RB'
|
||||||
author = 'Ashish Kulkarni'
|
author = 'Ashish Kulkarni'
|
||||||
|
|
||||||
def get_metadata(self, stream, ftype):
|
def get_metadata(self, stream, ftype):
|
||||||
from calibre.ebooks.metadata.rb import get_metadata
|
from calibre.ebooks.metadata.rb import get_metadata
|
||||||
return get_metadata(stream)
|
return get_metadata(stream)
|
||||||
|
|
||||||
class EPUBMetadataReader(MetadataReaderPlugin):
|
class EPUBMetadataReader(MetadataReaderPlugin):
|
||||||
|
|
||||||
name = 'Read EPUB metadata'
|
name = 'Read EPUB metadata'
|
||||||
file_types = set(['epub'])
|
file_types = set(['epub'])
|
||||||
description = _('Read metadata from %s files')%'EPUB'
|
description = _('Read metadata from %s files')%'EPUB'
|
||||||
|
|
||||||
def get_metadata(self, stream, ftype):
|
def get_metadata(self, stream, ftype):
|
||||||
from calibre.ebooks.metadata.epub import get_metadata
|
from calibre.ebooks.metadata.epub import get_metadata
|
||||||
return get_metadata(stream)
|
return get_metadata(stream)
|
||||||
|
|
||||||
class HTMLMetadataReader(MetadataReaderPlugin):
|
class HTMLMetadataReader(MetadataReaderPlugin):
|
||||||
|
|
||||||
name = 'Read HTML metadata'
|
name = 'Read HTML metadata'
|
||||||
file_types = set(['html'])
|
file_types = set(['html'])
|
||||||
description = _('Read metadata from %s files')%'HTML'
|
description = _('Read metadata from %s files')%'HTML'
|
||||||
|
|
||||||
def get_metadata(self, stream, ftype):
|
def get_metadata(self, stream, ftype):
|
||||||
from calibre.ebooks.metadata.html import get_metadata
|
from calibre.ebooks.metadata.html import get_metadata
|
||||||
return get_metadata(stream)
|
return get_metadata(stream)
|
||||||
|
|
||||||
class MOBIMetadataReader(MetadataReaderPlugin):
|
class MOBIMetadataReader(MetadataReaderPlugin):
|
||||||
|
|
||||||
name = 'Read MOBI metadata'
|
name = 'Read MOBI metadata'
|
||||||
file_types = set(['mobi', 'prc', 'azw'])
|
file_types = set(['mobi', 'prc', 'azw'])
|
||||||
description = _('Read metadata from %s files')%'MOBI'
|
description = _('Read metadata from %s files')%'MOBI'
|
||||||
|
|
||||||
def get_metadata(self, stream, ftype):
|
def get_metadata(self, stream, ftype):
|
||||||
from calibre.ebooks.mobi.reader import get_metadata
|
from calibre.ebooks.mobi.reader import get_metadata
|
||||||
return get_metadata(stream)
|
return get_metadata(stream)
|
||||||
|
|
||||||
|
|
||||||
class TOPAZMetadataReader(MetadataReaderPlugin):
|
class TOPAZMetadataReader(MetadataReaderPlugin):
|
||||||
|
|
||||||
name = 'Read Topaz metadata'
|
name = 'Read Topaz metadata'
|
||||||
file_types = set(['tpz', 'azw1'])
|
file_types = set(['tpz', 'azw1'])
|
||||||
description = _('Read metadata from %s files')%'MOBI'
|
description = _('Read metadata from %s files')%'MOBI'
|
||||||
|
|
||||||
def get_metadata(self, stream, ftype):
|
def get_metadata(self, stream, ftype):
|
||||||
from calibre.ebooks.metadata.topaz import get_metadata
|
from calibre.ebooks.metadata.topaz import get_metadata
|
||||||
return get_metadata(stream)
|
return get_metadata(stream)
|
||||||
|
|
||||||
class ODTMetadataReader(MetadataReaderPlugin):
|
class ODTMetadataReader(MetadataReaderPlugin):
|
||||||
|
|
||||||
name = 'Read ODT metadata'
|
name = 'Read ODT metadata'
|
||||||
file_types = set(['odt'])
|
file_types = set(['odt'])
|
||||||
description = _('Read metadata from %s files')%'ODT'
|
description = _('Read metadata from %s files')%'ODT'
|
||||||
|
|
||||||
def get_metadata(self, stream, ftype):
|
def get_metadata(self, stream, ftype):
|
||||||
from calibre.ebooks.metadata.odt import get_metadata
|
from calibre.ebooks.metadata.odt import get_metadata
|
||||||
return get_metadata(stream)
|
return get_metadata(stream)
|
||||||
|
|
||||||
class TXTMetadataReader(MetadataReaderPlugin):
|
class TXTMetadataReader(MetadataReaderPlugin):
|
||||||
|
|
||||||
name = 'Read TXT metadata'
|
name = 'Read TXT metadata'
|
||||||
file_types = set(['txt'])
|
file_types = set(['txt'])
|
||||||
description = _('Read metadata from %s files') % 'TXT'
|
description = _('Read metadata from %s files') % 'TXT'
|
||||||
|
|
||||||
def get_metadata(self, stream, ftype):
|
def get_metadata(self, stream, ftype):
|
||||||
from calibre.ebooks.metadata.txt import get_metadata
|
from calibre.ebooks.metadata.txt import get_metadata
|
||||||
return get_metadata(stream)
|
return get_metadata(stream)
|
||||||
|
|
||||||
class LRXMetadataReader(MetadataReaderPlugin):
|
class LRXMetadataReader(MetadataReaderPlugin):
|
||||||
|
|
||||||
name = 'Read LRX metadata'
|
name = 'Read LRX metadata'
|
||||||
file_types = set(['lrx'])
|
file_types = set(['lrx'])
|
||||||
description = _('Read metadata from %s files')%'LRX'
|
description = _('Read metadata from %s files')%'LRX'
|
||||||
|
|
||||||
def get_metadata(self, stream, ftype):
|
def get_metadata(self, stream, ftype):
|
||||||
from calibre.ebooks.metadata.lrx import get_metadata
|
from calibre.ebooks.metadata.lrx import get_metadata
|
||||||
return get_metadata(stream)
|
return get_metadata(stream)
|
||||||
|
|
||||||
class ComicMetadataReader(MetadataReaderPlugin):
|
class ComicMetadataReader(MetadataReaderPlugin):
|
||||||
|
|
||||||
name = 'Read comic metadata'
|
name = 'Read comic metadata'
|
||||||
file_types = set(['cbr', 'cbz'])
|
file_types = set(['cbr', 'cbz'])
|
||||||
description = _('Extract cover from comic files')
|
description = _('Extract cover from comic files')
|
||||||
|
|
||||||
def get_metadata(self, stream, ftype):
|
def get_metadata(self, stream, ftype):
|
||||||
if ftype == 'cbr':
|
if ftype == 'cbr':
|
||||||
from calibre.libunrar import extract_member as extract_first
|
from calibre.libunrar import extract_member as extract_first
|
||||||
|
extract_first
|
||||||
else:
|
else:
|
||||||
from calibre.libunzip import extract_member as extract_first
|
from calibre.libunzip import extract_member as extract_first
|
||||||
from calibre.ebooks.metadata import MetaInformation
|
from calibre.ebooks.metadata import MetaInformation
|
||||||
ret = extract_first(stream)
|
ret = extract_first(stream)
|
||||||
mi = MetaInformation(None, None)
|
mi = MetaInformation(None, None)
|
||||||
if ret is not None:
|
if ret is not None:
|
||||||
@ -199,65 +200,65 @@ class ComicMetadataReader(MetadataReaderPlugin):
|
|||||||
ext = os.path.splitext(path)[1][1:]
|
ext = os.path.splitext(path)[1][1:]
|
||||||
mi.cover_data = (ext.lower(), data)
|
mi.cover_data = (ext.lower(), data)
|
||||||
return mi
|
return mi
|
||||||
|
|
||||||
class ZipMetadataReader(MetadataReaderPlugin):
|
class ZipMetadataReader(MetadataReaderPlugin):
|
||||||
|
|
||||||
name = 'Read ZIP metadata'
|
name = 'Read ZIP metadata'
|
||||||
file_types = set(['zip', 'oebzip'])
|
file_types = set(['zip', 'oebzip'])
|
||||||
description = _('Read metadata from ebooks in ZIP archives')
|
description = _('Read metadata from ebooks in ZIP archives')
|
||||||
|
|
||||||
def get_metadata(self, stream, ftype):
|
def get_metadata(self, stream, ftype):
|
||||||
from calibre.ebooks.metadata.zip import get_metadata
|
from calibre.ebooks.metadata.zip import get_metadata
|
||||||
return get_metadata(stream)
|
return get_metadata(stream)
|
||||||
|
|
||||||
class RARMetadataReader(MetadataReaderPlugin):
|
class RARMetadataReader(MetadataReaderPlugin):
|
||||||
|
|
||||||
name = 'Read RAR metadata'
|
name = 'Read RAR metadata'
|
||||||
file_types = set(['rar'])
|
file_types = set(['rar'])
|
||||||
description = _('Read metadata from ebooks in RAR archives')
|
description = _('Read metadata from ebooks in RAR archives')
|
||||||
|
|
||||||
def get_metadata(self, stream, ftype):
|
def get_metadata(self, stream, ftype):
|
||||||
from calibre.ebooks.metadata.rar import get_metadata
|
from calibre.ebooks.metadata.rar import get_metadata
|
||||||
return get_metadata(stream)
|
return get_metadata(stream)
|
||||||
|
|
||||||
|
|
||||||
class EPUBMetadataWriter(MetadataWriterPlugin):
|
class EPUBMetadataWriter(MetadataWriterPlugin):
|
||||||
|
|
||||||
name = 'Set EPUB metadata'
|
name = 'Set EPUB metadata'
|
||||||
file_types = set(['epub'])
|
file_types = set(['epub'])
|
||||||
description = _('Set metadata in %s files')%'EPUB'
|
description = _('Set metadata in %s files')%'EPUB'
|
||||||
|
|
||||||
def set_metadata(self, stream, mi, type):
|
def set_metadata(self, stream, mi, type):
|
||||||
from calibre.ebooks.metadata.epub import set_metadata
|
from calibre.ebooks.metadata.epub import set_metadata
|
||||||
set_metadata(stream, mi)
|
set_metadata(stream, mi)
|
||||||
|
|
||||||
class LRFMetadataWriter(MetadataWriterPlugin):
|
class LRFMetadataWriter(MetadataWriterPlugin):
|
||||||
|
|
||||||
name = 'Set LRF metadata'
|
name = 'Set LRF metadata'
|
||||||
file_types = set(['lrf'])
|
file_types = set(['lrf'])
|
||||||
description = _('Set metadata in %s files')%'LRF'
|
description = _('Set metadata in %s files')%'LRF'
|
||||||
|
|
||||||
def set_metadata(self, stream, mi, type):
|
def set_metadata(self, stream, mi, type):
|
||||||
from calibre.ebooks.lrf.meta import set_metadata
|
from calibre.ebooks.lrf.meta import set_metadata
|
||||||
set_metadata(stream, mi)
|
set_metadata(stream, mi)
|
||||||
|
|
||||||
class RTFMetadataWriter(MetadataWriterPlugin):
|
class RTFMetadataWriter(MetadataWriterPlugin):
|
||||||
|
|
||||||
name = 'Set RTF metadata'
|
name = 'Set RTF metadata'
|
||||||
file_types = set(['rtf'])
|
file_types = set(['rtf'])
|
||||||
description = _('Set metadata in %s files')%'RTF'
|
description = _('Set metadata in %s files')%'RTF'
|
||||||
|
|
||||||
def set_metadata(self, stream, mi, type):
|
def set_metadata(self, stream, mi, type):
|
||||||
from calibre.ebooks.metadata.rtf import set_metadata
|
from calibre.ebooks.metadata.rtf import set_metadata
|
||||||
set_metadata(stream, mi)
|
set_metadata(stream, mi)
|
||||||
|
|
||||||
class MOBIMetadataWriter(MetadataWriterPlugin):
|
class MOBIMetadataWriter(MetadataWriterPlugin):
|
||||||
|
|
||||||
name = 'Set MOBI metadata'
|
name = 'Set MOBI metadata'
|
||||||
file_types = set(['mobi', 'prc', 'azw'])
|
file_types = set(['mobi', 'prc', 'azw'])
|
||||||
description = _('Set metadata in %s files')%'MOBI'
|
description = _('Set metadata in %s files')%'MOBI'
|
||||||
author = 'Marshall T. Vandegrift'
|
author = 'Marshall T. Vandegrift'
|
||||||
|
|
||||||
def set_metadata(self, stream, mi, type):
|
def set_metadata(self, stream, mi, type):
|
||||||
from calibre.ebooks.metadata.mobi import set_metadata
|
from calibre.ebooks.metadata.mobi import set_metadata
|
||||||
set_metadata(stream, mi)
|
set_metadata(stream, mi)
|
||||||
@ -267,14 +268,16 @@ from calibre.ebooks.epub.input import EPUBInput
|
|||||||
from calibre.ebooks.mobi.input import MOBIInput
|
from calibre.ebooks.mobi.input import MOBIInput
|
||||||
from calibre.ebooks.pdf.input import PDFInput
|
from calibre.ebooks.pdf.input import PDFInput
|
||||||
from calibre.ebooks.txt.input import TXTInput
|
from calibre.ebooks.txt.input import TXTInput
|
||||||
|
from calibre.ebooks.html.input import HTMLInput
|
||||||
from calibre.ebooks.oeb.output import OEBOutput
|
from calibre.ebooks.oeb.output import OEBOutput
|
||||||
from calibre.ebooks.txt.output import TXTOutput
|
from calibre.ebooks.txt.output import TXTOutput
|
||||||
from calibre.ebooks.pdf.output import PDFOutput
|
from calibre.ebooks.pdf.output import PDFOutput
|
||||||
from calibre.customize.profiles import input_profiles, output_profiles
|
from calibre.customize.profiles import input_profiles, output_profiles
|
||||||
|
|
||||||
plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDFInput, TXTInput, OEBOutput, TXTOutput, PDFOutput]
|
plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDFInput, HTMLInput,
|
||||||
|
TXTInput, OEBOutput, TXTOutput, PDFOutput]
|
||||||
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
|
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
|
||||||
x.__name__.endswith('MetadataReader')]
|
x.__name__.endswith('MetadataReader')]
|
||||||
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
|
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
|
||||||
x.__name__.endswith('MetadataWriter')]
|
x.__name__.endswith('MetadataWriter')]
|
||||||
plugins += input_profiles + output_profiles
|
plugins += input_profiles + output_profiles
|
||||||
|
@ -163,9 +163,9 @@ class InputFormatPlugin(Plugin):
|
|||||||
for x in os.listdir('.'):
|
for x in os.listdir('.'):
|
||||||
shutil.rmtree(x) if os.path.isdir(x) else os.remove(x)
|
shutil.rmtree(x) if os.path.isdir(x) else os.remove(x)
|
||||||
|
|
||||||
|
|
||||||
ret = self.convert(stream, options, file_ext,
|
ret = self.convert(stream, options, file_ext,
|
||||||
log, accelerators)
|
log, accelerators)
|
||||||
|
|
||||||
if options.debug_input is not None:
|
if options.debug_input is not None:
|
||||||
options.debug_input = os.path.abspath(options.debug_input)
|
options.debug_input = os.path.abspath(options.debug_input)
|
||||||
if not os.path.exists(options.debug_input):
|
if not os.path.exists(options.debug_input):
|
||||||
|
@ -17,7 +17,7 @@ def tostring(root, strip_comments=False, pretty_print=False):
|
|||||||
root.set('xmlns', 'http://www.w3.org/1999/xhtml')
|
root.set('xmlns', 'http://www.w3.org/1999/xhtml')
|
||||||
root.set('{http://www.w3.org/1999/xhtml}xlink', 'http://www.w3.org/1999/xlink')
|
root.set('{http://www.w3.org/1999/xhtml}xlink', 'http://www.w3.org/1999/xlink')
|
||||||
for x in root.iter():
|
for x in root.iter():
|
||||||
if x.tag.rpartition('}')[-1].lower() == 'svg':
|
if hasattr(x.tag, 'rpartition') and x.tag.rpartition('}')[-1].lower() == 'svg':
|
||||||
x.set('xmlns', 'http://www.w3.org/2000/svg')
|
x.set('xmlns', 'http://www.w3.org/2000/svg')
|
||||||
|
|
||||||
ans = _tostring(root, encoding='utf-8', pretty_print=pretty_print)
|
ans = _tostring(root, encoding='utf-8', pretty_print=pretty_print)
|
||||||
|
@ -11,14 +11,12 @@ __docformat__ = 'restructuredtext en'
|
|||||||
Input plugin for HTML or OPF ebooks.
|
Input plugin for HTML or OPF ebooks.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
import os, re, sys, cStringIO
|
import os, re, sys
|
||||||
from urlparse import urlparse, urlunparse
|
from urlparse import urlparse, urlunparse
|
||||||
from urllib import unquote
|
from urllib import unquote
|
||||||
|
|
||||||
from calibre.customize.conversion import InputFormatPlugin
|
from calibre.customize.conversion import InputFormatPlugin
|
||||||
from calibre.ebooks.metadata.meta import get_metadata
|
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||||
from calibre.ebooks.metadata.opf2 import OPF, OPFCreator
|
|
||||||
from calibre.ebooks.metadata import MetaInformation
|
|
||||||
from calibre.ebooks.chardet import xml_to_unicode
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
from calibre.customize.conversion import OptionRecommendation
|
from calibre.customize.conversion import OptionRecommendation
|
||||||
from calibre import unicode_path
|
from calibre import unicode_path
|
||||||
@ -213,72 +211,21 @@ def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None)
|
|||||||
sys.setrecursionlimit(orec)
|
sys.setrecursionlimit(orec)
|
||||||
|
|
||||||
|
|
||||||
def opf_traverse(opf_reader, verbose=0, encoding=None):
|
|
||||||
'''
|
|
||||||
Return a list of :class:`HTMLFile` objects in the order specified by the
|
|
||||||
`<spine>` element of the OPF.
|
|
||||||
|
|
||||||
:param opf_reader: An :class:`calibre.ebooks.metadata.opf2.OPF` instance.
|
|
||||||
:param encoding: Specify character encoding of HTML files. If `None` it is
|
|
||||||
auto-detected.
|
|
||||||
'''
|
|
||||||
if not opf_reader.spine:
|
|
||||||
raise ValueError('OPF does not have a spine')
|
|
||||||
flat = []
|
|
||||||
for path in opf_reader.spine.items():
|
|
||||||
path = os.path.abspath(path)
|
|
||||||
if path not in flat:
|
|
||||||
flat.append(os.path.abspath(path))
|
|
||||||
for item in opf_reader.manifest:
|
|
||||||
if 'html' in item.mime_type:
|
|
||||||
path = os.path.abspath(item.path)
|
|
||||||
if path not in flat:
|
|
||||||
flat.append(path)
|
|
||||||
for i, path in enumerate(flat):
|
|
||||||
if not os.path.exists(path):
|
|
||||||
path = path.replace('&', '%26')
|
|
||||||
if os.path.exists(path):
|
|
||||||
flat[i] = path
|
|
||||||
for item in opf_reader.itermanifest():
|
|
||||||
item.set('href', item.get('href').replace('&', '%26'))
|
|
||||||
ans = []
|
|
||||||
for path in flat:
|
|
||||||
if os.path.exists(path):
|
|
||||||
ans.append(HTMLFile(path, 0, encoding, verbose))
|
|
||||||
else:
|
|
||||||
print 'WARNING: OPF spine item %s does not exist'%path
|
|
||||||
ans = [f for f in ans if not f.is_binary]
|
|
||||||
return ans
|
|
||||||
|
|
||||||
def search_for_opf(dir):
|
|
||||||
for f in os.listdir(dir):
|
|
||||||
if f.lower().endswith('.opf'):
|
|
||||||
return OPF(open(os.path.join(dir, f), 'rb'), dir)
|
|
||||||
|
|
||||||
def get_filelist(htmlfile, dir, opts, log):
|
def get_filelist(htmlfile, dir, opts, log):
|
||||||
'''
|
'''
|
||||||
Build list of files referenced by html file or try to detect and use an
|
Build list of files referenced by html file or try to detect and use an
|
||||||
OPF file instead.
|
OPF file instead.
|
||||||
'''
|
'''
|
||||||
print 'Building file list...'
|
log.info('Building file list...')
|
||||||
opf = search_for_opf(dir)
|
filelist = traverse(htmlfile, max_levels=int(opts.max_levels),
|
||||||
filelist = None
|
verbose=opts.verbose,
|
||||||
if opf is not None:
|
encoding=opts.input_encoding)\
|
||||||
try:
|
[0 if opts.breadth_first else 1]
|
||||||
filelist = opf_traverse(opf, verbose=opts.verbose,
|
|
||||||
encoding=opts.input_encoding)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
if not filelist:
|
|
||||||
filelist = traverse(htmlfile, max_levels=int(opts.max_levels),
|
|
||||||
verbose=opts.verbose,
|
|
||||||
encoding=opts.input_encoding)\
|
|
||||||
[0 if opts.breadth_first else 1]
|
|
||||||
if opts.verbose:
|
if opts.verbose:
|
||||||
log.debug('\tFound files...')
|
log.debug('\tFound files...')
|
||||||
for f in filelist:
|
for f in filelist:
|
||||||
log.debug('\t\t', f)
|
log.debug('\t\t', f)
|
||||||
return opf, filelist
|
return filelist
|
||||||
|
|
||||||
|
|
||||||
class HTMLInput(InputFormatPlugin):
|
class HTMLInput(InputFormatPlugin):
|
||||||
@ -309,34 +256,32 @@ class HTMLInput(InputFormatPlugin):
|
|||||||
|
|
||||||
def convert(self, stream, opts, file_ext, log,
|
def convert(self, stream, opts, file_ext, log,
|
||||||
accelerators):
|
accelerators):
|
||||||
|
from calibre.ebooks.metadata.meta import get_metadata
|
||||||
|
|
||||||
basedir = os.getcwd()
|
basedir = os.getcwd()
|
||||||
|
|
||||||
if hasattr(stream, 'name'):
|
if hasattr(stream, 'name'):
|
||||||
basedir = os.path.dirname(stream.name)
|
basedir = os.path.dirname(stream.name)
|
||||||
if file_ext == 'opf':
|
if file_ext == 'opf':
|
||||||
opf = OPF(stream, basedir)
|
opfpath = stream.name
|
||||||
filelist = opf_traverse(opf, verbose=opts.verbose,
|
|
||||||
encoding=opts.input_encoding)
|
|
||||||
mi = MetaInformation(opf)
|
|
||||||
else:
|
else:
|
||||||
opf, filelist = get_filelist(stream.name, basedir, opts, log)
|
filelist = get_filelist(stream.name, basedir, opts, log)
|
||||||
mi = MetaInformation(opf)
|
mi = get_metadata(stream, 'html')
|
||||||
mi.smart_update(get_metadata(stream, 'html'))
|
mi = OPFCreator(os.getcwdu(), mi)
|
||||||
|
mi.guide = None
|
||||||
|
entries = [(f.path, 'application/xhtml+xml') for f in filelist]
|
||||||
|
mi.create_manifest(entries)
|
||||||
|
mi.create_spine([f.path for f in filelist])
|
||||||
|
|
||||||
mi = OPFCreator(os.getcwdu(), mi)
|
mi.render(open('metadata.opf', 'wb'))
|
||||||
mi.guide = None
|
opfpath = os.path.abspath('metadata.opf')
|
||||||
entries = [(f.path, 'application/xhtml+xml') for f in filelist]
|
|
||||||
mi.create_manifest(entries)
|
|
||||||
mi.create_spine([f.path for f in filelist])
|
|
||||||
|
|
||||||
tocbuf = cStringIO.StringIO()
|
|
||||||
mi.render(open('metadata.opf', 'wb'), tocbuf, 'toc.ncx')
|
|
||||||
toc = tocbuf.getvalue()
|
|
||||||
if toc:
|
|
||||||
open('toc.ncx', 'wb').write(toc)
|
|
||||||
|
|
||||||
from calibre.ebooks.conversion.plumber import create_oebbook
|
from calibre.ebooks.conversion.plumber import create_oebbook
|
||||||
return create_oebbook(log, os.path.abspath('metadata.opf'))
|
oeb = create_oebbook(log, opfpath)
|
||||||
|
|
||||||
|
from calibre.ebooks.oeb.transforms.package import Package
|
||||||
|
Package(os.getcwdu())(oeb, opts)
|
||||||
|
|
||||||
|
return oeb
|
||||||
|
|
||||||
|
|
||||||
|
@ -573,7 +573,7 @@ class OEBReader(object):
|
|||||||
item = self._find_ncx(opf)
|
item = self._find_ncx(opf)
|
||||||
self._toc_from_opf(opf, item)
|
self._toc_from_opf(opf, item)
|
||||||
self._pages_from_opf(opf, item)
|
self._pages_from_opf(opf, item)
|
||||||
self._ensure_cover_image()
|
#self._ensure_cover_image()
|
||||||
|
|
||||||
|
|
||||||
def main(argv=sys.argv):
|
def main(argv=sys.argv):
|
||||||
|
@ -6,13 +6,14 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import os
|
import os, re
|
||||||
from urllib import unquote as urlunquote
|
from urllib import unquote as urlunquote
|
||||||
from functools import partial
|
from functools import partial
|
||||||
|
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
import cssutils
|
import cssutils
|
||||||
|
|
||||||
|
from calibre import sanitize_file_name
|
||||||
from calibre.constants import islinux
|
from calibre.constants import islinux
|
||||||
from calibre.ebooks.oeb.base import OEB_DOCS, urlnormalize, urldefrag, \
|
from calibre.ebooks.oeb.base import OEB_DOCS, urlnormalize, urldefrag, \
|
||||||
rewrite_links
|
rewrite_links
|
||||||
@ -36,15 +37,21 @@ class Package(object):
|
|||||||
self.new_base_path = os.path.abspath(base)
|
self.new_base_path = os.path.abspath(base)
|
||||||
|
|
||||||
def rewrite_links_in(self, item):
|
def rewrite_links_in(self, item):
|
||||||
base = os.path.join(self.new_base_path, *item.href.split('/'))
|
old_href = item.old_href.split('#')[0]
|
||||||
|
new_href = item.href.split('#')[0]
|
||||||
|
base = os.path.join(self.old_base_path, *old_href.split('/'))
|
||||||
base = os.path.dirname(base)
|
base = os.path.dirname(base)
|
||||||
|
self.log.debug('\tRewriting links in', base+'/'+
|
||||||
|
item.href.rpartition('/')[-1])
|
||||||
|
new_base = os.path.join(self.new_base_path, *new_href.split('/'))
|
||||||
|
new_base = os.path.dirname(new_base)
|
||||||
|
|
||||||
if etree.iselement(item.data):
|
if etree.iselement(item.data):
|
||||||
self.rewrite_links_in_xml(item.data, base)
|
self.rewrite_links_in_xml(item.data, base, new_base)
|
||||||
elif hasattr(item.data, 'cssText'):
|
elif hasattr(item.data, 'cssText'):
|
||||||
self.rewrite_links_in_css(item.data, base)
|
self.rewrite_links_in_css(item.data, base, new_base)
|
||||||
|
|
||||||
def link_replacer(self, link_, base=''):
|
def link_replacer(self, link_, base='', new_base=''):
|
||||||
link = urlnormalize(link_)
|
link = urlnormalize(link_)
|
||||||
link, frag = urldefrag(link)
|
link, frag = urldefrag(link)
|
||||||
link = urlunquote(link).replace('/', os.sep)
|
link = urlunquote(link).replace('/', os.sep)
|
||||||
@ -55,20 +62,33 @@ class Package(object):
|
|||||||
link = link.lower()
|
link = link.lower()
|
||||||
if link not in self.map:
|
if link not in self.map:
|
||||||
return link_
|
return link_
|
||||||
nlink = os.path.relpath(self.map[link], base)
|
nlink = os.path.relpath(self.map[link], new_base)
|
||||||
if frag:
|
if frag:
|
||||||
nlink = '#'.join(nlink, frag)
|
nlink = '#'.join((nlink, frag))
|
||||||
return nlink.replace(os.sep, '/')
|
return nlink.replace(os.sep, '/')
|
||||||
|
|
||||||
def rewrite_links_in_css(self, sheet, base):
|
def rewrite_links_in_css(self, sheet, base, new_base):
|
||||||
repl = partial(self.link_replacer, base=base)
|
repl = partial(self.link_replacer, base=base, new_base=new_base)
|
||||||
cssutils.replaceUrls(sheet, repl)
|
cssutils.replaceUrls(sheet, repl)
|
||||||
|
|
||||||
def rewrite_links_in_xml(self, root, base):
|
def rewrite_links_in_xml(self, root, base, new_base):
|
||||||
repl = partial(self.link_replacer, base=base)
|
repl = partial(self.link_replacer, base=base, new_base=new_base)
|
||||||
rewrite_links(root, repl)
|
rewrite_links(root, repl)
|
||||||
|
|
||||||
def move_manifest_item(self, item):
|
def uniqify_name(self, new_href, hrefs):
|
||||||
|
c = 0
|
||||||
|
while new_href in hrefs:
|
||||||
|
c += 1
|
||||||
|
parts = new_href.split('/')
|
||||||
|
name, ext = os.path.splitext(parts[-1])
|
||||||
|
name = re.sub(r'_\d+$', '', name)
|
||||||
|
name += '_%d'%c
|
||||||
|
parts[-1] = name + ext
|
||||||
|
new_href = '/'.join(parts)
|
||||||
|
return new_href
|
||||||
|
|
||||||
|
|
||||||
|
def move_manifest_item(self, item, hrefs):
|
||||||
item.data # Make sure the data has been loaded and cached
|
item.data # Make sure the data has been loaded and cached
|
||||||
old_abspath = os.path.join(self.old_base_path,
|
old_abspath = os.path.join(self.old_base_path,
|
||||||
*(urldefrag(item.href)[0].split('/')))
|
*(urldefrag(item.href)[0].split('/')))
|
||||||
@ -79,11 +99,17 @@ class Package(object):
|
|||||||
new_href = 'content/'
|
new_href = 'content/'
|
||||||
elif item.href.lower().endswith('.ncx'):
|
elif item.href.lower().endswith('.ncx'):
|
||||||
new_href = ''
|
new_href = ''
|
||||||
new_href += bname
|
new_href += sanitize_file_name(bname)
|
||||||
|
|
||||||
|
if new_href in hrefs:
|
||||||
|
new_href = self.uniqify_name(new_href, hrefs)
|
||||||
|
hrefs.add(new_href)
|
||||||
|
|
||||||
new_abspath = os.path.join(self.new_base_path, *new_href.split('/'))
|
new_abspath = os.path.join(self.new_base_path, *new_href.split('/'))
|
||||||
new_abspath = os.path.abspath(new_abspath)
|
new_abspath = os.path.abspath(new_abspath)
|
||||||
|
item.old_href = self.oeb.manifest.hrefs.pop(item.href).href
|
||||||
item.href = new_href
|
item.href = new_href
|
||||||
|
self.oeb.manifest.hrefs[item.href] = item
|
||||||
if not islinux:
|
if not islinux:
|
||||||
old_abspath, new_abspath = old_abspath.lower(), new_abspath.lower()
|
old_abspath, new_abspath = old_abspath.lower(), new_abspath.lower()
|
||||||
if old_abspath != new_abspath:
|
if old_abspath != new_abspath:
|
||||||
@ -91,25 +117,33 @@ class Package(object):
|
|||||||
|
|
||||||
def rewrite_links_in_toc(self, toc):
|
def rewrite_links_in_toc(self, toc):
|
||||||
if toc.href:
|
if toc.href:
|
||||||
toc.href = self.link_replacer(toc.href, base=self.new_base_path)
|
toc.href = self.link_replacer(toc.href, base=self.old_base_path,
|
||||||
|
new_base=self.new_base_path)
|
||||||
|
|
||||||
for x in toc:
|
for x in toc:
|
||||||
self.rewrite_links_in_toc(x)
|
self.rewrite_links_in_toc(x)
|
||||||
|
|
||||||
def __call__(self, oeb, context):
|
def __call__(self, oeb, context):
|
||||||
self.map = {}
|
self.map = {}
|
||||||
self.log = self.oeb.log
|
self.log = oeb.log
|
||||||
|
self.oeb = oeb
|
||||||
self.old_base_path = os.path.abspath(oeb.container.rootdir)
|
self.old_base_path = os.path.abspath(oeb.container.rootdir)
|
||||||
|
|
||||||
|
hrefs = set([])
|
||||||
for item in self.oeb.manifest:
|
for item in self.oeb.manifest:
|
||||||
self.move_manifest_item(item)
|
self.move_manifest_item(item, hrefs)
|
||||||
|
|
||||||
|
self.log.debug('Rewriting links in OEB documents...')
|
||||||
for item in self.oeb.manifest:
|
for item in self.oeb.manifest:
|
||||||
self.rewrite_links_in(item)
|
self.rewrite_links_in(item)
|
||||||
|
|
||||||
if getattr(oeb.toc, 'nodes', False):
|
if getattr(oeb.toc, 'nodes', False):
|
||||||
|
self.log.debug('Rewriting links in TOC...')
|
||||||
self.rewrite_links_in_toc(oeb.toc)
|
self.rewrite_links_in_toc(oeb.toc)
|
||||||
|
|
||||||
if hasattr(oeb, 'guide'):
|
if hasattr(oeb, 'guide'):
|
||||||
|
self.log.debug('Rewriting links in guide...')
|
||||||
for ref in oeb.guide.values():
|
for ref in oeb.guide.values():
|
||||||
ref.href = self.link_replacer(ref.href, base=self.new_base_path)
|
ref.href = self.link_replacer(ref.href,
|
||||||
|
base=self.old_base_path,
|
||||||
|
new_base=self.new_base_path)
|
||||||
|
@ -48,7 +48,8 @@ class OEBWriter(object):
|
|||||||
pretty_print=pretty_print)
|
pretty_print=pretty_print)
|
||||||
|
|
||||||
def __call__(self, oeb, path):
|
def __call__(self, oeb, path):
|
||||||
"""Read the book in the :class:`OEBBook` object :param:`oeb` to a file
|
"""
|
||||||
|
Read the book in the :class:`OEBBook` object :param:`oeb` to a file
|
||||||
at :param:`path`.
|
at :param:`path`.
|
||||||
"""
|
"""
|
||||||
version = int(self.version[0])
|
version = int(self.version[0])
|
||||||
|
@ -1,11 +1,11 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||||
'''
|
'''
|
||||||
CLI for downloading feeds.
|
CLI for downloading feeds.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
import sys, os, logging
|
import sys, os
|
||||||
from calibre.web.feeds.recipes import get_builtin_recipe, compile_recipe, titles
|
from calibre.web.feeds.recipes import get_builtin_recipe, compile_recipe, titles
|
||||||
from calibre.web.fetch.simple import option_parser as _option_parser
|
from calibre.web.fetch.simple import option_parser as _option_parser
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
@ -14,13 +14,13 @@ from calibre.utils.config import Config, StringConfig
|
|||||||
def config(defaults=None):
|
def config(defaults=None):
|
||||||
desc = _('Options to control the fetching of periodical content from the web.')
|
desc = _('Options to control the fetching of periodical content from the web.')
|
||||||
c = Config('feeds2disk', desc) if defaults is None else StringConfig(defaults, desc)
|
c = Config('feeds2disk', desc) if defaults is None else StringConfig(defaults, desc)
|
||||||
|
|
||||||
web2disk = c.add_group('web2disk', _('Customize the download engine'))
|
web2disk = c.add_group('web2disk', _('Customize the download engine'))
|
||||||
web2disk('timeout', ['-t', '--timeout'], default=10.0,
|
web2disk('timeout', ['-t', '--timeout'], default=10.0,
|
||||||
help=_('Timeout in seconds to wait for a response from the server. Default: %default s'),)
|
help=_('Timeout in seconds to wait for a response from the server. Default: %default s'),)
|
||||||
web2disk('delay', ['--delay'], default=0,
|
web2disk('delay', ['--delay'], default=0,
|
||||||
help=_('Minimum interval in seconds between consecutive fetches. Default is %default s'))
|
help=_('Minimum interval in seconds between consecutive fetches. Default is %default s'))
|
||||||
web2disk('encoding', ['--encoding'], default=None,
|
web2disk('encoding', ['--encoding'], default=None,
|
||||||
help=_('The character encoding for the websites you are trying to download. The default is to try and guess the encoding.'))
|
help=_('The character encoding for the websites you are trying to download. The default is to try and guess the encoding.'))
|
||||||
web2disk('match_regexps', ['--match-regexp'], default=[], action='append',
|
web2disk('match_regexps', ['--match-regexp'], default=[], action='append',
|
||||||
help=_('Only links that match this regular expression will be followed. This option can be specified multiple times, in which case as long as a link matches any one regexp, it will be followed. By default all links are followed.'))
|
help=_('Only links that match this regular expression will be followed. This option can be specified multiple times, in which case as long as a link matches any one regexp, it will be followed. By default all links are followed.'))
|
||||||
@ -28,42 +28,42 @@ def config(defaults=None):
|
|||||||
help=_('Any link that matches this regular expression will be ignored. This option can be specified multiple times, in which case as long as any regexp matches a link, it will be ignored.By default, no links are ignored. If both --filter-regexp and --match-regexp are specified, then --filter-regexp is applied first.'))
|
help=_('Any link that matches this regular expression will be ignored. This option can be specified multiple times, in which case as long as any regexp matches a link, it will be ignored.By default, no links are ignored. If both --filter-regexp and --match-regexp are specified, then --filter-regexp is applied first.'))
|
||||||
web2disk('no_stylesheets', ['--dont-download-stylesheets'], action='store_true', default=False,
|
web2disk('no_stylesheets', ['--dont-download-stylesheets'], action='store_true', default=False,
|
||||||
help=_('Do not download CSS stylesheets.'))
|
help=_('Do not download CSS stylesheets.'))
|
||||||
|
|
||||||
c.add_opt('feeds', ['--feeds'], default=None,
|
c.add_opt('feeds', ['--feeds'], default=None,
|
||||||
help=_('''Specify a list of feeds to download. For example:
|
help=_('''Specify a list of feeds to download. For example:
|
||||||
"['http://feeds.newsweek.com/newsweek/TopNews', 'http://feeds.newsweek.com/headlines/politics']"
|
"['http://feeds.newsweek.com/newsweek/TopNews', 'http://feeds.newsweek.com/headlines/politics']"
|
||||||
If you specify this option, any argument to %prog is ignored and a default recipe is used to download the feeds.'''))
|
If you specify this option, any argument to %prog is ignored and a default recipe is used to download the feeds.'''))
|
||||||
c.add_opt('verbose', ['-v', '--verbose'], default=0, action='count',
|
c.add_opt('verbose', ['-v', '--verbose'], default=0, action='count',
|
||||||
help=_('''Be more verbose while processing.'''))
|
help=_('''Be more verbose while processing.'''))
|
||||||
c.add_opt('title', ['--title'], default=None,
|
c.add_opt('title', ['--title'], default=None,
|
||||||
help=_('The title for this recipe. Used as the title for any ebooks created from the downloaded feeds.'))
|
help=_('The title for this recipe. Used as the title for any ebooks created from the downloaded feeds.'))
|
||||||
c.add_opt('username', ['-u', '--username'], default=None,
|
c.add_opt('username', ['-u', '--username'], default=None,
|
||||||
help=_('Username for sites that require a login to access content.'))
|
help=_('Username for sites that require a login to access content.'))
|
||||||
c.add_opt('password', ['-p', '--password'], default=None,
|
c.add_opt('password', ['-p', '--password'], default=None,
|
||||||
help=_('Password for sites that require a login to access content.'))
|
help=_('Password for sites that require a login to access content.'))
|
||||||
c.add_opt('lrf', ['--lrf'], default=False, action='store_true',
|
c.add_opt('lrf', ['--lrf'], default=False, action='store_true',
|
||||||
help='Optimize fetching for subsequent conversion to LRF.')
|
help='Optimize fetching for subsequent conversion to LRF.')
|
||||||
c.add_opt('epub', ['--epub'], default=False, action='store_true',
|
c.add_opt('epub', ['--epub'], default=False, action='store_true',
|
||||||
help='Optimize fetching for subsequent conversion to EPUB.')
|
help='Optimize fetching for subsequent conversion to EPUB.')
|
||||||
c.add_opt('mobi', ['--mobi'], default=False, action='store_true',
|
c.add_opt('mobi', ['--mobi'], default=False, action='store_true',
|
||||||
help='Optimize fetching for subsequent conversion to MOBI.')
|
help='Optimize fetching for subsequent conversion to MOBI.')
|
||||||
c.add_opt('recursions', ['--recursions'], default=0,
|
c.add_opt('recursions', ['--recursions'], default=0,
|
||||||
help=_('Number of levels of links to follow on webpages that are linked to from feeds. Defaul %default'))
|
help=_('Number of levels of links to follow on webpages that are linked to from feeds. Defaul %default'))
|
||||||
c.add_opt('output_dir', ['--output-dir'], default='.',
|
c.add_opt('output_dir', ['--output-dir'], default='.',
|
||||||
help=_('The directory in which to store the downloaded feeds. Defaults to the current directory.'))
|
help=_('The directory in which to store the downloaded feeds. Defaults to the current directory.'))
|
||||||
c.add_opt('no_progress_bar', ['--no-progress-bar'], default=False, action='store_true',
|
c.add_opt('no_progress_bar', ['--no-progress-bar'], default=False, action='store_true',
|
||||||
help=_("Don't show the progress bar"))
|
help=_("Don't show the progress bar"))
|
||||||
c.add_opt('debug', ['--debug'], action='store_true', default=False,
|
c.add_opt('debug', ['--debug'], action='store_true', default=False,
|
||||||
help=_('Very verbose output, useful for debugging.'))
|
help=_('Very verbose output, useful for debugging.'))
|
||||||
c.add_opt('test', ['--test'], action='store_true', default=False,
|
c.add_opt('test', ['--test'], action='store_true', default=False,
|
||||||
help=_('Useful for recipe development. Forces max_articles_per_feed to 2 and downloads at most 2 feeds.'))
|
help=_('Useful for recipe development. Forces max_articles_per_feed to 2 and downloads at most 2 feeds.'))
|
||||||
|
|
||||||
return c
|
return c
|
||||||
|
|
||||||
USAGE=_('''\
|
USAGE=_('''\
|
||||||
%%prog [options] ARG
|
%%prog [options] ARG
|
||||||
|
|
||||||
%%prog parses an online source of articles, like an RSS or ATOM feed and
|
%%prog parses an online source of articles, like an RSS or ATOM feed and
|
||||||
fetches the article contents organized in a nice hierarchy.
|
fetches the article contents organized in a nice hierarchy.
|
||||||
|
|
||||||
ARG can be one of:
|
ARG can be one of:
|
||||||
@ -85,9 +85,9 @@ def option_parser(usage=USAGE):
|
|||||||
p.remove_option('--verbose')
|
p.remove_option('--verbose')
|
||||||
p.remove_option('--max-files')
|
p.remove_option('--max-files')
|
||||||
p.subsume('WEB2DISK OPTIONS', _('Options to control web2disk (used to fetch websites linked from feeds)'))
|
p.subsume('WEB2DISK OPTIONS', _('Options to control web2disk (used to fetch websites linked from feeds)'))
|
||||||
|
|
||||||
p.add_option('--feeds', default=None,
|
p.add_option('--feeds', default=None,
|
||||||
help=_('''Specify a list of feeds to download. For example:
|
help=_('''Specify a list of feeds to download. For example:
|
||||||
"['http://feeds.newsweek.com/newsweek/TopNews', 'http://feeds.newsweek.com/headlines/politics']"
|
"['http://feeds.newsweek.com/newsweek/TopNews', 'http://feeds.newsweek.com/headlines/politics']"
|
||||||
If you specify this option, any argument to %prog is ignored and a default recipe is used to download the feeds.'''))
|
If you specify this option, any argument to %prog is ignored and a default recipe is used to download the feeds.'''))
|
||||||
p.add_option('--verbose', default=False, action='store_true',
|
p.add_option('--verbose', default=False, action='store_true',
|
||||||
@ -99,70 +99,62 @@ If you specify this option, any argument to %prog is ignored and a default recip
|
|||||||
p.add_option('--lrf', default=False, action='store_true', help='Optimize fetching for subsequent conversion to LRF.')
|
p.add_option('--lrf', default=False, action='store_true', help='Optimize fetching for subsequent conversion to LRF.')
|
||||||
p.add_option('--recursions', default=0, type='int',
|
p.add_option('--recursions', default=0, type='int',
|
||||||
help=_('Number of levels of links to follow on webpages that are linked to from feeds. Defaul %default'))
|
help=_('Number of levels of links to follow on webpages that are linked to from feeds. Defaul %default'))
|
||||||
p.add_option('--output-dir', default=os.getcwd(),
|
p.add_option('--output-dir', default=os.getcwd(),
|
||||||
help=_('The directory in which to store the downloaded feeds. Defaults to the current directory.'))
|
help=_('The directory in which to store the downloaded feeds. Defaults to the current directory.'))
|
||||||
p.add_option('--no-progress-bar', dest='no_progress_bar', default=False, action='store_true',
|
p.add_option('--no-progress-bar', dest='no_progress_bar', default=False, action='store_true',
|
||||||
help=_('Dont show the progress bar'))
|
help=_('Dont show the progress bar'))
|
||||||
p.add_option('--debug', action='store_true', default=False,
|
p.add_option('--debug', action='store_true', default=False,
|
||||||
help=_('Very verbose output, useful for debugging.'))
|
help=_('Very verbose output, useful for debugging.'))
|
||||||
p.add_option('--test', action='store_true', default=False,
|
p.add_option('--test', action='store_true', default=False,
|
||||||
help=_('Useful for recipe development. Forces max_articles_per_feed to 2 and downloads at most 2 feeds.'))
|
help=_('Useful for recipe development. Forces max_articles_per_feed to 2 and downloads at most 2 feeds.'))
|
||||||
|
|
||||||
return p
|
return p
|
||||||
|
|
||||||
class RecipeError(Exception):
|
class RecipeError(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def run_recipe(opts, recipe_arg, parser, notification=None, handler=None):
|
def run_recipe(opts, recipe_arg, parser, notification=None):
|
||||||
if notification is None:
|
if notification is None:
|
||||||
from calibre.utils.terminfo import TerminalController, ProgressBar
|
from calibre.utils.terminfo import TerminalController, ProgressBar
|
||||||
term = TerminalController(sys.stdout)
|
term = TerminalController(sys.stdout)
|
||||||
pb = ProgressBar(term, _('Fetching feeds...'), no_progress_bar=opts.no_progress_bar)
|
pb = ProgressBar(term, _('Fetching feeds...'), no_progress_bar=opts.no_progress_bar)
|
||||||
notification = pb.update
|
notification = pb.update
|
||||||
|
|
||||||
recipe = None
|
recipe = None
|
||||||
if opts.feeds is not None:
|
if opts.feeds is not None:
|
||||||
recipe = BasicNewsRecipe
|
recipe = BasicNewsRecipe
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
if os.access(recipe_arg, os.R_OK):
|
if os.access(recipe_arg, os.R_OK):
|
||||||
recipe = compile_recipe(open(recipe_arg).read())
|
recipe = compile_recipe(open(recipe_arg).read())
|
||||||
else:
|
else:
|
||||||
raise Exception('not file')
|
raise Exception('not file')
|
||||||
except:
|
except:
|
||||||
recipe = get_builtin_recipe(recipe_arg)
|
recipe = get_builtin_recipe(recipe_arg)
|
||||||
if recipe is None:
|
if recipe is None:
|
||||||
recipe = compile_recipe(recipe_arg)
|
recipe = compile_recipe(recipe_arg)
|
||||||
|
|
||||||
if recipe is None:
|
if recipe is None:
|
||||||
raise RecipeError(recipe_arg+ ' is an invalid recipe')
|
raise RecipeError(recipe_arg+ ' is an invalid recipe')
|
||||||
|
|
||||||
|
|
||||||
if handler is None:
|
|
||||||
from calibre import ColoredFormatter
|
|
||||||
handler = logging.StreamHandler(sys.stdout)
|
|
||||||
handler.setLevel(logging.DEBUG if opts.debug else logging.INFO if opts.verbose else logging.WARN)
|
|
||||||
handler.setFormatter(ColoredFormatter('%(levelname)s: %(message)s\n')) # The trailing newline is need because of the progress bar
|
|
||||||
logging.getLogger('feeds2disk').addHandler(handler)
|
|
||||||
|
|
||||||
recipe = recipe(opts, parser, notification)
|
recipe = recipe(opts, parser, notification)
|
||||||
|
|
||||||
if not os.path.exists(recipe.output_dir):
|
if not os.path.exists(recipe.output_dir):
|
||||||
os.makedirs(recipe.output_dir)
|
os.makedirs(recipe.output_dir)
|
||||||
recipe.download(for_lrf=True)
|
recipe.download(for_lrf=True)
|
||||||
|
|
||||||
return recipe
|
return recipe
|
||||||
|
|
||||||
def main(args=sys.argv, notification=None, handler=None):
|
def main(args=sys.argv, notification=None):
|
||||||
p = option_parser()
|
p = option_parser()
|
||||||
opts, args = p.parse_args(args=args[1:])
|
opts, args = p.parse_args(args=args[1:])
|
||||||
|
|
||||||
if len(args) != 1 and opts.feeds is None:
|
if len(args) != 1 and opts.feeds is None:
|
||||||
p.print_help()
|
p.print_help()
|
||||||
return 1
|
return 1
|
||||||
recipe_arg = args[0] if len(args) > 0 else None
|
recipe_arg = args[0] if len(args) > 0 else None
|
||||||
run_recipe(opts, recipe_arg, p, notification=notification, handler=handler)
|
run_recipe(opts, recipe_arg, p, notification=notification)
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
@ -7,7 +7,7 @@ Defines various abstract base classes that can be subclassed to create powerful
|
|||||||
__docformat__ = "restructuredtext en"
|
__docformat__ = "restructuredtext en"
|
||||||
|
|
||||||
|
|
||||||
import logging, os, cStringIO, time, traceback, re, urlparse, sys
|
import os, time, traceback, re, urlparse, sys
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from contextlib import nested, closing
|
from contextlib import nested, closing
|
||||||
@ -27,6 +27,7 @@ from calibre.web.feeds import feed_from_xml, templates, feeds_from_index, Feed
|
|||||||
from calibre.web.fetch.simple import option_parser as web2disk_option_parser
|
from calibre.web.fetch.simple import option_parser as web2disk_option_parser
|
||||||
from calibre.web.fetch.simple import RecursiveFetcher
|
from calibre.web.fetch.simple import RecursiveFetcher
|
||||||
from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending
|
from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending
|
||||||
|
from calibre.utils.logging import Log
|
||||||
from calibre.ptempfile import PersistentTemporaryFile, \
|
from calibre.ptempfile import PersistentTemporaryFile, \
|
||||||
PersistentTemporaryDirectory
|
PersistentTemporaryDirectory
|
||||||
|
|
||||||
@ -423,7 +424,7 @@ class BasicNewsRecipe(object):
|
|||||||
'''
|
'''
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def get_obfuscated_article(self, url, logger):
|
def get_obfuscated_article(self, url):
|
||||||
'''
|
'''
|
||||||
If you set :member:`articles_are_obfuscated` this method is called with
|
If you set :member:`articles_are_obfuscated` this method is called with
|
||||||
every article URL. It should return the path to a file on the filesystem
|
every article URL. It should return the path to a file on the filesystem
|
||||||
@ -443,6 +444,7 @@ class BasicNewsRecipe(object):
|
|||||||
:param parser: Command line option parser. Used to intelligently merge options.
|
:param parser: Command line option parser. Used to intelligently merge options.
|
||||||
:param progress_reporter: A Callable that takes two arguments: progress (a number between 0 and 1) and a string message. The message should be optional.
|
:param progress_reporter: A Callable that takes two arguments: progress (a number between 0 and 1) and a string message. The message should be optional.
|
||||||
'''
|
'''
|
||||||
|
self.log = Log()
|
||||||
if not isinstance(self.title, unicode):
|
if not isinstance(self.title, unicode):
|
||||||
self.title = unicode(self.title, 'utf-8', 'replace')
|
self.title = unicode(self.title, 'utf-8', 'replace')
|
||||||
|
|
||||||
@ -455,7 +457,6 @@ class BasicNewsRecipe(object):
|
|||||||
|
|
||||||
|
|
||||||
if self.debug:
|
if self.debug:
|
||||||
logging.getLogger('feeds2disk').setLevel(logging.DEBUG)
|
|
||||||
self.verbose = True
|
self.verbose = True
|
||||||
self.report_progress = progress_reporter
|
self.report_progress = progress_reporter
|
||||||
|
|
||||||
@ -560,20 +561,20 @@ class BasicNewsRecipe(object):
|
|||||||
res = self.build_index()
|
res = self.build_index()
|
||||||
self.report_progress(1, _('Download finished'))
|
self.report_progress(1, _('Download finished'))
|
||||||
if self.failed_downloads:
|
if self.failed_downloads:
|
||||||
self.log_warning(_('Failed to download the following articles:'))
|
self.log.warning(_('Failed to download the following articles:'))
|
||||||
for feed, article, debug in self.failed_downloads:
|
for feed, article, debug in self.failed_downloads:
|
||||||
self.log_warning(article.title+_(' from ')+feed.title)
|
self.log.warning(article.title+_(' from ')+feed.title)
|
||||||
self.log_debug(article.url)
|
self.log.debug(article.url)
|
||||||
self.log_debug(debug)
|
self.log.debug(debug)
|
||||||
if self.partial_failures:
|
if self.partial_failures:
|
||||||
self.log_warning(_('Failed to download parts of the following articles:'))
|
self.log.warning(_('Failed to download parts of the following articles:'))
|
||||||
for feed, atitle, aurl, debug in self.partial_failures:
|
for feed, atitle, aurl, debug in self.partial_failures:
|
||||||
self.log_warning(atitle + _(' from ') + feed)
|
self.log.warning(atitle + _(' from ') + feed)
|
||||||
self.log_debug(aurl)
|
self.log.debug(aurl)
|
||||||
self.log_warning(_('\tFailed links:'))
|
self.log.warning(_('\tFailed links:'))
|
||||||
for l, tb in debug:
|
for l, tb in debug:
|
||||||
self.log_warning(l)
|
self.log.warning(l)
|
||||||
self.log_debug(tb)
|
self.log.debug(tb)
|
||||||
return res
|
return res
|
||||||
finally:
|
finally:
|
||||||
self.cleanup()
|
self.cleanup()
|
||||||
@ -636,20 +637,11 @@ class BasicNewsRecipe(object):
|
|||||||
extra_css=self.extra_css).render(doctype='xhtml')
|
extra_css=self.extra_css).render(doctype='xhtml')
|
||||||
|
|
||||||
|
|
||||||
def create_logger(self, feed_number, article_number):
|
def _fetch_article(self, url, dir, f, a, num_of_feeds):
|
||||||
logger = logging.getLogger('feeds2disk.article_%d_%d'%(feed_number, article_number))
|
|
||||||
out = cStringIO.StringIO()
|
|
||||||
handler = logging.StreamHandler(out)
|
|
||||||
handler.setFormatter(logging.Formatter('%(levelname)s: %(message)s'))
|
|
||||||
handler.setLevel(logging.INFO if self.verbose else logging.WARNING)
|
|
||||||
if self.debug:
|
|
||||||
handler.setLevel(logging.DEBUG)
|
|
||||||
logger.addHandler(handler)
|
|
||||||
return logger, out
|
|
||||||
|
|
||||||
def _fetch_article(self, url, dir, logger, f, a, num_of_feeds):
|
|
||||||
self.web2disk_options.browser = self.get_browser() if self.multithreaded_fetch else self.browser
|
self.web2disk_options.browser = self.get_browser() if self.multithreaded_fetch else self.browser
|
||||||
fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map, self.css_map, (url, f, a, num_of_feeds))
|
fetcher = RecursiveFetcher(self.web2disk_options, self.log,
|
||||||
|
self.image_map, self.css_map,
|
||||||
|
(url, f, a, num_of_feeds))
|
||||||
fetcher.base_dir = dir
|
fetcher.base_dir = dir
|
||||||
fetcher.current_dir = dir
|
fetcher.current_dir = dir
|
||||||
fetcher.show_progress = False
|
fetcher.show_progress = False
|
||||||
@ -661,21 +653,21 @@ class BasicNewsRecipe(object):
|
|||||||
raise Exception(_('Could not fetch article. Run with --debug to see the reason'))
|
raise Exception(_('Could not fetch article. Run with --debug to see the reason'))
|
||||||
return res, path, failures
|
return res, path, failures
|
||||||
|
|
||||||
def fetch_article(self, url, dir, logger, f, a, num_of_feeds):
|
def fetch_article(self, url, dir, f, a, num_of_feeds):
|
||||||
return self._fetch_article(url, dir, logger, f, a, num_of_feeds)
|
return self._fetch_article(url, dir, f, a, num_of_feeds)
|
||||||
|
|
||||||
def fetch_obfuscated_article(self, url, dir, logger, f, a, num_of_feeds):
|
def fetch_obfuscated_article(self, url, dir, f, a, num_of_feeds):
|
||||||
path = os.path.abspath(self.get_obfuscated_article(url, logger))
|
path = os.path.abspath(self.get_obfuscated_article(url))
|
||||||
url = ('file:'+path) if iswindows else ('file://'+path)
|
url = ('file:'+path) if iswindows else ('file://'+path)
|
||||||
return self._fetch_article(url, dir, logger, f, a, num_of_feeds)
|
return self._fetch_article(url, dir, f, a, num_of_feeds)
|
||||||
|
|
||||||
def fetch_embedded_article(self, article, dir, logger, f, a, num_of_feeds):
|
def fetch_embedded_article(self, article, dir, f, a, num_of_feeds):
|
||||||
templ = templates.EmbeddedContent()
|
templ = templates.EmbeddedContent()
|
||||||
raw = templ.generate(article).render('html')
|
raw = templ.generate(article).render('html')
|
||||||
with PersistentTemporaryFile('_feeds2disk.html') as pt:
|
with PersistentTemporaryFile('_feeds2disk.html') as pt:
|
||||||
pt.write(raw)
|
pt.write(raw)
|
||||||
url = ('file:'+pt.name) if iswindows else ('file://'+pt.name)
|
url = ('file:'+pt.name) if iswindows else ('file://'+pt.name)
|
||||||
return self._fetch_article(url, dir, logger, f, a, num_of_feeds)
|
return self._fetch_article(url, dir, f, a, num_of_feeds)
|
||||||
|
|
||||||
|
|
||||||
def build_index(self):
|
def build_index(self):
|
||||||
@ -716,7 +708,6 @@ class BasicNewsRecipe(object):
|
|||||||
art_dir = os.path.join(feed_dir, 'article_%d'%a)
|
art_dir = os.path.join(feed_dir, 'article_%d'%a)
|
||||||
if not os.path.isdir(art_dir):
|
if not os.path.isdir(art_dir):
|
||||||
os.makedirs(art_dir)
|
os.makedirs(art_dir)
|
||||||
logger, stream = self.create_logger(f, a)
|
|
||||||
try:
|
try:
|
||||||
url = self.print_version(article.url)
|
url = self.print_version(article.url)
|
||||||
except NotImplementedError:
|
except NotImplementedError:
|
||||||
@ -726,10 +717,9 @@ class BasicNewsRecipe(object):
|
|||||||
func, arg = (self.fetch_embedded_article, article) if self.use_embedded_content else \
|
func, arg = (self.fetch_embedded_article, article) if self.use_embedded_content else \
|
||||||
((self.fetch_obfuscated_article if self.articles_are_obfuscated \
|
((self.fetch_obfuscated_article if self.articles_are_obfuscated \
|
||||||
else self.fetch_article), url)
|
else self.fetch_article), url)
|
||||||
req = WorkRequest(func, (arg, art_dir, logger, f, a, len(feed)),
|
req = WorkRequest(func, (arg, art_dir, f, a, len(feed)),
|
||||||
{}, (f, a), self.article_downloaded,
|
{}, (f, a), self.article_downloaded,
|
||||||
self.error_in_article_download)
|
self.error_in_article_download)
|
||||||
req.stream = stream
|
|
||||||
req.feed = feed
|
req.feed = feed
|
||||||
req.article = article
|
req.article = article
|
||||||
req.feed_dir = feed_dir
|
req.feed_dir = feed_dir
|
||||||
@ -768,8 +758,8 @@ class BasicNewsRecipe(object):
|
|||||||
cu = self.get_cover_url()
|
cu = self.get_cover_url()
|
||||||
except Exception, err:
|
except Exception, err:
|
||||||
cu = None
|
cu = None
|
||||||
self.log_error(_('Could not download cover: %s')%str(err))
|
self.log.error(_('Could not download cover: %s')%str(err))
|
||||||
self.log_debug(traceback.format_exc())
|
self.log.debug(traceback.format_exc())
|
||||||
if cu is not None:
|
if cu is not None:
|
||||||
ext = cu.rpartition('.')[-1]
|
ext = cu.rpartition('.')[-1]
|
||||||
if '?' in ext:
|
if '?' in ext:
|
||||||
@ -841,8 +831,8 @@ class BasicNewsRecipe(object):
|
|||||||
f.write(html.encode('utf-8'))
|
f.write(html.encode('utf-8'))
|
||||||
renderer = render_html(hf)
|
renderer = render_html(hf)
|
||||||
if renderer.tb is not None:
|
if renderer.tb is not None:
|
||||||
self.logger.warning('Failed to render default cover')
|
self.log.warning('Failed to render default cover')
|
||||||
self.logger.debug(renderer.tb)
|
self.log.debug(renderer.tb)
|
||||||
else:
|
else:
|
||||||
cover_file.write(renderer.data)
|
cover_file.write(renderer.data)
|
||||||
cover_file.flush()
|
cover_file.flush()
|
||||||
@ -863,7 +853,7 @@ class BasicNewsRecipe(object):
|
|||||||
manifest.append(os.path.join(dir, 'index.ncx'))
|
manifest.append(os.path.join(dir, 'index.ncx'))
|
||||||
cpath = getattr(self, 'cover_path', None)
|
cpath = getattr(self, 'cover_path', None)
|
||||||
if cpath is None:
|
if cpath is None:
|
||||||
pf = PersistentTemporaryFile('_recipe_cover.jpg')
|
pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
|
||||||
self.default_cover(pf)
|
self.default_cover(pf)
|
||||||
cpath = pf.name
|
cpath = pf.name
|
||||||
if cpath is not None and os.access(cpath, os.R_OK):
|
if cpath is not None and os.access(cpath, os.R_OK):
|
||||||
@ -944,7 +934,7 @@ class BasicNewsRecipe(object):
|
|||||||
a = request.requestID[1]
|
a = request.requestID[1]
|
||||||
|
|
||||||
article = request.article
|
article = request.article
|
||||||
self.log_debug(_('\nDownloaded article %s from %s\n%s')%(article.title, article.url, request.stream.getvalue().decode('utf-8', 'ignore')))
|
self.log.debug(_('\nDownloaded article %s from %s')%(article.title, article.url))
|
||||||
article.orig_url = article.url
|
article.orig_url = article.url
|
||||||
article.url = 'article_%d/index.html'%a
|
article.url = 'article_%d/index.html'%a
|
||||||
article.downloaded = True
|
article.downloaded = True
|
||||||
@ -956,11 +946,11 @@ class BasicNewsRecipe(object):
|
|||||||
|
|
||||||
def error_in_article_download(self, request, traceback):
|
def error_in_article_download(self, request, traceback):
|
||||||
self.jobs_done += 1
|
self.jobs_done += 1
|
||||||
self.log_error(_('Failed to download article: %s from %s\n')%(request.article.title, request.article.url))
|
self.log.error(_('Failed to download article: %s from %s\n')%(request.article.title, request.article.url))
|
||||||
debug = request.stream.getvalue().decode('utf-8', 'ignore')
|
debug = request.stream.getvalue().decode('utf-8', 'ignore')
|
||||||
self.log_debug(debug)
|
self.log.debug(debug)
|
||||||
self.log_debug(traceback)
|
self.log.debug(traceback)
|
||||||
self.log_debug('\n')
|
self.log.debug('\n')
|
||||||
self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article download failed: %s')%request.article.title)
|
self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article download failed: %s')%request.article.title)
|
||||||
self.failed_downloads.append((request.feed, request.article, debug))
|
self.failed_downloads.append((request.feed, request.article, debug))
|
||||||
|
|
||||||
@ -990,7 +980,7 @@ class BasicNewsRecipe(object):
|
|||||||
feed.populate_from_preparsed_feed(msg, [])
|
feed.populate_from_preparsed_feed(msg, [])
|
||||||
feed.description = unicode(err)
|
feed.description = unicode(err)
|
||||||
parsed_feeds.append(feed)
|
parsed_feeds.append(feed)
|
||||||
self.log_exception(msg)
|
self.log.exception(msg)
|
||||||
|
|
||||||
|
|
||||||
return parsed_feeds
|
return parsed_feeds
|
||||||
@ -1057,7 +1047,7 @@ class CustomIndexRecipe(BasicNewsRecipe):
|
|||||||
index = os.path.abspath(self.custom_index())
|
index = os.path.abspath(self.custom_index())
|
||||||
url = 'file:'+index if iswindows else 'file://'+index
|
url = 'file:'+index if iswindows else 'file://'+index
|
||||||
self.web2disk_options.browser = self.browser
|
self.web2disk_options.browser = self.browser
|
||||||
fetcher = RecursiveFetcher(self.web2disk_options, self.logger)
|
fetcher = RecursiveFetcher(self.web2disk_options, self.log)
|
||||||
fetcher.base_dir = self.output_dir
|
fetcher.base_dir = self.output_dir
|
||||||
fetcher.current_dir = self.output_dir
|
fetcher.current_dir = self.output_dir
|
||||||
fetcher.show_progress = False
|
fetcher.show_progress = False
|
||||||
@ -1069,7 +1059,7 @@ class AutomaticNewsRecipe(BasicNewsRecipe):
|
|||||||
|
|
||||||
keep_only_tags = [dict(name=['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])]
|
keep_only_tags = [dict(name=['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])]
|
||||||
|
|
||||||
def fetch_embedded_article(self, article, dir, logger, f, a, num_of_feeds):
|
def fetch_embedded_article(self, article, dir, f, a, num_of_feeds):
|
||||||
if self.use_embedded_content:
|
if self.use_embedded_content:
|
||||||
self.web2disk_options.keep_only_tags = []
|
self.web2disk_options.keep_only_tags = []
|
||||||
return BasicNewsRecipe.fetch_embedded_article(self, article, dir, logger, f, a, num_of_feeds)
|
return BasicNewsRecipe.fetch_embedded_article(self, article, dir, f, a, num_of_feeds)
|
||||||
|
@ -7,18 +7,19 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
|||||||
Fetch a webpage and its links recursively. The webpages are saved to disk in
|
Fetch a webpage and its links recursively. The webpages are saved to disk in
|
||||||
UTF-8 encoding with any charset declarations removed.
|
UTF-8 encoding with any charset declarations removed.
|
||||||
'''
|
'''
|
||||||
import sys, socket, os, urlparse, logging, re, time, copy, urllib2, threading, traceback
|
import sys, socket, os, urlparse, re, time, copy, urllib2, threading, traceback
|
||||||
from urllib import url2pathname, quote
|
from urllib import url2pathname, quote
|
||||||
from threading import RLock
|
from threading import RLock
|
||||||
from httplib import responses
|
from httplib import responses
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from cStringIO import StringIO
|
from cStringIO import StringIO
|
||||||
|
|
||||||
from calibre import setup_cli_handlers, browser, sanitize_file_name, \
|
from calibre import browser, sanitize_file_name, \
|
||||||
relpath, unicode_path
|
relpath, unicode_path
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
|
||||||
from calibre.ebooks.chardet import xml_to_unicode
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
from calibre.utils.config import OptionParser
|
from calibre.utils.config import OptionParser
|
||||||
|
from calibre.utils.logging import Log
|
||||||
|
|
||||||
class FetchError(Exception):
|
class FetchError(Exception):
|
||||||
pass
|
pass
|
||||||
@ -28,10 +29,10 @@ class closing(object):
|
|||||||
|
|
||||||
def __init__(self, thing):
|
def __init__(self, thing):
|
||||||
self.thing = thing
|
self.thing = thing
|
||||||
|
|
||||||
def __enter__(self):
|
def __enter__(self):
|
||||||
return self.thing
|
return self.thing
|
||||||
|
|
||||||
def __exit__(self, *exc_info):
|
def __exit__(self, *exc_info):
|
||||||
try:
|
try:
|
||||||
self.thing.close()
|
self.thing.close()
|
||||||
@ -55,47 +56,48 @@ def save_soup(soup, target):
|
|||||||
for meta in metas:
|
for meta in metas:
|
||||||
if 'charset' in meta.get('content', '').lower():
|
if 'charset' in meta.get('content', '').lower():
|
||||||
meta.replaceWith(nm)
|
meta.replaceWith(nm)
|
||||||
|
|
||||||
selfdir = os.path.dirname(target)
|
selfdir = os.path.dirname(target)
|
||||||
|
|
||||||
for tag in soup.findAll(['img', 'link', 'a']):
|
for tag in soup.findAll(['img', 'link', 'a']):
|
||||||
for key in ('src', 'href'):
|
for key in ('src', 'href'):
|
||||||
path = tag.get(key, None)
|
path = tag.get(key, None)
|
||||||
if path and os.path.isfile(path) and os.path.exists(path) and os.path.isabs(path):
|
if path and os.path.isfile(path) and os.path.exists(path) and os.path.isabs(path):
|
||||||
tag[key] = unicode_path(relpath(path, selfdir).replace(os.sep, '/'))
|
tag[key] = unicode_path(relpath(path, selfdir).replace(os.sep, '/'))
|
||||||
|
|
||||||
html = unicode(soup)
|
html = unicode(soup)
|
||||||
with open(target, 'wb') as f:
|
with open(target, 'wb') as f:
|
||||||
f.write(html.encode('utf-8'))
|
f.write(html.encode('utf-8'))
|
||||||
|
|
||||||
class response(str):
|
class response(str):
|
||||||
|
|
||||||
def __new__(cls, *args):
|
def __new__(cls, *args):
|
||||||
obj = super(response, cls).__new__(cls, *args)
|
obj = super(response, cls).__new__(cls, *args)
|
||||||
obj.newurl = None
|
obj.newurl = None
|
||||||
return obj
|
return obj
|
||||||
|
|
||||||
class DummyLock(object):
|
class DummyLock(object):
|
||||||
|
|
||||||
def __enter__(self, *args): return self
|
def __enter__(self, *args): return self
|
||||||
def __exit__(self, *args): pass
|
def __exit__(self, *args): pass
|
||||||
|
|
||||||
class RecursiveFetcher(object):
|
class RecursiveFetcher(object):
|
||||||
LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in
|
LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in
|
||||||
('.exe\s*$', '.mp3\s*$', '.ogg\s*$', '^\s*mailto:', '^\s*$'))
|
('.exe\s*$', '.mp3\s*$', '.ogg\s*$', '^\s*mailto:', '^\s*$'))
|
||||||
#ADBLOCK_FILTER = tuple(re.compile(i, re.IGNORECASE) for it in
|
#ADBLOCK_FILTER = tuple(re.compile(i, re.IGNORECASE) for it in
|
||||||
# (
|
# (
|
||||||
#
|
#
|
||||||
# )
|
# )
|
||||||
# )
|
# )
|
||||||
CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE)
|
CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE)
|
||||||
default_timeout = socket.getdefaulttimeout() # Needed here as it is used in __del__
|
default_timeout = socket.getdefaulttimeout() # Needed here as it is used in __del__
|
||||||
DUMMY_LOCK = DummyLock()
|
DUMMY_LOCK = DummyLock()
|
||||||
|
|
||||||
def __init__(self, options, logger, image_map={}, css_map={}, job_info=None):
|
def __init__(self, options, log, image_map={}, css_map={}, job_info=None):
|
||||||
self.base_dir = os.path.abspath(os.path.expanduser(options.dir))
|
self.base_dir = os.path.abspath(os.path.expanduser(options.dir))
|
||||||
if not os.path.exists(self.base_dir):
|
if not os.path.exists(self.base_dir):
|
||||||
os.makedirs(self.base_dir)
|
os.makedirs(self.base_dir)
|
||||||
|
self.log = log
|
||||||
self.default_timeout = socket.getdefaulttimeout()
|
self.default_timeout = socket.getdefaulttimeout()
|
||||||
socket.setdefaulttimeout(options.timeout)
|
socket.setdefaulttimeout(options.timeout)
|
||||||
self.verbose = options.verbose
|
self.verbose = options.verbose
|
||||||
@ -122,19 +124,19 @@ class RecursiveFetcher(object):
|
|||||||
self.remove_tags_after = getattr(options, 'remove_tags_after', None)
|
self.remove_tags_after = getattr(options, 'remove_tags_after', None)
|
||||||
self.remove_tags_before = getattr(options, 'remove_tags_before', None)
|
self.remove_tags_before = getattr(options, 'remove_tags_before', None)
|
||||||
self.keep_only_tags = getattr(options, 'keep_only_tags', [])
|
self.keep_only_tags = getattr(options, 'keep_only_tags', [])
|
||||||
self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup)
|
self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup)
|
||||||
self.postprocess_html_ext= getattr(options, 'postprocess_html', None)
|
self.postprocess_html_ext= getattr(options, 'postprocess_html', None)
|
||||||
self.download_stylesheets = not options.no_stylesheets
|
self.download_stylesheets = not options.no_stylesheets
|
||||||
self.show_progress = True
|
self.show_progress = True
|
||||||
self.failed_links = []
|
self.failed_links = []
|
||||||
self.job_info = job_info
|
self.job_info = job_info
|
||||||
|
|
||||||
def get_soup(self, src):
|
def get_soup(self, src):
|
||||||
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
|
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
|
||||||
nmassage.extend(self.preprocess_regexps)
|
nmassage.extend(self.preprocess_regexps)
|
||||||
nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL), lambda m: '')] # Some websites have buggy doctype declarations that mess up beautifulsoup
|
nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL), lambda m: '')] # Some websites have buggy doctype declarations that mess up beautifulsoup
|
||||||
soup = BeautifulSoup(xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage)
|
soup = BeautifulSoup(xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage)
|
||||||
|
|
||||||
if self.keep_only_tags:
|
if self.keep_only_tags:
|
||||||
body = Tag(soup, 'body')
|
body = Tag(soup, 'body')
|
||||||
try:
|
try:
|
||||||
@ -146,7 +148,7 @@ class RecursiveFetcher(object):
|
|||||||
soup.find('body').replaceWith(body)
|
soup.find('body').replaceWith(body)
|
||||||
except AttributeError: # soup has no body element
|
except AttributeError: # soup has no body element
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def remove_beyond(tag, next):
|
def remove_beyond(tag, next):
|
||||||
while tag is not None and tag.name != 'body':
|
while tag is not None and tag.name != 'body':
|
||||||
after = getattr(tag, next)
|
after = getattr(tag, next)
|
||||||
@ -155,27 +157,27 @@ class RecursiveFetcher(object):
|
|||||||
after.extract()
|
after.extract()
|
||||||
after = ns
|
after = ns
|
||||||
tag = tag.parent
|
tag = tag.parent
|
||||||
|
|
||||||
if self.remove_tags_after is not None:
|
if self.remove_tags_after is not None:
|
||||||
rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after
|
rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after
|
||||||
for spec in rt:
|
for spec in rt:
|
||||||
tag = soup.find(**spec)
|
tag = soup.find(**spec)
|
||||||
remove_beyond(tag, 'nextSibling')
|
remove_beyond(tag, 'nextSibling')
|
||||||
|
|
||||||
if self.remove_tags_before is not None:
|
if self.remove_tags_before is not None:
|
||||||
tag = soup.find(**self.remove_tags_before)
|
tag = soup.find(**self.remove_tags_before)
|
||||||
remove_beyond(tag, 'previousSibling')
|
remove_beyond(tag, 'previousSibling')
|
||||||
|
|
||||||
for kwds in self.remove_tags:
|
for kwds in self.remove_tags:
|
||||||
for tag in soup.findAll(**kwds):
|
for tag in soup.findAll(**kwds):
|
||||||
tag.extract()
|
tag.extract()
|
||||||
return self.preprocess_html_ext(soup)
|
return self.preprocess_html_ext(soup)
|
||||||
|
|
||||||
|
|
||||||
def fetch_url(self, url):
|
def fetch_url(self, url):
|
||||||
data = None
|
data = None
|
||||||
self.log_debug('Fetching %s', url)
|
self.log.debug('Fetching', url)
|
||||||
delta = time.time() - self.last_fetch_at
|
delta = time.time() - self.last_fetch_at
|
||||||
if delta < self.delay:
|
if delta < self.delay:
|
||||||
time.sleep(delta)
|
time.sleep(delta)
|
||||||
if re.search(r'\s+', url) is not None:
|
if re.search(r'\s+', url) is not None:
|
||||||
@ -190,43 +192,43 @@ class RecursiveFetcher(object):
|
|||||||
raise FetchError, responses[err.code]
|
raise FetchError, responses[err.code]
|
||||||
if getattr(err, 'reason', [0])[0] == 104 or \
|
if getattr(err, 'reason', [0])[0] == 104 or \
|
||||||
getattr(getattr(err, 'args', [None])[0], 'errno', None) == -2: # Connection reset by peer or Name or service not know
|
getattr(getattr(err, 'args', [None])[0], 'errno', None) == -2: # Connection reset by peer or Name or service not know
|
||||||
self.log_debug('Temporary error, retrying in 1 second')
|
self.log.debug('Temporary error, retrying in 1 second')
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
with closing(self.browser.open(url)) as f:
|
with closing(self.browser.open(url)) as f:
|
||||||
data = response(f.read()+f.read())
|
data = response(f.read()+f.read())
|
||||||
data.newurl = f.geturl()
|
data.newurl = f.geturl()
|
||||||
else:
|
else:
|
||||||
raise err
|
raise err
|
||||||
finally:
|
finally:
|
||||||
self.last_fetch_at = time.time()
|
self.last_fetch_at = time.time()
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
def start_fetch(self, url):
|
def start_fetch(self, url):
|
||||||
soup = BeautifulSoup(u'<a href="'+url+'" />')
|
soup = BeautifulSoup(u'<a href="'+url+'" />')
|
||||||
self.log_info('Downloading')
|
self.log.debug('Downloading')
|
||||||
res = self.process_links(soup, url, 0, into_dir='')
|
res = self.process_links(soup, url, 0, into_dir='')
|
||||||
self.log_info('%s saved to %s', url, res)
|
self.log.debug('%s saved to %s'%( url, res))
|
||||||
return res
|
return res
|
||||||
|
|
||||||
def is_link_ok(self, url):
|
def is_link_ok(self, url):
|
||||||
for i in self.__class__.LINK_FILTER:
|
for i in self.__class__.LINK_FILTER:
|
||||||
if i.search(url):
|
if i.search(url):
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def is_link_wanted(self, url):
|
def is_link_wanted(self, url):
|
||||||
if self.filter_regexps:
|
if self.filter_regexps:
|
||||||
for f in self.filter_regexps:
|
for f in self.filter_regexps:
|
||||||
if f.search(url):
|
if f.search(url):
|
||||||
return False
|
return False
|
||||||
if self.match_regexps:
|
if self.match_regexps:
|
||||||
for m in self.match_regexps:
|
for m in self.match_regexps:
|
||||||
if m.search(url):
|
if m.search(url):
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def process_stylesheets(self, soup, baseurl):
|
def process_stylesheets(self, soup, baseurl):
|
||||||
diskpath = unicode_path(os.path.join(self.current_dir, 'stylesheets'))
|
diskpath = unicode_path(os.path.join(self.current_dir, 'stylesheets'))
|
||||||
if not os.path.exists(diskpath):
|
if not os.path.exists(diskpath):
|
||||||
@ -243,8 +245,7 @@ class RecursiveFetcher(object):
|
|||||||
try:
|
try:
|
||||||
data = self.fetch_url(iurl)
|
data = self.fetch_url(iurl)
|
||||||
except Exception, err:
|
except Exception, err:
|
||||||
self.log_debug('Could not fetch stylesheet %s', iurl)
|
self.log.exception('Could not fetch stylesheet %s'% iurl)
|
||||||
self.log_debug('Error: %s', str(err), exc_info=True)
|
|
||||||
continue
|
continue
|
||||||
stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
|
stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
|
||||||
with self.stylemap_lock:
|
with self.stylemap_lock:
|
||||||
@ -253,7 +254,7 @@ class RecursiveFetcher(object):
|
|||||||
x.write(data)
|
x.write(data)
|
||||||
tag['href'] = stylepath
|
tag['href'] = stylepath
|
||||||
else:
|
else:
|
||||||
for ns in tag.findAll(text=True):
|
for ns in tag.findAll(text=True):
|
||||||
src = str(ns)
|
src = str(ns)
|
||||||
m = self.__class__.CSS_IMPORT_PATTERN.search(src)
|
m = self.__class__.CSS_IMPORT_PATTERN.search(src)
|
||||||
if m:
|
if m:
|
||||||
@ -267,8 +268,7 @@ class RecursiveFetcher(object):
|
|||||||
try:
|
try:
|
||||||
data = self.fetch_url(iurl)
|
data = self.fetch_url(iurl)
|
||||||
except Exception, err:
|
except Exception, err:
|
||||||
self.log_warning('Could not fetch stylesheet %s', iurl)
|
self.log.exception('Could not fetch stylesheet %s'% iurl)
|
||||||
self.log_debug('Error: %s', str(err), exc_info=True)
|
|
||||||
continue
|
continue
|
||||||
c += 1
|
c += 1
|
||||||
stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
|
stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
|
||||||
@ -277,9 +277,9 @@ class RecursiveFetcher(object):
|
|||||||
with open(stylepath, 'wb') as x:
|
with open(stylepath, 'wb') as x:
|
||||||
x.write(data)
|
x.write(data)
|
||||||
ns.replaceWith(src.replace(m.group(1), stylepath))
|
ns.replaceWith(src.replace(m.group(1), stylepath))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def process_images(self, soup, baseurl):
|
def process_images(self, soup, baseurl):
|
||||||
diskpath = unicode_path(os.path.join(self.current_dir, 'images'))
|
diskpath = unicode_path(os.path.join(self.current_dir, 'images'))
|
||||||
if not os.path.exists(diskpath):
|
if not os.path.exists(diskpath):
|
||||||
@ -291,9 +291,6 @@ class RecursiveFetcher(object):
|
|||||||
iurl = self.image_url_processor(baseurl, iurl)
|
iurl = self.image_url_processor(baseurl, iurl)
|
||||||
ext = os.path.splitext(iurl)[1]
|
ext = os.path.splitext(iurl)[1]
|
||||||
ext = ext[:5]
|
ext = ext[:5]
|
||||||
#if not ext:
|
|
||||||
# self.log_debug('Skipping extensionless image %s', iurl)
|
|
||||||
# continue
|
|
||||||
if not urlparse.urlsplit(iurl).scheme:
|
if not urlparse.urlsplit(iurl).scheme:
|
||||||
iurl = urlparse.urljoin(baseurl, iurl, False)
|
iurl = urlparse.urljoin(baseurl, iurl, False)
|
||||||
with self.imagemap_lock:
|
with self.imagemap_lock:
|
||||||
@ -303,8 +300,7 @@ class RecursiveFetcher(object):
|
|||||||
try:
|
try:
|
||||||
data = self.fetch_url(iurl)
|
data = self.fetch_url(iurl)
|
||||||
except Exception, err:
|
except Exception, err:
|
||||||
self.log_warning('Could not fetch image %s', iurl)
|
self.log.exception('Could not fetch image %s'% iurl)
|
||||||
self.log_debug('Error: %s', str(err), exc_info=True)
|
|
||||||
continue
|
continue
|
||||||
c += 1
|
c += 1
|
||||||
fname = sanitize_file_name('img'+str(c)+ext)
|
fname = sanitize_file_name('img'+str(c)+ext)
|
||||||
@ -322,7 +318,7 @@ class RecursiveFetcher(object):
|
|||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
continue
|
continue
|
||||||
|
|
||||||
def absurl(self, baseurl, tag, key, filter=True):
|
def absurl(self, baseurl, tag, key, filter=True):
|
||||||
iurl = tag[key]
|
iurl = tag[key]
|
||||||
parts = urlparse.urlsplit(iurl)
|
parts = urlparse.urlsplit(iurl)
|
||||||
if not parts.netloc and not parts.path:
|
if not parts.netloc and not parts.path:
|
||||||
@ -330,32 +326,32 @@ class RecursiveFetcher(object):
|
|||||||
if not parts.scheme:
|
if not parts.scheme:
|
||||||
iurl = urlparse.urljoin(baseurl, iurl, False)
|
iurl = urlparse.urljoin(baseurl, iurl, False)
|
||||||
if not self.is_link_ok(iurl):
|
if not self.is_link_ok(iurl):
|
||||||
self.log_debug('Skipping invalid link: %s', iurl)
|
self.log.debug('Skipping invalid link:', iurl)
|
||||||
return None
|
return None
|
||||||
if filter and not self.is_link_wanted(iurl):
|
if filter and not self.is_link_wanted(iurl):
|
||||||
self.log_debug('Filtered link: '+iurl)
|
self.log.debug('Filtered link: '+iurl)
|
||||||
return None
|
return None
|
||||||
return iurl
|
return iurl
|
||||||
|
|
||||||
def normurl(self, url):
|
def normurl(self, url):
|
||||||
parts = list(urlparse.urlsplit(url))
|
parts = list(urlparse.urlsplit(url))
|
||||||
parts[4] = ''
|
parts[4] = ''
|
||||||
return urlparse.urlunsplit(parts)
|
return urlparse.urlunsplit(parts)
|
||||||
|
|
||||||
def localize_link(self, tag, key, path):
|
def localize_link(self, tag, key, path):
|
||||||
parts = urlparse.urlsplit(tag[key])
|
parts = urlparse.urlsplit(tag[key])
|
||||||
suffix = '#'+parts.fragment if parts.fragment else ''
|
suffix = '#'+parts.fragment if parts.fragment else ''
|
||||||
tag[key] = path+suffix
|
tag[key] = path+suffix
|
||||||
|
|
||||||
def process_return_links(self, soup, baseurl):
|
def process_return_links(self, soup, baseurl):
|
||||||
for tag in soup.findAll(lambda tag: tag.name.lower()=='a' and tag.has_key('href')):
|
for tag in soup.findAll(lambda tag: tag.name.lower()=='a' and tag.has_key('href')):
|
||||||
iurl = self.absurl(baseurl, tag, 'href')
|
iurl = self.absurl(baseurl, tag, 'href')
|
||||||
if not iurl:
|
if not iurl:
|
||||||
continue
|
continue
|
||||||
nurl = self.normurl(iurl)
|
nurl = self.normurl(iurl)
|
||||||
if self.filemap.has_key(nurl):
|
if self.filemap.has_key(nurl):
|
||||||
self.localize_link(tag, 'href', self.filemap[nurl])
|
self.localize_link(tag, 'href', self.filemap[nurl])
|
||||||
|
|
||||||
def process_links(self, soup, baseurl, recursion_level, into_dir='links'):
|
def process_links(self, soup, baseurl, recursion_level, into_dir='links'):
|
||||||
res = ''
|
res = ''
|
||||||
diskpath = os.path.join(self.current_dir, into_dir)
|
diskpath = os.path.join(self.current_dir, into_dir)
|
||||||
@ -365,7 +361,7 @@ class RecursiveFetcher(object):
|
|||||||
try:
|
try:
|
||||||
self.current_dir = diskpath
|
self.current_dir = diskpath
|
||||||
tags = list(soup.findAll('a', href=True))
|
tags = list(soup.findAll('a', href=True))
|
||||||
|
|
||||||
for c, tag in enumerate(tags):
|
for c, tag in enumerate(tags):
|
||||||
if self.show_progress:
|
if self.show_progress:
|
||||||
print '.',
|
print '.',
|
||||||
@ -395,17 +391,17 @@ class RecursiveFetcher(object):
|
|||||||
dsrc = dsrc.decode(self.encoding, 'ignore')
|
dsrc = dsrc.decode(self.encoding, 'ignore')
|
||||||
else:
|
else:
|
||||||
dsrc = xml_to_unicode(dsrc, self.verbose)[0]
|
dsrc = xml_to_unicode(dsrc, self.verbose)[0]
|
||||||
|
|
||||||
soup = self.get_soup(dsrc)
|
soup = self.get_soup(dsrc)
|
||||||
|
|
||||||
base = soup.find('base', href=True)
|
base = soup.find('base', href=True)
|
||||||
if base is not None:
|
if base is not None:
|
||||||
newbaseurl = base['href']
|
newbaseurl = base['href']
|
||||||
self.log_debug('Processing images...')
|
self.log.debug('Processing images...')
|
||||||
self.process_images(soup, newbaseurl)
|
self.process_images(soup, newbaseurl)
|
||||||
if self.download_stylesheets:
|
if self.download_stylesheets:
|
||||||
self.process_stylesheets(soup, newbaseurl)
|
self.process_stylesheets(soup, newbaseurl)
|
||||||
|
|
||||||
_fname = basename(iurl)
|
_fname = basename(iurl)
|
||||||
if not isinstance(_fname, unicode):
|
if not isinstance(_fname, unicode):
|
||||||
_fname.decode('latin1', 'replace')
|
_fname.decode('latin1', 'replace')
|
||||||
@ -416,56 +412,55 @@ class RecursiveFetcher(object):
|
|||||||
self.downloaded_paths.append(res)
|
self.downloaded_paths.append(res)
|
||||||
self.filemap[nurl] = res
|
self.filemap[nurl] = res
|
||||||
if recursion_level < self.max_recursions:
|
if recursion_level < self.max_recursions:
|
||||||
self.log_debug('Processing links...')
|
self.log.debug('Processing links...')
|
||||||
self.process_links(soup, newbaseurl, recursion_level+1)
|
self.process_links(soup, newbaseurl, recursion_level+1)
|
||||||
else:
|
else:
|
||||||
self.process_return_links(soup, newbaseurl)
|
self.process_return_links(soup, newbaseurl)
|
||||||
self.log_debug('Recursion limit reached. Skipping links in %s', iurl)
|
self.log.debug('Recursion limit reached. Skipping links in', iurl)
|
||||||
|
|
||||||
if callable(self.postprocess_html_ext):
|
if callable(self.postprocess_html_ext):
|
||||||
soup = self.postprocess_html_ext(soup,
|
soup = self.postprocess_html_ext(soup,
|
||||||
c==0 and recursion_level==0 and not getattr(self, 'called_first', False),
|
c==0 and recursion_level==0 and not getattr(self, 'called_first', False),
|
||||||
self.job_info)
|
self.job_info)
|
||||||
|
|
||||||
if c==0 and recursion_level == 0:
|
if c==0 and recursion_level == 0:
|
||||||
self.called_first = True
|
self.called_first = True
|
||||||
|
|
||||||
save_soup(soup, res)
|
save_soup(soup, res)
|
||||||
self.localize_link(tag, 'href', res)
|
self.localize_link(tag, 'href', res)
|
||||||
except Exception, err:
|
except Exception, err:
|
||||||
self.failed_links.append((iurl, traceback.format_exc()))
|
self.failed_links.append((iurl, traceback.format_exc()))
|
||||||
self.log_warning('Could not fetch link %s', iurl)
|
self.log.exception('Could not fetch link', iurl)
|
||||||
self.log_debug('Error: %s', str(err), exc_info=True)
|
|
||||||
finally:
|
finally:
|
||||||
self.current_dir = diskpath
|
self.current_dir = diskpath
|
||||||
self.files += 1
|
self.files += 1
|
||||||
finally:
|
finally:
|
||||||
self.current_dir = prev_dir
|
self.current_dir = prev_dir
|
||||||
if self.show_progress:
|
if self.show_progress:
|
||||||
print
|
print
|
||||||
return res
|
return res
|
||||||
|
|
||||||
def __del__(self):
|
def __del__(self):
|
||||||
dt = getattr(self, 'default_timeout', None)
|
dt = getattr(self, 'default_timeout', None)
|
||||||
if dt is not None:
|
if dt is not None:
|
||||||
socket.setdefaulttimeout(dt)
|
socket.setdefaulttimeout(dt)
|
||||||
|
|
||||||
def option_parser(usage=_('%prog URL\n\nWhere URL is for example http://google.com')):
|
def option_parser(usage=_('%prog URL\n\nWhere URL is for example http://google.com')):
|
||||||
parser = OptionParser(usage=usage)
|
parser = OptionParser(usage=usage)
|
||||||
parser.add_option('-d', '--base-dir',
|
parser.add_option('-d', '--base-dir',
|
||||||
help=_('Base directory into which URL is saved. Default is %default'),
|
help=_('Base directory into which URL is saved. Default is %default'),
|
||||||
default='.', type='string', dest='dir')
|
default='.', type='string', dest='dir')
|
||||||
parser.add_option('-t', '--timeout',
|
parser.add_option('-t', '--timeout',
|
||||||
help=_('Timeout in seconds to wait for a response from the server. Default: %default s'),
|
help=_('Timeout in seconds to wait for a response from the server. Default: %default s'),
|
||||||
default=10.0, type='float', dest='timeout')
|
default=10.0, type='float', dest='timeout')
|
||||||
parser.add_option('-r', '--max-recursions', default=1,
|
parser.add_option('-r', '--max-recursions', default=1,
|
||||||
help=_('Maximum number of levels to recurse i.e. depth of links to follow. Default %default'),
|
help=_('Maximum number of levels to recurse i.e. depth of links to follow. Default %default'),
|
||||||
type='int', dest='max_recursions')
|
type='int', dest='max_recursions')
|
||||||
parser.add_option('-n', '--max-files', default=sys.maxint, type='int', dest='max_files',
|
parser.add_option('-n', '--max-files', default=sys.maxint, type='int', dest='max_files',
|
||||||
help=_('The maximum number of files to download. This only applies to files from <a href> tags. Default is %default'))
|
help=_('The maximum number of files to download. This only applies to files from <a href> tags. Default is %default'))
|
||||||
parser.add_option('--delay', default=0, dest='delay', type='int',
|
parser.add_option('--delay', default=0, dest='delay', type='int',
|
||||||
help=_('Minimum interval in seconds between consecutive fetches. Default is %default s'))
|
help=_('Minimum interval in seconds between consecutive fetches. Default is %default s'))
|
||||||
parser.add_option('--encoding', default=None,
|
parser.add_option('--encoding', default=None,
|
||||||
help=_('The character encoding for the websites you are trying to download. The default is to try and guess the encoding.'))
|
help=_('The character encoding for the websites you are trying to download. The default is to try and guess the encoding.'))
|
||||||
parser.add_option('--match-regexp', default=[], action='append', dest='match_regexps',
|
parser.add_option('--match-regexp', default=[], action='append', dest='match_regexps',
|
||||||
help=_('Only links that match this regular expression will be followed. This option can be specified multiple times, in which case as long as a link matches any one regexp, it will be followed. By default all links are followed.'))
|
help=_('Only links that match this regular expression will be followed. This option can be specified multiple times, in which case as long as a link matches any one regexp, it will be followed. By default all links are followed.'))
|
||||||
@ -478,23 +473,21 @@ def option_parser(usage=_('%prog URL\n\nWhere URL is for example http://google.c
|
|||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
|
||||||
def create_fetcher(options, logger=None, image_map={}):
|
def create_fetcher(options, image_map={}, log=None):
|
||||||
if logger is None:
|
if log is None:
|
||||||
level = logging.DEBUG if options.verbose else logging.INFO
|
log = Log()
|
||||||
logger = logging.getLogger('web2disk')
|
return RecursiveFetcher(options, log, image_map={})
|
||||||
setup_cli_handlers(logger, level)
|
|
||||||
return RecursiveFetcher(options, logger, image_map={})
|
|
||||||
|
|
||||||
def main(args=sys.argv):
|
def main(args=sys.argv):
|
||||||
parser = option_parser()
|
parser = option_parser()
|
||||||
options, args = parser.parse_args(args)
|
options, args = parser.parse_args(args)
|
||||||
if len(args) != 2:
|
if len(args) != 2:
|
||||||
parser.print_help()
|
parser.print_help()
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
fetcher = create_fetcher(options)
|
|
||||||
fetcher.start_fetch(args[1])
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
fetcher = create_fetcher(options)
|
||||||
|
fetcher.start_fetch(args[1])
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
sys.exit(main())
|
sys.exit(main())
|
||||||
|
Loading…
x
Reference in New Issue
Block a user