mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
PC Quest and Living Digital. News download: Implement is_link_wanted a method for context sensitive link filtering
This commit is contained in:
parent
00bf66065d
commit
2c37b1c36b
14
resources/recipes/living_digital.recipe
Normal file
14
resources/recipes/living_digital.recipe
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
from calibre.web.feeds.news import CalibrePeriodical
|
||||||
|
|
||||||
|
class LivingDigital(CalibrePeriodical):
|
||||||
|
|
||||||
|
title = 'Living Digital'
|
||||||
|
calibre_periodicals_slug = 'living-digital'
|
||||||
|
|
||||||
|
description = '''
|
||||||
|
Catch the latest buzz in the digital world with Living Digital. Enjoy
|
||||||
|
reviews, news, features and recommendations on a wide range of consumer
|
||||||
|
technology products - from smartphones to flat panel TVs, netbooks to
|
||||||
|
cameras, and many more consumer lifestyle gadgets.
|
||||||
|
'''
|
||||||
|
language = 'en_IN'
|
14
resources/recipes/pc_quest_india.recipe
Normal file
14
resources/recipes/pc_quest_india.recipe
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
from calibre.web.feeds.news import CalibrePeriodical
|
||||||
|
|
||||||
|
class PCQ(CalibrePeriodical):
|
||||||
|
|
||||||
|
title = 'PCQuest'
|
||||||
|
calibre_periodicals_slug = 'pc-quest-india'
|
||||||
|
|
||||||
|
description = '''
|
||||||
|
Buying a tech product? Seeking a tech solution? Consult PCQuest, India's
|
||||||
|
market-leading selection and implementation guide for the latest
|
||||||
|
technologies: servers, business apps, security, open source, gadgets and
|
||||||
|
more.
|
||||||
|
'''
|
||||||
|
language = 'en_IN'
|
@ -296,6 +296,10 @@ class MobiReader(object):
|
|||||||
self.add_anchors()
|
self.add_anchors()
|
||||||
self.processed_html = self.processed_html.decode(self.book_header.codec,
|
self.processed_html = self.processed_html.decode(self.book_header.codec,
|
||||||
'ignore')
|
'ignore')
|
||||||
|
self.processed_html = self.processed_html.replace('</</', '</')
|
||||||
|
self.processed_html = re.sub(r'</([a-zA-Z]+)<', r'</\1><',
|
||||||
|
self.processed_html)
|
||||||
|
|
||||||
for pat in ENCODING_PATS:
|
for pat in ENCODING_PATS:
|
||||||
self.processed_html = pat.sub('', self.processed_html)
|
self.processed_html = pat.sub('', self.processed_html)
|
||||||
e2u = functools.partial(entity_to_unicode,
|
e2u = functools.partial(entity_to_unicode,
|
||||||
@ -320,7 +324,6 @@ class MobiReader(object):
|
|||||||
from lxml.html import soupparser
|
from lxml.html import soupparser
|
||||||
self.log.warning('Malformed markup, parsing using BeautifulSoup')
|
self.log.warning('Malformed markup, parsing using BeautifulSoup')
|
||||||
try:
|
try:
|
||||||
self.processed_html = self.processed_html.replace('</</', '</')
|
|
||||||
root = soupparser.fromstring(self.processed_html)
|
root = soupparser.fromstring(self.processed_html)
|
||||||
except Exception:
|
except Exception:
|
||||||
self.log.warning('MOBI markup appears to contain random bytes. Stripping.')
|
self.log.warning('MOBI markup appears to contain random bytes. Stripping.')
|
||||||
|
@ -150,7 +150,8 @@ class BasicNewsRecipe(Recipe):
|
|||||||
remove_empty_feeds = False
|
remove_empty_feeds = False
|
||||||
|
|
||||||
#: List of regular expressions that determines which links to follow
|
#: List of regular expressions that determines which links to follow
|
||||||
#: If empty, it is ignored. For example::
|
#: If empty, it is ignored. Used only if is_link_wanted is
|
||||||
|
#: not implemented. For example::
|
||||||
#:
|
#:
|
||||||
#: match_regexps = [r'page=[0-9]+']
|
#: match_regexps = [r'page=[0-9]+']
|
||||||
#:
|
#:
|
||||||
@ -161,7 +162,8 @@ class BasicNewsRecipe(Recipe):
|
|||||||
match_regexps = []
|
match_regexps = []
|
||||||
|
|
||||||
#: List of regular expressions that determines which links to ignore
|
#: List of regular expressions that determines which links to ignore
|
||||||
#: If empty it is ignored. For example::
|
#: If empty it is ignored. Used only if is_link_wanted is not
|
||||||
|
#: implemented. For example::
|
||||||
#:
|
#:
|
||||||
#: filter_regexps = [r'ads\.doubleclick\.net']
|
#: filter_regexps = [r'ads\.doubleclick\.net']
|
||||||
#:
|
#:
|
||||||
@ -291,6 +293,17 @@ class BasicNewsRecipe(Recipe):
|
|||||||
def short_title(self):
|
def short_title(self):
|
||||||
return self.title
|
return self.title
|
||||||
|
|
||||||
|
def is_link_wanted(self, url, tag):
|
||||||
|
'''
|
||||||
|
Return True if the link should be followed or False otherwise. By
|
||||||
|
default, raises NotImplementedError which causes the downloader to
|
||||||
|
ignore it.
|
||||||
|
|
||||||
|
:param url: The URL to be followed
|
||||||
|
:param tag: The Tag from which the URL was derived
|
||||||
|
'''
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
'''
|
'''
|
||||||
Return a :term:`URL` to the cover image for this issue or `None`.
|
Return a :term:`URL` to the cover image for this issue or `None`.
|
||||||
@ -575,7 +588,8 @@ class BasicNewsRecipe(Recipe):
|
|||||||
|
|
||||||
self.web2disk_options = web2disk_option_parser().parse_args(web2disk_cmdline)[0]
|
self.web2disk_options = web2disk_option_parser().parse_args(web2disk_cmdline)[0]
|
||||||
for extra in ('keep_only_tags', 'remove_tags', 'preprocess_regexps',
|
for extra in ('keep_only_tags', 'remove_tags', 'preprocess_regexps',
|
||||||
'preprocess_html', 'remove_tags_after', 'remove_tags_before'):
|
'preprocess_html', 'remove_tags_after',
|
||||||
|
'remove_tags_before', 'is_link_wanted'):
|
||||||
setattr(self.web2disk_options, extra, getattr(self, extra))
|
setattr(self.web2disk_options, extra, getattr(self, extra))
|
||||||
self.web2disk_options.postprocess_html = self._postprocess_html
|
self.web2disk_options.postprocess_html = self._postprocess_html
|
||||||
self.web2disk_options.encoding = self.encoding
|
self.web2disk_options.encoding = self.encoding
|
||||||
|
@ -91,6 +91,9 @@ class DummyLock(object):
|
|||||||
def __enter__(self, *args): return self
|
def __enter__(self, *args): return self
|
||||||
def __exit__(self, *args): pass
|
def __exit__(self, *args): pass
|
||||||
|
|
||||||
|
def default_is_link_wanted(url, tag):
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
class RecursiveFetcher(object):
|
class RecursiveFetcher(object):
|
||||||
LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in
|
LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in
|
||||||
('.exe\s*$', '.mp3\s*$', '.ogg\s*$', '^\s*mailto:', '^\s*$'))
|
('.exe\s*$', '.mp3\s*$', '.ogg\s*$', '^\s*mailto:', '^\s*$'))
|
||||||
@ -134,6 +137,8 @@ class RecursiveFetcher(object):
|
|||||||
self.keep_only_tags = getattr(options, 'keep_only_tags', [])
|
self.keep_only_tags = getattr(options, 'keep_only_tags', [])
|
||||||
self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup)
|
self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup)
|
||||||
self.postprocess_html_ext= getattr(options, 'postprocess_html', None)
|
self.postprocess_html_ext= getattr(options, 'postprocess_html', None)
|
||||||
|
self._is_link_wanted = getattr(options, 'is_link_wanted',
|
||||||
|
default_is_link_wanted)
|
||||||
self.download_stylesheets = not options.no_stylesheets
|
self.download_stylesheets = not options.no_stylesheets
|
||||||
self.show_progress = True
|
self.show_progress = True
|
||||||
self.failed_links = []
|
self.failed_links = []
|
||||||
@ -233,7 +238,13 @@ class RecursiveFetcher(object):
|
|||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def is_link_wanted(self, url):
|
def is_link_wanted(self, url, tag):
|
||||||
|
try:
|
||||||
|
return self._is_link_wanted(url, tag)
|
||||||
|
except NotImplementedError:
|
||||||
|
pass
|
||||||
|
except:
|
||||||
|
return False
|
||||||
if self.filter_regexps:
|
if self.filter_regexps:
|
||||||
for f in self.filter_regexps:
|
for f in self.filter_regexps:
|
||||||
if f.search(url):
|
if f.search(url):
|
||||||
@ -342,7 +353,7 @@ class RecursiveFetcher(object):
|
|||||||
if not self.is_link_ok(iurl):
|
if not self.is_link_ok(iurl):
|
||||||
self.log.debug('Skipping invalid link:', iurl)
|
self.log.debug('Skipping invalid link:', iurl)
|
||||||
return None
|
return None
|
||||||
if filter and not self.is_link_wanted(iurl):
|
if filter and not self.is_link_wanted(iurl, tag):
|
||||||
self.log.debug('Filtered link: '+iurl)
|
self.log.debug('Filtered link: '+iurl)
|
||||||
return None
|
return None
|
||||||
return iurl
|
return iurl
|
||||||
|
Loading…
x
Reference in New Issue
Block a user