mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
PC Quest and Living Digital. News download: Implement is_link_wanted a method for context sensitive link filtering
This commit is contained in:
parent
00bf66065d
commit
2c37b1c36b
14
resources/recipes/living_digital.recipe
Normal file
14
resources/recipes/living_digital.recipe
Normal file
@ -0,0 +1,14 @@
|
||||
from calibre.web.feeds.news import CalibrePeriodical
|
||||
|
||||
class LivingDigital(CalibrePeriodical):
|
||||
|
||||
title = 'Living Digital'
|
||||
calibre_periodicals_slug = 'living-digital'
|
||||
|
||||
description = '''
|
||||
Catch the latest buzz in the digital world with Living Digital. Enjoy
|
||||
reviews, news, features and recommendations on a wide range of consumer
|
||||
technology products - from smartphones to flat panel TVs, netbooks to
|
||||
cameras, and many more consumer lifestyle gadgets.
|
||||
'''
|
||||
language = 'en_IN'
|
14
resources/recipes/pc_quest_india.recipe
Normal file
14
resources/recipes/pc_quest_india.recipe
Normal file
@ -0,0 +1,14 @@
|
||||
from calibre.web.feeds.news import CalibrePeriodical
|
||||
|
||||
class PCQ(CalibrePeriodical):
|
||||
|
||||
title = 'PCQuest'
|
||||
calibre_periodicals_slug = 'pc-quest-india'
|
||||
|
||||
description = '''
|
||||
Buying a tech product? Seeking a tech solution? Consult PCQuest, India's
|
||||
market-leading selection and implementation guide for the latest
|
||||
technologies: servers, business apps, security, open source, gadgets and
|
||||
more.
|
||||
'''
|
||||
language = 'en_IN'
|
@ -296,6 +296,10 @@ class MobiReader(object):
|
||||
self.add_anchors()
|
||||
self.processed_html = self.processed_html.decode(self.book_header.codec,
|
||||
'ignore')
|
||||
self.processed_html = self.processed_html.replace('</</', '</')
|
||||
self.processed_html = re.sub(r'</([a-zA-Z]+)<', r'</\1><',
|
||||
self.processed_html)
|
||||
|
||||
for pat in ENCODING_PATS:
|
||||
self.processed_html = pat.sub('', self.processed_html)
|
||||
e2u = functools.partial(entity_to_unicode,
|
||||
@ -320,7 +324,6 @@ class MobiReader(object):
|
||||
from lxml.html import soupparser
|
||||
self.log.warning('Malformed markup, parsing using BeautifulSoup')
|
||||
try:
|
||||
self.processed_html = self.processed_html.replace('</</', '</')
|
||||
root = soupparser.fromstring(self.processed_html)
|
||||
except Exception:
|
||||
self.log.warning('MOBI markup appears to contain random bytes. Stripping.')
|
||||
|
@ -150,7 +150,8 @@ class BasicNewsRecipe(Recipe):
|
||||
remove_empty_feeds = False
|
||||
|
||||
#: List of regular expressions that determines which links to follow
|
||||
#: If empty, it is ignored. For example::
|
||||
#: If empty, it is ignored. Used only if is_link_wanted is
|
||||
#: not implemented. For example::
|
||||
#:
|
||||
#: match_regexps = [r'page=[0-9]+']
|
||||
#:
|
||||
@ -161,7 +162,8 @@ class BasicNewsRecipe(Recipe):
|
||||
match_regexps = []
|
||||
|
||||
#: List of regular expressions that determines which links to ignore
|
||||
#: If empty it is ignored. For example::
|
||||
#: If empty it is ignored. Used only if is_link_wanted is not
|
||||
#: implemented. For example::
|
||||
#:
|
||||
#: filter_regexps = [r'ads\.doubleclick\.net']
|
||||
#:
|
||||
@ -291,6 +293,17 @@ class BasicNewsRecipe(Recipe):
|
||||
def short_title(self):
|
||||
return self.title
|
||||
|
||||
def is_link_wanted(self, url, tag):
|
||||
'''
|
||||
Return True if the link should be followed or False otherwise. By
|
||||
default, raises NotImplementedError which causes the downloader to
|
||||
ignore it.
|
||||
|
||||
:param url: The URL to be followed
|
||||
:param tag: The Tag from which the URL was derived
|
||||
'''
|
||||
raise NotImplementedError
|
||||
|
||||
def get_cover_url(self):
|
||||
'''
|
||||
Return a :term:`URL` to the cover image for this issue or `None`.
|
||||
@ -575,7 +588,8 @@ class BasicNewsRecipe(Recipe):
|
||||
|
||||
self.web2disk_options = web2disk_option_parser().parse_args(web2disk_cmdline)[0]
|
||||
for extra in ('keep_only_tags', 'remove_tags', 'preprocess_regexps',
|
||||
'preprocess_html', 'remove_tags_after', 'remove_tags_before'):
|
||||
'preprocess_html', 'remove_tags_after',
|
||||
'remove_tags_before', 'is_link_wanted'):
|
||||
setattr(self.web2disk_options, extra, getattr(self, extra))
|
||||
self.web2disk_options.postprocess_html = self._postprocess_html
|
||||
self.web2disk_options.encoding = self.encoding
|
||||
|
@ -91,6 +91,9 @@ class DummyLock(object):
|
||||
def __enter__(self, *args): return self
|
||||
def __exit__(self, *args): pass
|
||||
|
||||
def default_is_link_wanted(url, tag):
|
||||
raise NotImplementedError()
|
||||
|
||||
class RecursiveFetcher(object):
|
||||
LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in
|
||||
('.exe\s*$', '.mp3\s*$', '.ogg\s*$', '^\s*mailto:', '^\s*$'))
|
||||
@ -134,6 +137,8 @@ class RecursiveFetcher(object):
|
||||
self.keep_only_tags = getattr(options, 'keep_only_tags', [])
|
||||
self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup)
|
||||
self.postprocess_html_ext= getattr(options, 'postprocess_html', None)
|
||||
self._is_link_wanted = getattr(options, 'is_link_wanted',
|
||||
default_is_link_wanted)
|
||||
self.download_stylesheets = not options.no_stylesheets
|
||||
self.show_progress = True
|
||||
self.failed_links = []
|
||||
@ -233,7 +238,13 @@ class RecursiveFetcher(object):
|
||||
return False
|
||||
return True
|
||||
|
||||
def is_link_wanted(self, url):
|
||||
def is_link_wanted(self, url, tag):
|
||||
try:
|
||||
return self._is_link_wanted(url, tag)
|
||||
except NotImplementedError:
|
||||
pass
|
||||
except:
|
||||
return False
|
||||
if self.filter_regexps:
|
||||
for f in self.filter_regexps:
|
||||
if f.search(url):
|
||||
@ -342,7 +353,7 @@ class RecursiveFetcher(object):
|
||||
if not self.is_link_ok(iurl):
|
||||
self.log.debug('Skipping invalid link:', iurl)
|
||||
return None
|
||||
if filter and not self.is_link_wanted(iurl):
|
||||
if filter and not self.is_link_wanted(iurl, tag):
|
||||
self.log.debug('Filtered link: '+iurl)
|
||||
return None
|
||||
return iurl
|
||||
|
Loading…
x
Reference in New Issue
Block a user