PC Quest and Living Digital. News download: Implement is_link_wanted a method for context sensitive link filtering

This commit is contained in:
Kovid Goyal 2010-04-08 17:19:47 +05:30
parent 00bf66065d
commit 2c37b1c36b
5 changed files with 62 additions and 6 deletions

View File

@ -0,0 +1,14 @@
from calibre.web.feeds.news import CalibrePeriodical
class LivingDigital(CalibrePeriodical):
title = 'Living Digital'
calibre_periodicals_slug = 'living-digital'
description = '''
Catch the latest buzz in the digital world with Living Digital. Enjoy
reviews, news, features and recommendations on a wide range of consumer
technology products - from smartphones to flat panel TVs, netbooks to
cameras, and many more consumer lifestyle gadgets.
'''
language = 'en_IN'

View File

@ -0,0 +1,14 @@
from calibre.web.feeds.news import CalibrePeriodical
class PCQ(CalibrePeriodical):
title = 'PCQuest'
calibre_periodicals_slug = 'pc-quest-india'
description = '''
Buying a tech product? Seeking a tech solution? Consult PCQuest, India's
market-leading selection and implementation guide for the latest
technologies: servers, business apps, security, open source, gadgets and
more.
'''
language = 'en_IN'

View File

@ -296,6 +296,10 @@ class MobiReader(object):
self.add_anchors() self.add_anchors()
self.processed_html = self.processed_html.decode(self.book_header.codec, self.processed_html = self.processed_html.decode(self.book_header.codec,
'ignore') 'ignore')
self.processed_html = self.processed_html.replace('</</', '</')
self.processed_html = re.sub(r'</([a-zA-Z]+)<', r'</\1><',
self.processed_html)
for pat in ENCODING_PATS: for pat in ENCODING_PATS:
self.processed_html = pat.sub('', self.processed_html) self.processed_html = pat.sub('', self.processed_html)
e2u = functools.partial(entity_to_unicode, e2u = functools.partial(entity_to_unicode,
@ -320,7 +324,6 @@ class MobiReader(object):
from lxml.html import soupparser from lxml.html import soupparser
self.log.warning('Malformed markup, parsing using BeautifulSoup') self.log.warning('Malformed markup, parsing using BeautifulSoup')
try: try:
self.processed_html = self.processed_html.replace('</</', '</')
root = soupparser.fromstring(self.processed_html) root = soupparser.fromstring(self.processed_html)
except Exception: except Exception:
self.log.warning('MOBI markup appears to contain random bytes. Stripping.') self.log.warning('MOBI markup appears to contain random bytes. Stripping.')

View File

@ -150,7 +150,8 @@ class BasicNewsRecipe(Recipe):
remove_empty_feeds = False remove_empty_feeds = False
#: List of regular expressions that determines which links to follow #: List of regular expressions that determines which links to follow
#: If empty, it is ignored. For example:: #: If empty, it is ignored. Used only if is_link_wanted is
#: not implemented. For example::
#: #:
#: match_regexps = [r'page=[0-9]+'] #: match_regexps = [r'page=[0-9]+']
#: #:
@ -161,7 +162,8 @@ class BasicNewsRecipe(Recipe):
match_regexps = [] match_regexps = []
#: List of regular expressions that determines which links to ignore #: List of regular expressions that determines which links to ignore
#: If empty it is ignored. For example:: #: If empty it is ignored. Used only if is_link_wanted is not
#: implemented. For example::
#: #:
#: filter_regexps = [r'ads\.doubleclick\.net'] #: filter_regexps = [r'ads\.doubleclick\.net']
#: #:
@ -291,6 +293,17 @@ class BasicNewsRecipe(Recipe):
def short_title(self): def short_title(self):
return self.title return self.title
def is_link_wanted(self, url, tag):
'''
Return True if the link should be followed or False otherwise. By
default, raises NotImplementedError which causes the downloader to
ignore it.
:param url: The URL to be followed
:param tag: The Tag from which the URL was derived
'''
raise NotImplementedError
def get_cover_url(self): def get_cover_url(self):
''' '''
Return a :term:`URL` to the cover image for this issue or `None`. Return a :term:`URL` to the cover image for this issue or `None`.
@ -575,7 +588,8 @@ class BasicNewsRecipe(Recipe):
self.web2disk_options = web2disk_option_parser().parse_args(web2disk_cmdline)[0] self.web2disk_options = web2disk_option_parser().parse_args(web2disk_cmdline)[0]
for extra in ('keep_only_tags', 'remove_tags', 'preprocess_regexps', for extra in ('keep_only_tags', 'remove_tags', 'preprocess_regexps',
'preprocess_html', 'remove_tags_after', 'remove_tags_before'): 'preprocess_html', 'remove_tags_after',
'remove_tags_before', 'is_link_wanted'):
setattr(self.web2disk_options, extra, getattr(self, extra)) setattr(self.web2disk_options, extra, getattr(self, extra))
self.web2disk_options.postprocess_html = self._postprocess_html self.web2disk_options.postprocess_html = self._postprocess_html
self.web2disk_options.encoding = self.encoding self.web2disk_options.encoding = self.encoding

View File

@ -91,6 +91,9 @@ class DummyLock(object):
def __enter__(self, *args): return self def __enter__(self, *args): return self
def __exit__(self, *args): pass def __exit__(self, *args): pass
def default_is_link_wanted(url, tag):
raise NotImplementedError()
class RecursiveFetcher(object): class RecursiveFetcher(object):
LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in
('.exe\s*$', '.mp3\s*$', '.ogg\s*$', '^\s*mailto:', '^\s*$')) ('.exe\s*$', '.mp3\s*$', '.ogg\s*$', '^\s*mailto:', '^\s*$'))
@ -134,6 +137,8 @@ class RecursiveFetcher(object):
self.keep_only_tags = getattr(options, 'keep_only_tags', []) self.keep_only_tags = getattr(options, 'keep_only_tags', [])
self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup) self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup)
self.postprocess_html_ext= getattr(options, 'postprocess_html', None) self.postprocess_html_ext= getattr(options, 'postprocess_html', None)
self._is_link_wanted = getattr(options, 'is_link_wanted',
default_is_link_wanted)
self.download_stylesheets = not options.no_stylesheets self.download_stylesheets = not options.no_stylesheets
self.show_progress = True self.show_progress = True
self.failed_links = [] self.failed_links = []
@ -233,7 +238,13 @@ class RecursiveFetcher(object):
return False return False
return True return True
def is_link_wanted(self, url): def is_link_wanted(self, url, tag):
try:
return self._is_link_wanted(url, tag)
except NotImplementedError:
pass
except:
return False
if self.filter_regexps: if self.filter_regexps:
for f in self.filter_regexps: for f in self.filter_regexps:
if f.search(url): if f.search(url):
@ -342,7 +353,7 @@ class RecursiveFetcher(object):
if not self.is_link_ok(iurl): if not self.is_link_ok(iurl):
self.log.debug('Skipping invalid link:', iurl) self.log.debug('Skipping invalid link:', iurl)
return None return None
if filter and not self.is_link_wanted(iurl): if filter and not self.is_link_wanted(iurl, tag):
self.log.debug('Filtered link: '+iurl) self.log.debug('Filtered link: '+iurl)
return None return None
return iurl return iurl