PC Quest and Living Digital. News download: Implement is_link_wanted a method for context sensitive link filtering

2026-02-27 13:40:09 -05:00 · 2010-04-08 17:19:47 +05:30 · 2010-04-08 17:19:47 +05:30 · 2c37b1c36b
commit 2c37b1c36b
parent 00bf66065d
5 changed files with 62 additions and 6 deletions
--- a/resources/recipes/living_digital.recipe
+++ b/resources/recipes/living_digital.recipe
@ -0,0 +1,14 @@
+from calibre.web.feeds.news import CalibrePeriodical
+
+class LivingDigital(CalibrePeriodical):
+
+    title = 'Living Digital'
+    calibre_periodicals_slug = 'living-digital'
+
+    description = '''
+    Catch the latest buzz in the digital world with Living Digital. Enjoy
+    reviews, news, features and recommendations on a wide range of consumer
+    technology products - from smartphones to flat panel TVs, netbooks to
+    cameras, and many more consumer lifestyle gadgets.
+    '''
+    language = 'en_IN'
--- a/resources/recipes/pc_quest_india.recipe
+++ b/resources/recipes/pc_quest_india.recipe
@ -0,0 +1,14 @@
+from calibre.web.feeds.news import CalibrePeriodical
+
+class PCQ(CalibrePeriodical):
+
+    title = 'PCQuest'
+    calibre_periodicals_slug = 'pc-quest-india'
+
+    description = '''
+    Buying a tech product? Seeking a tech solution? Consult PCQuest, India's
+    market-leading selection and implementation guide for the latest
+    technologies: servers, business apps, security, open source, gadgets and
+    more.
+    '''
+    language = 'en_IN'
--- a/src/calibre/ebooks/mobi/reader.py
+++ b/src/calibre/ebooks/mobi/reader.py
@ -296,6 +296,10 @@ class MobiReader(object):
        self.add_anchors()
        self.processed_html = self.processed_html.decode(self.book_header.codec,
            'ignore')
+        self.processed_html = self.processed_html.replace('</</', '</')
+        self.processed_html = re.sub(r'</([a-zA-Z]+)<', r'</\1><',
+                self.processed_html)
+
        for pat in ENCODING_PATS:
            self.processed_html = pat.sub('', self.processed_html)
        e2u = functools.partial(entity_to_unicode,
@ -320,7 +324,6 @@ class MobiReader(object):
            from lxml.html import soupparser
            self.log.warning('Malformed markup, parsing using BeautifulSoup')
            try:
-                self.processed_html = self.processed_html.replace('</</', '</')
                root = soupparser.fromstring(self.processed_html)
            except Exception:
                self.log.warning('MOBI markup appears to contain random bytes. Stripping.')
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@ -150,7 +150,8 @@ class BasicNewsRecipe(Recipe):
    remove_empty_feeds = False

    #: List of regular expressions that determines which links to follow
-    #: If empty, it is ignored. For example::
+    #: If empty, it is ignored. Used only if is_link_wanted is
+    #: not implemented. For example::
    #:
    #:     match_regexps = [r'page=[0-9]+']
    #:
@ -161,7 +162,8 @@ class BasicNewsRecipe(Recipe):
    match_regexps         = []

    #: List of regular expressions that determines which links to ignore
-    #: If empty it is ignored. For example::
+    #: If empty it is ignored. Used only if is_link_wanted is not
+    #: implemented. For example::
    #:
    #:     filter_regexps = [r'ads\.doubleclick\.net']
    #:
@ -291,6 +293,17 @@ class BasicNewsRecipe(Recipe):
    def short_title(self):
        return self.title

+    def is_link_wanted(self, url, tag):
+        '''
+        Return True if the link should be followed or False otherwise. By
+        default, raises NotImplementedError which causes the downloader to
+        ignore it.
+
+        :param url: The URL to be followed
+        :param tag: The Tag from which the URL was derived
+        '''
+        raise NotImplementedError
+
    def get_cover_url(self):
        '''
        Return a :term:`URL` to the cover image for this issue or `None`.
@ -575,7 +588,8 @@ class BasicNewsRecipe(Recipe):

        self.web2disk_options = web2disk_option_parser().parse_args(web2disk_cmdline)[0]
        for extra in ('keep_only_tags', 'remove_tags', 'preprocess_regexps',
-                      'preprocess_html', 'remove_tags_after', 'remove_tags_before'):
+                      'preprocess_html', 'remove_tags_after',
+                      'remove_tags_before', 'is_link_wanted'):
            setattr(self.web2disk_options, extra, getattr(self, extra))
        self.web2disk_options.postprocess_html = self._postprocess_html
        self.web2disk_options.encoding = self.encoding
--- a/src/calibre/web/fetch/simple.py
+++ b/src/calibre/web/fetch/simple.py
@ -91,6 +91,9 @@ class DummyLock(object):
    def __enter__(self, *args): return self
    def __exit__(self, *args): pass

+def default_is_link_wanted(url, tag):
+    raise NotImplementedError()
+
 class RecursiveFetcher(object):
    LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in
                ('.exe\s*$', '.mp3\s*$', '.ogg\s*$', '^\s*mailto:', '^\s*$'))
@ -134,6 +137,8 @@ class RecursiveFetcher(object):
        self.keep_only_tags      = getattr(options, 'keep_only_tags', [])
        self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup)
        self.postprocess_html_ext= getattr(options, 'postprocess_html', None)
+        self._is_link_wanted     = getattr(options, 'is_link_wanted',
+                default_is_link_wanted)
        self.download_stylesheets = not options.no_stylesheets
        self.show_progress = True
        self.failed_links = []
@ -233,7 +238,13 @@ class RecursiveFetcher(object):
                return False
        return True

-    def is_link_wanted(self, url):
+    def is_link_wanted(self, url, tag):
+        try:
+            return self._is_link_wanted(url, tag)
+        except NotImplementedError:
+            pass
+        except:
+            return False
        if self.filter_regexps:
            for f in self.filter_regexps:
                if f.search(url):
@ -342,7 +353,7 @@ class RecursiveFetcher(object):
        if not self.is_link_ok(iurl):
            self.log.debug('Skipping invalid link:', iurl)
            return None
-        if filter and not self.is_link_wanted(iurl):
+        if filter and not self.is_link_wanted(iurl, tag):
            self.log.debug('Filtered link: '+iurl)
            return None
        return iurl