calibre/recipes/pc_lab.recipe

#!/usr/bin/env  python2
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Comment
class PCLab(BasicNewsRecipe):
    cover_url             = 'http://pclab.pl/img/logo.png'
    title                 = u"PC Lab"
    __author__            = 'ravcio - rlelusz[at]gmail.com'
    description           = u"Articles from PC Lab website"
    language              = 'pl'
    oldest_article        = 30
    max_articles_per_feed = 100
    recursions            = 0
    encoding              = 'iso-8859-2'
    no_stylesheets        = True
    remove_javascript     = True
    remove_empty_feeds = True
    use_embedded_content  = False

    keep_only_tags = [
            dict(name='div', attrs={'class':['substance']})
                     ]

    remove_tags = [
            dict(name='div', attrs={'class':['toc first', 'toc', 'tags', 'recommendedarticles', 'name', 'zumi', 'chapters']})
                  ]

    #links to RSS feeds
    feeds = [
             (u'Aktualności', 'http://pclab.pl/xml/aktualnosci.xml'),
             (u'Artykuły', u'http://pclab.pl/xml/artykuly.xml'),
             (u'Poradniki', 'http://pclab.pl/xml/poradniki.xml')
             ]

    #load second and subsequent page content
    # in: soup - full page with 'next' button
    # out: appendtag - tag to which new page is to be added
    def append_page(self, soup, appendtag):
        # find the 'Next' button
        pager = soup.find('div', attrs={'class':'navigation'})
        if pager:
            a = pager.find('a')
            if 'news' in a['href']:
                pager = None
            else:
                pager = pager.find('div', attrs={'class':'next'})

        while pager:
            #search for 'a' element with link to next page (exit if not found)
            a = pager.find('a')
            nexturl = a['href']
            soup2 = self.index_to_soup('http://pclab.pl' + nexturl)
            pager = soup2.find('div', attrs={'class':'next'})
            pagetext = soup2.find('div', attrs={'class':'substance'})
            pagetext = pagetext.find('div', attrs={'class':'data'})
            comments = pagetext.findAll(text=lambda text:isinstance(text, Comment))
            for comment in comments:
                comment.extract()

            pos = len(appendtag.contents)
            appendtag.insert(pos, pagetext)
            pos = len(appendtag.contents)

        pager = soup.find('div', attrs={'class':'navigation'})
        if pager:
            pager.extract()

    def preprocess_html(self, soup):
        # soup.body contains no title and no navigator, they are in soup
        self.append_page(soup, soup.body)
        for link in soup.findAll('a'):
            href = link.get('href', None)
            if href and href.startswith('/'):
                link['href'] = 'http://pclab.pl' + href
        # finally remove some tags
        #for r in soup.findAll('div', attrs={'class':['tags', 'index', 'script_bxad_slot_display_list_bxad_slot', 'index first', 'zumi', 'navigation']})

        return soup