calibre/recipes/pc_lab.recipe

#!/usr/bin/env  python

from calibre.web.feeds.recipes import BasicNewsRecipe

class PCLab(BasicNewsRecipe):
    cover_url             = 'http://pclab.pl/img/logo.png'
    title                 = u"PC Lab"
    __author__            = 'ravcio - rlelusz[at]gmail.com'
    description           = u"Articles from PC Lab website"
    language              = 'pl'
    oldest_article        = 30.0
    max_articles_per_feed = 100
    recursions            = 0
    encoding              = 'iso-8859-2'
    no_stylesheets        = True
    remove_javascript     = True
    use_embedded_content  = False

    keep_only_tags = [
            dict(name='div', attrs={'class':['substance']})
                     ]

    remove_tags = [
            dict(name='div', attrs={'class':['chapters']})
            ,dict(name='div', attrs={'id':['script_bxad_slot_display_list_bxad_slot']})
                  ]

    remove_tags_after = [
            dict(name='div', attrs={'class':['navigation']})
                ]

    #links to RSS feeds
    feeds = [ ('PCLab', u'http://pclab.pl/xml/artykuly.xml') ]

    #load second and subsequent page content
    # in: soup - full page with 'next' button
    # out: appendtag - tag to which new page is to be added
    def append_page(self, soup, appendtag):
        # find the 'Next' button
        pager = soup.find('div', attrs={'class':'next'})

        if pager:
            #search for 'a' element with link to next page (exit if not found)
            a = pager.find('a')
            if a:
                nexturl = a['href']

                soup2 = self.index_to_soup('http://pclab.pl/' + nexturl)

                pagetext_substance = soup2.find('div', attrs={'class':'substance'})
                pagetext = pagetext_substance.find('div', attrs={'class':'data'})
                pagetext.extract()

                pos = len(appendtag.contents)
                appendtag.insert(pos, pagetext)
                pos = len(appendtag.contents)

                self.append_page(soup2, appendtag)


    def preprocess_html(self, soup):

        # soup.body contains no title and no navigator, they are in soup
        self.append_page(soup, soup.body)

        # finally remove some tags
        tags = soup.findAll('div',attrs={'class':['tags', 'index', 'script_bxad_slot_display_list_bxad_slot', 'index first', 'zumi', 'navigation']})
        [tag.extract() for tag in tags]

        return soup