pclab.pl by ravcio. Fixes #7545 (recipe for http://pclab.pl)

2025-09-29 15:31:08 -04:00 · 2010-11-15 16:23:07 -07:00 · 2010-11-15 16:23:07 -07:00 · 377da4abad
commit 377da4abad
parent a6ebb4c040
1 changed files with 70 additions and 0 deletions
--- a/resources/recipes/pc_lab.recipe
+++ b/resources/recipes/pc_lab.recipe
@ -0,0 +1,70 @@
 #!/usr/bin/env  python
 from calibre.web.feeds.recipes import BasicNewsRecipe
 class PCLab(BasicNewsRecipe):
    cover_url             = 'http://pclab.pl/img/logo.png'
    title                 = u"PC Lab"
    __author__            = 'ravcio - rlelusz[at]gmail.com'
    description           = u"Articles from PC Lab website"
    language              = 'pl'
    oldest_article        = 30.0
    max_articles_per_feed = 100
    recursions            = 0
    encoding              = 'iso-8859-2'
    no_stylesheets        = True
    remove_javascript     = True
    use_embedded_content  = False
    keep_only_tags = [
            dict(name='div', attrs={'class':['substance']})
                     ]
    remove_tags = [
            dict(name='div', attrs={'class':['chapters']})
            ,dict(name='div', attrs={'id':['script_bxad_slot_display_list_bxad_slot']})
                  ]
    remove_tags_after = [
            dict(name='div', attrs={'class':['navigation']})
                ]
    #links to RSS feeds
    feeds = [ ('PCLab', u'http://pclab.pl/xml/artykuly.xml') ]
    #load second and subsequent page content
    # in: soup - full page with 'next' button
    # out: appendtag - tag to which new page is to be added
    def append_page(self, soup, appendtag):
        # find the 'Next' button
        pager = soup.find('div', attrs={'class':'next'})
        if pager:
            #search for 'a' element with link to next page (exit if not found)
            a = pager.find('a')
            if a:
                nexturl = a['href']
                soup2 = self.index_to_soup('http://pclab.pl/' + nexturl)
                pagetext_substance = soup2.find('div', attrs={'class':'substance'})
                pagetext = pagetext_substance.find('div', attrs={'class':'data'})
                pagetext.extract()
                pos = len(appendtag.contents)
                appendtag.insert(pos, pagetext)
                pos = len(appendtag.contents)
                self.append_page(soup2, appendtag)
    def preprocess_html(self, soup):
        # soup.body contains no title and no navigator, they are in soup
        self.append_page(soup, soup.body)
        # finally remove some tags
        tags = soup.findAll('div',attrs={'class':['tags', 'index', 'script_bxad_slot_display_list_bxad_slot', 'index first', 'zumi', 'navigation']})
        [tag.extract() for tag in tags]
        return soup