mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 18:24:30 -04:00
pclab.pl by ravcio. Fixes #7545 (recipe for http://pclab.pl)
This commit is contained in:
parent
a6ebb4c040
commit
377da4abad
70
resources/recipes/pc_lab.recipe
Normal file
70
resources/recipes/pc_lab.recipe
Normal file
@ -0,0 +1,70 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
|
||||||
|
class PCLab(BasicNewsRecipe):
|
||||||
|
cover_url = 'http://pclab.pl/img/logo.png'
|
||||||
|
title = u"PC Lab"
|
||||||
|
__author__ = 'ravcio - rlelusz[at]gmail.com'
|
||||||
|
description = u"Articles from PC Lab website"
|
||||||
|
language = 'pl'
|
||||||
|
oldest_article = 30.0
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
recursions = 0
|
||||||
|
encoding = 'iso-8859-2'
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_javascript = True
|
||||||
|
use_embedded_content = False
|
||||||
|
|
||||||
|
keep_only_tags = [
|
||||||
|
dict(name='div', attrs={'class':['substance']})
|
||||||
|
]
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
dict(name='div', attrs={'class':['chapters']})
|
||||||
|
,dict(name='div', attrs={'id':['script_bxad_slot_display_list_bxad_slot']})
|
||||||
|
]
|
||||||
|
|
||||||
|
remove_tags_after = [
|
||||||
|
dict(name='div', attrs={'class':['navigation']})
|
||||||
|
]
|
||||||
|
|
||||||
|
#links to RSS feeds
|
||||||
|
feeds = [ ('PCLab', u'http://pclab.pl/xml/artykuly.xml') ]
|
||||||
|
|
||||||
|
#load second and subsequent page content
|
||||||
|
# in: soup - full page with 'next' button
|
||||||
|
# out: appendtag - tag to which new page is to be added
|
||||||
|
def append_page(self, soup, appendtag):
|
||||||
|
# find the 'Next' button
|
||||||
|
pager = soup.find('div', attrs={'class':'next'})
|
||||||
|
|
||||||
|
if pager:
|
||||||
|
#search for 'a' element with link to next page (exit if not found)
|
||||||
|
a = pager.find('a')
|
||||||
|
if a:
|
||||||
|
nexturl = a['href']
|
||||||
|
|
||||||
|
soup2 = self.index_to_soup('http://pclab.pl/' + nexturl)
|
||||||
|
|
||||||
|
pagetext_substance = soup2.find('div', attrs={'class':'substance'})
|
||||||
|
pagetext = pagetext_substance.find('div', attrs={'class':'data'})
|
||||||
|
pagetext.extract()
|
||||||
|
|
||||||
|
pos = len(appendtag.contents)
|
||||||
|
appendtag.insert(pos, pagetext)
|
||||||
|
pos = len(appendtag.contents)
|
||||||
|
|
||||||
|
self.append_page(soup2, appendtag)
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
|
||||||
|
# soup.body contains no title and no navigator, they are in soup
|
||||||
|
self.append_page(soup, soup.body)
|
||||||
|
|
||||||
|
# finally remove some tags
|
||||||
|
tags = soup.findAll('div',attrs={'class':['tags', 'index', 'script_bxad_slot_display_list_bxad_slot', 'index first', 'zumi', 'navigation']})
|
||||||
|
[tag.extract() for tag in tags]
|
||||||
|
|
||||||
|
return soup
|
Loading…
x
Reference in New Issue
Block a user