Arret sur images by Francois D

This commit is contained in:
Kovid Goyal 2013-03-31 07:21:58 +05:30
parent 9be9b8fb36
commit a671413989
3 changed files with 62 additions and 4 deletions

View File

@ -0,0 +1,54 @@
from __future__ import unicode_literals
__license__ = 'WTFPL'
__author__ = '2013, François D. <franek at chicour.net>'
__description__ = 'Get some fresh news from Arrêt sur images'
from calibre.web.feeds.recipes import BasicNewsRecipe
class Asi(BasicNewsRecipe):
title = 'Arrêt sur images'
__author__ = 'François D. (aka franek)'
description = 'Global news in french from news site "Arrêt sur images"'
oldest_article = 7.0
language = 'fr'
needs_subscription = True
max_articles_per_feed = 100
simultaneous_downloads = 1
timefmt = '[%a, %d %b %Y %I:%M +0200]'
cover_url = 'http://www.arretsurimages.net/images/header/menu/menu_1.png'
use_embedded_content = False
no_stylesheets = True
remove_javascript = True
feeds = [
('vite dit et gratuit', 'http://www.arretsurimages.net/vite-dit.rss'),
('Toutes les chroniques', 'http://www.arretsurimages.net/chroniques.rss'),
('Contenus et dossiers', 'http://www.arretsurimages.net/dossiers.rss'),
]
conversion_options = { 'smarten_punctuation' : True }
remove_tags = [dict(id='vite-titre'), dict(id='header'), dict(id='wrap-connexion'), dict(id='col_right'), dict(name='div', attrs={'class':'bloc-chroniqueur-2'}), dict(id='footercontainer')]
def print_version(self, url):
return url.replace('contenu.php', 'contenu-imprimable.php')
def get_browser(self):
# Need to use robust HTML parser
br = BasicNewsRecipe.get_browser(self, use_robust_parser=True)
if self.username is not None and self.password is not None:
br.open('http://www.arretsurimages.net/index.php')
br.select_form(nr=0)
br.form.set_all_readonly(False)
br['redir'] = 'forum/login.php'
br['username'] = self.username
br['password'] = self.password
br.submit()
return br

View File

@ -376,7 +376,7 @@ def random_user_agent(choose=None):
choose = random.randint(0, len(choices)-1)
return choices[choose]
def browser(honor_time=True, max_time=2, mobile_browser=False, user_agent=None):
def browser(honor_time=True, max_time=2, mobile_browser=False, user_agent=None, use_robust_parser=False):
'''
Create a mechanize browser for web scraping. The browser handles cookies,
refresh requests and ignores robots.txt. Also uses proxy if available.
@ -385,7 +385,11 @@ def browser(honor_time=True, max_time=2, mobile_browser=False, user_agent=None):
:param max_time: Maximum time in seconds to wait during a refresh request
'''
from calibre.utils.browser import Browser
opener = Browser()
if use_robust_parser:
import mechanize
opener = Browser(factory=mechanize.RobustFactory())
else:
opener = Browser()
opener.set_handle_refresh(True, max_time=max_time, honor_time=honor_time)
opener.set_handle_robots(False)
if user_agent is None:

View File

@ -17,10 +17,10 @@ class Browser(B):
cookie jar. All clones share the same browser configuration.
'''
def __init__(self):
def __init__(self, *args, **kwargs):
self._clone_actions = {}
B.__init__(self)
B.__init__(self, *args, **kwargs)
self.set_cookiejar(CookieJar())
def set_handle_refresh(self, *args, **kwargs):